Filtering Most Frequent Features to reduce high cardinality

In [1]:
import pandas as pd
import numpy as np

In [2]:
techniques_df = pd.read_pickle ('../data/interim/X_technique.pkl')
groups_df = pd.read_pickle ('../data/interim/X_group.pkl')

techniques_org_df = pd.read_pickle ('../data/interim/X_technique_org.pkl')
groups_org_df = pd.read_pickle ('../data/interim/X_group_org.pkl')

In [3]:
from statistics import mode
def get_feature_length_stats(df: pd.DataFrame()):
    for col in list (df.columns)[1:]:
        list_lengths = df[col].apply(lambda x: len(x))
        avg_len = list_lengths.mean()
        mode_len = mode(list_lengths)
        max_len = list_lengths.max()
        print ('{col} \tmean_len: {mean_len}\tmode_len: {mode_len}\tmax_len: {max_len}'.format(col = col, mean_len= avg_len, mode_len = mode_len, max_len = max_len))
        
def get_vocab_size (df: pd.DataFrame):
    for feature_name in df.columns[1:]:
        vocab_size = len(df[feature_name].explode().unique())
        print ('{feature_name}: vocab size = {vocab_size}'.format (feature_name = feature_name, vocab_size = vocab_size))

In [4]:
get_feature_length_stats(df = techniques_org_df)
get_vocab_size (techniques_org_df)

input_technique_platforms 	mean_len: 2.510708401976936	mode_len: 1	max_len: 10
input_technique_tactics 	mean_len: 1.2932454695222406	mode_len: 1	max_len: 4
input_technique_data_sources 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_defenses_bypassed 	mean_len: 1.2289950576606261	mode_len: 1	max_len: 8
input_technique_permissions_required 	mean_len: 1.1894563426688634	mode_len: 1	max_len: 4
input_technique_mitigation_id 	mean_len: 2.1301482701812193	mode_len: 1	max_len: 11
input_technique_detection_name 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_software_id 	mean_len: 14.179571663920923	mode_len: 1	max_len: 334
input_technique_description 	mean_len: 1044.9093904448105	mode_len: 750	max_len: 4381
input_technique_platforms: vocab size = 11
input_technique_tactics: vocab size = 14
input_technique_data_sources: vocab size = 106
input_technique_defenses_bypassed: vocab size = 25
input_technique_permissions_required: vocab size = 6
input_techniq

In [5]:
techniques_df['input_technique_description'].iloc[7].shape

(768,)

In [6]:
techniques_df['input_technique_software_id'].explode().value_counts()

input_technique_software_id
         201
s0363     72
s0260     72
s0154     72
s0650     71
        ... 
s0112      1
s0001      1
s0016      1
s0014      1
s0026      1
Name: count, Length: 636, dtype: int64

In [7]:
get_feature_length_stats(df = techniques_df)
get_vocab_size (techniques_df)

input_technique_data_sources 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_defenses_bypassed 	mean_len: 1.2289950576606261	mode_len: 1	max_len: 8
input_technique_detection_name 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_mitigation_id 	mean_len: 2.1301482701812193	mode_len: 1	max_len: 11
input_technique_permissions_required 	mean_len: 1.1894563426688634	mode_len: 1	max_len: 4
input_technique_platforms 	mean_len: 2.510708401976936	mode_len: 1	max_len: 10
input_technique_software_id 	mean_len: 14.179571663920923	mode_len: 1	max_len: 334
input_technique_tactics 	mean_len: 1.2932454695222406	mode_len: 1	max_len: 4
input_technique_description 	mean_len: 768.0	mode_len: 768	max_len: 768
input_technique_data_sources: vocab size = 106
input_technique_defenses_bypassed: vocab size = 25
input_technique_detection_name: vocab size = 106
input_technique_mitigation_id: vocab size = 44
input_technique_permissions_required: vocab size = 6
input_technique

In [8]:
get_feature_length_stats(df = groups_df)
get_vocab_size (groups_df)

input_group_software_id 	mean_len: 6.1911764705882355	mode_len: 1	max_len: 46
input_group_description 	mean_len: 768.0	mode_len: 768	max_len: 768
input_group_software_id: vocab size = 464
input_group_description: vocab size = 104378


In [9]:
groups_df

Unnamed: 0,group_ID,input_group_software_id,input_group_description
0,G0099,[s0434],"[-0.41692165, -0.26227784, -0.47415727, 0.0276..."
1,G0006,"[s0017, s0025, s0119, s0026, s0121, s0002, s00...","[-0.6153141, 0.07517785, -0.5039633, -0.243659..."
2,G0005,"[s0040, s0015, s0003]","[-0.32223642, 0.029669948, -0.7427179, -0.0198..."
3,G0023,[s0064],"[-0.39972576, -0.06616345, -0.55075675, -0.038..."
4,G0025,[s0069],"[-0.608141, -0.007160956, -0.7598587, 0.167140..."
...,...,...,...
131,G0044,"[s0501, s0013, s0141]","[-0.3997315, 0.011453139, -0.5749509, -0.17962..."
132,G0102,"[s0552, s0534, s0521, s0154, s0575, s0024, s03...","[-0.27592593, -0.100573234, -0.37286678, -0.02..."
133,G0128,[],"[-0.48862642, -0.18858606, -0.5323841, -0.0883..."
134,G0018,"[s0043, s0042, s0039, s0012, s0096, s0100, s0104]","[-0.2925966, 0.0350828, -0.64397854, 0.130665,..."
