In [1]:
import pandas as pd
import numpy as np

In [2]:
techniques_df = pd.read_pickle ('../data/interim/m1pp_technique.pkl')
groups_df = pd.read_pickle ('../data/interim/m1pp_group.pkl')

techniques_org_df = pd.read_pickle ('../data/interim/X_technique_org.pkl')
groups_org_df = pd.read_pickle ('../data/interim/X_group_org.pkl')

In [3]:
from statistics import mode
def get_feature_length_stats(df: pd.DataFrame()):
    for col in list (df.columns)[1:]:
        if not isinstance(df[col].iloc[0], list): print ('{col}: scalar'.format (col = col))
        else:
            list_lengths = df[col].apply(lambda x: len(x))
            avg_len = list_lengths.mean()
            mode_len = mode(list_lengths)
            max_len = list_lengths.max()
            print ('{col} \tmean_len: {mean_len}\tmode_len: {mode_len}\tmax_len: {max_len}'.format(col = col, mean_len= avg_len, mode_len = mode_len, max_len = max_len))
        
def get_vocab_size (df: pd.DataFrame):
    for feature_name in df.columns[1:]:
        vocab_size = len(df[feature_name].explode().unique())
        print ('{feature_name}: vocab size = {vocab_size}'.format (feature_name = feature_name, vocab_size = vocab_size))

- Original cardinality

In [4]:
get_feature_length_stats(df = techniques_org_df)
get_vocab_size (techniques_org_df)

input_technique_platforms 	mean_len: 2.510708401976936	mode_len: 1	max_len: 10
input_technique_tactics 	mean_len: 1.2932454695222406	mode_len: 1	max_len: 4
input_technique_data_sources 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_defenses_bypassed 	mean_len: 1.2289950576606261	mode_len: 1	max_len: 8
input_technique_permissions_required 	mean_len: 1.1894563426688634	mode_len: 1	max_len: 4
input_technique_mitigation_id 	mean_len: 2.1301482701812193	mode_len: 1	max_len: 11
input_technique_detection_name 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_software_id 	mean_len: 14.179571663920923	mode_len: 1	max_len: 334
input_technique_description: scalar
input_technique_platforms: vocab size = 11
input_technique_tactics: vocab size = 14
input_technique_data_sources: vocab size = 106
input_technique_defenses_bypassed: vocab size = 25
input_technique_permissions_required: vocab size = 6
input_technique_mitigation_id: vocab size = 44
input_technique_

In [5]:
techniques_df['input_technique_description'].iloc[7].shape

(768,)

- Reduced Cardinality

In [6]:
get_feature_length_stats(df = techniques_df)
get_vocab_size (techniques_df)

input_technique_data_sources 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_defenses_bypassed 	mean_len: 1.2289950576606261	mode_len: 1	max_len: 8
input_technique_detection_name 	mean_len: 3.154859967051071	mode_len: 3	max_len: 14
input_technique_mitigation_id 	mean_len: 2.1301482701812193	mode_len: 1	max_len: 11
input_technique_permissions_required 	mean_len: 1.1894563426688634	mode_len: 1	max_len: 4
input_technique_platforms 	mean_len: 2.510708401976936	mode_len: 1	max_len: 10
input_technique_software_id 	mean_len: 14.179571663920923	mode_len: 1	max_len: 334
input_technique_tactics 	mean_len: 1.2932454695222406	mode_len: 1	max_len: 4
input_technique_description: scalar
input_technique_interaction_rate: scalar
input_technique_data_sources: vocab size = 106
input_technique_defenses_bypassed: vocab size = 25
input_technique_detection_name: vocab size = 106
input_technique_mitigation_id: vocab size = 44
input_technique_permissions_required: vocab size = 6
input_tech

In [7]:
get_feature_length_stats(df = groups_df)
# get_vocab_size (groups_df)

input_group_software_id 	mean_len: 6.1911764705882355	mode_len: 1	max_len: 46
input_group_description: scalar
input_group_interaction_rate: scalar
input_group_tactics 	mean_len: 19.147058823529413	mode_len: 1	max_len: 108


---

In [8]:
import tensorflow as tf

In [9]:
test_dataset = tf.data.Dataset.load ('../data/processed/model1/test_dataset/')
inputs = test_dataset.element_spec[0]

In [10]:
test_dataset.element_spec[0]

{'input_technique_platforms': RaggedTensorSpec(TensorShape([None]), tf.string, 0, tf.int64),
 'input_group_tactics': RaggedTensorSpec(TensorShape([None]), tf.string, 0, tf.int64),
 'input_group_interaction_rate': TensorSpec(shape=(1,), dtype=tf.float32, name=None),
 'input_technique_tactics': RaggedTensorSpec(TensorShape([None]), tf.string, 0, tf.int64),
 'input_group_description': TensorSpec(shape=(768,), dtype=tf.float32, name=None),
 'input_technique_mitigation_id': RaggedTensorSpec(TensorShape([None]), tf.string, 0, tf.int64),
 'input_technique_software_id': RaggedTensorSpec(TensorShape([None]), tf.string, 0, tf.int64),
 'input_technique_permissions_required': RaggedTensorSpec(TensorShape([None]), tf.string, 0, tf.int64),
 'input_technique_description': TensorSpec(shape=(768,), dtype=tf.float32, name=None),
 'input_technique_data_sources': RaggedTensorSpec(TensorShape([None]), tf.string, 0, tf.int64),
 'input_technique_defenses_bypassed': RaggedTensorSpec(TensorShape([None]), tf.st

In [11]:
input_technique = [item for item in list(inputs.keys()) if item.startswith ('input_technique')]
input_technique

['input_technique_platforms',
 'input_technique_tactics',
 'input_technique_mitigation_id',
 'input_technique_software_id',
 'input_technique_permissions_required',
 'input_technique_description',
 'input_technique_data_sources',
 'input_technique_defenses_bypassed',
 'input_technique_interaction_rate',
 'input_technique_detection_name']

In [12]:
input_group = [item for item in list(inputs.keys()) if item.startswith ('input_group')]
input_group

['input_group_tactics',
 'input_group_interaction_rate',
 'input_group_description',
 'input_group_software_id']