In [1]:
import pandas as pd
import regex

In [2]:
groups_df = pd.read_pickle ('data/interim/X_group.pkl')[['group_ID']]
techniques_df = pd.read_pickle ('data/interim/X_technique.pkl')[['technique_ID', 'input_technique_tactics']]
pos_df = pd.read_pickle ('data/interim/y_cleaned.pkl')
tactics_df = pd.read_csv ('data/raw/tactics_order.csv', index_col=0)

In [3]:
groups_df

Unnamed: 0,group_ID
0,G0099
1,G0006
2,G0005
3,G0023
4,G0025
...,...
131,G0044
132,G0102
133,G0128
134,G0018


In [4]:
pos_df = pos_df[pos_df['label'] ==1.0]
pos_df

Unnamed: 0,group_ID,technique_ID,label
60,G0099,T1059.005,1.0
199,G0099,T1105,1.0
211,G0099,T1036.004,1.0
230,G0099,T1571,1.0
238,G0099,T1027,1.0
...,...,...,...
52728,G0045,T1049,1.0
52739,G0045,T1199,1.0
52750,G0045,T1204.002,1.0
52753,G0045,T1078,1.0


In [5]:
technique_earliest_stage = pd.merge (
    left = techniques_df.explode ('input_technique_tactics'),
    right = tactics_df[['tactic_name', 'stage_order']],
    how = 'left', left_on= 'input_technique_tactics', right_on= 'tactic_name'
)
technique_earliest_stage = technique_earliest_stage.groupby ('technique_ID', as_index= False).agg(min)
technique_earliest_stage.drop(columns = ['input_technique_tactics', 'tactic_name'], inplace= True)
technique_earliest_stage.rename (columns= {'stage_order': 'technique_earliest_stage'}, inplace= True)
technique_earliest_stage

Unnamed: 0,technique_ID,technique_earliest_stage
0,T1001,10
1,T1001.001,10
2,T1001.002,10
3,T1001.003,10
4,T1003,6
...,...,...
602,T1648,2
603,T1649,6
604,T1650,0
605,T1651,2


In [6]:
pos_tactic_stage = pd.merge (
    left = pos_df, right = technique_earliest_stage[['technique_ID', 'technique_earliest_stage']],
    how = 'left', on = 'technique_ID'
)
pos_tactic_stage

Unnamed: 0,group_ID,technique_ID,label,technique_earliest_stage
0,G0099,T1059.005,1.0,2
1,G0099,T1105,1.0,10
2,G0099,T1036.004,1.0,5
3,G0099,T1571,1.0,10
4,G0099,T1027,1.0,5
...,...,...,...,...
3047,G0045,T1049,1.0,7
3048,G0045,T1199,1.0,1
3049,G0045,T1204.002,1.0,2
3050,G0045,T1078,1.0,1


In [7]:
group_earliest_stage = pos_tactic_stage[['group_ID', 'technique_earliest_stage']].groupby ('group_ID', as_index= False).agg (min)
group_earliest_stage.rename (columns= {'technique_earliest_stage': 'group_earliest_stage'}, inplace= True)
group_earliest_stage

Unnamed: 0,group_ID,group_earliest_stage
0,G0001,0
1,G0002,5
2,G0003,0
3,G0004,0
4,G0005,1
...,...,...
128,G1009,0
129,G1011,0
130,G1012,0
131,G1013,0


In [8]:
import sys
sys.path.append ('..')
from src.data.cleaning_4 import _make_interaction_matrix

In [9]:
im = _make_interaction_matrix (group_IDs_df= groups_df[['group_ID']], 
                               technique_IDs_df=techniques_df[['technique_ID']],
                               positive_cases= pos_df[['group_ID', 'technique_ID']], include_unused=False)
im['label'].value_counts()

label
0.0    49716
1.0     3052
Name: count, dtype: int64

In [10]:
im = pd.merge (im, group_earliest_stage, how = 'left', on = 'group_ID')
im = pd.merge (im, technique_earliest_stage, how = 'left', on = 'technique_ID')

In [11]:
# if a technique's stage later than the group's earliest stage: remove that technique 
# (only unused-technique (label = 0) is removed after this process)
im = im [im ['group_earliest_stage'] <= im ['technique_earliest_stage']]
im['label'].value_counts()


label
0.0    44474
1.0     3052
Name: count, dtype: int64

In [12]:
im

Unnamed: 0,group_ID,technique_ID,label,group_earliest_stage,technique_earliest_stage
0,G0099,T1548.002,0.0,0.0,4
1,G0099,T1134,0.0,0.0,4
2,G0099,T1134.002,0.0,0.0,4
3,G0099,T1134.001,0.0,0.0,4
4,G0099,T1531,0.0,0.0,12
...,...,...,...,...,...
52763,G0045,T1102.002,0.0,0.0,10
52764,G0045,T1102.001,0.0,0.0,10
52765,G0045,T1102.003,0.0,0.0,10
52766,G0045,T1047,1.0,0.0,2


In [13]:
n = 10
result = pos_df[pos_df['group_ID'].map(pos_df['group_ID'].value_counts()) > n]
result

Unnamed: 0,group_ID,technique_ID,label
396,G0006,T1087.001,1.0
403,G0006,T1583.001,1.0
422,G0006,T1560.001,1.0
424,G0006,T1119,1.0
449,G0006,T1059.003,1.0
...,...,...,...
52728,G0045,T1049,1.0
52739,G0045,T1199,1.0
52750,G0045,T1204.002,1.0
52753,G0045,T1078,1.0


In [14]:
from src.data.cleaning_4 import _make_interaction_matrix_2

In [19]:
res = _make_interaction_matrix_2 (
    group_IDs_df= groups_df [['group_ID']],
    technique_IDs_df= techniques_df[['technique_ID']],
    positive_cases= pos_df[['group_ID', 'technique_ID']],
    technique_tactics_df= techniques_df[['technique_ID', 'input_technique_tactics']],
    tactics_order_df= tactics_df, 
    limit_technique_based_on_earliest_tactic_stage= True,
    limit_group_instances = 15
)
res['label'].value_counts()

label
0.0    23113
1.0     2571
Name: count, dtype: int64

In [16]:
res = _make_interaction_matrix (
    group_IDs_df= groups_df [['group_ID']],
    technique_IDs_df= techniques_df[['technique_ID']],
    positive_cases= pos_df[['group_ID', 'technique_ID']],
)
res['label'].value_counts()

label
0.0    49716
1.0     3052
Name: count, dtype: int64

In [17]:
pd.read_pickle ('data/interim/y_cleaned.pkl')

Unnamed: 0,group_ID,technique_ID,label
0,G0099,T1548.002,0.0
1,G0099,T1134,0.0
2,G0099,T1134.002,0.0
3,G0099,T1134.001,0.0
4,G0099,T1531,0.0
...,...,...,...
52763,G0045,T1102.002,0.0
52764,G0045,T1102.001,0.0
52765,G0045,T1102.003,0.0
52766,G0045,T1047,1.0
