In [49]:
import pandas as pd
from src.models.model1.recommend import build_new_group_profile
from src.data.build_features_3 import build_group_interaction_rate
from sklearn.preprocessing import StandardScaler

In [50]:
test_data = pd.read_pickle ('data/test/m06c_1_1_test_data.pkl')
group_features = pd.read_pickle ('data/processed/model1/processed_group_features.pkl')
processed_technique_features = pd.read_pickle ('data/processed/model1/processed_technnique_features.pkl')

train_labels =pd.read_pickle ('data/processed/model1/processed_train_labels.pkl')
y_cleaned = pd.read_pickle ('data/interim/y_cleaned.pkl')


In [51]:
processed_technique_features.head()

Unnamed: 0,technique_ID,input_technique_data_sources,input_technique_detection_name,input_technique_mitigation_id,input_technique_platforms,input_technique_software_id,input_technique_tactics,input_technique_description,input_technique_interaction_rate
0,T1548,"[command_command_execution, file_file_metadata...","[command_execution, file_metadata, file_modifi...","[m1047, m1038, m1028, m1026, m1022, other]","[linux, windows, macos]",[],"[defense_evasion, privilege_escalation]","[-0.36630338, -0.24036503, -0.36260876, 0.0984...",-0.638444
1,T1548.002,"[command_command_execution, process_process_cr...","[command_execution, process_creation, windows_...","[m1047, m1026, other]",[windows],"[s0584, s0640, s0606, s0570, s1068, s0089, s10...","[defense_evasion, privilege_escalation]","[-0.5305501, -0.43114784, -0.67361057, 0.16222...",0.075537
2,T1548.004,"[process_os_api_execution, process_process_cre...","[os_api_execution, process_creation]",[m1038],[macos],[s0402],"[defense_evasion, privilege_escalation]","[-0.57463825, -0.65824693, -0.23797776, -0.254...",-0.638444
3,T1548.001,"[command_command_execution, file_file_metadata...","[command_execution, file_metadata, file_modifi...",[m1028],"[linux, macos]",[other],"[defense_evasion, privilege_escalation]","[-0.7420097, -0.461457, -0.45979652, -0.116038...",-0.638444
4,T1548.003,"[command_command_execution, file_file_modifica...","[command_execution, file_modification, process...","[m1028, m1026, m1022]","[linux, macos]","[s0154, s0279, other]","[defense_evasion, privilege_escalation]","[-0.8037999, -0.6815256, 0.18332177, 0.0635958...",-0.638444


In [52]:
pos_y = y_cleaned[y_cleaned['label'] == 1]
counts = pos_y['group_ID'].value_counts().values.reshape (-1,1)
scaler = StandardScaler()
scaler.fit (counts)

1. Interaction rate: calculate from normalized train group interaction.

In [53]:
pos_y = train_labels[train_labels['label'] == 1.0]
train_interaction_count = pos_y['group_ID'].value_counts()
scaler = StandardScaler ()
scaler.fit (train_interaction_count.values.reshape (-1,1))

group_interaction_count = 10
group_interaction_rate = scaler.transform([[group_interaction_count]])
group_interaction_rate = group_interaction_rate[0][0]
group_interaction_rate


-1.257584368332994

2. Interacted tactics

In [54]:
detected_techniques = ['T1078', 'T1047', 'T1059', 'T1059.001']
detected_techniques_features = processed_technique_features[processed_technique_features['technique_ID'].isin (detected_techniques)]
group_interacted_tactics = list (detected_techniques_features['input_technique_tactics'].explode().values)

3. Interacted software

In [55]:
detected_techniques = ['T1078', 'T1047', 'T1059', 'T1059.001', 'T1059.003']
detected_techniques_features = processed_technique_features[processed_technique_features['technique_ID'].isin (detected_techniques)]
group_software = list (detected_techniques_features['input_technique_software_id'].explode().unique())
print (len(group_software))

198


In [90]:
def build_detected_group_profile (processed_group_features: pd.DataFrame(),
                                  processed_technique_features: pd.DataFrame(), 
                                  detected_techniques: list , threshold: int,
                                  train_labels: pd.DataFrame(), 
                                  group_id: str, settings: dict):
    """Build features for a detected group, including:
    1. Description embedding: equals to the avg pooling of the processed groups' embeddings\n
    2. Interaction rate: equals to the avg or min interaction rate of the interacted groups\n
    3. Interacted tactics: average tactic interaction rate for each tactic from the interacted groups\n
    4. Used software: the N most commonly used software, where N is the number of average software used by interacted groups\n
    Args:
        processed_group_features (pd.DataFrame): _description_
        label_df (pd.DataFrame): _description_
        new_group_id (str): _description_
        settings (dict): _description_

    Returns:
        _type_: _description_
    """
    group_interaction_count = len(detected_techniques)
    
    pos_y = train_labels[train_labels['label'] == 1]
    train_interaction_count = pos_y['group_ID'].value_counts()
    scaler = StandardScaler()
    scaler.fit (train_interaction_count.values.reshape (-1,1))
    
    interacted_groups = list(pos_y['group_ID'].unique())
    interacted_group_features = processed_group_features [processed_group_features['group_ID'].isin(interacted_groups)]    
    ### 👉 group description equals to to average pooling of all group description embeddings this value is kept the same no matter the threshold
    group_description = interacted_group_features['input_group_description'].apply(pd.Series).mean().tolist()
    
    group_interaction_rate = 0
    group_interacted_tactics = [[]]
    group_software =  [[]]
    
    ### 👉 Assign initial values if group has interaction count less than threshold 
    if group_interaction_count < threshold:
        if settings['interaction'] == 'min':
            group_interaction_rate = (interacted_group_features['input_group_interaction_rate']).min()
        elif settings['interaction'] == 'avg':
            group_interaction_rate = (interacted_group_features['input_group_interaction_rate']).mean()
        
        avg_tactic_rate = interacted_group_features['input_group_tactics'].explode().value_counts()/len(interacted_groups)
        rounded_avg_tactic_rate = avg_tactic_rate.round().astype(int)
        group_interacted_tactics = [[idx for idx, val in rounded_avg_tactic_rate.items() for _ in range(val)]]
    
        avg_software_interaction_rate = interacted_group_features['input_group_software_id'].apply(len).mean().round().astype(int)
        most_frequent_software = interacted_group_features['input_group_software_id'].explode().value_counts().sort_values(ascending = False)
        most_frequent_software = list(most_frequent_software.index)
        most_frequent_software.remove('other')
        most_frequent_software.remove('')
        group_software = [most_frequent_software[0:avg_software_interaction_rate]]
    
    elif group_interaction_count >= threshold:
        group_interaction_rate = scaler.transform([[group_interaction_count]])
        group_interaction_rate = group_interaction_rate[0][0]
        
        detected_techniques_features = processed_technique_features[processed_technique_features['technique_ID'].isin (detected_techniques)]
        
        group_interacted_tactics = [list (detected_techniques_features['input_technique_tactics'].explode().values)]
        group_software = [list (detected_techniques_features['input_technique_software_id'].explode().unique())]
    
    values = {
        'group_ID': group_id,
        'input_group_software_id': group_software,
        'input_group_tactics': group_interacted_tactics,
        'input_group_description': [group_description],
        'input_group_interaction_rate': group_interaction_rate,
        
    }
    detected_group_features = pd.DataFrame(values, index=[0])
    return detected_group_features
    # return values


detected_techniques = ['T1078', 'T1047', 'T1059', 'T1059.001', 'T1059.003', 'T1059.001', 'T1059.003']
settings = {
    'interaction' : 'min'
}
res = build_detected_group_profile (processed_group_features= group_features,
                                    processed_technique_features = processed_technique_features ,
                                    detected_techniques= detected_techniques[0:6], threshold=5,train_labels= train_labels,group_id='yah', settings=settings)
res

Unnamed: 0,group_ID,input_group_software_id,input_group_tactics,input_group_description,input_group_interaction_rate
0,yah,"[s0234, s0023, s0334, s0695, s0363, s0434, s04...","[execution, execution, execution, defense_evas...","[-0.4318556785583496, -0.12413875013589859, -0...",-1.342618
