In [7]:
import sys
sys.path.append ('..')
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from src.models.model1.evaluate import get_report_data


In [8]:
report_data = pd.read_pickle ('../data/test/report_data.pkl')
look_up_table = pd.read_pickle ('../data/lookup_tables/m06d_1_2_a.pkl')

FileNotFoundError: [Errno 2] No such file or directory: '../data/test/report_data.pkl'

# Building Test Data

In [None]:
def get_interacted_tactic_range (interacted_techniques: list, look_up_table: pd.DataFrame()):
    """From a list of interacted techniques: Returns a tuple containing the earliest and latest tactic stage
    """
    interacted_table = look_up_table[look_up_table['technique_ID'].isin(interacted_techniques)]
    earliest_stage = interacted_table['technique_earliest_stage'].min()
    latest_stage = interacted_table['technique_earliest_stage'].max()
    
    return (earliest_stage, latest_stage)

def get_cadidate_techniques (interacted_techniques: list,  look_up_table: pd.DataFrame(), n: int, mode: str = 'latest'):
    """From a list of interacted techniques: Returns a list of candidate techniques. \n
    Step 1: Takes n most similar techniques for each interacted techniques.\n
    Step 2: From the list of Step 1: filter some techniques based on the tactic stage of the interacted techniques\n
        If `mode == 'latest'`: remove candidate techniques if their latest tactic stage is before the latest interacted stage\n
        If `mode == 'earliest'`: remove candidate techniques if their latest tactic stage is before the earliest interacted stage
    """
    interacted_table = look_up_table[look_up_table['technique_ID'].isin(interacted_techniques)]
    # get the first n items in each list
    interacted_table.loc[:, 'sorted_similar_techniques'] = interacted_table['sorted_similar_techniques'].apply(lambda x: x[0:n])
    # filter duplicates by getting unique values
    candidate_techniques = list(interacted_table['sorted_similar_techniques'].explode().unique())
    
    earliest_interacted_stage, latest_interacted_stage = get_interacted_tactic_range (interacted_techniques, look_up_table)
    candidate_table = look_up_table[look_up_table['technique_ID'].isin(candidate_techniques)]
    if mode == 'latest':
        candidate_techniques = list (candidate_table[candidate_table['technique_latest_stage'] >= latest_interacted_stage]['technique_ID'].values)
    elif mode == 'earliest':
        candidate_techniques = list (candidate_table[candidate_table['technique_latest_stage'] >= earliest_interacted_stage]['technique_ID'].values)
    return candidate_techniques

def make_test_data (report_data: pd.DataFrame, look_up_table: pd.DataFrame(), n: int = 200, mode: str = 'latest'):

    """From the CISA report data, make data for testing. Method:\n
    1. For each report, iteratively take from interacted Techniques as "detected techniques" and the rest as "true subsequent techniques".\n
    2. For each list of "detected techniques", get the candidate Techniques from the provided look-up table

    Args:
        report_data (pd.DataFrame): CiSA report data
        look_up_table (pd.DataFrame): look-up table created from a model
        n (int, optional): number of most similar Technique for each detected techniques. Defaults to 200.
        mode (str, optional): filter mode for look-up table. Defaults to 'latest'.

    Returns:
        _type_: _description_
    """
    test_group_IDs = []
    test_detected_techniques = []
    test_true_subsequent_techniques = []
    test_candidate_techniques = []
    for _, row in report_data.iterrows():
        group_ID = row['group_ID']
        for i in range (len (row['active_techniques'])-1):
            detected_techniques = row['active_techniques'][0:i+1]
            true_subsequent_techniques_techniques = row['active_techniques'][i+1:]
            candidate_techniques = get_cadidate_techniques (interacted_techniques = detected_techniques, look_up_table=look_up_table, n = n, mode = mode)
            
            test_group_IDs.append (group_ID)
            test_detected_techniques.append (detected_techniques)
            test_true_subsequent_techniques.append (true_subsequent_techniques_techniques)
            test_candidate_techniques.append (candidate_techniques)
    data = {
        'group_ID': test_group_IDs,
        'detected_techniques': test_detected_techniques,
        'candidate_techniques': test_candidate_techniques,
        'true_subsequent_techniques': test_true_subsequent_techniques,
    }
    res_df = pd.DataFrame(data = data)
    return res_df

# Making Prediction

In [None]:
def build_detected_group_profile (processed_group_features: pd.DataFrame(),
                                  processed_technique_features: pd.DataFrame(), 
                                  detected_techniques: list , threshold: int,
                                  train_labels: pd.DataFrame(), 
                                  group_id: str, settings: dict):
    """ 
    Build features for a newly detected group, including:
    1. Interaction rate (float): the initial value equals to the avg or min interaction rate of the interacted groups\n
    2. Interacted tactics (list of tactics): initial value: for each tactic from the interacted groups, the number of tactic interaction is the average number of interactions for that tactic\n
    3. Used software (list of software): initial value: the N most commonly used software, where N is the number of average software used by interacted groups\n

    Args:
        settings (dict): `'initial_interaction'`: set the initial interaction rate of the new group to the avg or min of the rate of train set

    """

    group_interaction_count = len(detected_techniques)
    
    # make a standard scaler, fit on train set's distribution
    pos_y = train_labels[train_labels['label'] == 1]
    train_interaction_count = pos_y['group_ID'].value_counts()
    scaler = StandardScaler()
    scaler.fit (train_interaction_count.values.reshape (-1,1))
    
    interacted_groups = list(pos_y['group_ID'].unique())
    interacted_group_features = processed_group_features [processed_group_features['group_ID'].isin(interacted_groups)]    
    
    # get the list of most frequent software from train set
    avg_software_interaction_rate = interacted_group_features['input_group_software_id'].apply(len).mean().round().astype(int)
    most_frequent_software = interacted_group_features['input_group_software_id'].explode().value_counts().sort_values(ascending = False)
    most_frequent_software = list(most_frequent_software.index)
    most_frequent_software.remove('other')
    most_frequent_software.remove('')
    
    group_interaction_rate = 0
    group_interacted_tactics = [[]]
    group_software =  [[]]
    
    ### 👉 Assign initial values if group has interaction count less than threshold 
    if group_interaction_count < threshold:
        if settings['initial_interaction'] == 'min':
            group_interaction_rate = scaler.transform(np.array(train_interaction_count.min()).reshape(1, -1)).item()
        elif settings['initial_interaction'] == 'avg':
            group_interaction_rate = 0
        
        avg_tactic_rate = interacted_group_features['input_group_tactics'].explode().value_counts()/len(interacted_groups)
        rounded_avg_tactic_rate = avg_tactic_rate.round().astype(int)
        group_interacted_tactics = [[idx for idx, val in rounded_avg_tactic_rate.items() for _ in range(val)]]
    
        group_software = [most_frequent_software[0:avg_software_interaction_rate]]
    
    elif group_interaction_count >= threshold:
        group_interaction_rate = scaler.transform([[group_interaction_count]])
        group_interaction_rate = group_interaction_rate[0][0]
        
        detected_techniques_features = processed_technique_features[processed_technique_features['technique_ID'].isin (detected_techniques)]
        group_interacted_tactics = [list (detected_techniques_features['input_technique_tactics'].explode().values)]
        possible_software = [list(detected_techniques_features['input_technique_software_id'].explode().unique())]
        # possible_software = detected_techniques_features['input_technique_software_id'].explode().unique()
        # possible_software = [software for software in possible_software if software in most_frequent_software[0:avg_software_interaction_rate]]
        group_software = [list (detected_techniques_features['input_technique_software_id'].explode().unique())]
    
    values = {
        'group_ID': group_id,
        'input_group_software_id': group_software,
        'input_group_tactics': group_interacted_tactics,
        'input_group_interaction_rate': group_interaction_rate,
        
    }
    detected_group_features = pd.DataFrame(values, index=[0])
    return detected_group_features

In [None]:
import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from src.models.model1.model_preprocess import build_dataset_3
from src.models.model1.predict import get_metrics
group_features = pd.read_pickle ('../data/processed/model1/processed_group_features.pkl')
processed_technique_features = pd.read_pickle ('../data/processed/model1/processed_technnique_features.pkl')
train_labels =pd.read_pickle ('../data/processed/model1/processed_train_labels.pkl')
cv_dataset = Dataset.load ('../data/processed/model1/cv_dataset')

model = keras.models.load_model ('../trained_models/model1/m06d_1_2_a')
metrics = get_metrics (model, cv_dataset)
prediction_threshold = metrics['best_threshold']
metrics




{'best_threshold': 0.61539865,
 'best_f1_score': 0.4622222221723692,
 'auc_pr': 0.41657233}

In [9]:
reports = [
    {'code': 'aa23-339a', 'passive_techniques': ['T1046']},
    {'code': 'aa23-201a', 'passive_techniques': ['T1584', 'T1588.002']},
    {'code': 'aa22-320a', 'passive_techniques': None},
    {'code': 'aa23-074a', 'passive_techniques': ['T1059.006', 'T1190', 'T1105', 'T1505.003', 'T1595.002']}
]
report_data = get_report_data (reports)
test_data = make_test_data(report_data=report_data, look_up_table= look_up_table)

In [15]:
for _, row in report_data.iterrows():
    print (len(row['active_techniques']))

21
33
16
16


In [8]:
test_data_with_preds = test_data.copy()
test_data_with_preds['predicted_techniques'] = None

In [14]:
group_profile_update_threshold = 5
prediction_threshold = metrics['best_threshold'] 
settings = {'initial_interaction' : 'avg'}
 
for index, row in test_data_with_preds.iterrows():
    detected_techniques = row['detected_techniques']
    candidate_techniques = row['candidate_techniques']
    group_ID = row['group_ID']
    candidate_technique_features = processed_technique_features[processed_technique_features['technique_ID'].isin(candidate_techniques)]
    # build group profile based on detected tecniques
    detected_group_profile = build_detected_group_profile (processed_group_features= group_features,
                                    processed_technique_features = processed_technique_features ,
                                    detected_techniques= detected_techniques, threshold=group_profile_update_threshold,train_labels= train_labels,group_id= group_ID, settings=settings)
    groups = pd.concat ([groups, detected_group_profile])
    aligned_group_profile = pd.concat ([detected_group_profile] * len(candidate_techniques), ignore_index= True)

    blank_labels = pd.DataFrame({'label': [-1]* len(candidate_techniques)})

    # make dataset for current group profile and candidate techniques
    test_dataset = build_dataset_3 (X_group_df= aligned_group_profile, X_technique_df= candidate_technique_features, y_df= blank_labels,
                                    selected_ragged_group_features = [f for f in detected_group_profile.columns if f not in ('group_ID', 'input_group_interaction_rate', 'input_group_description')],
                                    selected_ragged_technique_features = [f for f in candidate_technique_features if f not in ('technique_ID', 'input_technique_description', 'input_technique_interaction_rate')])
    test_dataset = test_dataset.batch(32)
    test_dataset.batch(32)
    results = []
    # model makes prediction
    # if the final prediction results in an empty list, keep decreasing the threshold
    current_prediction_threshold = prediction_threshold
    while len(results) == 0:
        results_logit = model.predict(test_dataset,verbose=0)
        results_prob = tf.keras.activations.sigmoid(results_logit)
        results_binary = np.where(results_prob >= current_prediction_threshold, 1, 0)
        results_binary = results_binary.flatten().tolist()
        # convert binary prediction to technique names
        results = [technique for binary_val, technique in zip (results_binary, candidate_techniques) if binary_val == 1.0]
        current_prediction_threshold *= 0.99
    
    test_data_with_preds.at[index, 'predicted_techniques'] = results


In [15]:
test_data_with_preds['accuracy'] = None
test_data_with_preds['precision'] = None
test_data_with_preds['recall'] = None
for index, row in test_data_with_preds.iterrows():
    true_values = row['true_subsequent_techniques']
    predicted_values = row['predicted_techniques']
    correct_predictions = [1 if val in true_values else 0 for val in predicted_values]
    
    accuracy = sum(correct_predictions) / len(predicted_values)
    true_positives = sum([1 for val in predicted_values if val in true_values])
    false_positives = len(predicted_values) - true_positives
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / len(true_values) if len(true_values) > 0 else 0  
    test_data_with_preds.at[index,'accuracy'] = accuracy
    test_data_with_preds.at[index,'precision'] = precision
    test_data_with_preds.at[index,'recall'] = recall

In [22]:
"""
reports = [
    {'code': 'aa23-339a', 'passive_techniques': ['T1046']},
    {'code': 'aa23-201a', 'passive_techniques': ['T1584', 'T1588.002']},
    {'code': 'aa22-320a', 'passive_techniques': None},
    {'code': 'aa23-074a', 'passive_techniques': ['T1059.006', 'T1190', 'T1105', 'T1505.003', 'T1595.002']}
]
"""
test_data_with_preds[test_data_with_preds['group_ID']== 'aa23-201a']

Unnamed: 0,group_ID,detected_techniques,candidate_techniques,true_subsequent_techniques,predicted_techniques,accuracy,precision,recall
20,aa23-201a,[T1190],"[T1548, T1548.002, T1548.003, T1134, T1087.002...","[T1059.004, T1106, T1547, T1505.003, T1548.001...","[T1071.001, T1547.001, T1059.001, T1059.005, T...",0.259259,0.259259,0.21875
21,aa23-201a,"[T1190, T1059.004]","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1106, T1547, T1505.003, T1548.001, T1140, T1...","[T1071.001, T1547.001, T1059.001, T1059.005, T...",0.269231,0.269231,0.225806
22,aa23-201a,"[T1190, T1059.004, T1106]","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1547, T1505.003, T1548.001, T1140, T1222.002...","[T1071.001, T1547.001, T1059.001, T1059.005, T...",0.28,0.28,0.233333
23,aa23-201a,"[T1190, T1059.004, T1106, T1547]","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1505.003, T1548.001, T1140, T1222.002, T1070...","[T1071.001, T1547.001, T1005, T1140, T1083, T1...",0.368421,0.368421,0.241379
24,aa23-201a,"[T1190, T1059.004, T1106, T1547, T1505.003]","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1548.001, T1140, T1222.002, T1070, T1070.002...","[T1105, T1036.005]",1.0,1.0,0.071429
25,aa23-201a,"[T1190, T1059.004, T1106, T1547, T1505.003, T1...","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1140, T1222.002, T1070, T1070.002, T1070.004...","[T1105, T1036.005]",1.0,1.0,0.074074
26,aa23-201a,"[T1190, T1059.004, T1106, T1547, T1505.003, T1...","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1222.002, T1070, T1070.002, T1070.004, T1036...","[T1071.001, T1105, T1036.005]",0.666667,0.666667,0.076923
27,aa23-201a,"[T1190, T1059.004, T1106, T1547, T1505.003, T1...","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1070, T1070.002, T1070.004, T1036, T1036.005...","[T1071.001, T1105, T1036.005]",0.666667,0.666667,0.08
28,aa23-201a,"[T1190, T1059.004, T1106, T1547, T1505.003, T1...","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1070.002, T1070.004, T1036, T1036.005, T1036...","[T1071.001, T1105, T1036.005]",0.666667,0.666667,0.083333
29,aa23-201a,"[T1190, T1059.004, T1106, T1547, T1505.003, T1...","[T1548, T1548.002, T1548.004, T1548.001, T1548...","[T1070.004, T1036, T1036.005, T1036.008, T1552...","[T1071.001, T1105, T1036.005]",0.666667,0.666667,0.086957
