# Post THES3 Defense Notebook 8a

## Rationale:

**"What malware families/types does the model see benign samples as malicious?"**

- Check if hash is benign (from holdout/test), predict using model.

- If hash was a FP; Countercheck to clustered hashes; Ask what types and families do the hash belong to?

- Tabulate and synthesize what the results suggest regarding model performance.

## Load Libraries

In [1]:
import pandas as pd
import lightgbm as lgbm
import catboost as catb

from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, average_precision_score, roc_auc_score, recall_score, precision_score, f1_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, RocCurveDisplay

from joblib import load
import warnings
warnings.filterwarnings("ignore")

## Load DF

In [2]:
lgbm_df = pd.read_csv("../Dataset/TB/LGBM_TB_Test.csv", low_memory=False)
lgbm_df = lgbm_df[lgbm_df['malware'] == 0]
lgbm_df = lgbm_df.reset_index()
lgbm_df = lgbm_df.drop(columns='index')
lgbm_df.head()

Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,0,286,172,117,275,228,208,187,208,240,...,260,141,65,260,215,240,117,71,cd72e12f040e951a2845c34edc8816f8,benign
1,0,82,16,240,117,31,260,240,260,208,...,65,215,56,208,187,208,172,117,3019981659828124a721a9237e2a73cf,benign
2,0,240,117,240,117,240,117,240,117,240,...,275,240,117,275,240,117,208,275,991beff24b33d1f4fcd44756a1c4b9de,benign
3,0,82,240,117,240,117,208,187,208,117,...,117,39,172,117,172,117,172,117,d82c23e820e6106ff24fceecd8e821fc,benign
4,0,82,16,86,25,60,81,60,81,208,...,215,286,106,171,260,240,117,260,6e51234733dec1e25f2fc3245aea3d7c,benign


In [3]:
catb_df = pd.read_csv("../Dataset/TB/CATB_TB_Test.csv", low_memory=False)
catb_df = catb_df[catb_df['malware'] == 0]
catb_df = catb_df.reset_index()
catb_df = catb_df.drop(columns='index')
catb_df.head()

Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,0,SetErrorMode,LdrGetDllHandle,LdrGetProcedureAddress,GetSystemDirectoryW,NtProtectVirtualMemory,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrLoadDll,...,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,NtClose,LdrLoadDll,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,cd72e12f040e951a2845c34edc8816f8,benign
1,0,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,LdrLoadDll,LdrGetProcedureAddress,CoInitializeEx,RegOpenKeyExW,LdrLoadDll,RegOpenKeyExW,NtAllocateVirtualMemory,...,RegCloseKey,NtClose,CreateActCtxW,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,3019981659828124a721a9237e2a73cf,benign
2,0,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,GetSystemDirectoryW,LdrLoadDll,LdrGetProcedureAddress,GetSystemDirectoryW,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,GetSystemDirectoryW,991beff24b33d1f4fcd44756a1c4b9de,benign
3,0,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetProcedureAddress,...,LdrGetProcedureAddress,IsDebuggerPresent,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,d82c23e820e6106ff24fceecd8e821fc,benign
4,0,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtCreateMutant,LoadStringW,FindResourceExW,LoadResource,FindResourceExW,LoadResource,NtAllocateVirtualMemory,...,NtClose,SetErrorMode,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,6e51234733dec1e25f2fc3245aea3d7c,benign


In [4]:
pca_df = pd.read_csv("../PCA/oliveira_pca.csv", low_memory=False)
pca_df = pca_df.drop(columns=pca_df.iloc[:,1:101].columns.to_list(), inplace=False, errors='raise')
pca_df = pca_df.drop(columns=['malware', 'type', 'type_int', 'comp_1', 'comp_2', 'comp_3'], inplace=False, errors='raise')
pca_df.head()

Unnamed: 0,hash,pca_segment
0,071e8c3f8922e186e57548cd4c703a5d,c_1
1,33f8e6d08a6aae939f25a8e0d63dd523,c_4
2,b68abd064e975e1c6d5f25e748663076,c_0
3,72049be7bd30ea61297ea624ae198067,c_6
4,c9b3700a77facf29172f32df6bc77f48,c_2


## Load Models

In [5]:
default_lgbm = load('../GBDT_Training/Outputs/LGBM/Default/RYZEN3b_LGBM_TB.model') # <== Point these to the respective .model files

default_catb = catb.CatBoostClassifier()
default_catb = default_catb.load_model("../GBDT_Training/Outputs/CATB/Default/RYZEN3b_CATB_TB.model", format='json') # <== Point these to the respective .model files

## Prediction Function

In [6]:
def col_switch(df, column1, column2): # reference: https://stackoverflow.com/a/56693510
    i = list(df.columns) 
    a, b = i.index(column1), i.index(column2)
    i[b], i[a] = i[a], i[b]
    df = df[i]
    return df

def evaluate(model, df):
    y_actual = df['malware']
    y_pred = model.predict(df.iloc[:,1:101])
    y_pred = pd.Series(y_pred, dtype='Int8')
    inner_df = df.copy(deep=True)
    inner_df = inner_df.drop(columns=df.iloc[:,1:101].columns.to_list(), inplace=False, errors='raise')
    inner_df.rename(columns={"malware": "y_actual"}, inplace=True)
    inner_df['y_pred'] = pd.Series(y_pred, dtype='Int8')
    inner_df = col_switch(inner_df, 'y_actual','hash')
    inner_df = col_switch(inner_df, 'y_actual','type')
    concl = []
    for i in range(inner_df.shape[0]):
        if inner_df.iloc[i,2] == inner_df.iloc[i,3]:
            concl.append(True)
        else:
            concl.append(False)
    inner_df["match"] = pd.Series(concl)
    display(inner_df["match"].value_counts())
    global pca_df
    inner_df = pd.merge(inner_df, pca_df, on='hash', how='inner')
    display(inner_df)
    return inner_df

## Load Predictions

In [7]:
lgbm_predictions = evaluate(default_lgbm, lgbm_df)
lgbm_predictions = lgbm_predictions[lgbm_predictions["match"] == False] #Only "False" matches will be saved to file
lgbm_predictions.to_csv("lgbm_predictions_8a.csv", index=False)

match
True     68
False    45
Name: count, dtype: int64

Unnamed: 0,hash,type,y_actual,y_pred,match,pca_segment
0,cd72e12f040e951a2845c34edc8816f8,benign,0,1,False,c_0
1,3019981659828124a721a9237e2a73cf,benign,0,0,True,c_6
2,991beff24b33d1f4fcd44756a1c4b9de,benign,0,1,False,c_0
3,d82c23e820e6106ff24fceecd8e821fc,benign,0,1,False,c_4
4,6e51234733dec1e25f2fc3245aea3d7c,benign,0,0,True,c_0
...,...,...,...,...,...,...
108,8feb9a5080e1dc5ed66d47d867ee2c25,benign,0,0,True,c_6
109,1921a5574e07c3efbb41ed2ecc38ab1a,benign,0,1,False,c_0
110,b4165525fec4aa6b0c02b11f47693954,benign,0,0,True,c_6
111,e83d386e3568c67713af325872baf4c4,benign,0,1,False,c_4


In [8]:
catb_predictions = evaluate(default_catb, catb_df)
catb_predictions = catb_predictions[catb_predictions["match"] == False] #Only "False" matches will be saved to file
catb_predictions.to_csv("catb_predictions_8a.csv", index=False)

match
True     76
False    37
Name: count, dtype: int64

Unnamed: 0,hash,type,y_actual,y_pred,match,pca_segment
0,cd72e12f040e951a2845c34edc8816f8,benign,0,1,False,c_0
1,3019981659828124a721a9237e2a73cf,benign,0,0,True,c_6
2,991beff24b33d1f4fcd44756a1c4b9de,benign,0,1,False,c_0
3,d82c23e820e6106ff24fceecd8e821fc,benign,0,1,False,c_4
4,6e51234733dec1e25f2fc3245aea3d7c,benign,0,0,True,c_0
...,...,...,...,...,...,...
108,8feb9a5080e1dc5ed66d47d867ee2c25,benign,0,0,True,c_6
109,1921a5574e07c3efbb41ed2ecc38ab1a,benign,0,1,False,c_0
110,b4165525fec4aa6b0c02b11f47693954,benign,0,0,True,c_6
111,e83d386e3568c67713af325872baf4c4,benign,0,1,False,c_4
