# Repeated Holdout Test

**Objectives**
- Load the Model(s)
- Run predictions on the model using the Training/Validation & Holdout/Test* data split using Stratified K-Folds CV (params: k=5 & random_seed=1)
- Record the performance of the models (e.g., accuracy, precision, recall, ROC-AUC, etc.) on every CV instance for use in analysis to determine possible occurence of overfitting.

*The addition of the use of Holdout/Test split here acts as an addition to the original plan of only using Training/Validation data splits.*

**Essentially, this answers the question:** `Is the model overfitted from the training data?`

In [1]:
import pandas as pd

import lightgbm as lgbm
import catboost as catb
 
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report

from joblib import load

import warnings
warnings.filterwarnings("ignore")

In [2]:
DF_LGBM_TB_TRAIN = pd.read_csv('./Dataset/LGBM_TB.csv', low_memory=False) #<== Point these to the appropriate Training+Validation datasets.
DF_LGBM_IB_TRAIN = pd.read_csv('./Dataset/LGBM_IB.csv', low_memory=False)
DF_LGBM_TB_TEST = pd.read_csv('./Dataset/LGBM_TB_Test.csv', low_memory=False)
DF_LGBM_IB_TEST = pd.read_csv('./Dataset/LGBM_IB_Test.csv', low_memory=False)
display(DF_LGBM_TB_TRAIN.head())
display(DF_LGBM_IB_TRAIN.head())
display(DF_LGBM_TB_TEST.head())
display(DF_LGBM_IB_TEST.head())

DF_CATB_TB_TRAIN = pd.read_csv('./Dataset/CATB_TB.csv', low_memory=False)
DF_CATB_IB_TRAIN = pd.read_csv('./Dataset/CATB_IB.csv', low_memory=False)
DF_CATB_TB_TEST = pd.read_csv('./Dataset/CATB_TB_Test.csv', low_memory=False)
DF_CATB_IB_TEST = pd.read_csv('./Dataset/CATB_IB_Test.csv', low_memory=False)

DF_CATB_IB_TRAIN.iloc[:,1:101] = DF_CATB_IB_TRAIN.iloc[:,1:101].astype('str')
DF_CATB_IB_TRAIN.replace("nan", "NaN", inplace=True)
DF_CATB_IB_TEST.iloc[:,1:101] = DF_CATB_IB_TEST.iloc[:,1:101].astype('str')
DF_CATB_IB_TEST.replace("nan", "NaN", inplace=True)

display(DF_CATB_TB_TRAIN.head())
display(DF_CATB_IB_TRAIN.head())
display(DF_CATB_TB_TEST.head())
display(DF_CATB_IB_TEST.head())

Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,1,82,240,117,240,117,93,117,172,117,...,117,37,294,240,117,198,208,187,208,240
1,1,82,240,117,240,117,240,117,240,117,...,260,40,159,224,82,209,260,40,209,260
2,1,286,110,172,240,117,240,117,240,117,...,25,71,275,260,240,117,141,65,260,141
3,1,112,274,158,215,274,158,215,298,76,...,117,71,297,135,171,215,35,208,56,71
4,1,82,240,117,240,117,240,117,240,117,...,225,35,225,35,240,117,225,35,225,215


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,1,82,240,117,93,172,16,215,228,208,...,307,307,307,307,307,307,307,307,307,307
1,1,82,240,117,172,16,11,274,158,215,...,307,307,307,307,307,307,307,307,307,307
2,1,286,110,172,240,117,106,171,260,141,...,307,307,307,307,307,307,307,307,307,307
3,1,112,274,158,215,298,76,208,172,117,...,307,307,307,307,307,307,307,307,307,307
4,1,82,240,117,16,228,208,71,56,172,...,307,307,307,307,307,307,307,307,307,307


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,1,112,274,158,215,274,158,215,298,76,...,297,135,171,215,35,208,56,71,c0299a7bdcde86ded3c3afd350028c01,trojan
1,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,57cd6a362b9b3808663ebbc84ec17b2c,trojan
2,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,732d1b5f0c4c0d113cbebd5d3c519c44,trojan
3,1,112,274,158,215,274,158,215,298,76,...,297,135,171,215,35,208,56,71,aea1feacee06204634e843b49aeca4ee,trojan
4,1,286,110,172,240,117,240,117,240,117,...,275,260,240,117,141,65,260,141,0f7d52267f2d793bc374520a52a96ce0,trojan


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,1,112,274,158,215,298,76,208,172,117,...,307,307,307,307,307,307,307,307,c0299a7bdcde86ded3c3afd350028c01,trojan
1,1,82,240,117,172,16,11,274,158,215,...,307,307,307,307,307,307,307,307,57cd6a362b9b3808663ebbc84ec17b2c,trojan
2,1,82,240,117,172,16,11,274,158,215,...,307,307,307,307,307,307,307,307,732d1b5f0c4c0d113cbebd5d3c519c44,trojan
3,1,112,274,158,215,298,76,208,172,117,...,307,307,307,307,307,307,307,307,aea1feacee06204634e843b49aeca4ee,trojan
4,1,286,110,172,240,117,106,171,260,141,...,307,307,307,307,307,307,307,307,0f7d52267f2d793bc374520a52a96ce0,trojan


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,GetFileType,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,...,LdrGetProcedureAddress,NtOpenKeyEx,NtOpenDirectoryObject,LdrLoadDll,LdrGetProcedureAddress,GetSystemInfo,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrLoadDll
1,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryInfoKeyW,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumKeyExW,RegOpenKeyExW
2,1,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,LoadStringW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExW,LdrLoadDll,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW
3,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW
4,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,DrawTextExW,GetSystemMetrics,DrawTextExW,GetSystemMetrics,LdrLoadDll,LdrGetProcedureAddress,DrawTextExW,GetSystemMetrics,DrawTextExW,NtClose


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,GetFileType,LdrGetDllHandle,SetUnhandledExceptionFilter,NtClose,NtProtectVirtualMemory,NtAllocateVirtualMemory,...,,,,,,,,,,
1,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,SetUnhandledExceptionFilter,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,NtClose,...,,,,,,,,,,
2,1,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,...,,,,,,,,,,
3,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,...,,,,,,,,,,
4,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,SetUnhandledExceptionFilter,NtProtectVirtualMemory,NtAllocateVirtualMemory,GetSystemWindowsDirectoryW,CreateActCtxW,LdrGetDllHandle,...,,,,,,,,,,


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,c0299a7bdcde86ded3c3afd350028c01,trojan
1,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,57cd6a362b9b3808663ebbc84ec17b2c,trojan
2,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,732d1b5f0c4c0d113cbebd5d3c519c44,trojan
3,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,aea1feacee06204634e843b49aeca4ee,trojan
4,1,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,GetSystemDirectoryW,RegOpenKeyExW,LdrLoadDll,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,0f7d52267f2d793bc374520a52a96ce0,trojan


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,...,,,,,,,,,c0299a7bdcde86ded3c3afd350028c01,trojan
1,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,SetUnhandledExceptionFilter,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,NtClose,...,,,,,,,,,57cd6a362b9b3808663ebbc84ec17b2c,trojan
2,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,SetUnhandledExceptionFilter,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,NtClose,...,,,,,,,,,732d1b5f0c4c0d113cbebd5d3c519c44,trojan
3,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,...,,,,,,,,,aea1feacee06204634e843b49aeca4ee,trojan
4,1,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,...,,,,,,,,,0f7d52267f2d793bc374520a52a96ce0,trojan


### 0. Defining a helper function for evaluation

In [3]:
def stratified_kfolds_eval(train_set, test_set, model):
    features = train_set.drop('malware', axis=1)
    labels = train_set['malware']
    kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
    print('HOLDOUT FOLD PREDICTIONS')
    print('-------------------------------------------------------')
    ctr = 1
    for train, holdout in kfolds.split(features, labels):
        X_holdout = features.iloc[holdout]
        Y_holdout = labels.iloc[holdout]
        Y_pred = model.predict(X_holdout)
        print(f"Fold: {ctr}")
        print(classification_report(Y_holdout, Y_pred, digits=4))
        print('-------------------------------------------------------')
        ctr += 1
    
    #comparing each holdout fold result with test split result
    print('\nTEST SPLIT PREDICTIONS')
    print('-------------------------------------------------------')
    Y_pred = model.predict(test_set.iloc[:,1:101])
    print(classification_report(test_set.iloc[:,0], Y_pred, digits=4))
    print('-------------------------------------------------------')

### 1. LightGBM

In [4]:
default_tb = load('./Models/LGBM/Train_Default/DEMO_LGBM_TB.model')
default_ib = load('./Models/LGBM/Train_Default/DEMO_LGBM_IB.model')
tuned_tb = load('./Models/LGBM/Train_Tuned/TUNED_DEMO_LGBM_TB.model')
tuned_ib = load('./Models/LGBM/Train_Tuned/TUNED_DEMO_LGBM_IB.model')

print("1. Default LGBM TB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_LGBM_TB_TRAIN, DF_LGBM_TB_TEST, default_tb)

FileNotFoundError: [Errno 2] No such file or directory: './Models/LGBM/Train_Default/DEMO_LGBM_TB.model'

In [None]:
print("2. Default LGBM IB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_LGBM_IB_TRAIN, DF_LGBM_IB_TEST, default_ib)

In [None]:
print("3. Tuned LGBM TB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_LGBM_TB_TRAIN, DF_LGBM_TB_TEST, tuned_tb)

In [None]:
print("4. Tuned LGBM IB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_LGBM_IB_TRAIN, DF_LGBM_IB_TEST, tuned_ib)

### 2. CatBoost

In [None]:
default_tb = catb.CatBoostClassifier()
default_ib = catb.CatBoostClassifier()
tuned_tb = catb.CatBoostClassifier()
tuned_ib = catb.CatBoostClassifier()
default_tb = default_tb.load_model("./Models/CATB/Train_Default/DEMO_CATB_TB.model", format='json')
default_ib = default_ib.load_model("./Models/CATB/Train_Default/DEMO_CATB_IB.model", format='json')
tuned_tb = tuned_tb.load_model("./Models/CATB/Train_Tuned/TUNED_DEMO_CATB_TB.model", format='json')
tuned_ib = tuned_ib.load_model("./Models/CATB/Train_Tuned/TUNED_DEMO_CATB_IB.model", format='json')

print("1. Default CatBoost TB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_CATB_TB_TRAIN, DF_CATB_TB_TEST, default_tb)

In [None]:
print("2. Default CatBoost IB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_CATB_IB_TRAIN, DF_CATB_IB_TEST, default_ib)

In [None]:
print("3. Tuned CatBoost TB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_CATB_TB_TRAIN, DF_CATB_TB_TEST, tuned_tb)

In [None]:
print("4. Default CatBoost IB, 'holdout' from train/validation split vs test split\n")
stratified_kfolds_eval(DF_CATB_IB_TRAIN, DF_CATB_IB_TEST, tuned_ib)