# Dataset Exclusion Generator

This notebook generates the datasets of the following variation(s):
- Dataset whose malware and benign samples contain at least 1 API Call that is exclusive to each (for training and testing)

## 1. Import Dataset

In [1]:
from joblib import load
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Load list of API calls
API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

In [2]:
benign_exclusive = ['CryptProtectMemory','FindFirstFileExA','GetFileVersionInfoExW','GetFileVersionInfoSizeExW','GetUserNameExW','IWbemServices_ExecMethod','listen','NtDeleteFile','NtLoadKey','recvfrom','RegDeleteKeyA','WriteConsoleW']
malicious_exclusive = ['connect','ControlService','CopyFileA','CopyFileExW','CreateJobObjectW','CreateRemoteThread','CreateServiceA','CreateServiceW','CryptDecodeObjectEx','CryptDecrypt','CryptEncrypt','CryptProtectData','DeleteUrlCacheEntryA','DeleteUrlCacheEntryW','DnsQuery_A','EnumServicesStatusA','getaddrinfo','GetAddrInfoW','GetBestInterfaceEx','GetDiskFreeSpaceW','gethostbyname','GetUserNameExA','HttpOpenRequestA','HttpOpenRequestW','HttpQueryInfoA','HttpSendRequestA','InternetCloseHandle','InternetConnectA','InternetConnectW','InternetCrackUrlA','InternetGetConnectedState','InternetOpenA','InternetOpenUrlA','InternetOpenUrlW','InternetOpenW','InternetQueryOptionA','InternetReadFile','InternetSetStatusCallback','MoveFileWithProgressW','NtGetContextThread','NtReadVirtualMemory','NtSetContextThread','NtSuspendThread','NtTerminateThread','NtWriteVirtualMemory','ObtainUserAgentString','OpenSCManagerA','OpenServiceA','recv','RegDeleteKeyW','RtlDecompressBuffer','RtlRemoveVectoredExceptionHandler','send','SetFileInformationByHandle','SetFileTime','SetInformationJobObject','shutdown','StartServiceA','WSARecv','WSASocketA']

print("# Benign Exclusive APIs:", len(benign_exclusive))
print("# Malicious Exclusive APIs:", len(malicious_exclusive))

# Benign Exclusive APIs: 12
# Malicious Exclusive APIs: 60


In [3]:
def complete_validation(df):
    collector = []
    for i in df.columns[1:101]:
        collector += df[i].to_list()
    return len(pd.Series(collector).unique())

In [4]:
import pandas as pd
df = pd.read_csv('../Dataset/oliveira_labelled_str.csv')
df
print(f"Raw Dataset (wo False Labelled Malicious Samples) Unique APIs: {complete_validation(df)}/307")

Raw Dataset (wo False Labelled Malicious Samples) Unique APIs: 260/307


It turns out that the real number of API Calls in use by the samples is only **260 out of 307** which indicates that the end product dataset of the dataset's author had changes when they initially made it and when it was released for public use. 

In [5]:
#Loading existing test data (mixed)
lgbm_test = pd.read_csv("LGBM_TB_Test.csv")
catb_test = pd.read_csv("CATB_TB_Test.csv")

print(f"Original Training Dataset Unique APIs: {complete_validation(pd.read_csv('./CATB_TB.csv'))}/260")
print(f"Original Test Dataset Unique APIs: {complete_validation(pd.read_csv('./CATB_TB_Test.csv'))}/260")

Original Training Dataset Unique APIs: 259/260
Original Test Dataset Unique APIs: 223/260


## 2. How many are samples that contain at least 1 of its exclusive API Call?

In [6]:
def count_sample_exclusive(exclusive:list, malware:int):
    size = df[df['malware']==malware].shape[0]
    total = 0
    for x in exclusive:
        matches = df[df['malware']==malware]['pattern'].str.contains(x)
        raw_types = df[df['malware']==malware][df['pattern'].str.contains(x)]['type'].value_counts()
        types = ""
        for r in range(len(raw_types)):
            types += f"{raw_types.index.to_list()[r]} ({raw_types.to_list()[r]}) "
        print(f"{x:36s} {len(matches[matches==True]):4d} | {types}")
        total += len(matches[matches==True])
    print("")
    label = "Malicious"
    if malware == 0:
        label = "Benign"
    #print(f"{label} Samples with at least 1 Exclusive {label} API Call: {total} ({total/size*100:.4f}% of {label} samples)")

In [7]:
count_sample_exclusive(malicious_exclusive, 1)

connect                               190 | adware (103) trojan (62) pua (24) downloader (1) 
ControlService                          1 | trojan (1) 
CopyFileA                             108 | trojan (97) miner (8) virus (2) worm (1) 
CopyFileExW                            30 | trojan (28) pua (2) 
CreateJobObjectW                       32 | trojan (27) pua (3) adware (2) 
CreateRemoteThread                      2 | trojan (2) 
CreateServiceA                         35 | trojan (34) virus (1) 
CreateServiceW                          9 | trojan (9) 
CryptDecodeObjectEx                     2 | trojan (1) downloader (1) 
CryptDecrypt                         3001 | trojan (2551) pua (233) adware (120) downloader (97) 
CryptEncrypt                           39 | trojan (39) 
CryptProtectData                       27 | trojan (27) 
DeleteUrlCacheEntryA                    8 | trojan (6) miner (1) pua (1) 
DeleteUrlCacheEntryW                   22 | pua (20) trojan (2) 
DnsQuery_A            

In [8]:
count_sample_exclusive(benign_exclusive,0)

CryptProtectMemory                      1 | benign (1) 
FindFirstFileExA                        3 | benign (3) 
GetFileVersionInfoExW                   2 | benign (2) 
GetFileVersionInfoSizeExW               2 | benign (2) 
GetUserNameExW                          2 | benign (2) 
IWbemServices_ExecMethod                1 | benign (1) 
listen                                  2 | benign (2) 
NtDeleteFile                            1 | benign (1) 
NtLoadKey                               1 | benign (1) 
recvfrom                                1 | benign (1) 
RegDeleteKeyA                           2 | benign (2) 
WriteConsoleW                          16 | benign (16) 



## 3. Dataset Exclusivity Filtering

In [9]:
def filter_sample_exclusive():
    global benign_exclusive
    global malicious_excusive
    exclusive_df = None
    for x in benign_exclusive:
        exclusive_df = pd.concat([exclusive_df, df[df['pattern'].str.contains(x)]], axis=0)
    for x in malicious_exclusive:
        exclusive_df = pd.concat([exclusive_df, df[df['pattern'].str.contains(x)]], axis=0)
    exclusive_df.drop_duplicates(subset=['hash'], inplace=True, keep='first')
    remaining_df = pd.concat([exclusive_df, df], axis=0)
    remaining_df.drop_duplicates(subset=['hash'], inplace=True, keep=False)
    return exclusive_df, remaining_df

## 4. Dataset whose Malware and Benign samples contain at least 1 API Call exclusive to Malicious and Benign samples (for training & testing)

In [10]:
exclusive_str, remaining_str = filter_sample_exclusive()

In [11]:
print(f"# of samples that qualifies as \"Exclusive\": {exclusive_str.shape[0]} ({exclusive_str.shape[0]/df.shape[0]*100:.4f}%)")
print(f"# of samples that do not qualify as \"Exclusive\": {remaining_str.shape[0]} ({remaining_str.shape[0]/df.shape[0]*100:.4f}%)")

# of samples that qualifies as "Exclusive": 4289 (10.4011%)
# of samples that do not qualify as "Exclusive": 36934 (89.5674%)


In [12]:
print("Composition of samples that contain at least 1 API Call exclusive to it: ", exclusive_str['type'].value_counts())
print("\n")
display(exclusive_str)
display(remaining_str)

Composition of samples that contain at least 1 API Call exclusive to it:  type
trojan        3498
pua            333
adware         282
downloader     103
benign          31
ransomware      22
miner           14
virus            5
worm             1
Name: count, dtype: int64




Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
10798,c662cc346f4c041ee954b9a4333ed28d,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtAllocateVirtualMemory,GetFileAttributesExW,NtCreateFile,NtFreeVirtualMemory,GetFileType,NtClose,DeleteFileW,0,benign,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4489,3e122e753cd94f004797b3b4dddba96a,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,timeGetTime,NtAllocateVirtualMemory,LdrLoadDll,LdrUnloadDll,NtCreateFile,GetFileType,...,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
22077,204113cdf14320ee9db1f88a62a48019,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,timeGetTime,NtAllocateVirtualMemory,timeGetTime,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtCreateFile,...,NtClose,FindFirstFileExA,NtClose,NtCreateFile,GetFileType,NtAllocateVirtualMemory,NtReadFile,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
37146,00a1b66bd25adb59460871ee40e6ebc0,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,GetSystemWindowsDirectoryW,NtAllocateVirtualMemory,GetSystemDirectoryW,RegOpenKeyExA,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,...,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
16396,59147b8b8abf9768ca96badfd91d7bb9,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,LdrLoadDll,LdrGetProcedureAddress,NtClose,NtOpenKey,NtQueryValueKey,NtClose,...,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39220,a9009a2935ed87db57d544e28a99510b,GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,FindResourceExW,SetFileTime,FindResourceExW,NtOpenDirectoryObject,FindResourceExW,...,NtFreeVirtualMemory,NtClose,NtOpenFile,NtQueryInformationFile,NtAllocateVirtualMemory,NtReadFile,NtFreeVirtualMemory,1,trojan,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
39380,a3b81a0902ddbaccd03413c90b255387,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,GetTempPathW,SearchPathW,GetTempPathW,SearchPathW,GetTempPathW,SearchPathW,GetTempPathW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
40299,e501a7d4d95d9a87027be637376a329e,NtAllocateVirtualMemory,SetErrorMode,LoadStringA,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,...,NtAllocateVirtualMemory,RegOpenKeyExW,LdrGetDllHandle,FindResourceExW,LoadResource,FindResourceExW,LoadResource,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
40643,5d9c5d82d8070ef29ed745c15b1c0989,NtAllocateVirtualMemory,SetErrorMode,LoadStringA,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,...,FindResourceA,LoadStringA,LdrLoadDll,LdrGetProcedureAddress,__exception__,FindResourceExA,FindResourceA,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
0,071e8c3f8922e186e57548cd4c703a5d,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,1,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,1,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegQueryValueExA,RegCloseKey,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,1,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
4,c9b3700a77facf29172f32df6bc77f48,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryInfoKeyW,RegEnumKeyExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
5,cc6217be863e606e49da90fee2252f52,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetVolumeNameForVolumeMountPointW,LdrGetProcedureAddress,RegOpenKeyExW,RegCloseKey,1,trojan,"LdrGetProcedureAddress,NtAllocateVirtualMemory..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41231,e3d6d58faa040f0f9742c9d0eaf58be4,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41232,9b917bab7f32188ae40c744f2be9aaf8,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,GetSystemTimeAsFileTime,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41233,35a18ee05f75f04912018d9f462cb990,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41234,654139d715abcf7ecdddbef5a84f224b,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [13]:
def convert(api:str):
    return APIS.index(api)
def ordinal_encode(df):
    global APIS
    for j in range(1,101):
        df.iloc[:,j] = pd.Series(list(map(convert, df.iloc[:,j].to_list())))
    return df

exclusive_enc = ordinal_encode(exclusive_str.copy(deep=True))
remaining_enc = ordinal_encode(remaining_str.copy(deep=True))

In [14]:
display(exclusive_enc)
display(remaining_enc)

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
10798,c662cc346f4c041ee954b9a4333ed28d,82,240,117,240,117,240,117,240,117,...,208,73,297,187,93,215,127,0,benign,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4489,3e122e753cd94f004797b3b4dddba96a,82,16,208,57,208,240,50,297,93,...,215,281,215,281,215,281,215,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
22077,204113cdf14320ee9db1f88a62a48019,82,16,208,57,208,57,82,208,297,...,215,281,215,297,93,208,264,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
37146,00a1b66bd25adb59460871ee40e6ebc0,82,16,71,208,275,112,71,25,71,...,215,281,215,281,215,281,215,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
16396,59147b8b8abf9768ca96badfd91d7bb9,82,16,208,240,117,215,274,158,215,...,117,172,117,172,117,172,117,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39220,a9009a2935ed87db57d544e28a99510b,82,172,117,16,60,234,60,294,60,...,187,215,20,34,208,264,187,1,trojan,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
39380,a3b81a0902ddbaccd03413c90b255387,82,240,117,240,117,240,117,240,117,...,271,47,271,47,271,47,271,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
40299,e501a7d4d95d9a87027be637376a329e,208,286,76,110,240,117,208,187,208,...,208,260,172,60,81,60,81,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
40643,5d9c5d82d8070ef29ed745c15b1c0989,208,286,76,110,240,117,208,187,208,...,111,76,240,117,306,291,111,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,135,171,215,35,208,56,71,1,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,117,71,297,135,171,215,35,1,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,123,65,112,123,65,113,112,1,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,40,209,260,141,260,141,260,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
5,cc6217be863e606e49da90fee2252f52,117,208,117,208,117,240,117,240,117,...,260,141,65,9,117,260,65,1,trojan,"LdrGetProcedureAddress,NtAllocateVirtualMemory..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41231,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41232,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,82,159,224,82,159,224,82,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41233,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41234,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [15]:
# Validate if the contents of the dataset contain all 307 unique API Calls originally from the raw dataset (exclude 'NaN').
print(f"Exclusive Dataset: {complete_validation(exclusive_enc)}/260")
print(f"Common Dataset: {complete_validation(remaining_enc)}/260")

Exclusive Dataset: 248/260
Common Dataset: 184/260


## 5. Training on these datasets

In [16]:
import lightgbm as lgbm
import catboost as catb
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split

def get_indexes(numeric=False):
    indexes = []
    if numeric:
        for i in range(100):
            indexes.append(i)
    else:
        for i in range(100):
            indexes.append(f"t_{i}")
    return indexes

lgbm_model = lgbm.LGBMClassifier(random_state=1, n_jobs=0, verbose=-1, categorical_feature=get_indexes(True),
                                 boost_from_average=True, boosting_type='gbdt', class_weight='balanced', 
                                 data_sample_strategy='goss', enable_bundle=True, 
                                 objective='binary', tree_learner='feature', device='gpu')
catb_model = catb.CatBoostClassifier(random_state=1, thread_count=-1, verbose=0, cat_features=get_indexes(), 
                                     nan_mode='Min', one_hot_max_size=256,
                                     boosting_type='Ordered', bootstrap_type='Bayesian', 
                                     grow_policy='SymmetricTree', objective='Logloss', task_type='GPU')

In [17]:
def exclusivity_train_test(model, train:pd.DataFrame, test:pd.DataFrame, encoded:bool, model_name, test_ratio):
    print(f"{model_name}\n")
    X_train, X_test, y_train, y_test = None, None, None, None
    if encoded:
        X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,1:101].astype('int64'), train['malware'], test_size=test_ratio, random_state=1)
    else:
        X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,1:101], train['malware'], test_size=test_ratio, random_state=1)
    model.fit(X_train, y_train)
    if encoded:
        print(f"Testing on Test Split ({test_ratio*100:.0f}%) of Training Data:\n", metrics.classification_report(y_test.astype('int64'), model.predict(X_test.astype('int64')), digits=4),"\n")
        print("Testing on \"External\" Test Dataset:\n", metrics.classification_report(test['malware'].astype('int64'), model.predict(test.iloc[:, 1:101].astype('int64')), digits=4),"\n")
    else:
        print(f"Testing on Test Split ({test_ratio*100:.0f}%) of Training Data:\n", metrics.classification_report(y_test, model.predict(X_test), digits=4),"\n")
        print("Testing on \"External\" Test Dataset:\n", metrics.classification_report(test['malware'], model.predict(test.iloc[:, 1:101]), digits=4),"\n")

In [18]:
exclusivity_train_test(lgbm_model, exclusive_enc, remaining_enc, True, "LightGBM", 0.10)

LightGBM

Testing on Test Split (10%) of Training Data:
               precision    recall  f1-score   support

           0     0.7500    0.7500    0.7500         4
           1     0.9976    0.9976    0.9976       425

    accuracy                         0.9953       429
   macro avg     0.8738    0.8738    0.8738       429
weighted avg     0.9953    0.9953    0.9953       429
 

Testing on "External" Test Dataset:
               precision    recall  f1-score   support

           0     0.8655    0.0997    0.1788      1033
           1     0.9747    0.9996    0.9870     35901

    accuracy                         0.9744     36934
   macro avg     0.9201    0.5496    0.5829     36934
weighted avg     0.9717    0.9744    0.9644     36934
 



In [19]:
exclusivity_train_test(lgbm_model, exclusive_enc, lgbm_test, True, "LightGBM", 0.10)

LightGBM

Testing on Test Split (10%) of Training Data:
               precision    recall  f1-score   support

           0     0.7500    0.7500    0.7500         4
           1     0.9976    0.9976    0.9976       425

    accuracy                         0.9953       429
   macro avg     0.8738    0.8738    0.8738       429
weighted avg     0.9953    0.9953    0.9953       429
 

Testing on "External" Test Dataset:
               precision    recall  f1-score   support

           0     1.0000    0.0965    0.1760       114
           1     0.9750    1.0000    0.9873      4010

    accuracy                         0.9750      4124
   macro avg     0.9875    0.5482    0.5817      4124
weighted avg     0.9756    0.9750    0.9649      4124
 



In [20]:
exclusivity_train_test(catb_model, exclusive_str, remaining_str, False, "CatBoost", 0.10)

CatBoost

Testing on Test Split (10%) of Training Data:
               precision    recall  f1-score   support

           0     0.6667    1.0000    0.8000         4
           1     1.0000    0.9953    0.9976       425

    accuracy                         0.9953       429
   macro avg     0.8333    0.9976    0.8988       429
weighted avg     0.9969    0.9953    0.9958       429
 

Testing on "External" Test Dataset:
               precision    recall  f1-score   support

           0     0.7569    0.1597    0.2638      1033
           1     0.9764    0.9985    0.9873     35901

    accuracy                         0.9751     36934
   macro avg     0.8666    0.5791    0.6256     36934
weighted avg     0.9702    0.9751    0.9671     36934
 



In [21]:
exclusivity_train_test(catb_model, exclusive_str, catb_test, False, "CatBoost", 0.10)

CatBoost

Testing on Test Split (10%) of Training Data:
               precision    recall  f1-score   support

           0     0.6667    1.0000    0.8000         4
           1     1.0000    0.9953    0.9976       425

    accuracy                         0.9953       429
   macro avg     0.8333    0.9976    0.8988       429
weighted avg     0.9969    0.9953    0.9958       429
 

Testing on "External" Test Dataset:
               precision    recall  f1-score   support

           0     0.8571    0.1579    0.2667       114
           1     0.9766    0.9993    0.9878      4010

    accuracy                         0.9760      4124
   macro avg     0.9169    0.5786    0.6272      4124
weighted avg     0.9733    0.9760    0.9679      4124
 



## 6. Predicting on previously trained models

This is to determine if the performance is reliant on the existence of those exclusive API Calls.

As there are no significant differences in terms of performance of the model between time-based and instance-based behaviors, only models trained on time-based behaviors will be tested. 

In [22]:
#Opening existing trained models
lgbm_model = load('Models/LGBM/Default/RYZEN3b_LGBM_TB.model') # <== Point these to the respective .model files
catb_model = catb.CatBoostClassifier()
catb_model = catb_model.load_model("Models/CATB/Default/RYZEN3b_CATB_TB.model", format='json') # <== Point these to the respective .model files

def exclusive_test(input_df, model,encoded:bool):
    if encoded:
        y_pred = model.predict(input_df.iloc[:,1:101].astype('int64'))
    else:
        y_pred = model.predict(input_df.iloc[:,1:101])
    print(metrics.classification_report(input_df['malware'], y_pred, digits=4))

### 6.1. Checking performance on mixed input data (i.e., original test data).

In [23]:
exclusive_test(lgbm_test, lgbm_model, True)

              precision    recall  f1-score   support

           0     0.9851    0.5789    0.7293       114
           1     0.9882    0.9998    0.9939      4010

    accuracy                         0.9881      4124
   macro avg     0.9866    0.7893    0.8616      4124
weighted avg     0.9881    0.9881    0.9866      4124



In [24]:
exclusive_test(catb_test, catb_model, False)

              precision    recall  f1-score   support

           0     0.9571    0.5877    0.7283       114
           1     0.9884    0.9993    0.9938      4010

    accuracy                         0.9879      4124
   macro avg     0.9728    0.7935    0.8610      4124
weighted avg     0.9875    0.9879    0.9865      4124



### 6.2. Checking performance on exclusive and non-exclusive data (i.e., exclusivity data).

In [25]:
exclusive_test(exclusive_enc, lgbm_model, True)

              precision    recall  f1-score   support

           0     0.8889    0.7742    0.8276        31
           1     0.9984    0.9993    0.9988      4258

    accuracy                         0.9977      4289
   macro avg     0.9436    0.8867    0.9132      4289
weighted avg     0.9976    0.9977    0.9976      4289



In [26]:
exclusive_test(remaining_enc, lgbm_model, True)

              precision    recall  f1-score   support

           0     0.9825    0.8141    0.8904      1033
           1     0.9947    0.9996    0.9971     35901

    accuracy                         0.9944     36934
   macro avg     0.9886    0.9069    0.9438     36934
weighted avg     0.9943    0.9944    0.9941     36934



In [27]:
exclusive_test(exclusive_str, catb_model, False)

              precision    recall  f1-score   support

           0     0.8235    0.9032    0.8615        31
           1     0.9993    0.9986    0.9989      4258

    accuracy                         0.9979      4289
   macro avg     0.9114    0.9509    0.9302      4289
weighted avg     0.9980    0.9979    0.9979      4289



In [28]:
exclusive_test(remaining_str, catb_model, False)

              precision    recall  f1-score   support

           0     0.9901    0.7754    0.8697      1033
           1     0.9936    0.9998    0.9967     35901

    accuracy                         0.9935     36934
   macro avg     0.9918    0.8876    0.9332     36934
weighted avg     0.9935    0.9935    0.9931     36934

