# Dataset Exclusion Generator

This notebook generates the datasets of the following variation(s):
- Dataset whose malware and benign samples contain at least 1 API Call that is exclusive to each (for training and testing)

## Import Dataset

In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

#Load list of API calls
API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

In [2]:
benign_exclusive = ['CryptProtectMemory','FindFirstFileExA','GetFileVersionInfoExW','GetFileVersionInfoSizeExW','GetUserNameExW','IWbemServices_ExecMethod','listen','NtDeleteFile','NtLoadKey','recvfrom','RegDeleteKeyA','WriteConsoleW']
malicious_exclusive = ['connect','ControlService','CopyFileA','CopyFileExW','CreateJobObjectW','CreateRemoteThread','CreateServiceA','CreateServiceW','CryptDecodeObjectEx','CryptDecrypt','CryptEncrypt','CryptProtectData','DeleteUrlCacheEntryA','DeleteUrlCacheEntryW','DnsQuery_A','EnumServicesStatusA','getaddrinfo','GetAddrInfoW','GetBestInterfaceEx','GetDiskFreeSpaceW','gethostbyname','GetUserNameExA','HttpOpenRequestA','HttpOpenRequestW','HttpQueryInfoA','HttpSendRequestA','InternetCloseHandle','InternetConnectA','InternetConnectW','InternetCrackUrlA','InternetGetConnectedState','InternetOpenA','InternetOpenUrlA','InternetOpenUrlW','InternetOpenW','InternetQueryOptionA','InternetReadFile','InternetSetStatusCallback','MoveFileWithProgressW','NtGetContextThread','NtReadVirtualMemory','NtSetContextThread','NtSuspendThread','NtTerminateThread','NtWriteVirtualMemory','ObtainUserAgentString','OpenSCManagerA','OpenServiceA','recv','RegDeleteKeyW','RtlDecompressBuffer','RtlRemoveVectoredExceptionHandler','send','SetFileInformationByHandle','SetFileTime','SetInformationJobObject','shutdown','StartServiceA','WSARecv','WSASocketA']

print("# Benign Exclusive APIs:", len(benign_exclusive))
print("# Malicious Exclusive APIs:", len(malicious_exclusive))

# Benign Exclusive APIs: 12
# Malicious Exclusive APIs: 60


In [3]:
def complete_validation(df):
    collector = []
    for i in df.columns[1:101]:
        collector += df[i].to_list()
    return len(pd.Series(collector).unique())

In [16]:
import pandas as pd
df = pd.read_csv('../Dataset/oliveira_labelled_str.csv')
df
print(f"Unique APIs: {complete_validation(df)}/307")

Unique APIs: 264/307


It turns out that the real number of API Calls in use by the samples is only 260 out of 307 which may suggest that the end product dataset of the dataset's author had changes when they initially made it and when it was released for public use. 

## How many are samples that contain at least 1 of its exclusive API Call?

In [5]:
def count_sample_exclusive(exclusive:list, malware:int):
    size = df[df['malware']==malware].shape[0]
    total = 0
    for x in exclusive:
        matches = df[df['malware']==malware]['pattern'].str.contains(x)
        raw_types = df[df['malware']==malware][df['pattern'].str.contains(x)]['type'].value_counts()
        types = ""
        for r in range(len(raw_types)):
            types += f"{raw_types.index.to_list()[r]} ({raw_types.to_list()[r]}) "
        print(f"{x:36s} {len(matches[matches==True]):4d} | {types}")
        total += len(matches[matches==True])
    print("")
    label = "Malicious"
    if malware == 0:
        label = "Benign"
    #print(f"{label} Samples with at least 1 Exclusive {label} API Call: {total} ({total/size*100:.4f}% of {label} samples)")

In [6]:
count_sample_exclusive(malicious_exclusive, 1)

connect                               195 | adware (103) trojan (62) pua (24) _ (5) downloader (1) 
ControlService                          1 | trojan (1) 
CopyFileA                             110 | trojan (97) miner (8) _ (2) virus (2) worm (1) 
CopyFileExW                            30 | trojan (28) pua (2) 
CreateJobObjectW                       32 | trojan (27) pua (3) adware (2) 
CreateRemoteThread                      2 | trojan (2) 
CreateServiceA                         35 | trojan (34) virus (1) 
CreateServiceW                          9 | trojan (9) 
CryptDecodeObjectEx                     3 | trojan (1) _ (1) downloader (1) 
CryptDecrypt                         3004 | trojan (2551) pua (233) adware (120) downloader (97) _ (3) 
CryptEncrypt                           39 | trojan (39) 
CryptProtectData                       27 | trojan (27) 
DeleteUrlCacheEntryA                    8 | trojan (6) miner (1) pua (1) 
DeleteUrlCacheEntryW                   22 | pua (20) trojan (2)

In [7]:
count_sample_exclusive(benign_exclusive,0)

CryptProtectMemory                      1 | benign (1) 
FindFirstFileExA                        3 | benign (3) 
GetFileVersionInfoExW                   2 | benign (2) 
GetFileVersionInfoSizeExW               2 | benign (2) 
GetUserNameExW                          2 | benign (2) 
IWbemServices_ExecMethod                1 | benign (1) 
listen                                  2 | benign (2) 
NtDeleteFile                            1 | benign (1) 
NtLoadKey                               1 | benign (1) 
recvfrom                                1 | benign (1) 
RegDeleteKeyA                           2 | benign (2) 
WriteConsoleW                          16 | benign (16) 



## Dataset Exclusivity Filtering

In [8]:
def filter_sample_exclusive():
    global benign_exclusive
    global malicious_excusive
    exclusive_df = None
    for x in benign_exclusive:
        exclusive_df = pd.concat([exclusive_df, df[df['pattern'].str.contains(x)]], axis=0)
    for x in malicious_exclusive:
        exclusive_df = pd.concat([exclusive_df, df[df['pattern'].str.contains(x)]], axis=0)
    exclusive_df.drop_duplicates(subset=['hash'], inplace=True, keep='first')
    remaining_df = pd.concat([exclusive_df, df], axis=0)
    remaining_df.drop_duplicates(subset=['hash'], inplace=True, keep=False)
    return exclusive_df, remaining_df
def convert(api:str):
    return APIS.index(api)
def ordinal_encode(df):
    global APIS
    for j in range(1,101):
        df.iloc[:,j] = pd.Series(list(map(convert, df.iloc[:,j].to_list())))
    return df

## Dataset whose Malware and Benign samples contain at least 1 API Call exclusive to Malicious and Benign samples (for training & testing)

In [9]:
exclusive_str, remaining_str = filter_sample_exclusive()

In [10]:
print("Composition of samples that contain at least 1 API Call exclusive to it: ", exclusive_str['type'].value_counts())
print("\n")
display(exclusive_str)
display(remaining_str)

Composition of samples that contain at least 1 API Call exclusive to it:  type
trojan        3498
pua            333
adware         282
_              156
downloader     103
benign          31
ransomware      22
miner           14
virus            5
worm             1
Name: count, dtype: int64




Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
11404,c662cc346f4c041ee954b9a4333ed28d,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtAllocateVirtualMemory,GetFileAttributesExW,NtCreateFile,NtFreeVirtualMemory,GetFileType,NtClose,DeleteFileW,0,benign,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4760,3e122e753cd94f004797b3b4dddba96a,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,timeGetTime,NtAllocateVirtualMemory,LdrLoadDll,LdrUnloadDll,NtCreateFile,GetFileType,...,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
23426,204113cdf14320ee9db1f88a62a48019,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,timeGetTime,NtAllocateVirtualMemory,timeGetTime,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtCreateFile,...,NtClose,FindFirstFileExA,NtClose,NtCreateFile,GetFileType,NtAllocateVirtualMemory,NtReadFile,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
39519,00a1b66bd25adb59460871ee40e6ebc0,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,GetSystemWindowsDirectoryW,NtAllocateVirtualMemory,GetSystemDirectoryW,RegOpenKeyExA,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,...,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,FindFirstFileExA,NtClose,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
17375,59147b8b8abf9768ca96badfd91d7bb9,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,LdrLoadDll,LdrGetProcedureAddress,NtClose,NtOpenKey,NtQueryValueKey,NtClose,...,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42662,ede65bc1ae2f644f96a567a7fff0f7ee,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,GetFileType,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,RegOpenKeyExA,RegQueryValueExA,...,NtSetInformationFile,NtClose,NtOpenFile,NtQueryInformationFile,NtAllocateVirtualMemory,NtReadFile,NtFreeVirtualMemory,1,_,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
42882,e501a7d4d95d9a87027be637376a329e,NtAllocateVirtualMemory,SetErrorMode,LoadStringA,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,...,NtAllocateVirtualMemory,RegOpenKeyExW,LdrGetDllHandle,FindResourceExW,LoadResource,FindResourceExW,LoadResource,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
42956,2cae0bf3c1344031ef7951f4758c79f6,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,GetFileType,LdrGetDllHandle,LdrGetProcedureAddress,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,...,LoadResource,SizeofResource,FindResourceExW,LoadResource,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,1,_,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
43251,5d9c5d82d8070ef29ed745c15b1c0989,NtAllocateVirtualMemory,SetErrorMode,LoadStringA,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,...,FindResourceA,LoadStringA,LdrLoadDll,LdrGetProcedureAddress,__exception__,FindResourceExA,FindResourceA,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
0,071e8c3f8922e186e57548cd4c703a5d,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,1,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,1,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegQueryValueExA,RegCloseKey,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,1,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
4,c9b3700a77facf29172f32df6bc77f48,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryInfoKeyW,RegEnumKeyExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
5,cc6217be863e606e49da90fee2252f52,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetVolumeNameForVolumeMountPointW,LdrGetProcedureAddress,RegOpenKeyExW,RegCloseKey,1,trojan,"LdrGetProcedureAddress,NtAllocateVirtualMemory..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43871,e3d6d58faa040f0f9742c9d0eaf58be4,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
43872,9b917bab7f32188ae40c744f2be9aaf8,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,GetSystemTimeAsFileTime,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
43873,35a18ee05f75f04912018d9f462cb990,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
43874,654139d715abcf7ecdddbef5a84f224b,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [11]:
exclusive_enc = ordinal_encode(exclusive_str).copy(deep=True)
remaining_enc = ordinal_encode(remaining_str).copy(deep=True)

In [12]:
display(exclusive_enc)
display(remaining_enc)

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
11404,c662cc346f4c041ee954b9a4333ed28d,82,240,117,240,117,240,117,240,117,...,208,73,297,187,93,215,127,0,benign,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4760,3e122e753cd94f004797b3b4dddba96a,82,16,208,57,208,240,50,297,93,...,215,281,215,281,215,281,215,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
23426,204113cdf14320ee9db1f88a62a48019,82,16,208,57,208,57,82,208,297,...,215,281,215,297,93,208,264,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
39519,00a1b66bd25adb59460871ee40e6ebc0,82,16,71,208,275,112,71,25,71,...,215,281,215,281,215,281,215,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
17375,59147b8b8abf9768ca96badfd91d7bb9,82,16,208,240,117,215,274,158,215,...,117,172,117,172,117,172,117,0,benign,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42662,ede65bc1ae2f644f96a567a7fff0f7ee,208,187,208,93,172,117,16,112,123,...,119,215,20,34,208,264,187,1,_,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
42882,e501a7d4d95d9a87027be637376a329e,208,286,76,110,240,117,208,187,208,...,208,260,172,60,81,60,81,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
42956,2cae0bf3c1344031ef7951f4758c79f6,208,187,208,93,172,117,112,123,65,...,81,140,60,81,208,187,208,1,_,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
43251,5d9c5d82d8070ef29ed745c15b1c0989,208,286,76,110,240,117,208,187,208,...,111,76,240,117,306,291,111,1,trojan,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,135,171,215,35,208,56,71,1,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,117,71,297,135,171,215,35,1,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,123,65,112,123,65,113,112,1,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,40,209,260,141,260,141,260,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
5,cc6217be863e606e49da90fee2252f52,117,208,117,208,117,240,117,240,117,...,260,141,65,9,117,260,65,1,trojan,"LdrGetProcedureAddress,NtAllocateVirtualMemory..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43871,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
43872,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,82,159,224,82,159,224,82,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
43873,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
43874,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [13]:
# Validate if the contents of the dataset contain all 307 unique API Calls originally from the raw dataset (exclude 'NaN').
print(f"Exclusive: {complete_validation(exclusive_enc)}/307")
print(f"Common: {complete_validation(remaining_enc)}/307")

Exclusive: 251/307
Common: 187/307


In [14]:
# complete_validation(pd.concat([exclusive_enc, remaining_enc], axis=0))
complete_validation(df)

264

Due to these results, it is apparent that model training through these exclusivity datasets won't be possible due to the incompleteness of the 

## Testing these datasets as training (`exclusive_xx`) and test (`remaining_xx`)

This is to determine if the performance is reliant on the existence of those exclusive API Calls.