# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import time

start_time = time.time()

#Evaluation and Testing Systems
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

#ML Models
from sklearn.ensemble import AdaBoostClassifier #AdaBoost
import xgboost as xgb #XGBoost; Install via pip first
import lightgbm as lgb #LightGBM; pip install lightgbm

# Importing Raw Datasets

## MalBehavD-V1

**Dataset URL:** https://github.com/mpasco/MalbehavD-V1

In [2]:
malbehavd = pd.read_csv("../Datasets/MalBehavD_2022/MalBehavD-V1-dataset.csv")
malbehavd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2570 entries, 0 to 2569
Columns: 177 entries, sha256 to Unnamed: 176
dtypes: int64(1), object(176)
memory usage: 3.5+ MB


## Oliveira Dataset

**Dataset URL:** https://ieee-dataport.org/open-access/malware-analysis-datasets-api-call-sequences

**Modification:** The version of the dataset here was converted from the

In [3]:
og_oliveira = pd.read_csv("../Datasets/Oliveira_2019/dynamic_api_call_sequence_per_malware_100_0_306.csv")
og_oliveira.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43876 entries, 0 to 43875
Columns: 102 entries, hash to malware
dtypes: int64(101), object(1)
memory usage: 34.1+ MB


## Catak (2021)

**Dataset URL:** https://github.com/ocatak/malware_api_class/

**Note:** The Catak Dataset is designed more for malware classification than detection, hence the first column for the pre-cleaned dataset is designated as a the malware type column.

In [4]:
og_catak = pd.read_csv("../Datasets/Catak_etal_2021/Original/all_analysis_data.csv")
og_catak.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7106 entries, 0 to 7105
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

# Cleaning & Examining Datasets

## MalBehavD-V1

**Dataset URL:** https://github.com/mpasco/MalbehavD-V1

### 1. Entire Dataset

In [5]:
malbehavd

Unnamed: 0,sha256,labels,0,1,2,3,4,5,6,7,...,Unnamed: 167,Unnamed: 168,Unnamed: 169,Unnamed: 170,Unnamed: 171,Unnamed: 172,Unnamed: 173,Unnamed: 174,Unnamed: 175,Unnamed: 176
0,5c18291c481a192ed5003084dab2d8a117fd3736359218...,0,LdrUnloadDll,CoUninitialize,NtQueryKey,NtDuplicateObject,GetShortPathNameW,GetSystemInfo,IsDebuggerPresent,GetSystemWindowsDirectoryW,...,,,,,,,,,,
1,4683faf3da550ffb594cf5513c4cbb34f64df85f27fd1c...,0,NtOpenMutant,GetForegroundWindow,NtQueryKey,DrawTextExW,NtSetInformationFile,RegQueryValueExA,LdrGetProcedureAddress,CoUninitialize,...,,,,,,,,,,
2,9a0aea1c7290031d7c3429d0e921f107282cc6eab854ee...,0,GetForegroundWindow,DrawTextExW,GetSystemInfo,IsDebuggerPresent,GetSystemWindowsDirectoryW,NtQueryValueKey,RegCloseKey,GetFileAttributesW,...,,,,,,,,,,
3,e0f3e4d5f50afd9c31e51dd9941c5a52d57c7c524f5d11...,0,NtQueryValueKey,LdrUnloadDll,GlobalMemoryStatus,WriteConsoleA,NtOpenKey,LdrGetProcedureAddress,NtTerminateProcess,NtClose,...,,,,,,,,,,
4,ec2b6d29992f13e74015ff0b129150b4afae15c593e4b7...,0,LdrUnloadDll,GetSystemTimeAsFileTime,NtOpenKey,WSAStartup,SetUnhandledExceptionFilter,NtTerminateProcess,NtClose,NtAllocateVirtualMemory,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2565,ed6a3fc04af435176b9c2f3024eb53c31d1e522da52c5c...,1,CreateToolhelp32Snapshot,GetCursorPos,CoUninitialize,RegCloseKey,LdrUnloadDll,DrawTextExW,NtSetInformationFile,CopyFileA,...,,,,,,,,,,
2566,ed5d70a13633a46355c0c2f9905ba29b7b74dfdb4db321...,1,NtDuplicateObject,RegCloseKey,LdrUnloadDll,NtSetInformationFile,RegQueryValueExA,NtTerminateProcess,NtQueryValueKey,RegQueryValueExW,...,,,,,,,,,,
2567,ed5addbdbe5f56f108530148c71ab7db806ac9324395d0...,1,GetCursorPos,NtOpenSection,CoUninitialize,RegCloseKey,LdrUnloadDll,GetSystemInfo,RegQueryValueExA,NtTerminateProcess,...,,,,,,,,,,
2568,ed4f4518e3120a4fd8ff6c61bf072d4de60264711a9196...,1,NtAllocateVirtualMemory,LdrGetProcedureAddress,SetUnhandledExceptionFilter,GetFileType,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetDllHandle,NtProtectVirtualMemory,...,,,,,,,,,,


### 2. Features Only

In [6]:
featr = malbehavd.iloc[:, 2:177]
featr

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Unnamed: 167,Unnamed: 168,Unnamed: 169,Unnamed: 170,Unnamed: 171,Unnamed: 172,Unnamed: 173,Unnamed: 174,Unnamed: 175,Unnamed: 176
0,LdrUnloadDll,CoUninitialize,NtQueryKey,NtDuplicateObject,GetShortPathNameW,GetSystemInfo,IsDebuggerPresent,GetSystemWindowsDirectoryW,NtClose,GetFileVersionInfoSizeW,...,,,,,,,,,,
1,NtOpenMutant,GetForegroundWindow,NtQueryKey,DrawTextExW,NtSetInformationFile,RegQueryValueExA,LdrGetProcedureAddress,CoUninitialize,NtQueryValueKey,RegCloseKey,...,,,,,,,,,,
2,GetForegroundWindow,DrawTextExW,GetSystemInfo,IsDebuggerPresent,GetSystemWindowsDirectoryW,NtQueryValueKey,RegCloseKey,GetFileAttributesW,RegQueryValueExW,NtMapViewOfSection,...,,,,,,,,,,
3,NtQueryValueKey,LdrUnloadDll,GlobalMemoryStatus,WriteConsoleA,NtOpenKey,LdrGetProcedureAddress,NtTerminateProcess,NtClose,NtAllocateVirtualMemory,LdrGetDllHandle,...,,,,,,,,,,
4,LdrUnloadDll,GetSystemTimeAsFileTime,NtOpenKey,WSAStartup,SetUnhandledExceptionFilter,NtTerminateProcess,NtClose,NtAllocateVirtualMemory,NtQueryAttributesFile,LdrGetDllHandle,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2565,CreateToolhelp32Snapshot,GetCursorPos,CoUninitialize,RegCloseKey,LdrUnloadDll,DrawTextExW,NtSetInformationFile,CopyFileA,GetSystemWindowsDirectoryW,NtQueryValueKey,...,,,,,,,,,,
2566,NtDuplicateObject,RegCloseKey,LdrUnloadDll,NtSetInformationFile,RegQueryValueExA,NtTerminateProcess,NtQueryValueKey,RegQueryValueExW,NtFreeVirtualMemory,NtCreateThreadEx,...,,,,,,,,,,
2567,GetCursorPos,NtOpenSection,CoUninitialize,RegCloseKey,LdrUnloadDll,GetSystemInfo,RegQueryValueExA,NtTerminateProcess,NtQueryValueKey,GetFileAttributesW,...,,,,,,,,,,
2568,NtAllocateVirtualMemory,LdrGetProcedureAddress,SetUnhandledExceptionFilter,GetFileType,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetDllHandle,NtProtectVirtualMemory,NtQueryValueKey,LdrUnloadDll,...,,,,,,,,,,


### 3. Unique API calls on MalBehavD-V1

In [7]:
combined_summary = []
featr['summary'] = featr.values.tolist()
for i in range(featr['summary'].shape[0]):
    length = len(featr['summary'].loc[i])
    for j in range(length-1):
        combined_summary.append(featr['summary'].loc[i][j])
print("combined_summary:", len(combined_summary))
malbd_featr = pd.Series(combined_summary).dropna().drop_duplicates().reset_index()
malbd_featr.rename(columns={'index': 'index', 0: 'api_calls'}, inplace=True)
malbd_featr.sort_values(inplace=True, by='api_calls')
malbd_featr.drop(axis=1, labels="index", inplace=True)
malbd_featr
#malbd_featr = pd.Series(combined_summary).dropna().drop_duplicates()

combined_summary: 447180


Unnamed: 0,api_calls
142,CertControlStore
245,CertCreateCertificateContext
138,CertOpenStore
277,CertOpenSystemStoreW
89,CoCreateInstance
...,...
264,sendto
83,setsockopt
222,shutdown
168,socket


In [8]:
#Write to file
f = open("MalbehavD_Features.txt", mode="w")
f.write(str(malbd_featr['api_calls'].tolist()))
f.flush()
f.close()

## Oliveira Dataset

**Dataset URL:** https://ieee-dataport.org/open-access/malware-analysis-datasets-api-call-sequences

**Modification:** The version of the dataset here was converted from the

### 1. Entire Dataset (Original)

In [9]:
mal_col = og_oliveira.pop('malware')
og_oliveira.insert(1, mal_col.name, mal_col)
og_oliveira

Unnamed: 0,hash,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,071e8c3f8922e186e57548cd4c703a5d,1,112,274,158,215,274,158,215,298,...,117,71,297,135,171,215,35,208,56,71
1,33f8e6d08a6aae939f25a8e0d63dd523,1,82,208,187,208,172,117,172,117,...,60,81,240,117,71,297,135,171,215,35
2,b68abd064e975e1c6d5f25e748663076,1,16,110,240,117,240,117,240,117,...,123,65,112,123,65,112,123,65,113,112
3,72049be7bd30ea61297ea624ae198067,1,82,208,187,208,172,117,172,117,...,215,208,302,208,302,187,208,302,228,302
4,c9b3700a77facf29172f32df6bc77f48,1,82,240,117,240,117,240,117,240,...,40,209,260,40,209,260,141,260,141,260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43871,e3d6d58faa040f0f9742c9d0eaf58be4,1,82,240,117,240,117,240,117,240,...,260,141,260,141,260,141,260,141,260,141
43872,9b917bab7f32188ae40c744f2be9aaf8,1,82,240,117,240,117,240,117,240,...,82,159,224,82,159,224,82,159,224,82
43873,35a18ee05f75f04912018d9f462cb990,1,82,240,117,240,117,240,117,240,...,141,260,141,260,141,260,141,260,141,260
43874,654139d715abcf7ecdddbef5a84f224b,1,82,240,117,240,117,240,117,240,...,260,141,260,141,260,141,260,141,260,141


### 2. Entire Dataset (Pre-Cleaned)

**The code for pre-cleaning is as follows:**

```
#CONVERTS THE NUMBER NOTATION OF API CALLS IN OLVIEIRA (2019) DATASET TO ACTUAL API CALLS

import os

def cls():
    os.system('cls')

print("Group CDEF\n")

dataset = open(os.getcwd()+"\\Original\\dynamic_api_call_sequence_per_malware_100_0_306.csv", mode="r")
new_dataset = open("cleaned_dynamic_api_call_sequence_per_malware_100_0_306.csv", mode="w+")

oliveira_idx = ['NtOpenThread', 'ExitWindowsEx', 'FindResourceW', 'CryptExportKey', 'CreateRemoteThreadEx', 'MessageBoxTimeoutW', 'InternetCrackUrlW', 'StartServiceW', 'GetFileSize', 'GetVolumeNameForVolumeMountPointW', 'GetFileInformationByHandle', 'CryptAcquireContextW', 'RtlDecompressBuffer', 'SetWindowsHookExA', 'RegSetValueExW', 'LookupAccountSidW', 'SetUnhandledExceptionFilter', 'InternetConnectA', 'GetComputerNameW', 'RegEnumValueA', 'NtOpenFile', 'NtSaveKeyEx', 'HttpOpenRequestA', 'recv', 'GetFileSizeEx', 'LoadStringW', 'SetInformationJobObject', 'WSAConnect', 'CryptDecrypt', 'GetTimeZoneInformation', 'InternetOpenW', 'CoInitializeEx', 'CryptGenKey', 'GetAsyncKeyState', 'NtQueryInformationFile', 'GetSystemMetrics', 'NtDeleteValueKey', 'NtOpenKeyEx', 'sendto', 'IsDebuggerPresent', 'RegQueryInfoKeyW', 'NetShareEnum', 'InternetOpenUrlW', 'WSASocketA', 'CopyFileExW', 'connect', 'ShellExecuteExW', 'SearchPathW', 'GetUserNameA', 'InternetOpenUrlA', 'LdrUnloadDll', 'EnumServicesStatusW', 'EnumServicesStatusA', 'WSASend', 'CopyFileW', 'NtDeleteFile', 'CreateActCtxW', 'timeGetTime', 'MessageBoxTimeoutA', 'CreateServiceA', 'FindResourceExW', 'WSAAccept', 'InternetConnectW', 'HttpSendRequestA', 'GetVolumePathNameW', 'RegCloseKey', 'InternetGetConnectedStateExW', 'GetAdaptersInfo', 'shutdown', 'NtQueryMultipleValueKey', 'NtQueryKey', 'GetSystemWindowsDirectoryW', 'GlobalMemoryStatusEx', 'GetFileAttributesExW', 'OpenServiceW', 'getsockname', 'LoadStringA', 'UnhookWindowsHookEx', 'NtCreateUserProcess', 'Process32NextW', 'CreateThread', 'LoadResource', 'GetSystemTimeAsFileTime', 'SetStdHandle', 'CoCreateInstanceEx', 'GetSystemDirectoryA', 'NtCreateMutant', 'RegCreateKeyExW', 'IWbemServices_ExecQuery', 'NtDuplicateObject', 'Thread32First', 'OpenSCManagerW', 'CreateServiceW', 'GetFileType', 'MoveFileWithProgressW', 'NtDeviceIoControlFile', 'GetFileInformationByHandleEx', 'CopyFileA', 'NtLoadKey', 'GetNativeSystemInfo', 'NtOpenProcess', 'CryptUnprotectMemory', 'InternetWriteFile', 'ReadProcessMemory', 'gethostbyname', 'WSASendTo', 'NtOpenSection', 'listen', 'WSAStartup', 'socket', 'OleInitialize', 'FindResourceA', 'RegOpenKeyExA', 'RegEnumKeyExA', 'NtQueryDirectoryFile', 'CertOpenSystemStoreW', 'ControlService', 'LdrGetProcedureAddress', 'GlobalMemoryStatus', 'NtSetInformationFile', 'OutputDebugStringA', 'GetAdaptersAddresses', 'CoInitializeSecurity', 'RegQueryValueExA', 'NtQueryFullAttributesFile', 'DeviceIoControl', '__anomaly__', 'DeleteFileW', 'GetShortPathNameW', 'NtGetContextThread', 'GetKeyboardState', 'RemoveDirectoryA', 'InternetSetStatusCallback', 'NtResumeThread', 'SetFileInformationByHandle', 'NtCreateSection', 'NtQueueApcThread', 'accept', 'DecryptMessage', 'GetUserNameExW', 'SizeofResource', 'RegQueryValueExW', 'SetWindowsHookExW', 'HttpOpenRequestW', 'CreateDirectoryW', 'InternetOpenA', 'GetFileVersionInfoExW', 'FindWindowA', 'closesocket', 'RtlAddVectoredExceptionHandler', 'IWbemServices_ExecMethod', 'GetDiskFreeSpaceExW', 'TaskDialog', 'WriteConsoleW', 'CryptEncrypt', 'WSARecvFrom', 'NtOpenMutant', 'CoGetClassObject', 'NtQueryValueKey', 'NtDelayExecution', 'select', 'HttpQueryInfoA', 'GetVolumePathNamesForVolumeNameW', 'RegDeleteValueW', 'InternetCrackUrlA', 'OpenServiceA', 'InternetSetOptionA', 'CreateDirectoryExW', 'bind', 'NtShutdownSystem', 'DeleteUrlCacheEntryA', 'NtMapViewOfSection', 'LdrGetDllHandle', 'NtCreateKey', 'GetKeyState', 'CreateRemoteThread', 'NtEnumerateValueKey', 'SetFileAttributesW', 'NtUnmapViewOfSection', 'RegDeleteValueA', 'CreateJobObjectW', 'send', 'NtDeleteKey', 'SetEndOfFile', 'GetUserNameExA', 'GetComputerNameA', 'URLDownloadToFileW', 'NtFreeVirtualMemory', 'recvfrom', 'NtUnloadDriver', 'NtTerminateThread', 'CryptUnprotectData', 'NtCreateThreadEx', 'DeleteService', 'GetFileAttributesW', 'GetFileVersionInfoSizeExW', 'OpenSCManagerA', 'WriteProcessMemory', 'GetSystemInfo', 'SetFilePointer', 'Module32FirstW', 'ioctlsocket', 'RegEnumKeyW', 'RtlCompressBuffer', 'SendNotifyMessageW', 'GetAddrInfoW', 'CryptProtectData', 'Thread32Next', 'NtAllocateVirtualMemory', 'RegEnumKeyExW', 'RegSetValueExA', 'DrawTextExA', 'CreateToolhelp32Snapshot', 'FindWindowW', 'CoUninitialize', 'NtClose', 'WSARecv', 'CertOpenStore', 'InternetGetConnectedState', 'RtlAddVectoredContinueHandler', 'RegDeleteKeyW', 'SHGetSpecialFolderLocation', 'CreateProcessInternalW', 'NtCreateDirectoryObject', 'EnumWindows', 'DrawTextExW', 'RegEnumValueW', 'SendNotifyMessageA', 'NtProtectVirtualMemory', 'NetUserGetLocalGroups', 'GetUserNameW', 'WSASocketW', 'getaddrinfo', 'AssignProcessToJobObject', 'SetFileTime', 'WriteConsoleA', 'CryptDecodeObjectEx', 'EncryptMessage', 'system', 'NtSetContextThread', 'LdrLoadDll', 'InternetGetConnectedStateExA', 'RtlCreateUserThread', 'GetCursorPos', 'Module32NextW', 'RegCreateKeyExA', 'NtLoadDriver', 'NetUserGetInfo', 'SHGetFolderPathW', 'GetBestInterfaceEx', 'CertControlStore', 'StartServiceA', 'NtWriteFile', 'Process32FirstW', 'NtReadVirtualMemory', 'GetDiskFreeSpaceW', 'GetFileVersionInfoW', 'FindFirstFileExW', 'FindWindowExW', 'GetSystemWindowsDirectoryA', 'RegOpenKeyExW', 'CoCreateInstance', 'NtQuerySystemInformation', 'LookupPrivilegeValueW', 'NtReadFile', 'ReadCabinetState', 'GetForegroundWindow', 'InternetCloseHandle', 'FindWindowExA', 'ObtainUserAgentString', 'CryptCreateHash', 'GetTempPathW', 'CryptProtectMemory', 'NetGetJoinInformation', 'NtOpenKey', 'GetSystemDirectoryW', 'DnsQuery_A', 'RegQueryInfoKeyA', 'NtEnumerateKey', 'RegisterHotKey', 'RemoveDirectoryW', 'FindFirstFileExA', 'CertOpenSystemStoreA', 'NtTerminateProcess', 'NtSetValueKey', 'CryptAcquireContextA', 'SetErrorMode', 'UuidCreate', 'RtlRemoveVectoredExceptionHandler', 'RegDeleteKeyA', 'setsockopt', 'FindResourceExA', 'NtSuspendThread', 'GetFileVersionInfoSizeW', 'NtOpenDirectoryObject', 'InternetQueryOptionA', 'InternetReadFile', 'NtCreateFile', 'NtQueryAttributesFile', 'HttpSendRequestW', 'CryptHashMessage', 'CryptHashData', 'NtWriteVirtualMemory', 'SetFilePointerEx', 'CertCreateCertificateContext', 'DeleteUrlCacheEntryW', '__exception__']
oliveira_idx.sort()

row_size = 0
for r in dataset:
    row_size += 1

dataset.seek(0, 0)

ctr = 0
print(f"Converting #s to API Calls...")
for row_data in dataset:
    if "hash" in row_data:
        new_dataset.write(row_data.replace('\n','')+"\n")
    else:
        new_row = row_data.replace('\n','').split(',')
        for col in range(1,len(new_row)-1):
            new_row[col] = oliveira_idx[int(new_row[col])]
        new_dataset.write(','.join(new_row)+"\n")
    ctr += 1
```

In [10]:
oliveira = pd.read_csv("../Datasets/Oliveira_2019/cleaned_dynamic_api_call_sequence_per_malware_100_0_306.csv")
mal_col = oliveira.pop('malware')
oliveira.insert(1, mal_col.name, mal_col)
oliveira

Unnamed: 0,hash,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,071e8c3f8922e186e57548cd4c703a5d,1,HttpSendRequestA,WSAAccept,NtCreateSection,Process32NextW,WSAAccept,NtCreateSection,Process32NextW,recvfrom,...,InternetConnectA,GetComputerNameW,recv,LdrGetProcedureAddress,NtLoadDriver,Process32NextW,CryptHashData,OleInitialize,FindFirstFileExW,GetComputerNameW
1,33f8e6d08a6aae939f25a8e0d63dd523,1,GetFileVersionInfoExW,OleInitialize,NtQueryKey,OleInitialize,NtLoadKey,InternetConnectA,NtLoadKey,InternetConnectA,...,FindResourceW,GetFileType,RemoveDirectoryW,InternetConnectA,GetComputerNameW,recv,LdrGetProcedureAddress,NtLoadDriver,Process32NextW,CryptHashData
2,b68abd064e975e1c6d5f25e748663076,1,CreateActCtxW,HttpOpenRequestW,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,...,InternetGetConnectedStateExW,GetAdaptersAddresses,HttpSendRequestA,InternetGetConnectedStateExW,GetAdaptersAddresses,HttpSendRequestA,InternetGetConnectedStateExW,GetAdaptersAddresses,HttpSendRequestW,HttpSendRequestA
3,72049be7bd30ea61297ea624ae198067,1,GetFileVersionInfoExW,OleInitialize,NtQueryKey,OleInitialize,NtLoadKey,InternetConnectA,NtLoadKey,InternetConnectA,...,Process32NextW,OleInitialize,setsockopt,OleInitialize,setsockopt,NtQueryKey,OleInitialize,setsockopt,RegEnumValueA,setsockopt
4,c9b3700a77facf29172f32df6bc77f48,1,GetFileVersionInfoExW,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,...,CryptUnprotectMemory,OpenSCManagerA,SetStdHandle,CryptUnprotectMemory,OpenSCManagerA,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43871,e3d6d58faa040f0f9742c9d0eaf58be4,1,GetFileVersionInfoExW,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,...,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW
43872,9b917bab7f32188ae40c744f2be9aaf8,1,GetFileVersionInfoExW,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,...,GetFileVersionInfoExW,NtCreateThreadEx,RegDeleteValueW,GetFileVersionInfoExW,NtCreateThreadEx,RegDeleteValueW,GetFileVersionInfoExW,NtCreateThreadEx,RegDeleteValueW,GetFileVersionInfoExW
43873,35a18ee05f75f04912018d9f462cb990,1,GetFileVersionInfoExW,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,...,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle
43874,654139d715abcf7ecdddbef5a84f224b,1,GetFileVersionInfoExW,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,InternetConnectA,RemoveDirectoryW,...,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW,SetStdHandle,LookupAccountSidW


### 3. Expected API calls on Oliveira (2019)

In [11]:
#Expected API calls as per https://ieee-dataport.org/open-access/malware-analysis-datasets-api-call-sequences
oli_apis = ['NtOpenThread', 'ExitWindowsEx', 'FindResourceW', 'CryptExportKey', 'CreateRemoteThreadEx', 'MessageBoxTimeoutW', 'InternetCrackUrlW', 'StartServiceW', 'GetFileSize', 'GetVolumeNameForVolumeMountPointW', 'GetFileInformationByHandle', 'CryptAcquireContextW', 'RtlDecompressBuffer', 'SetWindowsHookExA', 'RegSetValueExW', 'LookupAccountSidW', 'SetUnhandledExceptionFilter', 'InternetConnectA', 'GetComputerNameW', 'RegEnumValueA', 'NtOpenFile', 'NtSaveKeyEx', 'HttpOpenRequestA', 'recv', 'GetFileSizeEx', 'LoadStringW', 'SetInformationJobObject', 'WSAConnect', 'CryptDecrypt', 'GetTimeZoneInformation', 'InternetOpenW', 'CoInitializeEx', 'CryptGenKey', 'GetAsyncKeyState', 'NtQueryInformationFile', 'GetSystemMetrics', 'NtDeleteValueKey', 'NtOpenKeyEx', 'sendto', 'IsDebuggerPresent', 'RegQueryInfoKeyW', 'NetShareEnum', 'InternetOpenUrlW', 'WSASocketA', 'CopyFileExW', 'connect', 'ShellExecuteExW', 'SearchPathW', 'GetUserNameA', 'InternetOpenUrlA', 'LdrUnloadDll', 'EnumServicesStatusW', 'EnumServicesStatusA', 'WSASend', 'CopyFileW', 'NtDeleteFile', 'CreateActCtxW', 'timeGetTime', 'MessageBoxTimeoutA', 'CreateServiceA', 'FindResourceExW', 'WSAAccept', 'InternetConnectW', 'HttpSendRequestA', 'GetVolumePathNameW', 'RegCloseKey', 'InternetGetConnectedStateExW', 'GetAdaptersInfo', 'shutdown', 'NtQueryMultipleValueKey', 'NtQueryKey', 'GetSystemWindowsDirectoryW', 'GlobalMemoryStatusEx', 'GetFileAttributesExW', 'OpenServiceW', 'getsockname', 'LoadStringA', 'UnhookWindowsHookEx', 'NtCreateUserProcess', 'Process32NextW', 'CreateThread', 'LoadResource', 'GetSystemTimeAsFileTime', 'SetStdHandle', 'CoCreateInstanceEx', 'GetSystemDirectoryA', 'NtCreateMutant', 'RegCreateKeyExW', 'IWbemServices_ExecQuery', 'NtDuplicateObject', 'Thread32First', 'OpenSCManagerW', 'CreateServiceW', 'GetFileType', 'MoveFileWithProgressW', 'NtDeviceIoControlFile', 'GetFileInformationByHandleEx', 'CopyFileA', 'NtLoadKey', 'GetNativeSystemInfo', 'NtOpenProcess', 'CryptUnprotectMemory', 'InternetWriteFile', 'ReadProcessMemory', 'gethostbyname', 'WSASendTo', 'NtOpenSection', 'listen', 'WSAStartup', 'socket', 'OleInitialize', 'FindResourceA', 'RegOpenKeyExA', 'RegEnumKeyExA', 'NtQueryDirectoryFile', 'CertOpenSystemStoreW', 'ControlService', 'LdrGetProcedureAddress', 'GlobalMemoryStatus', 'NtSetInformationFile', 'OutputDebugStringA', 'GetAdaptersAddresses', 'CoInitializeSecurity', 'RegQueryValueExA', 'NtQueryFullAttributesFile', 'DeviceIoControl', '__anomaly__', 'DeleteFileW', 'GetShortPathNameW', 'NtGetContextThread', 'GetKeyboardState', 'RemoveDirectoryA', 'InternetSetStatusCallback', 'NtResumeThread', 'SetFileInformationByHandle', 'NtCreateSection', 'NtQueueApcThread', 'accept', 'DecryptMessage', 'GetUserNameExW', 'SizeofResource', 'RegQueryValueExW', 'SetWindowsHookExW', 'HttpOpenRequestW', 'CreateDirectoryW', 'InternetOpenA', 'GetFileVersionInfoExW', 'FindWindowA', 'closesocket', 'RtlAddVectoredExceptionHandler', 'IWbemServices_ExecMethod', 'GetDiskFreeSpaceExW', 'TaskDialog', 'WriteConsoleW', 'CryptEncrypt', 'WSARecvFrom', 'NtOpenMutant', 'CoGetClassObject', 'NtQueryValueKey', 'NtDelayExecution', 'select', 'HttpQueryInfoA', 'GetVolumePathNamesForVolumeNameW', 'RegDeleteValueW', 'InternetCrackUrlA', 'OpenServiceA', 'InternetSetOptionA', 'CreateDirectoryExW', 'bind', 'NtShutdownSystem', 'DeleteUrlCacheEntryA', 'NtMapViewOfSection', 'LdrGetDllHandle', 'NtCreateKey', 'GetKeyState', 'CreateRemoteThread', 'NtEnumerateValueKey', 'SetFileAttributesW', 'NtUnmapViewOfSection', 'RegDeleteValueA', 'CreateJobObjectW', 'send', 'NtDeleteKey', 'SetEndOfFile', 'GetUserNameExA', 'GetComputerNameA', 'URLDownloadToFileW', 'NtFreeVirtualMemory', 'recvfrom', 'NtUnloadDriver', 'NtTerminateThread', 'CryptUnprotectData', 'NtCreateThreadEx', 'DeleteService', 'GetFileAttributesW', 'GetFileVersionInfoSizeExW', 'OpenSCManagerA', 'WriteProcessMemory', 'GetSystemInfo', 'SetFilePointer', 'Module32FirstW', 'ioctlsocket', 'RegEnumKeyW', 'RtlCompressBuffer', 'SendNotifyMessageW', 'GetAddrInfoW', 'CryptProtectData', 'Thread32Next', 'NtAllocateVirtualMemory', 'RegEnumKeyExW', 'RegSetValueExA', 'DrawTextExA', 'CreateToolhelp32Snapshot', 'FindWindowW', 'CoUninitialize', 'NtClose', 'WSARecv', 'CertOpenStore', 'InternetGetConnectedState', 'RtlAddVectoredContinueHandler', 'RegDeleteKeyW', 'SHGetSpecialFolderLocation', 'CreateProcessInternalW', 'NtCreateDirectoryObject', 'EnumWindows', 'DrawTextExW', 'RegEnumValueW', 'SendNotifyMessageA', 'NtProtectVirtualMemory', 'NetUserGetLocalGroups', 'GetUserNameW', 'WSASocketW', 'getaddrinfo', 'AssignProcessToJobObject', 'SetFileTime', 'WriteConsoleA', 'CryptDecodeObjectEx', 'EncryptMessage', 'system', 'NtSetContextThread', 'LdrLoadDll', 'InternetGetConnectedStateExA', 'RtlCreateUserThread', 'GetCursorPos', 'Module32NextW', 'RegCreateKeyExA', 'NtLoadDriver', 'NetUserGetInfo', 'SHGetFolderPathW', 'GetBestInterfaceEx', 'CertControlStore', 'StartServiceA', 'NtWriteFile', 'Process32FirstW', 'NtReadVirtualMemory', 'GetDiskFreeSpaceW', 'GetFileVersionInfoW', 'FindFirstFileExW', 'FindWindowExW', 'GetSystemWindowsDirectoryA', 'RegOpenKeyExW', 'CoCreateInstance', 'NtQuerySystemInformation', 'LookupPrivilegeValueW', 'NtReadFile', 'ReadCabinetState', 'GetForegroundWindow', 'InternetCloseHandle', 'FindWindowExA', 'ObtainUserAgentString', 'CryptCreateHash', 'GetTempPathW', 'CryptProtectMemory', 'NetGetJoinInformation', 'NtOpenKey', 'GetSystemDirectoryW', 'DnsQuery_A', 'RegQueryInfoKeyA', 'NtEnumerateKey', 'RegisterHotKey', 'RemoveDirectoryW', 'FindFirstFileExA', 'CertOpenSystemStoreA', 'NtTerminateProcess', 'NtSetValueKey', 'CryptAcquireContextA', 'SetErrorMode', 'UuidCreate', 'RtlRemoveVectoredExceptionHandler', 'RegDeleteKeyA', 'setsockopt', 'FindResourceExA', 'NtSuspendThread', 'GetFileVersionInfoSizeW', 'NtOpenDirectoryObject', 'InternetQueryOptionA', 'InternetReadFile', 'NtCreateFile', 'NtQueryAttributesFile', 'HttpSendRequestW', 'CryptHashMessage', 'CryptHashData', 'NtWriteVirtualMemory', 'SetFilePointerEx', 'CertCreateCertificateContext', 'DeleteUrlCacheEntryW', '__exception__']
oli_apis.sort()
print(oli_apis)
print("# of Expected API Calls", len(oli_apis))

['AssignProcessToJobObject', 'CertControlStore', 'CertCreateCertificateContext', 'CertOpenStore', 'CertOpenSystemStoreA', 'CertOpenSystemStoreW', 'CoCreateInstance', 'CoCreateInstanceEx', 'CoGetClassObject', 'CoInitializeEx', 'CoInitializeSecurity', 'CoUninitialize', 'ControlService', 'CopyFileA', 'CopyFileExW', 'CopyFileW', 'CreateActCtxW', 'CreateDirectoryExW', 'CreateDirectoryW', 'CreateJobObjectW', 'CreateProcessInternalW', 'CreateRemoteThread', 'CreateRemoteThreadEx', 'CreateServiceA', 'CreateServiceW', 'CreateThread', 'CreateToolhelp32Snapshot', 'CryptAcquireContextA', 'CryptAcquireContextW', 'CryptCreateHash', 'CryptDecodeObjectEx', 'CryptDecrypt', 'CryptEncrypt', 'CryptExportKey', 'CryptGenKey', 'CryptHashData', 'CryptHashMessage', 'CryptProtectData', 'CryptProtectMemory', 'CryptUnprotectData', 'CryptUnprotectMemory', 'DecryptMessage', 'DeleteFileW', 'DeleteService', 'DeleteUrlCacheEntryA', 'DeleteUrlCacheEntryW', 'DeviceIoControl', 'DnsQuery_A', 'DrawTextExA', 'DrawTextExW', '

### 4. Actual API Calls found in Oliveira (2019)

In [12]:
featr = oliveira.iloc[:, 2:]
combined_summary = []
featr['summary'] = featr.values.tolist()
for i in range(featr['summary'].shape[0]):
    length = len(featr['summary'].loc[i])
    for j in range(length-1):
        combined_summary.append(featr['summary'].loc[i][j])
oli_featr = pd.Series(combined_summary).dropna().drop_duplicates().reset_index()
oli_featr.rename(columns={'index': 'index', 0: 'api_calls'}, inplace=True)
oli_featr.sort_values(inplace=True, by='api_calls')
oli_featr.drop(axis=1, labels="index", inplace=True)
oli_featr

Unnamed: 0,api_calls
157,AssignProcessToJobObject
57,CertCreateCertificateContext
225,CertOpenStore
176,CertOpenSystemStoreW
205,CoCreateInstance
...,...
50,sendto
47,setsockopt
95,shutdown
202,system


In [13]:
#Write to file
f = open("Oliveira_Features.txt", mode="w")
f.write(str(oli_featr['api_calls'].tolist()))
f.flush()
f.close()

## Catak (2021)

**Dataset URL:** https://github.com/ocatak/malware_api_class/

**Note:** The Catak Dataset is designed more for malware classification than detection, hence the first column for the pre-cleaned dataset is designated as a the malware type column.

### 1. Entire Dataset (Original)

In [14]:
og_catak

Unnamed: 0,ldrloaddll ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress regopenkeyexa regopenkeyexa regopenkeyexa ntopenkey ntqueryvaluekey ntclose ntopenkey ntqueryvaluekey ntclose ntclose ntqueryattributesfile ntqueryattributesfile ntqueryattributesfile ntqueryattributesfile loadstringa ntallocatevirtualmemory ntallocatevirtualmemory loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa loadstringa ldrgetdllhandle ldrgetprocedureaddress ldrgetdllhandle ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrgetprocedureaddress ldrloaddll ldrgetprocedureaddress ldrunloaddll findfirstfileexw copyfilea regcreatekeyexa regsetvalueexa regclosekey createprocessinternalw ntclose ntclose ntclose ntfreevirtualmemory ntterminateprocess ntterminateprocess ntclose ntclose ntclose ntclose ntclose ntclose ntclose ldrunloaddll ntopenkey ntqueryvaluekey ntclose ntclose ntclose ntclose ntterminateprocess
0,getsystemtimeasfiletime ntallocatevirtualmemor...
1,ldrgetdllhandle ldrgetprocedureaddress getsyst...
2,ldrloaddll ldrgetprocedureaddress ldrloaddll l...
3,ldrloaddll ldrgetprocedureaddress ldrgetproced...
4,ntprotectvirtualmemory ntprotectvirtualmemory ...
...,...
7101,ldrloaddll ldrgetprocedureaddress ldrgetproced...
7102,ldrloaddll ldrgetprocedureaddress ldrgetproced...
7103,ldrloaddll ldrgetprocedureaddress ldrgetproced...
7104,createthread ntallocatevirtualmemory ntfreevir...


### 2. Entire Dataset (Pre-Cleaned)

**The code for pre-cleaning is as follows:**

```
#CONVERTS THE CASING OF THE API CALLS IN CATAK (2021) DATASET TO THE SAME CASING AS FOUND IN OLIVEIRA (2019) AND REMOVE THE DUPLICATE API CALLS

import os
import threading

LABELS = []
API_CALLS = []

print("Group CDEF\n")

def cls():
    os.system('cls')

def read_api(dataset):
    global API_CALLS
    print("Running [read_api] thread...")
    for row in dataset:
        API_CALLS.append(row.replace("\n","").split(","))
    print("Finishing [read_api] thread...")

def read_labels(dataset_label):
    global LABELS
    print("Running [read_labels] thread...")
    for row in dataset_label:
        LABELS.append(row.replace("\n",""))
    print("Finishing [read_labels] thread...")

oliveira_idx = ['NtOpenThread', 'ExitWindowsEx', 'FindResourceW', 'CryptExportKey', 'CreateRemoteThreadEx', 'MessageBoxTimeoutW', 'InternetCrackUrlW', 'StartServiceW', 'GetFileSize', 'GetVolumeNameForVolumeMountPointW', 'GetFileInformationByHandle', 'CryptAcquireContextW', 'RtlDecompressBuffer', 'SetWindowsHookExA', 'RegSetValueExW', 'LookupAccountSidW', 'SetUnhandledExceptionFilter', 'InternetConnectA', 'GetComputerNameW', 'RegEnumValueA', 'NtOpenFile', 'NtSaveKeyEx', 'HttpOpenRequestA', 'recv', 'GetFileSizeEx', 'LoadStringW', 'SetInformationJobObject', 'WSAConnect', 'CryptDecrypt', 'GetTimeZoneInformation', 'InternetOpenW', 'CoInitializeEx', 'CryptGenKey', 'GetAsyncKeyState', 'NtQueryInformationFile', 'GetSystemMetrics', 'NtDeleteValueKey', 'NtOpenKeyEx', 'sendto', 'IsDebuggerPresent', 'RegQueryInfoKeyW', 'NetShareEnum', 'InternetOpenUrlW', 'WSASocketA', 'CopyFileExW', 'connect', 'ShellExecuteExW', 'SearchPathW', 'GetUserNameA', 'InternetOpenUrlA', 'LdrUnloadDll', 'EnumServicesStatusW', 'EnumServicesStatusA', 'WSASend', 'CopyFileW', 'NtDeleteFile', 'CreateActCtxW', 'timeGetTime', 'MessageBoxTimeoutA', 'CreateServiceA', 'FindResourceExW', 'WSAAccept', 'InternetConnectW', 'HttpSendRequestA', 'GetVolumePathNameW', 'RegCloseKey', 'InternetGetConnectedStateExW', 'GetAdaptersInfo', 'shutdown', 'NtQueryMultipleValueKey', 'NtQueryKey', 'GetSystemWindowsDirectoryW', 'GlobalMemoryStatusEx', 'GetFileAttributesExW', 'OpenServiceW', 'getsockname', 'LoadStringA', 'UnhookWindowsHookEx', 'NtCreateUserProcess', 'Process32NextW', 'CreateThread', 'LoadResource', 'GetSystemTimeAsFileTime', 'SetStdHandle', 'CoCreateInstanceEx', 'GetSystemDirectoryA', 'NtCreateMutant', 'RegCreateKeyExW', 'IWbemServices_ExecQuery', 'NtDuplicateObject', 'Thread32First', 'OpenSCManagerW', 'CreateServiceW', 'GetFileType', 'MoveFileWithProgressW', 'NtDeviceIoControlFile', 'GetFileInformationByHandleEx', 'CopyFileA', 'NtLoadKey', 'GetNativeSystemInfo', 'NtOpenProcess', 'CryptUnprotectMemory', 'InternetWriteFile', 'ReadProcessMemory', 'gethostbyname', 'WSASendTo', 'NtOpenSection', 'listen', 'WSAStartup', 'socket', 'OleInitialize', 'FindResourceA', 'RegOpenKeyExA', 'RegEnumKeyExA', 'NtQueryDirectoryFile', 'CertOpenSystemStoreW', 'ControlService', 'LdrGetProcedureAddress', 'GlobalMemoryStatus', 'NtSetInformationFile', 'OutputDebugStringA', 'GetAdaptersAddresses', 'CoInitializeSecurity', 'RegQueryValueExA', 'NtQueryFullAttributesFile', 'DeviceIoControl', '__anomaly__', 'DeleteFileW', 'GetShortPathNameW', 'NtGetContextThread', 'GetKeyboardState', 'RemoveDirectoryA', 'InternetSetStatusCallback', 'NtResumeThread', 'SetFileInformationByHandle', 'NtCreateSection', 'NtQueueApcThread', 'accept', 'DecryptMessage', 'GetUserNameExW', 'SizeofResource', 'RegQueryValueExW', 'SetWindowsHookExW', 'HttpOpenRequestW', 'CreateDirectoryW', 'InternetOpenA', 'GetFileVersionInfoExW', 'FindWindowA', 'closesocket', 'RtlAddVectoredExceptionHandler', 'IWbemServices_ExecMethod', 'GetDiskFreeSpaceExW', 'TaskDialog', 'WriteConsoleW', 'CryptEncrypt', 'WSARecvFrom', 'NtOpenMutant', 'CoGetClassObject', 'NtQueryValueKey', 'NtDelayExecution', 'select', 'HttpQueryInfoA', 'GetVolumePathNamesForVolumeNameW', 'RegDeleteValueW', 'InternetCrackUrlA', 'OpenServiceA', 'InternetSetOptionA', 'CreateDirectoryExW', 'bind', 'NtShutdownSystem', 'DeleteUrlCacheEntryA', 'NtMapViewOfSection', 'LdrGetDllHandle', 'NtCreateKey', 'GetKeyState', 'CreateRemoteThread', 'NtEnumerateValueKey', 'SetFileAttributesW', 'NtUnmapViewOfSection', 'RegDeleteValueA', 'CreateJobObjectW', 'send', 'NtDeleteKey', 'SetEndOfFile', 'GetUserNameExA', 'GetComputerNameA', 'URLDownloadToFileW', 'NtFreeVirtualMemory', 'recvfrom', 'NtUnloadDriver', 'NtTerminateThread', 'CryptUnprotectData', 'NtCreateThreadEx', 'DeleteService', 'GetFileAttributesW', 'GetFileVersionInfoSizeExW', 'OpenSCManagerA', 'WriteProcessMemory', 'GetSystemInfo', 'SetFilePointer', 'Module32FirstW', 'ioctlsocket', 'RegEnumKeyW', 'RtlCompressBuffer', 'SendNotifyMessageW', 'GetAddrInfoW', 'CryptProtectData', 'Thread32Next', 'NtAllocateVirtualMemory', 'RegEnumKeyExW', 'RegSetValueExA', 'DrawTextExA', 'CreateToolhelp32Snapshot', 'FindWindowW', 'CoUninitialize', 'NtClose', 'WSARecv', 'CertOpenStore', 'InternetGetConnectedState', 'RtlAddVectoredContinueHandler', 'RegDeleteKeyW', 'SHGetSpecialFolderLocation', 'CreateProcessInternalW', 'NtCreateDirectoryObject', 'EnumWindows', 'DrawTextExW', 'RegEnumValueW', 'SendNotifyMessageA', 'NtProtectVirtualMemory', 'NetUserGetLocalGroups', 'GetUserNameW', 'WSASocketW', 'getaddrinfo', 'AssignProcessToJobObject', 'SetFileTime', 'WriteConsoleA', 'CryptDecodeObjectEx', 'EncryptMessage', 'system', 'NtSetContextThread', 'LdrLoadDll', 'InternetGetConnectedStateExA', 'RtlCreateUserThread', 'GetCursorPos', 'Module32NextW', 'RegCreateKeyExA', 'NtLoadDriver', 'NetUserGetInfo', 'SHGetFolderPathW', 'GetBestInterfaceEx', 'CertControlStore', 'StartServiceA', 'NtWriteFile', 'Process32FirstW', 'NtReadVirtualMemory', 'GetDiskFreeSpaceW', 'GetFileVersionInfoW', 'FindFirstFileExW', 'FindWindowExW', 'GetSystemWindowsDirectoryA', 'RegOpenKeyExW', 'CoCreateInstance', 'NtQuerySystemInformation', 'LookupPrivilegeValueW', 'NtReadFile', 'ReadCabinetState', 'GetForegroundWindow', 'InternetCloseHandle', 'FindWindowExA', 'ObtainUserAgentString', 'CryptCreateHash', 'GetTempPathW', 'CryptProtectMemory', 'NetGetJoinInformation', 'NtOpenKey', 'GetSystemDirectoryW', 'DnsQuery_A', 'RegQueryInfoKeyA', 'NtEnumerateKey', 'RegisterHotKey', 'RemoveDirectoryW', 'FindFirstFileExA', 'CertOpenSystemStoreA', 'NtTerminateProcess', 'NtSetValueKey', 'CryptAcquireContextA', 'SetErrorMode', 'UuidCreate', 'RtlRemoveVectoredExceptionHandler', 'RegDeleteKeyA', 'setsockopt', 'FindResourceExA', 'NtSuspendThread', 'GetFileVersionInfoSizeW', 'NtOpenDirectoryObject', 'InternetQueryOptionA', 'InternetReadFile', 'NtCreateFile', 'NtQueryAttributesFile', 'HttpSendRequestW', 'CryptHashMessage', 'CryptHashData', 'NtWriteVirtualMemory', 'SetFilePointerEx', 'CertCreateCertificateContext', 'DeleteUrlCacheEntryW', '__exception__']
oliveira_idx.sort()
oliveira_idx_lower = []
for o in oliveira_idx:
    oliveira_idx_lower.append(o.lower())

dataset = open(os.getcwd()+"\\Original\\"+"all_analysis_data.csv", mode='r')   
dataset_label = open(os.getcwd()+"\\Original\\"+"labels.csv", mode='r')
new_dataset = open("cleaned_all_analysis_data.csv", mode="w")

api_t = threading.Thread(target=read_api, args=[dataset])
label_t = threading.Thread(target=read_labels, args=[dataset_label])
api_t.start()
label_t.start()
api_t.join()
label_t.join()
print("API Calls:", len(API_CALLS))
print("Labels Size:", len(LABELS))

combined = []
print("Converting casing...")
for i in range(len(LABELS)):
    sub = [LABELS[i]]
    apis = API_CALLS[i]
    for a in range(len(apis)):
        apis[a] = apis[a].replace("\n","").split(" ")
        apis[a] = list(dict.fromkeys(apis[a]))
        for s in range(len(apis[a])):
            for o in range(len(oliveira_idx_lower)):
                if apis[a][s] == oliveira_idx_lower[o]:
                    sub.append(oliveira_idx[o])
    combined.append(sub)
print("Casing Converted")
API_CALLS = None
LABELS = None

print("Finding size of pre-cleaned dataset...")
breadth = 0
depth = len(combined)
for c in combined:
    if len(c)-1 > breadth:
        breadth = len(c)-1
print("Depth:", depth)
print("Breadth:", breadth)

header = "malware_type,"
for i in range(1,breadth+1):
    header += "t_" + str(i)
    if i < breadth:
        header += ","
header += "\n"
print("Writing to file...")
new_dataset.write(header)
for c in range(0,len(combined)):
    new_dataset.write(','.join(combined[c])+"\n")
```

In [15]:
catak = pd.read_csv("../Datasets/Catak_etal_2021/cleaned_all_analysis_data.csv")
catak

  catak = pd.read_csv("../Datasets/Catak_etal_2021/cleaned_all_analysis_data.csv")


Unnamed: 0,malware_type,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,t_9,...,t_137,t_138,t_139,t_140,t_141,t_142,t_143,t_144,t_145,t_146
0,Trojan,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,...,,,,,,,,,,
1,Trojan,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,NtCreateMutant,NtClose,LdrLoadDll,...,,,,,,,,,,
2,Backdoor,LdrGetDllHandle,LdrGetProcedureAddress,GetSystemDirectoryA,CopyFileA,RegOpenKeyExA,RegSetValueExA,RegCloseKey,RegCreateKeyExA,NtCreateFile,...,,,,,,,,,,
3,Backdoor,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,...,,,,,,,,,,
4,Trojan,LdrLoadDll,LdrGetProcedureAddress,WSAStartup,NtCreateMutant,RegOpenKeyExA,RegDeleteKeyA,RegCloseKey,CopyFileA,RegSetValueExA,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7102,Virus,LdrLoadDll,LdrGetProcedureAddress,SetErrorMode,NtOpenDirectoryObject,NtCreateSection,NtMapViewOfSection,CreateThread,NtCreateMutant,NtAllocateVirtualMemory,...,,,,,,,,,,
7103,Virus,LdrLoadDll,LdrGetProcedureAddress,SetErrorMode,NtOpenDirectoryObject,NtCreateSection,NtMapViewOfSection,CreateThread,NtCreateMutant,NtAllocateVirtualMemory,...,,,,,,,,,,
7104,Virus,LdrLoadDll,LdrGetProcedureAddress,FindResourceExW,LoadResource,NtOpenKey,NtQueryValueKey,NtClose,GetSystemWindowsDirectoryW,NtCreateFile,...,,,,,,,,,,
7105,Virus,CreateThread,NtAllocateVirtualMemory,NtFreeVirtualMemory,GetFileType,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,LoadStringA,RegOpenKeyExA,...,,,,,,,,,,


### 3. Unique & Actual API calls on Catak (2021)

In [16]:
featr = catak.iloc[:, 1:]
combined_summary = []
featr['summary'] = featr.values.tolist()
for i in range(featr['summary'].shape[0]):
    length = len(featr['summary'].loc[i])
    for j in range(length-1):
        combined_summary.append(featr['summary'].loc[i][j])
catak_featr = pd.Series(combined_summary).dropna().drop_duplicates().reset_index()
catak_featr.rename(columns={'index': 'index', 0: 'api_calls'}, inplace=True)
catak_featr.sort_values(inplace=True, by='api_calls')
catak_featr.drop(axis=1, labels="index", inplace=True)
catak_featr

Unnamed: 0,api_calls
176,CertOpenSystemStoreA
102,CoCreateInstance
174,CoCreateInstanceEx
137,CoGetClassObject
26,CoInitializeEx
...,...
215,sendto
170,setsockopt
153,shutdown
87,socket


In [17]:
#Write to file
f = open("Catak_Features.txt", mode="w")
f.write(str(catak_featr['api_calls'].tolist()))
f.flush()
f.close()

## Comparing MalbehavD-V1 & Oliveira Dataset

**Finding Same API calls in MalbehavD-V1 from Oliveira**

In [18]:
oli_apis = oli_featr['api_calls'].tolist()
catak_apis = catak_featr['api_calls'].tolist()

malbd_oli_catak_featr = {'malbehavd_featr':malbd_featr['api_calls'].tolist(), 
                         'match_to_oliveira': malbd_featr['api_calls'].isin(oli_apis).tolist(),
                         'match_to_catak': malbd_featr['api_calls'].isin(catak_apis).tolist()}
malbd_oli_catak_featr = pd.DataFrame(data=malbd_oli_catak_featr)

malbd_oli_catak_featr

Unnamed: 0,malbehavd_featr,match_to_oliveira,match_to_catak
0,CertControlStore,False,False
1,CertCreateCertificateContext,True,False
2,CertOpenStore,True,False
3,CertOpenSystemStoreW,True,False
4,CoCreateInstance,True,True
...,...,...,...
286,sendto,True,True
287,setsockopt,True,True
288,shutdown,True,True
289,socket,False,True


In [19]:
oli_ctr = 0
catak_ctr = 0
oli_mismatch = []
catak_mismatch = []
for row in range(malbd_oli_catak_featr.shape[0]):
    #print(malbd_oli_featr_ls[0][row], malbd_oli_featr_ls[1][row])
    if malbd_oli_catak_featr['match_to_oliveira'][row]:
        oli_ctr += 1
    else:
        oli_mismatch.append(malbd_oli_catak_featr['malbehavd_featr'][row])
    if malbd_oli_catak_featr['match_to_catak'][row]:
        catak_ctr += 1
    else:
        catak_mismatch.append(malbd_oli_catak_featr['malbehavd_featr'][row])

print("")
print("True: MalbehavD and [X] has the same API call.")
print("False: MalbehavD has the API call but [X] doesn't.")
print("")
print(f"Oliveira Match Rate: {oli_ctr:.0f}/{malbd_oli_catak_featr.shape[0]:.0f} ({100*(oli_ctr/malbd_oli_catak_featr.shape[0]):.4f}%)")
print(f"Catak Match Rate: {catak_ctr:.0f}/{malbd_oli_catak_featr.shape[0]:.0f} ({100*(catak_ctr/malbd_oli_catak_featr.shape[0]):.4f}%)")
print("")
print("Oliveira Mismatched Items:")
print(oli_mismatch)
print("")
print("Catak Mismatched Items:")
print(catak_mismatch)


True: MalbehavD and [X] has the same API call.
False: MalbehavD has the API call but [X] doesn't.

Oliveira Match Rate: 249/291 (85.5670%)
Catak Match Rate: 269/291 (92.4399%)

Oliveira Mismatched Items:
['CertControlStore', 'CreateRemoteThread', 'CryptAcquireContextA', 'DecryptMessage', 'DnsQuery_W', 'EnumServicesStatusA', 'EnumWindows', 'FindWindowA', 'GetAdaptersInfo', 'GetBestInterfaceEx', 'GetFileInformationByHandleEx', 'GetUserNameExA', 'GetUserNameExW', 'GetVolumePathNameW', 'InternetOpenUrlW', 'LdrUnloadDll', 'LoadResource', 'NtAllocateVirtualMemory', 'NtCreateFile', 'NtEnumerateKey', 'NtFreeVirtualMemory', 'NtQueryInformationFile', 'NtQuerySystemInformation', 'NtQueueApcThread', 'NtReadVirtualMemory', 'RegCreateKeyExA', 'RegDeleteValueA', 'RegEnumKeyW', 'RegQueryInfoKeyW', 'RegSetValueExW', 'RegisterHotKey', 'RtlAddVectoredContinueHandler', 'RtlAddVectoredExceptionHandler', 'RtlRemoveVectoredExceptionHandler', 'SendNotifyMessageA', 'SizeofResource', 'UuidCreate', 'WSAStartup'

# Time Taken

In [20]:
print(f"{time.time()-start_time}s")

300.39131116867065s
