# Pattern Compare

Aims to answer the question **Are there any key similarities or differences between malicious and benign samples in terms of API Call Patterns?** in **4.2.6. Dataset Analysis** of the study.

## 1. Import Libraries/Datasets

In [36]:
import pandas as pd
from difflib import SequenceMatcher
import time

malicious_df = pd.read_csv('./Clustering/[EDITED]KMeans_SampleHash_Common.csv', low_memory=False) #This should point to a verified SampleHash_Common.csv file
benign_df = pd.read_csv('./Clustering/Benign/API_Patterns.csv') #This should point to the API_Patterns.csv file

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
# APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

## 2. DataFrame Preview

In [37]:
malicious_df.replace(to_replace='-',value='_', inplace=True)
malicious_df

Unnamed: 0,cluster,hash,type1,type2,type3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


In [38]:
#Drop row that is falsely labelled.
malicious_df.drop(malicious_df[(malicious_df['type1']=='_')&(malicious_df['type2']=='_')&(malicious_df['type3']=='_')].index, inplace=True)
malicious_df

Unnamed: 0,cluster,hash,type1,type2,type3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
5,1,1ff43aa97f19dc8543aeaa1cd53e3885,trojan,adware,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


In [39]:
benign_df

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,5b51d65972a349f90a86984c26b12b30,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,CoCreateInstance,NtOpenSection,CreateDirectoryW,NtCreateFile,LdrGetProcedureAddress,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,ceb8cc125478fad641daa4e04e9b2f19,GetSystemInfo,NtAllocateVirtualMemory,NtOpenSection,GetTempPathW,CreateDirectoryW,GetFileAttributesW,FindFirstFileExW,DeleteFileW,NtQueryDirectoryFile,...,NtClose,NtCreateMutant,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,NtClose,NtCreateMutant,NtClose,NtCreateFile,"GetSystemInfo,NtAllocateVirtualMemory,NtOpenSe..."
2,f108600edf46d7c20f6acc522aeba6df,GetSystemTimeAsFileTime,NtProtectVirtualMemory,SetUnhandledExceptionFilter,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,GetSystemTimeAsFileTime,...,SetErrorMode,GetFileAttributesExW,SetErrorMode,NtAllocateVirtualMemory,SetErrorMode,GetFileAttributesExW,SetErrorMode,FindFirstFileExW,NtQueryDirectoryFile,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."
3,711be6337cb78a948f04759a0bd210ce,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,GetSystemMetrics,LdrLoadDll,LdrGetProcedureAddress,GetSystemMetrics,NtAllocateVirtualMemory,LdrLoadDll,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,6de26f67ceb1e3303b889489010f4c3f,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExW,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,c6c5563b17b7c763e51e4dbc3378ef1a,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
66,67db2476f1e9e962ca343f799b669225,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,FindFirstFileExW,NtCreateFile,SetErrorMode,ReadProcessMemory,NtAllocateVirtualMemory,Module32NextW,ReadProcessMemory,SetErrorMode,FindFirstFileExW,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
67,6e51234733dec1e25f2fc3245aea3d7c,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,NtCreateMutant,LoadStringW,FindResourceExW,LoadResource,FindResourceExW,LoadResource,NtAllocateVirtualMemory,...,FindFirstFileExW,NtClose,SetErrorMode,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
68,cfbd8d062e9baa98737a0260996f48c6,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,SetErrorMode,OleInitialize,SetWindowsHookExW,NtAllocateVirtualMemory,SetWindowsHookExW,NtAllocateVirtualMemory,GetForegroundWindow,...,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,GetSystemMetrics,LdrLoadDll,LdrGetProcedureAddress,GetSystemMetrics,LdrGetProcedureAddress,GetKeyState,"SetUnhandledExceptionFilter,NtAllocateVirtualM..."


## 3. Identify Malware Types

In [40]:
'''Identify popular malware types in the dataset per Type as validated by VirusTotal.'''
types = ['type1', 'type2', 'type3']

def sortbyquantity(ls):
    return ls[1]

def identify(malware_type:str):
    print(f"{malware_type.upper()} LABEL")
    unique = list(malicious_df[malware_type].unique())
    if '_' in unique:
        unique.remove('_')
    print(unique)
    quantities = []
    for t in unique:
        quantities.append([t, len(malicious_df[malicious_df[malware_type]==t])])
    quantities.sort(key=sortbyquantity, reverse=True)
    for q in quantities:
        print(q)
    print("")
    
for i in types:
    identify(i)

TYPE1 LABEL
['trojan', 'downloader', 'adware', 'ransomware', 'pua', 'softomate']
['trojan', 399]
['downloader', 29]
['adware', 23]
['ransomware', 8]
['pua', 2]
['softomate', 1]

TYPE2 LABEL
['dropper', 'hacktool', 'adware', 'downloader', 'miner', 'trojan', 'ransomware', 'spyware', 'pua', 'virus', 'banker']
['adware', 280]
['trojan', 38]
['downloader', 17]
['ransomware', 9]
['dropper', 8]
['miner', 6]
['spyware', 5]
['banker', 5]
['pua', 4]
['virus', 2]
['hacktool', 1]

TYPE3 LABEL
['pua', 'trojan', 'worm', 'adware', 'downloader', 'dropper', 'virus', 'ransomware', 'spyware', 'hacktool']
['pua', 56]
['virus', 24]
['downloader', 23]
['trojan', 19]
['adware', 12]
['dropper', 5]
['hacktool', 2]
['worm', 1]
['ransomware', 1]
['spyware', 1]



## 4. Extract API Patterns

In [41]:
malicious_patterns = []

print(len(malicious_patterns))
malicious_patterns[0:4]

0


[]

In [42]:
benign_patterns = benign_df['pattern'].to_list()
print(len(benign_patterns))
benign_patterns[0:4]

70


['SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrLoadDll,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,RegOpenKeyExA,LdrGetProcedureAddress,CreateActCtxW,LdrLoadDll,LdrGetProcedureAddress,GetSystemDirectoryW,RegOpenKeyExA,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,GetVolumeNameForVolumeMountPointW

## 5. Compare API Call Patterns

The samples that match in terms of pattern (i.e., states) are ideally the following:
1. The malicious sample is a `trojan` hence must have similar behaviors as a benign software. It might be the limitation of the dataset having only the first 100 API calls and may not have included the rest.
2. The malicious sample is falsely labelled as malicious (i.e., false-positive) which is supported by it matching the behavior of benign samples.

**Note that this uses the full API Call Patterns (i.e., Time-based Behaviors)**

In [43]:
ratio = 0
ratios = []
print("Comparing API Call Patterns...")
start = time.time()
for b in range(benign_df.shape[0]):
    for m in range(malicious_df.shape[0]):
        new_ratio = SequenceMatcher(None, malicious_df['pattern'].iloc[m], benign_df['pattern'].iloc[b]).ratio()
        ratio += new_ratio
        ratios.append([benign_df['pattern'].iloc[b],malicious_df['pattern'].iloc[m],new_ratio,
                       malicious_df['type1'].iloc[m],malicious_df['type2'].iloc[m],malicious_df['type3'].iloc[m], 
                       malicious_df['hash'].iloc[m], benign_df['hash'].iloc[b]])
elapsed = time.time()-start
size = len(benign_patterns) * len(malicious_patterns)
ratio /= size
ratios.sort(reverse=True,key=lambda ratio: ratio[2])
print("Average Similarity Ratio:", ratio)
print(f"Time Elapsed: {elapsed:0.4f}s")
print(f"Time per Sample: {elapsed/size:0.4f}s")
print("")

Comparing API Call Patterns...


ZeroDivisionError: float division by zero

In [None]:
def print_comparison(type:str, ratios:list, max:int):
    if max > len(ratios):
        print(f"The specified `max` value ({max}) exceeds the number available ratios to select.")
        max = len(ratios)
    states = []
    output = ""
    for r in range(0,max):
        output += f"{r+1}\n"
        output += f"Malicious Hash: {ratios[r][6]}\n"
        output += f"Benign Hash: {ratios[r][7]}\n"
        output += f"Score: {ratios[r][2]}\n"
        output += f"Type: {ratios[r][3]} {ratios[r][4]} {ratios[r][5]}\n"
        output += f"Malicious API Call Pattern: {ratios[r][1]}\n"
        output += f"Benign API Call Pattern: {ratios[r][0]}\n"
        output += "\n"
        states.append(ratios[r][3]+' '+ratios[r][4]+' '+ratios[r][5])
    print(output)
    print(output, file=f"{type}_comparison.txt")
    return states

In [None]:
# MOST SIMILAR API CALL PATTERNS TO BOTH MALICIOUS AND BENIGN SAMPLES
ratios.sort(reverse=True,key=lambda ratio: ratio[2])
states = print_comparison("similar", ratios, 50)
common_states = list(pd.Series(states).sort_values().unique())
print(f"Most Similar API Call Patterns between Malicious and Benign Samples:\n", common_states, "\n")

1
Malicious Hash: 382ad6882681103fad2b2e2f4129dc04
Benign Hash: 6de26f67ceb1e3303b889489010f4c3f
Score: 0.9897172236503856
Type: downloader trojan _
Malicious API Call Pattern: SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrLoadDll,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,RegOpenKeyExA,LdrGetProcedureAddress,CreateActCtxW,LdrLoadDll,LdrGetProcedureAddress,GetSystemDirectoryW,RegOpenKeyExA,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,LdrGetProcedu

AttributeError: 'str' object has no attribute 'write'

In [None]:
# MOST DIFFERENT API CALL PATTERNS TO BOTH MALICIOUS AND BENIGN SAMPLES
ratios.sort(reverse=False,key=lambda ratio: ratio[2])
states = print_comparison("different",ratios, 20)
uncommon_states = list(pd.Series(states).sort_values().unique())
print(f"Most Different API Call Patterns between Malicious and Benign Samples:\n", uncommon_states, "\n")

1
Benign API Call Pattern: SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrLoadDll,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,RegOpenKeyExA,LdrGetProcedureAddress,CreateActCtxW,LdrLoadDll,LdrGetProcedureAddress,GetSystemDirectoryW,RegOpenKeyExA,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,GetVolum