# Pattern Compare

## 1. Import Libraries/Datasets

In [1]:
import pandas as pd
from difflib import SequenceMatcher
import time

malicious_df = pd.read_csv('./Clustering/Malicious/[EDITED]KMeans_SampleHash_Common.csv', low_memory=False)
benign_df = pd.read_csv('./Clustering/Benign/API_Patterns.csv')

## 2. DataFrame Preview

In [2]:
malicious_df.replace(to_replace='-',value='_', inplace=True)
malicious_df

Unnamed: 0.1,Unnamed: 0,cluster,hash,type1,type2,type3,pattern
0,0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...,...
490,490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


In [3]:
benign_df

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,5b51d65972a349f90a86984c26b12b30,286,110,172,240,117,240,117,240,117,...,215,114,215,117,261,106,144,297,117,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,ceb8cc125478fad641daa4e04e9b2f19,198,208,106,271,144,194,257,127,114,...,215,86,215,172,117,215,86,215,297,"GetSystemInfo,NtAllocateVirtualMemory,NtOpenSe..."
2,f108600edf46d7c20f6acc522aeba6df,82,228,16,29,82,29,82,29,82,...,286,73,286,208,286,73,286,257,114,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."
3,711be6337cb78a948f04759a0bd210ce,82,240,117,240,117,240,117,240,117,...,117,208,117,35,240,117,35,208,240,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,6de26f67ceb1e3303b889489010f4c3f,286,110,172,240,117,240,117,240,117,...,215,114,215,117,71,25,71,275,260,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,c6c5563b17b7c763e51e4dbc3378ef1a,240,117,240,117,240,117,240,117,240,...,65,113,112,123,65,113,112,123,65,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
66,67db2476f1e9e962ca343f799b669225,82,240,117,240,117,240,117,240,117,...,257,297,286,103,208,244,103,286,257,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
67,6e51234733dec1e25f2fc3245aea3d7c,82,16,86,25,60,81,60,81,208,...,257,215,286,106,171,260,240,117,260,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
68,cfbd8d062e9baa98737a0260996f48c6,16,208,286,110,142,208,142,208,266,...,117,208,117,35,240,117,35,117,174,"SetUnhandledExceptionFilter,NtAllocateVirtualM..."


## 3. Identify Malware Types

In [4]:
'''Identify popular malware types in the dataset per Type as validated by VirusTotal.'''
types = ['type1', 'type2', 'type3']

def identify(malware_type:str):
    print(f"{malware_type.upper()} LABEL")
    unique = list(malicious_df[malware_type].unique())
    unique.remove('_')
    print(unique)
    for t in unique:
        print(t, len(malicious_df[malicious_df[malware_type]==t]))
    print("")

for i in types:
    identify(i)

TYPE1 LABEL
['trojan', 'downloader', 'adware', 'ransomware', 'pua', 'softomate']
trojan 399
downloader 29
adware 23
ransomware 8
pua 2
softomate 1

TYPE2 LABEL
['dropper', 'hacktool', 'adware', 'downloader', 'miner', 'trojan', 'ransomware', 'spyware', 'pua', 'virus', 'banker']
dropper 8
hacktool 1
adware 280
downloader 17
miner 6
trojan 38
ransomware 9
spyware 5
pua 4
virus 2
banker 5

TYPE3 LABEL
['pua', 'trojan', 'worm', 'adware', 'downloader', 'dropper', 'virus', 'ransomware', 'spyware', 'hacktool']
pua 56
trojan 19
worm 1
adware 12
downloader 23
dropper 5
virus 24
ransomware 1
spyware 1
hacktool 2



## 4. Extract API Patterns

In [5]:
malicious_patterns = []
for i in range(malicious_df.shape[0]): #Only allow those with 
    if not (malicious_df['type1'].iloc[i] == '_' and malicious_df['type2'].iloc[i] == '_' and malicious_df['type3'].iloc[i] == '_'):
        malicious_patterns.append(malicious_df['pattern'].iloc[i])
print(len(malicious_patterns))
malicious_patterns

462


['GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,NtOpenFile,NtQueryDirectoryFile,NtClose,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetSystemTimeAsFileTime,NtQuerySystemInformation,NtProtectVirtualMemory,GetSystemDirectoryW,LdrGetDllHandle,LdrLoadDll,GetSystemDirectoryW,LdrGetDllHandle,NtOpenKey,Nt

In [6]:
benign_patterns = benign_df['pattern'].to_list()
print(len(benign_patterns))
benign_patterns

70


['SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrLoadDll,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,RegOpenKeyExA,LdrGetProcedureAddress,CreateActCtxW,LdrLoadDll,LdrGetProcedureAddress,GetSystemDirectoryW,RegOpenKeyExA,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,GetVolumeNameForVolumeMountPointW

## 5. Compare API Call Patterns

The samples that match in terms of pattern (i.e., states) are ideally the following:
1. The malicious sample is a `trojan` hence must have similar behaviors as a benign software. It might be the limitation of the dataset having only the first 100 API calls and may not have included the rest.
2. The malicious sample is falsely labelled as malicious (i.e., false-positive) which is supported by it matching the behavior of benign samples.

In [11]:
ratio = 0
ratios = []
print("Comparing API Call Patterns...")
start = time.time()
for b in range(benign_df.shape[0]):
    for m in range(malicious_df.shape[0]):
        new_ratio = SequenceMatcher(None, malicious_df['pattern'].iloc[m], benign_df['pattern'].iloc[b]).ratio()
        ratio += new_ratio
        ratios.append([benign_df['pattern'].iloc[b],malicious_df['pattern'].iloc[m],new_ratio,
                       malicious_df['type1'].iloc[m],malicious_df['type2'].iloc[m],malicious_df['type3'].iloc[m], 
                       malicious_df['hash'].iloc[m]])
elapsed = time.time()-start
size = len(benign_patterns) * len(malicious_patterns)
ratio /= size
ratios.sort(reverse=True,key=lambda ratio: ratio[2])
print("Average Similarity Ratio:", ratio)
print(f"Time Elapsed: {elapsed:0.4f}s")
print(f"Time per Sample: {elapsed/size:0.4f}s")
print("")
states = []
max = 20
for r in range(0,max):
    print(r+1)
    print("Benign:",ratios[r][0])
    print("Malicious:",ratios[r][1])
    print("Score:",ratios[r][2])
    print("Type:",ratios[r][3], ratios[r][4], ratios[r][5])
    states.append(ratios[r][3]+' '+ratios[r][4]+' '+ratios[r][5])
    print("Hash:",ratios[r][6])
states = list(pd.Series(states).sort_values().unique())
print(f"Top {max} Reasons of Matching:", states)

Comparing API Call Patterns...
Average Similarity Ratio: 0.8524672073376841
Time Elapsed: 44.0643s
Time per Sample: 0.0014s

1
Benign: GetSystemTimeAsFileTime,GetSystemInfo,NtOpenKey,NtQueryValueKey,NtClose,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetDllHandle,LdrGetProcedureAddress,RegOpenKeyExW,Reg

In [13]:
states = []
for r in range(0,20):
    print(r+1)
    print("Benign:",ratios[r][0])
    print("Malicious:",ratios[r][1])
    print("Score:",ratios[r][2])
    print("Type:",ratios[r][3], ratios[r][4], ratios[r][5])
    states.append(ratios[r][3]+' '+ratios[r][4]+' '+ratios[r][5])
    print("Hash:",ratios[r][6])
states = list(pd.Series(states).sort_values().unique())
print(states)

1
Benign: GetSystemTimeAsFileTime,GetSystemInfo,NtOpenKey,NtQueryValueKey,NtClose,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetDllHandle,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtQuerySystemInformation,NtProtectVirtualMemory,GetSystemDirectoryW,LdrGetDllHandle,LdrLoadDll,GetS