# Pattern Compare

Aims to answer the question **Are there any key similarities or differences between malicious and benign samples in terms of API Call Patterns?** in **4.2.6. Dataset Analysis** of the study.

## 1. Import Libraries/Datasets

In [1]:
import pandas as pd
from difflib import SequenceMatcher
import time
import threading

malicious_df = pd.read_csv('./Clustering/[EDITED]KMeans_SampleHash_Common.csv', low_memory=False) #This should point to a verified <DataClustering>_SampleHash_Common.csv file
benign_df = pd.read_csv('./Clustering/Benign/API_Patterns.csv') #This should point to the API_Patterns.csv file

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split
# APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

## 2. DataFrame Preview

In [2]:
malicious_df.replace(to_replace='-',value='_', inplace=True)
malicious_df

Unnamed: 0,cluster,hash,type1,type2,type3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


In [3]:
#Drop row that is falsely labelled.
malicious_df.drop(malicious_df[(malicious_df['type1']=='_')&(malicious_df['type2']=='_')&(malicious_df['type3']=='_')].index, inplace=True)
malicious_df

Unnamed: 0,cluster,hash,type1,type2,type3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
5,1,1ff43aa97f19dc8543aeaa1cd53e3885,trojan,adware,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


In [4]:
benign_df

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,5b51d65972a349f90a86984c26b12b30,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,CoCreateInstance,NtOpenSection,CreateDirectoryW,NtCreateFile,LdrGetProcedureAddress,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,ceb8cc125478fad641daa4e04e9b2f19,GetSystemInfo,NtAllocateVirtualMemory,NtOpenSection,GetTempPathW,CreateDirectoryW,GetFileAttributesW,FindFirstFileExW,DeleteFileW,NtQueryDirectoryFile,...,NtClose,NtCreateMutant,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,NtClose,NtCreateMutant,NtClose,NtCreateFile,"GetSystemInfo,NtAllocateVirtualMemory,NtOpenSe..."
2,f108600edf46d7c20f6acc522aeba6df,GetSystemTimeAsFileTime,NtProtectVirtualMemory,SetUnhandledExceptionFilter,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,GetSystemTimeAsFileTime,GetTimeZoneInformation,GetSystemTimeAsFileTime,...,SetErrorMode,GetFileAttributesExW,SetErrorMode,NtAllocateVirtualMemory,SetErrorMode,GetFileAttributesExW,SetErrorMode,FindFirstFileExW,NtQueryDirectoryFile,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."
3,711be6337cb78a948f04759a0bd210ce,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetProcedureAddress,GetSystemMetrics,LdrLoadDll,LdrGetProcedureAddress,GetSystemMetrics,NtAllocateVirtualMemory,LdrLoadDll,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,6de26f67ceb1e3303b889489010f4c3f,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtQueryDirectoryFile,NtClose,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExW,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1074,d282ef96a93986f89825508812958354,SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,LdrGetProcedureAddress,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1075,c0389d256f976044adf570f0df908953,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,GetCursorPos,SetErrorMode,FindResourceW,SetWindowsHookExW,CoInitializeEx,NtDuplicateObject,NtAllocateVirtualMemory,...,NtAllocateVirtualMemory,LdrLoadDll,LdrGetProcedureAddress,NtAllocateVirtualMemory,GetSystemMetrics,RegOpenKeyExW,NtAllocateVirtualMemory,GetSystemMetrics,NtAllocateVirtualMemory,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
1076,20316e717de5db169aecbb67377504ce,SetUnhandledExceptionFilter,NtCreateMutant,NtAllocateVirtualMemory,NtClose,NtCreateMutant,NtClose,NtCreateMutant,NtClose,NtAllocateVirtualMemory,...,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,"SetUnhandledExceptionFilter,NtCreateMutant,NtA..."
1077,ce945d424b93ea73fbbedf0254f6bc07,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,...,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,FindResourceExW,LoadResource,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."


## 3. Identify Malware Types

In [5]:
'''Identify popular malware types in the dataset per Type as validated by VirusTotal.'''

def identify(malware_type:str):
    '''Identify and count the malware types per type'''
    print(f"{malware_type.upper()} LABEL")
    unique = list(malicious_df[malware_type].unique())
    if '_' in unique:
        unique.remove('_')
    quantities = []
    for t in unique:
        quantities.append([t, len(malicious_df[malicious_df[malware_type]==t])])
    quantities.sort(key=lambda ls: ls[1], reverse=True)
    for q in quantities:
        print(q)
    print("")

'''Executes the identification for each type'''
types = ['type1', 'type2', 'type3']
for i in types:    
    identify(i)

TYPE1 LABEL
['trojan', 399]
['downloader', 29]
['adware', 23]
['ransomware', 8]
['pua', 2]
['softomate', 1]

TYPE2 LABEL
['adware', 280]
['trojan', 38]
['downloader', 17]
['ransomware', 9]
['dropper', 8]
['miner', 6]
['spyware', 5]
['banker', 5]
['pua', 4]
['virus', 2]
['hacktool', 1]

TYPE3 LABEL
['pua', 56]
['virus', 24]
['downloader', 23]
['trojan', 19]
['adware', 12]
['dropper', 5]
['hacktool', 2]
['worm', 1]
['ransomware', 1]
['spyware', 1]



## 4. Extract API Patterns

In [6]:
malicious_patterns = malicious_df['pattern'].to_list()
print(len(malicious_patterns))
malicious_patterns[0:4]

462


['GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,NtOpenFile,NtQueryDirectoryFile,NtClose,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetSystemTimeAsFileTime,NtQuerySystemInformation,NtProtectVirtualMemory,GetSystemDirectoryW,LdrGetDllHandle,LdrLoadDll,GetSystemDirectoryW,LdrGetDllHandle,NtOpenKey,Nt

In [7]:
benign_patterns = benign_df['pattern'].to_list()
print(len(benign_patterns))
benign_patterns[0:4]

1079


['SetErrorMode,OleInitialize,LdrGetDllHandle,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,NtOpenSection,NtMapViewOfSection,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrLoadDll,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,RegOpenKeyExA,LdrGetProcedureAddress,CreateActCtxW,LdrLoadDll,LdrGetProcedureAddress,GetSystemDirectoryW,RegOpenKeyExA,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetProcedureAddress,GetVolumeNameForVolumeMountPointW

## 5. Compare API Call Patterns

**Note that this uses the full API Call Patterns (i.e., Time-based Behaviors)**

In [8]:
#This code block may take a while to complete. SequenceMatcher.ratio() is deemed slow even by its documentation.
print("Comparing API Call Patterns...")
ratio = 0
ratios = []
start = time.time()

unique_malicious = list(malicious_df['pattern'].unique())
unique_benign = list(benign_df['pattern'].unique())
print(f"No. of unique malicious patterns: {len(unique_malicious)}")
print(f"No. of unique benign patterns: {len(unique_benign)}")
print("")
for m,ma in enumerate(unique_malicious): #For efficiecy's sake, let's just compare it to unique malicious samples. Hashes that match the same malicious pattern will be collected in batches instead of one by one.
    mal_df = malicious_df[malicious_df['pattern']== ma]
    st = time.time()
    for b,be in enumerate(unique_benign): #For efficiecy's sake, let's just compare it to unique benign samples. Hashes that match the same benign pattern will be collected in batches instead of one by one.
        ratios.append({'ratio': SequenceMatcher(None, ma, be).ratio(), 
                       'benign_pattern':be, 'malicious_pattern': ma,
                       'type1':mal_df['type1'].to_list(), 
                       'type2':mal_df['type2'].to_list(), 
                       'type3':mal_df['type3'].to_list(), 
                       'malicious_hash':mal_df['hash'].to_list(), 
                       'benign_hash':benign_df[benign_df['pattern'] == be]['hash'].to_list()})
    print(f"{m} @ {time.time()-st:.4f}")
elapsed = time.time()-start

Comparing API Call Patterns...
No. of unique malicious patterns: 99
No. of unique benign patterns: 624



0 @ 5.7374
1 @ 5.4467
2 @ 5.8863
3 @ 4.8530
4 @ 5.6535
5 @ 5.6638
6 @ 5.2199
7 @ 4.7802
8 @ 5.0853
9 @ 5.9400
10 @ 5.3065
11 @ 5.1873
12 @ 4.7382
13 @ 5.0962
14 @ 5.2900
15 @ 6.5317
16 @ 5.8464
17 @ 5.2561
18 @ 5.4105
19 @ 4.8425
20 @ 4.9307
21 @ 5.2117
22 @ 5.1808
23 @ 5.2573
24 @ 4.2400
25 @ 5.0083
26 @ 5.2883
27 @ 5.8603
28 @ 4.8348
29 @ 5.2695
30 @ 4.8621
31 @ 4.4953
32 @ 5.5989
33 @ 4.8632
34 @ 4.2107
35 @ 4.5667
36 @ 5.4392
37 @ 4.6802
38 @ 5.5030
39 @ 5.1330
40 @ 5.9044
41 @ 4.5527
42 @ 4.2529
43 @ 4.4975
44 @ 5.0446
45 @ 4.8007
46 @ 5.0553
47 @ 5.1002
48 @ 4.2322
49 @ 5.2048
50 @ 4.5506
51 @ 4.9518
52 @ 5.6426
53 @ 4.6914
54 @ 5.5530
55 @ 4.4025
56 @ 5.6968
57 @ 5.1609
58 @ 4.7023
59 @ 4.7984
60 @ 4.7673
61 @ 4.5772
62 @ 5.7344
63 @ 5.7280
64 @ 5.3009
65 @ 5.2950
66 @ 5.9160
67 @ 4.7950
68 @ 5.7910
69 @ 5.8566
70 @ 5.8408
71 @ 5.7902
72 @ 5.7242
73 @ 5.6222
74 @ 8.1743
75 @ 5.3006
76 @ 5.2275
77 @ 5.1612
78 @ 5.3079
79 @ 4.4513
80 @ 5.7996
81 @ 5.3976
82 @ 5.8561
83 @ 5.9268
84

In [9]:
size = len(benign_patterns) * len(malicious_patterns)
for r in ratios:
    ratio += r['ratio']
ratio /= size
ratios.sort(reverse=True,key=lambda ratio: ratio['ratio'])
print(f"Average Similarity Ratio: {ratio*100:.4f}%")
print(f"Time Elapsed: {elapsed:0.4f}s")
print(f"Time per Sample: {elapsed/size:0.4f}s")
print("")

Average Similarity Ratio: 1.4945%
Time Elapsed: 516.7505s
Time per Sample: 0.0010s



In [10]:
def print_comparison(types:str, ratios:list, max:int):
    if max > len(ratios):
        print(f"The specified `max` value ({max}) exceeds the number available ratios to select.")
        max = len(ratios)
    states = []
    output = ""
    for r in range(0,max):
        output += f"{r+1}\n"
        output += f"Malicious Hashes ({len(ratios[r]['malicious_hash'])}):\n"
        for t in range(len(ratios[r]['malicious_hash'])):
            output += f"\t{ratios[r]['malicious_hash'][t]} - {ratios[r]['type1'][t]} {ratios[r]['type2'][t]} {ratios[r]['type3'][t]}" + "\n"
        output += f"Benign Hash ({len(ratios[r]['benign_hash'])}):\n"
        for b in range(len(ratios[r]['benign_hash'])):
            output += f"\t{ratios[r]['benign_hash'][b]}\n"
        output += f"Score: {ratios[r]['ratio']:.4f}\n"
        output += f"Malicious API Call Pattern: {ratios[r]['malicious_pattern']}\n"
        output += f"Benign API Call Pattern: {ratios[r]['benign_pattern']}\n"
        output += "\n"
        for t in range(len(ratios[r]['malicious_hash'])):
            states.append(ratios[r]['type1'][t]+' '+ratios[r]['type2'][t]+' '+ratios[r]['type3'][t])
    print(output)
    print(output, file=open(f"PatternCompare_{types}.txt",'w'))
    return states

### How to read the outputs of `print_comparison()`?

```
1
Type: ['trojan', 'trojan', 'trojan', 'trojan'] ['dropper', 'hacktool', '_', 'dropper'] ['pua', '_', '_', '_']
Malicious Hash: ['490d584c7d303ed35c673460b63f3ca8', '9ab8ea1d2d68a0d4110df413e677976c', 'adbc74815ef2bd1ea4967abad812233d', 'f5a0ad49337ebc87897698e70d03364e']
Benign Hash: ['1d5be1a1f06fb4af76db691286667685', 'e17c7fc99e55278c9377e0ba686b4002']
Score: 1.0000
Malicious API Call Pattern: GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTim...
Benign API Call Pattern: GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTime...
```

The three separate lists indicate the three Popular Threat Categories of VirusTotal on the given sample. Each index of the three `Type` lists correspond to the hashes in the list shown in the `Malicious Hash` 

Examples:
1. `490d584c7d303ed35c673460b63f3ca8` = `trojan` `dropper` `pua`
2. `adbc74815ef2bd1ea4967abad812233d` = `trojan` `_` `_`
3. `9ab8ea1d2d68a0d4110df413e677976c` = `trojan` `hacktool` `_`

The `Benign Hash` represents the benign samples with a matching API Call Pattern as those malicious ones. 

The succeeding lines are `Malicious API Call Pattern` & `Benign API Call Pattern` which show the API Call Pattern (shortened in this example)

### 5.1 High Matching Ratios

Samples with high matching ratio in terms of pattern (i.e., states) are ideally the following:
1. The malicious sample is a `trojan` hence must have similar behaviors as a benign software. It might be the limitation of the dataset having only the first 100 API calls and may not have included the rest.
2. The malicious sample is falsely labelled (i.e., `_ _ _`)as malicious (i.e., false-positive) which is supported by it matching the behavior of benign samples.

In [11]:
# MOST SIMILAR API CALL PATTERNS TO BOTH MALICIOUS AND BENIGN SAMPLES
top = 20 #Prints Top 50 Most Similar
ratios.sort(reverse=True,key=lambda ratio: ratio['ratio'])
states = print_comparison("similar", ratios, 20) 
common_states = pd.Series(states).sort_values()
display(f"Malware Types of the Top {top} Most Matching API Call Patterns to Benign Samples:", pd.Series(common_states).sort_values().value_counts())

1
Malicious Hashes (4):
	490d584c7d303ed35c673460b63f3ca8 - trojan dropper pua
	9ab8ea1d2d68a0d4110df413e677976c - trojan hacktool _
	adbc74815ef2bd1ea4967abad812233d - trojan _ _
	f5a0ad49337ebc87897698e70d03364e - trojan dropper _
Benign Hash (2):
	1d5be1a1f06fb4af76db691286667685
	e17c7fc99e55278c9377e0ba686b4002
Score: 1.0000
Malicious API Call Pattern: GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,NtOpenFile,NtQueryDirectoryFile,NtClose,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryInfoKeyW,RegC

'Malware Types of the Top 20 Most Matching API Call Patterns to Benign Samples:'

trojan adware _             45
trojan _ _                  18
trojan dropper _             8
trojan adware downloader     6
trojan downloader adware     4
trojan dropper pua           2
trojan hacktool _            2
adware pua _                 1
downloader trojan _          1
trojan adware pua            1
trojan miner worm            1
trojan pua ransomware        1
Name: count, dtype: int64

### 5.2 Low Matching Ratios

Samples with low matching ratio in terms of pattern (i.e., states) are ideally the following:
1. The malicious sample is a `trojan` hence must have similar behaviors as a benign software. It might be the limitation of the dataset having only the first 100 API calls and may not have included the rest.
2. The malicious sample is falsely labelled as malicious (i.e., false-positive) which is supported by it matching the behavior of benign samples.

In [12]:
# MOST DIFFERENT API CALL PATTERNS TO BOTH MALICIOUS AND BENIGN SAMPLES
top = 20 #Prints Top 20
ratios.sort(reverse=False,key=lambda ratio: ratio['ratio'])
states = print_comparison("different",ratios, 20) #Prints Top 20
uncommon_states = pd.Series(states).sort_values()
display(f"Malware Types of the Top {top} Least Matching API Call Patterns to Benign Samples:", pd.Series(uncommon_states).sort_values().value_counts())

1
Malicious Hashes (5):
	1da9e321b4857c7ffc9e54a6449749c5 - trojan banker _
	1901fc38186ae1bed1b5da4874cfa382 - trojan banker _
	ead8622597de71be8384b3849c20216f - trojan banker _
	1feb2a391cd4f44a9400b9b85caa141f - trojan banker _
	ded641f741d8eda8be254e981d37c29c - trojan banker _
Benign Hash (2):
	5af545fe09a6680a1b4e1b3aaa2a8031
	fb4462fd7462e2e5dd100132ed7b38df
Score: 0.0005
Malicious API Call Pattern: NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInformationFile,NtQuerySystemInformation,NtQueryInform

'Malware Types of the Top 20 Least Matching API Call Patterns to Benign Samples:'

trojan banker _              40
trojan _ _                   34
ransomware trojan _           6
trojan adware _               5
trojan downloader _           3
trojan ransomware _           2
trojan spyware _              2
adware trojan downloader      1
ransomware trojan dropper     1
ransomware trojan pua         1
trojan dropper _              1
trojan miner spyware          1
trojan pua _                  1
Name: count, dtype: int64