# Malicious ClusterScan

Describes/summarizes the or set of clusters. Useful for when you aim to enlist  

## 1. Import Libraries/Datasets

In [1]:
import pandas as pd

malicious_df = pd.read_csv('./Clustering/[EDITED]KMeans_SampleHash_Common.csv', low_memory=False) #This should point to a verified <DataClustering>_SampleHash_Common.csv file
benign_df = pd.read_csv('./Clustering/Benign/API_Patterns.csv', low_memory=False) #This should point to the API_Patterns.csv file

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

def get_unique_clusters(df:pd.DataFrame):
    return list(df['cluster'].unique())

## 2. DataFrame Preview

In [2]:
#Replace '-' empty malware type delimiter with '_' for consistency
malicious_df.replace(to_replace='-',value='_', inplace=True)
malicious_df

Unnamed: 0,cluster,hash,type1,type2,type3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


## 3. How many are falsely labelled samples from the verified samples?

In [3]:
false_labelled = malicious_df[(malicious_df['type1']=='_')&(malicious_df['type2']=='_')&(malicious_df['type3']=='_')].copy(deep=True)
false_labelled.drop(columns=['type1', 'type2', 'type3'], inplace=True)

print(f"No. of falsely labelled samples from verified samples: {false_labelled.shape[0]}")
print("")

print("Counts of Falsely Labelled Samples in each Cluster")
print(false_labelled['cluster'].value_counts())
print("")

display(false_labelled)

No. of falsely labelled samples from verified samples: 33

Counts of Falsely Labelled Samples in each Cluster
cluster
61    5
55    4
25    3
28    3
58    3
77    3
22    2
23    2
44    2
85    2
0     1
19    1
45    1
98    1
Name: count, dtype: int64



Unnamed: 0,cluster,hash,pattern
3,0,f6eb4841bba3a4cee747700dc0ee1609,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
95,19,9fbb71598d55fb6ac2d788a4edbe7ae6,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
111,22,0f7d7c34ed0cfd0c481df870d67ac98f,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
112,22,fce6af9ba629f6195f625217de2f2bc2,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
116,23,3555236a4ee1bd9ea8b9c33ffcf89dca,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
117,23,bf042f37cde592df4d7cd54a40b0e2bc,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
125,25,bbefd38b6cbb856a60a7e717bbece419,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
127,25,72d69a114513bf55f8efb6089423e09a,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
129,25,a8da6503536530a1da8592810550de86,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
140,28,4633b8a44132e2aa1dbb7a7fbced3090,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


## 4. Does the presented API Call Patterns match those from the API Call Patterns of those Benign samples?

**Remember that the samples labelled as benign in Oliveira came from Win7 executables which means that there is a guarantee that those are truly benign which makes it safe for use in comparison.**

In [4]:
unique_false_patterns = list(false_labelled['pattern'])

same = []
print("Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples","\n")
for f in unique_false_patterns:
    if benign_df[benign_df['pattern']==f].shape[0]>0:
        print("API Call Pattern:")
        print("\t" + f)
        print("Clusters & Hashes of Matching Falsely Labelled Samples:")
        for p in range(false_labelled[false_labelled['pattern']==f].shape[0]):
            print(f"\t{false_labelled['cluster'].iloc[p]} - {false_labelled['hash'].iloc[p]}")
        print("Hashes of Benign Samples with Matching API Call Patterns:")
        for p in range(benign_df[benign_df['pattern']==f].shape[0]):
            print(f"\t{benign_df['hash'].iloc[p]}")
        same.append(f)
        print("\n")
print("")
print(f"No. of API Call Patterns of Falsely-Labelled Malicious Samples that match the API Call Patterns of Benign Samples: {len(same)} ({len(same)/benign_df.shape[0]*100:.4f}%)")

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples 

API Call Pattern:
	GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,NtOpenFile,NtQueryDirectoryFile,NtClose,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetSystemTimeAsFileTime,NtQuerySystemInformation,NtProtectVirtualM

In [5]:
print("In terms of unique API Calls:")
for i, s in enumerate(same):
    print(i, list(pd.Series(s.split(',')).unique()))
    print("")

In terms of unique API Calls:
0 ['GetSystemTimeAsFileTime', 'NtCreateMutant', 'NtOpenKeyEx', 'NtQueryKey', 'LdrLoadDll', 'LdrGetProcedureAddress', 'RegOpenKeyExW', 'RegQueryInfoKeyW', 'RegEnumKeyExW', 'RegEnumValueW', 'RegCloseKey', 'GetFileAttributesW', 'RegQueryValueExW', 'NtOpenFile', 'NtQueryDirectoryFile', 'NtClose', 'NtQuerySystemInformation', 'NtProtectVirtualMemory', 'GetSystemDirectoryW', 'LdrGetDllHandle', 'NtOpenKey', 'NtQueryValueKey', 'NtCreateFile', 'GetFileSize', 'NtCreateSection', 'NtMapViewOfSection', 'GetSystemInfo', 'NtUnmapViewOfSection']

