# Malicious ClusterScan

Describes/summarizes the or set of clusters. Useful for when you aim to enlist  

## 1. Import Libraries/Datasets

In [1]:
import pandas as pd

malicious_df = pd.read_csv('./Clustering/[EDITED]KMeans_SampleHash_Common.csv', low_memory=False, index_col=False) #This should point to a VirusTotal verified <Data Clustering>_SampleHash_Common.csv file

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

def get_unique_clusters(df:pd.DataFrame):
    return list(df['cluster'].unique())

## 2. DataFrame Preview

In [2]:
#Replace '-' empty malware type delimiter with '_' for consistency
malicious_df.replace(to_replace='-',value='_', inplace=True)
malicious_df

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


In [3]:
#Drop row that is falsely labelled. (i.e. '_' on all popularity levels of VirusTotal)
malicious_df.drop(malicious_df[(malicious_df['Type 1']=='_')&(malicious_df['Type 2']=='_')&(malicious_df['Type 3']=='_')].index, inplace=True)
malicious_df

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
5,1,1ff43aa97f19dc8543aeaa1cd53e3885,trojan,adware,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,trojan,dropper,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,trojan,miner,hacktool,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


## 3. Identify "What Malware Types are there in each cluster?"

This indicates what are the individual malware types mostly associated to cluster in terms of quantity regardless of order by popularity by VT.

In [4]:
#Identify the overall list of types each cluster is as designated by VirusTotal.
unique_clusters = get_unique_clusters(malicious_df)

summary = []
for u in unique_clusters:
    df_copy = malicious_df[malicious_df['cluster'] == u].copy(deep=True)
    types = list(df_copy['Type 1']) + list(df_copy['Type 2']) + list(df_copy['Type 3'])
    c_types = types.copy()
    types.sort(key=lambda x:c_types.count(x), reverse=True) #Sort the list by quantity (i.e., type with most quantity comes first)
    types = pd.Series(types).unique() #Combine the 3 levels of classifications of VirusTotal
    types = list(types)
    if '_' in types:
        types.remove('_')
    types = ' '.join(types)
    summary.append([int(u), types])
summary = pd.DataFrame(summary, columns=['cluster', 'types'])
for s in range(summary.shape[0]):
    print(f"{summary['cluster'].iloc[s]:2d} | {summary['types'].iloc[s]}")
# Note that the order of malware types starts from Type 1, Type 2, and Type 3.

 0 | trojan dropper hacktool pua
 1 | trojan adware
 2 | trojan adware pua
 3 | trojan adware
 4 | downloader adware trojan
 5 | trojan dropper miner worm
 6 | trojan downloader ransomware adware pua
 7 | trojan adware
 8 | trojan adware
 9 | trojan adware downloader pua
10 | trojan adware
11 | trojan adware dropper
12 | trojan adware
13 | downloader adware trojan
14 | trojan adware virus
15 | trojan ransomware dropper pua
16 | trojan adware pua
17 | trojan spyware
18 | trojan adware
19 | adware trojan pua
20 | adware trojan virus dropper
21 | adware trojan downloader virus
22 | trojan downloader adware pua
23 | trojan adware pua spyware downloader
24 | trojan adware
25 | pua adware trojan ransomware
26 | trojan adware
27 | trojan spyware
28 | softomate trojan virus adware
29 | trojan adware
30 | trojan downloader
31 | trojan adware
32 | trojan adware pua
33 | trojan adware
34 | trojan adware
35 | trojan adware
36 | trojan adware pua downloader
37 | trojan adware
38 | trojan dropper
39

In [9]:
# Summarize Clusters that have the same malware types as per VirusTotal.
unique_type_summary = list(summary['types'].unique())
print("# of Unique Type Summaries:", len(unique_type_summary),"\n")

count_summary = []
for i, u in enumerate(list(summary['types'].unique())):
    print("Malware Type Summary:", u)
    print("Matching Clusters:", list(summary[summary['types'] == u]['cluster']))
    count_summary.append([i, u, len(list(summary[summary['types'] == u]['cluster']))])
    print("")
count_summary.sort(key=lambda x: x[2])
    
pd.DataFrame(count_summary, columns=['cluster','malware_type_summary', 'count'])

# of Unique Type Summaries: 40 

Malware Type Summary: trojan dropper hacktool pua
Matching Clusters: [0]

Malware Type Summary: trojan adware
Matching Clusters: [1, 3, 7, 8, 10, 12, 18, 24, 26, 29, 31, 33, 34, 35, 37, 42, 46, 47, 57, 63, 66, 68, 74, 80, 84, 92, 94]

Malware Type Summary: trojan adware pua
Matching Clusters: [2, 16, 32, 52, 54, 56, 72, 86]

Malware Type Summary: downloader adware trojan
Matching Clusters: [4, 13, 73]

Malware Type Summary: trojan dropper miner worm
Matching Clusters: [5]

Malware Type Summary: trojan downloader ransomware adware pua
Matching Clusters: [6]

Malware Type Summary: trojan adware downloader pua
Matching Clusters: [9, 71]

Malware Type Summary: trojan adware dropper
Matching Clusters: [11]

Malware Type Summary: trojan adware virus
Matching Clusters: [14, 39, 41, 43, 62, 64, 76, 79, 82, 90, 93]

Malware Type Summary: trojan ransomware dropper pua
Matching Clusters: [15]

Malware Type Summary: trojan spyware
Matching Clusters: [17, 27]

Malwa

Unnamed: 0,cluster,malware_type_summary,count
0,0,trojan dropper hacktool pua,1
1,4,trojan dropper miner worm,1
2,5,trojan downloader ransomware adware pua,1
3,7,trojan adware dropper,1
4,9,trojan ransomware dropper pua,1
5,11,adware trojan pua,1
6,12,adware trojan virus dropper,1
7,13,adware trojan downloader virus,1
8,14,trojan downloader adware pua,1
9,15,trojan adware pua spyware downloader,1
