# PCA Analysis

Based from the results of **PCA All**.

## Loading Dataset

In [1]:
# Import Libraries
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

sns.set()
warnings.filterwarnings("ignore")

df = pd.read_csv('./oliveira_pca.csv')

API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove benign samples
# df = df[df['type'] != 'benign']

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#Inverse Label Encoding
df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
df = inject_patterns(df).copy(deep=True)

df

Injecting API patterns...


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_98,t_99,type_int,comp_1,comp_2,comp_3,pca_segment,hash,type,pattern
0,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,CreateActCtxW,GetSystemWindowsDirectoryW,0,8.028000,-5.097104,2.267355,c_5,071e8c3f8922e186e57548cd4c703a5d,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtClose,GetSystemMetrics,2,1.611520,1.859686,-0.806068,c_3,33f8e6d08a6aae939f25a8e0d63dd523,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,1,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegEnumKeyExA,RegOpenKeyExA,0,-0.354560,2.660250,-2.721597,c_3,b68abd064e975e1c6d5f25e748663076,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtProtectVirtualMemory,NtWriteVirtualMemory,0,-2.381073,1.430765,1.791827,c_1,72049be7bd30ea61297ea624ae198067,trojan,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,0,-4.715926,-4.372803,1.031899,c_0,c9b3700a77facf29172f32df6bc77f48,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41233,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,0,-5.881912,-3.868208,1.650292,c_6,e3d6d58faa040f0f9742c9d0eaf58be4,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41234,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,EnumWindows,GetSystemTimeAsFileTime,0,-3.959985,-5.032700,-0.459489,c_0,9b917bab7f32188ae40c744f2be9aaf8,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41235,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,0,-4.511803,-3.429224,0.578653,c_6,35a18ee05f75f04912018d9f462cb990,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41236,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,0,-5.881912,-3.868208,1.650292,c_6,654139d715abcf7ecdddbef5a84f224b,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [2]:
print("Number of PCA Kmeans Clusters:", len(df['pca_segment'].unique()))

Number of PCA Kmeans Clusters: 7


## Exploring PCA Kmeans Clusters

In [3]:
def explore_cluster(cluster_id:str):
    #Load PCA Kmeans Cluster
    cluster = df[df['pca_segment'] == cluster_id]
    print("1. What sample types are in the cluster (including benign if any)?\n".upper())
    print("Item Classes\n",cluster['type'].value_counts())
    total = sum(cluster['type'].value_counts())
    print("Total:", total, f"({total/df.shape[0]*100:.4f}%)")
    print("")

    print(f"2. How many unique API Call Patterns are there in cluster {cluster_id}?\n".upper())
    print("Total:", len(cluster['pattern'].unique()))
    mal = cluster[cluster['malware']==1]['pattern'].unique()
    ben = cluster[cluster['malware']==0]['pattern'].unique()
    print("Malicious:", len(mal))
    print("Benign:", len(ben))
    print("Shared API Call Patterns:\n")
    for i in list(set(mal).intersection(set(ben))):
        ls = cluster[cluster['pattern'] == i]['type'].value_counts()
        print(f"{i[:256]}...", "\n", ls.index.tolist(), ls.tolist(),"\n")
    print("")
    
    print("3. What Top 20 API Call Patterns of the Cluster (Malicious Samples)\n".upper())
    mal_pattern = cluster[cluster['malware']==1]['pattern'].value_counts()[0:20]
    for i in range(len(mal_pattern)):
        print(f"{mal_pattern.index[i][:128]}... - {mal_pattern[i]}")
    print("")
    
    print("4. What Top 20 API Call Patterns of the Cluster (Benign Samples)\n".upper())
    ben_pattern = cluster[cluster['malware']==0]['pattern'].value_counts()[0:20]
    for i in range(len(ben_pattern)):
        print(f"{ben_pattern.index[i][:128]}... - {ben_pattern[i]}")
    print("")

    print("5. What Malware Types use these Top 20 API Call Patterns? (Malicious Samples)\n".upper())
    for p in mal_pattern.index.to_list():
        print(f"{p[:128]}...\n")
        m_types = cluster[cluster['pattern'] == p]['type'].value_counts()
        for i in range(len(m_types)):
            print(f"{m_types.index[i]} - {m_types[i]}")
        print("")
    print("")

    print("6. What are the hashes of these malicious samples?\n".upper())
    sample_ratio = 0.1
    ctr = 0
    for p in mal_pattern.index.to_list():
        #print(f"{p[:128]}...", end=",")
        pattern_filter = cluster[cluster['pattern'] == p]
        for t in pattern_filter['type'].value_counts().index.to_list():
            hashes = pattern_filter[pattern_filter['type'] == t]['hash'].to_list()
            #print(t+f" ({len(hashes)}, {int(len(hashes)*sample_ratio)+1})")
            for h in hashes[0:int(len(hashes)*sample_ratio)+1]:
                print(cluster_id+","+str(ctr)+","+t+","+h)
        ctr += 1
    print("")
    
    print("7. What Top 20 API Call Patterns of the Cluster (Malicious Samples; In Whole)\n".upper())
    mal_pattern = cluster[cluster['malware']==1]['pattern'].value_counts()[0:20]
    for i in range(len(mal_pattern)):
        print(f"{mal_pattern.index[i]}\n")
    print("")
    
    print("8. What Top 20 API Call Patterns of the Cluster (Benign Samples; In Whole)\n".upper())
    ben_pattern = cluster[cluster['malware']==0]['pattern'].value_counts()[0:20]
    for i in range(len(ben_pattern)):
        print(f"{ben_pattern.index[i]}\n")
    print("")

### c_0

In [4]:
explore_cluster("c_0")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        4654
adware         167
pua             22
downloader       7
Name: count, dtype: int64
Total: 4850 (11.7610%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_0?

Total: 2725
Malicious: 2725
Benign: 0
Shared API Call Patterns:


3. WHAT TOP 20 API CALL PATTERNS OF THE CLUSTER (MALICIOUS SAMPLES)

GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 343
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 168
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 135
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 134
GetSystemTimeAsFileTime,LdrLoadDll,L

### c_1

In [5]:
explore_cluster("c_1")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        4242
pua            710
adware         558
ransomware     330
benign         286
miner           87
virus           29
downloader      20
spyware          8
hacktool         5
dropper          2
worm             2
Name: count, dtype: int64
Total: 6279 (15.2262%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_1?

Total: 1952
Malicious: 1717
Benign: 241
Shared API Call Patterns:

LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetP... 
 ['trojan', 'benign'] [1, 1] 

SetErrorMode,LdrGetDllHandle,LdrGetProcedureAddress,GetSystemDirectoryW,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrLoadDll,GetSystemDirectoryW,LdrLoadDll,GetSystemDirectoryW,GetSystemW

### c_2

In [6]:
explore_cluster("c_2")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        4366
downloader     403
pua             64
benign          62
adware          57
virus            4
miner            3
Name: count, dtype: int64
Total: 4959 (12.0253%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_2?

Total: 213
Malicious: 162
Benign: 51
Shared API Call Patterns:


3. WHAT TOP 20 API CALL PATTERNS OF THE CLUSTER (MALICIOUS SAMPLES)

NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHand... - 1093
NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHand... - 861
NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHand... - 801
NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,Ld

### c_3

In [7]:
explore_cluster("c_3")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        10659
downloader     1803
pua            1296
adware          904
benign          700
virus            45
spyware          36
miner            27
ransomware        6
dropper           3
hacktool          2
Name: count, dtype: int64
Total: 15481 (37.5406%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_3?

Total: 6254
Malicious: 5949
Benign: 307
Shared API Call Patterns:

GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDl... 
 ['trojan', 'benign'] [1, 1] 

GetSystemTimeAsFileTime,NtOpenKey,NtQueryValueKey,NtClose,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW

### c_4

In [8]:
explore_cluster("c_4")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        2517
adware          23
pua             13
benign          11
miner            7
downloader       5
ransomware       4
spyware          2
Name: count, dtype: int64
Total: 2582 (6.2612%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_4?

Total: 27
Malicious: 25
Benign: 6
Shared API Call Patterns:

GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,Ld... 
 ['trojan', 'benign', 'miner'] [6, 5, 1] 

GetSystemTimeAsFileTime,GetSystemInfo,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,L... 
 ['trojan', 'miner', '

### c_5

In [9]:
explore_cluster("c_5")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        4253
benign          19
pua             14
adware          12
downloader       5
spyware          2
miner            1
worm             1
virus            1
Name: count, dtype: int64
Total: 4308 (10.4467%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_5?

Total: 273
Malicious: 255
Benign: 18
Shared API Call Patterns:


3. WHAT TOP 20 API CALL PATTERNS OF THE CLUSTER (MALICIOUS SAMPLES)

RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVi... - 3305
RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVi... - 44
RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVi... - 37
RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose

c_5,18,trojan,6b638207e887929c4903d8a6930f6897
c_5,18,trojan,f672964a33aac309f4ea54d93fb5533c
c_5,18,trojan,d1cde7821c33f1b0621ea0b7426d84e6
c_5,18,trojan,ddff91e7b840a31c193c41dc97f07b25
c_5,18,trojan,9a055fd20637ea7c3c3eeb678def6623
c_5,18,trojan,81f9a072f94e63307d57a7f2cf86e853
c_5,18,trojan,22965189e8327de9be5363cbce9a4b0f
c_5,18,trojan,2f6433bcaea2a416e36c257e997816c0
c_5,18,trojan,e05ff53b7b20b952c46de773c8084070
c_5,18,trojan,d01d1e9811f0eb5d1b0fde6994f02232
c_5,18,trojan,aab9a65a870337e5583013c6218407ba
c_5,18,trojan,a4549bd444a89f1970ea913602b36f51
c_5,18,trojan,ebc1761c9188b2fed5429b3812e43025
c_5,18,pua,57053363499d350a72d6cf95b29631f7
c_5,19,trojan,a9b3f3f84ea5cfcf0fcf44d4fda5e86d
c_5,19,trojan,6b106429d95d6532c62fd679f2b492a1
c_5,19,trojan,55f1c3e0aa031a198bce930eb7606a0c
c_5,19,trojan,c9d2cb122eb8f7f04e8983020acaaaf4
c_5,19,trojan,c2112affc07f2739515a32a9905c4579
c_5,19,trojan,a368108e44e57cede72da04f8074079a
c_5,19,trojan,f8f7be4300f5ab87e298546a75b4b0d1
c_5,19,trojan,ea

### c_6

In [10]:
explore_cluster("c_6")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        2661
adware          94
pua             16
downloader       6
benign           1
ransomware       1
Name: count, dtype: int64
Total: 2779 (6.7389%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_6?

Total: 660
Malicious: 659
Benign: 1
Shared API Call Patterns:


3. WHAT TOP 20 API CALL PATTERNS OF THE CLUSTER (MALICIOUS SAMPLES)

GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 738
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 200
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 76
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 69
Get