# PCA Analysis

Based from the results of **PCA All**.

## Loading Dataset

In [1]:
# Import Libraries
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

sns.set()
warnings.filterwarnings("ignore")

df = pd.read_csv('./oliveira_pca.csv')

API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove benign samples
# df = df[df['type'] != 'benign']

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#Inverse Label Encoding
df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
df = inject_patterns(df).copy(deep=True)

df

  from pandas.core import (


Injecting API patterns...


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_97,t_98,t_99,comp_1,comp_2,comp_3,pca_segment,hash,type,pattern
0,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,8.027009,-5.098860,2.266480,c_2,071e8c3f8922e186e57548cd4c703a5d,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtMapViewOfSection,NtClose,GetSystemMetrics,1.611804,1.859529,-0.805991,c_5,33f8e6d08a6aae939f25a8e0d63dd523,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,1,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,-0.354380,2.660978,-2.721049,c_5,b68abd064e975e1c6d5f25e748663076,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtWriteVirtualMemory,NtProtectVirtualMemory,NtWriteVirtualMemory,-2.380784,1.431129,1.791917,c_5,72049be7bd30ea61297ea624ae198067,trojan,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,-4.716947,-4.371760,1.031883,c_6,c9b3700a77facf29172f32df6bc77f48,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41231,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,-5.882868,-3.867033,1.650539,c_0,e3d6d58faa040f0f9742c9d0eaf58be4,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41232,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,-3.961226,-5.031721,-0.459638,c_6,9b917bab7f32188ae40c744f2be9aaf8,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41233,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,-4.512618,-3.428349,0.578631,c_0,35a18ee05f75f04912018d9f462cb990,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41234,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,-5.882868,-3.867033,1.650539,c_0,654139d715abcf7ecdddbef5a84f224b,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [2]:
print("Number of PCA Kmeans Clusters:", len(df['pca_segment'].unique()))

Number of PCA Kmeans Clusters: 7


## Exploring PCA Kmeans Clusters

In [3]:
def explore_cluster(cluster_id:str):
    #Load PCA Kmeans Cluster
    cluster = df[df['pca_segment'] == cluster_id]
    print("1. What malware types are in the cluster (including benign if any)?\n".upper())
    print("Item Classes\n",cluster['type'].value_counts())
    print("")

    print("2. What Top 10 API Call Patterns of the Cluster\n".upper())
    top5_pattern = cluster['pattern'].value_counts()[0:10]
    print("Top 10 Patterns\n", top5_pattern)
    print("")

    print("3. What sample classes use these Top 10 API Call Patterns?\n".upper())
    for p in top5_pattern.index.to_list():
        print(p, cluster[cluster['pattern'] == p]['type'].value_counts(),'\n')
    print("")

    print("4. What are the hashes of these samples?\n".upper())
    for p in top5_pattern.index.to_list():
        print(p)
        pattern_filter = cluster[cluster['pattern'] == p]
        for t in pattern_filter['type'].value_counts().index.to_list():
            hashes = pattern_filter[pattern_filter['type'] == t]['hash'].to_list()
            print("\t"+t+f" ({len(hashes)}, {int(len(hashes)*0.10)+1})")
            for h in hashes[0:int(len(hashes)*0.25)+1]:
                print("\t\t"+h)
    print("")

### c_0

In [4]:
explore_cluster("c_0")

1. WHAT MALWARE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        3403
adware         120
pua             22
downloader       6
Name: count, dtype: int64

2. WHAT TOP 10 API CALL PATTERNS OF THE CLUSTER

Top 10 Patterns
 pattern
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,LdrLoadDll,LdrGetProcedureAddress,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetProcedureAddress,CryptCreateHash,LdrGetProcedureAddress,CryptHashData,LdrGetProcedureAddress,NtCreateFile,GetFileSize,SetFilePointer,NtReadFile,NtClose,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,CoInitializeEx,RegOpenKeyExW,RegQueryValueExW,RegEnumKeyW,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegEnumKeyW,RegCloseKey,RegOpenKeyExW,RegQue

### c_1

In [5]:
explore_cluster("c_1")

1. WHAT MALWARE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        3747
pua            687
adware         553
ransomware     326
benign         275
miner           81
virus           28
downloader      16
hacktool         6
dropper          2
spyware          2
worm             1
Name: count, dtype: int64

2. WHAT TOP 10 API CALL PATTERNS OF THE CLUSTER

Top 10 Patterns
 pattern
GetSystemInfo,LdrGetDllHandle,LdrGetProcedureAddress,GetSystemDirectoryW,NtAllocateVirtualMemory,SetErrorMode,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrLoadDll,SetErrorMode,LdrLoadDll,SetErrorMode,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExW,LdrLoadDll,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,RegCloseKey,NtCreateMutant,GetNativeSystemInfo,GetSystemWindowsDirectoryW,NtClose,LdrLoadDll,Se

### c_2

In [6]:
explore_cluster("c_2")

1. WHAT MALWARE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        4259
benign          25
pua             14
adware          12
downloader       5
spyware          2
miner            1
worm             1
virus            1
Name: count, dtype: int64

2. WHAT TOP 10 API CALL PATTERNS OF THE CLUSTER

Top 10 Patterns
 pattern
RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,LoadStringA,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LoadStringA,LdrGetProcedureAddress,GetSystemMetrics,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadRes

RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,LoadStringA,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LoadStringA,LdrGetProcedureAddress,GetSystemMetrics,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,LdrGetProcedureAddress,FindResourceExW,LoadResource,FindResourceExW,LoadResource,NtAllocateVirtualMemory,GetSystemMetrics,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,FindResourceExW,LoadResource,FindResourceExW,LoadResource,DrawTextExW,GetSy

### c_3

In [7]:
explore_cluster("c_3")

1. WHAT MALWARE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        2428
adware          23
pua             13
benign          10
miner            7
downloader       5
ransomware       4
spyware          2
Name: count, dtype: int64

2. WHAT TOP 10 API CALL PATTERNS OF THE CLUSTER

Top 10 Patterns
 pattern
GetSystemTimeAsFileTime,GetSystemInfo,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW

	trojan (1024, 103)
		54d1c19fa3bd274f1c7376fb341b2848
		bf8f04b479d5e1a3d9345cad5a395023
		f0a5c81459868d6ce0c3b03c5c84a7e9
		8de40561abda4349c746a9c52fe2f815
		1de8d54ef45c22f588e2dd6c52a01025
		361b481084c41d0df4c4eb763f268043
		127c3c579480c24aa5ab6ce265072928
		11cedfa324977e4dde5e41148223b4aa
		6ec1432fba76b37d1586279b017ce09a
		29b3ae1b6d68b362ec8956b4fa7c5dd7
		d84cdff50f7f6b2d20a1c07f80eaba0f
		16e66c0cd7f5ca6a6ebb398a1837a2cb
		09571672aec8049a954cc3f0b9d3b14b
		523e640306a305acf35c11886544803e
		8c5701328ee3dd8375dd0be9e419c868
		6814527b7872411ae83b47835bb32bff
		1bcf37c9521354e42aa57bf7688f26a3
		17e2ea5508023f6565da80414f870ca4
		1d8741acd20711f17199d8e2cdd38403
		df81ce901b9f95edb93dd049f42a36ec
		3348e702acb55e6a73963bc530f26c45
		4dbd2259b79716cc2cbe6a1a5ba5ab3a
		f6c3700fecafc9a337a2d3610ca472c5
		dc4200ac514006f084ead7f83b84c928
		1d3c2d4883cb307d21ebfc13aee4af1c
		77031e1f87a144b9138dc68d65dac9f7
		a28a1afbf0de5749a27a8ba471449fa7
		3851d2d3688ebaf443143069af5d8343


### c_4

This cluster will be excempted from further analysis since the majority of benign samples wasn't found in this cluster. 

In [8]:
# explore_cluster("c_4")

### c_5

In [9]:
explore_cluster("c_5")

1. WHAT MALWARE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        11747
downloader     1809
pua            1332
adware          947
benign          755
virus            49
spyware          42
miner            35
ransomware       10
dropper           3
hacktool          1
worm              1
Name: count, dtype: int64

2. WHAT TOP 10 API CALL PATTERNS OF THE CLUSTER

Top 10 Patterns
 pattern
GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,NtAllocateVirtualMemory,SetUnhandledExceptionFilter,NtAllocateVirtualMemory,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,GetSystemTimeAsFileTime,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtClose,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGet

LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,NtProtectVirtualMemory,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,GetSystemInfo,NtAllocateVirtualMemory,RegOpenKeyExW,FindFirstFileExW,LoadStringW,LdrLoadDll,LdrGetProcedureAddress,GetNativeSystemInfo,LoadStringW,LdrGetDllHandle,LdrGetProcedureAddress,RegOpenKeyExW,NtOpenKey,NtQueryValueKey,LdrGetDllHandl

### c_6

In [10]:
explore_cluster("c_6")

1. WHAT MALWARE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        3903
adware         141
pua             16
downloader       7
ransomware       1
Name: count, dtype: int64

2. WHAT TOP 10 API CALL PATTERNS OF THE CLUSTER

Top 10 Patterns
 pattern
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,SetUnhandledExceptionFilter,LdrLoadDll,LdrGetProcedureAddress,CryptAcquireContextW,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetProcedureAddress,CryptCreateHash,LdrGetProcedureAddress,CryptHashData,LdrGetProcedureAddress,NtCreateFile,GetFileSize,SetFilePointer,NtReadFile,NtClose,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,CoInitializeEx,RegOpenKeyExW,RegQueryValueExW,RegEnumKeyW,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegEnumKeyW,RegCloseKey,R