# PCA Analysis

Based from the results of **PCA All**.

## Loading Dataset

In [1]:
# Import Libraries
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

sns.set()
warnings.filterwarnings("ignore")

df = pd.read_csv('./oliveira_pca.csv')

API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove benign samples
# df = df[df['type'] != 'benign']

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#Inverse Label Encoding
df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
df = inject_patterns(df).copy(deep=True)

df

Injecting API patterns...


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_97,t_98,t_99,comp_1,comp_2,comp_3,Segment,hash,type,pattern
0,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,8.027009,-5.098860,2.266480,c_4,071e8c3f8922e186e57548cd4c703a5d,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtMapViewOfSection,NtClose,GetSystemMetrics,1.611804,1.859529,-0.805991,c_3,33f8e6d08a6aae939f25a8e0d63dd523,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,1,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,-0.354380,2.660978,-2.721049,c_3,b68abd064e975e1c6d5f25e748663076,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtWriteVirtualMemory,NtProtectVirtualMemory,NtWriteVirtualMemory,-2.380784,1.431129,1.791917,c_5,72049be7bd30ea61297ea624ae198067,trojan,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,-4.716947,-4.371760,1.031883,c_2,c9b3700a77facf29172f32df6bc77f48,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41231,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,-5.882868,-3.867033,1.650539,c_2,e3d6d58faa040f0f9742c9d0eaf58be4,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41232,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,-3.961226,-5.031721,-0.459638,c_2,9b917bab7f32188ae40c744f2be9aaf8,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41233,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,-4.512618,-3.428349,0.578631,c_2,35a18ee05f75f04912018d9f462cb990,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41234,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,-5.882868,-3.867033,1.650539,c_2,654139d715abcf7ecdddbef5a84f224b,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


## Exploring PCA Kmeans Clusters

In [2]:
def explore_cluster(cluster_id:str):
    #Load PCA Kmeans Cluster
    cluster = df[df['Segment'] == cluster_id]
    #Check number of malware types in the cluster (including benign if any)
    display(df['type'].value_counts())

### c_0

In [8]:
def explore_cluster(cluster_id:str):
    #Load PCA Kmeans Cluster
    cluster = df[df['Segment'] == cluster_id]
    #Check number of malware types in the cluster (including benign if any)
    display("Item Classes",df['type'].value_counts())
    #Check the Top 5 API Call Patterns of the Cluster
    top5_pattern = df['pattern'].value_counts()[0:5]
    display("Top 5 Patterns", top5_pattern)
    #What sample classes calls these Top 5 API Calls

explore_cluster("c_0")

'Item Classes'

type
trojan        33352
downloader     2249
pua            2135
adware         1815
benign         1077
ransomware      341
miner           125
virus            79
spyware          48
hacktool          7
dropper           5
worm              3
Name: count, dtype: int64

'Top 5 Patterns'

pattern
RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,NtAllocateVirtualMemory,LoadStringA,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LoadStringA,LdrGetProcedureAddress,GetSystemMetrics,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,LdrGetProcedureAddress,FindResourceExW,LoadResource,FindResourceExW,LoadResource,NtAllocateVirtualMemory,GetSystemMetrics,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,FindResourceExW,LoadResource,FindResourceExW,LoadResource,DrawTextE