# PCA Analysis

Based from the results of **PCA All**.

## Loading Dataset

In [1]:
# Import Libraries
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

sns.set()
warnings.filterwarnings("ignore")

df = pd.read_csv('./oliveira_pca.csv')

API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove benign samples
# df = df[df['type'] != 'benign']

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

#Inverse Label Encoding
df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
df = inject_patterns(df).copy(deep=True)

df

  from pandas.core import (


Injecting API patterns...


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_97,t_98,t_99,comp_1,comp_2,comp_3,pca_segment,hash,type,pattern
0,1,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,8.027009,-5.098860,2.266480,c_2,071e8c3f8922e186e57548cd4c703a5d,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtMapViewOfSection,NtClose,GetSystemMetrics,1.611804,1.859529,-0.805991,c_5,33f8e6d08a6aae939f25a8e0d63dd523,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,1,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,-0.354380,2.660978,-2.721049,c_5,b68abd064e975e1c6d5f25e748663076,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,1,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtWriteVirtualMemory,NtProtectVirtualMemory,NtWriteVirtualMemory,-2.380784,1.431129,1.791917,c_5,72049be7bd30ea61297ea624ae198067,trojan,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,-4.716947,-4.371760,1.031883,c_6,c9b3700a77facf29172f32df6bc77f48,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41231,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,-5.882868,-3.867033,1.650539,c_0,e3d6d58faa040f0f9742c9d0eaf58be4,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41232,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,-3.961226,-5.031721,-0.459638,c_6,9b917bab7f32188ae40c744f2be9aaf8,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41233,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,-4.512618,-3.428349,0.578631,c_0,35a18ee05f75f04912018d9f462cb990,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41234,1,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,-5.882868,-3.867033,1.650539,c_0,654139d715abcf7ecdddbef5a84f224b,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [2]:
print("Number of PCA Kmeans Clusters:", len(df['pca_segment'].unique()))

Number of PCA Kmeans Clusters: 7


## Exploring PCA Kmeans Clusters

In [3]:
def explore_cluster(cluster_id:str):
    #Load PCA Kmeans Cluster
    cluster = df[df['pca_segment'] == cluster_id]
    print("1. What sample types are in the cluster (including benign if any)?\n".upper())
    print("Item Classes\n",cluster['type'].value_counts())
    total = sum(cluster['type'].value_counts())
    print("Total:", total, f"({total/df.shape[0]*100:.4f}%)")
    print("")

    print(f"2. How many unique API Call Patterns are there in cluster {cluster_id}?\n".upper())
    print("Total:", len(cluster['pattern'].unique()))
    mal = cluster[cluster['malware']==1]['pattern'].unique()
    ben = cluster[cluster['malware']==0]['pattern'].unique()
    print("Malicious:", len(mal))
    print("Benign:", len(ben))
    print("Shared API Call Patterns:\n")
    for i in list(set(mal).intersection(set(ben))):
        ls = cluster[cluster['pattern'] == i]['type'].value_counts()
        print(f"{i[:256]}...", "\n", ls.index.tolist(), ls.tolist(),"\n")
    print("")
    
    print("3. What Top 20 API Call Patterns of the Cluster (Malicious Samples)\n".upper())
    mal_pattern = cluster[cluster['malware']==1]['pattern'].value_counts()[0:20]
    for i in range(len(mal_pattern)):
        print(f"{mal_pattern.index[i][:128]}... - {mal_pattern[i]}")
    print("")
    
    print("4. What Top 20 API Call Patterns of the Cluster (Benign Samples)\n".upper())
    ben_pattern = cluster[cluster['malware']==0]['pattern'].value_counts()[0:20]
    for i in range(len(ben_pattern)):
        print(f"{ben_pattern.index[i][:128]}... - {ben_pattern[i]}")
    print("")

    print("5. What Malware Types use these Top 20 API Call Patterns? (Malicious Samples)\n".upper())
    for p in mal_pattern.index.to_list():
        print(f"{p[:128]}...\n")
        m_types = cluster[cluster['pattern'] == p]['type'].value_counts()
        for i in range(len(m_types)):
            print(f"{m_types.index[i]} - {m_types[i]}")
        print("")
    print("")

    print("6. What are the hashes of these malicious samples?\n".upper())
    sample_ratio = 0.1
    ctr = 0
    for p in mal_pattern.index.to_list():
        #print(f"{p[:128]}...", end=",")
        pattern_filter = cluster[cluster['pattern'] == p]
        for t in pattern_filter['type'].value_counts().index.to_list():
            hashes = pattern_filter[pattern_filter['type'] == t]['hash'].to_list()
            #print(t+f" ({len(hashes)}, {int(len(hashes)*sample_ratio)+1})")
            for h in hashes[0:int(len(hashes)*sample_ratio)+1]:
                print(cluster_id+","+str(ctr)+","+t+","+h)
        ctr += 1
    print("")
    
    print("7. What Top 20 API Call Patterns of the Cluster (Malicious Samples; In Whole)\n".upper())
    mal_pattern = cluster[cluster['malware']==1]['pattern'].value_counts()[0:20]
    for i in range(len(mal_pattern)):
        print(f"{mal_pattern.index[i]}\n")
    print("")
    
    print("8. What Top 20 API Call Patterns of the Cluster (Benign Samples; In Whole)\n".upper())
    ben_pattern = cluster[cluster['malware']==0]['pattern'].value_counts()[0:20]
    for i in range(len(ben_pattern)):
        print(f"{ben_pattern.index[i]}\n")
    print("")

### c_0

In [4]:
explore_cluster("c_0")

1. WHAT SAMPLE TYPES ARE IN THE CLUSTER (INCLUDING BENIGN IF ANY)?

Item Classes
 type
trojan        3403
adware         120
pua             22
downloader       6
Name: count, dtype: int64
Total: 3551 (8.6114%)

2. HOW MANY UNIQUE API CALL PATTERNS ARE THERE IN CLUSTER C_0?

Total: 1266
Malicious: 1266
Benign: 0
Shared API Call Patterns:


3. WHAT TOP 20 API CALL PATTERNS OF THE CLUSTER (MALICIOUS SAMPLES)

GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 738
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 200
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 135
GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,Ld... - 76
GetSystemTimeAsFileTime,LdrLoadDll,Ldr

TypeError: can only concatenate str (not "int") to str

### c_1

In [None]:
explore_cluster("c_1")

### c_2

In [None]:
explore_cluster("c_2")

### c_3

In [None]:
explore_cluster("c_3")

### c_4

In [None]:
explore_cluster("c_4")

### c_5

In [None]:
explore_cluster("c_5")

### c_6

In [None]:
explore_cluster("c_6")