# PCA

**References:**
- https://365datascience.com/tutorials/python-tutorials/pca-k-means/
- https://www.datacamp.com/tutorial/introduction-t-sne
- https://www.geeksforgeeks.org/difference-between-pca-vs-t-sne/

The contents of this notebook only clusters both **benign & malicious** samples.

**Objectives**
- To reduce dimensionality of the dataset.
- To obtain optimum number of features whilst retaining dataset's context and value.


## 1. Import Data

In [1]:
# Import Libraries
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

sns.set()
warnings.filterwarnings("ignore")

df = pd.read_csv('./Dataset/oliveira_labelled.csv')

API_LIST = "./Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove benign samples
# df = df[df['type'] != 'benign']

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

df

Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type
0,1,112,274,158,215,274,158,215,298,76,...,297,135,171,215,35,208,56,71,071e8c3f8922e186e57548cd4c703a5d,trojan
1,1,82,208,187,208,172,117,172,117,172,...,240,117,71,297,135,171,215,35,33f8e6d08a6aae939f25a8e0d63dd523,pua
2,1,16,110,240,117,240,117,240,117,240,...,112,123,65,112,123,65,113,112,b68abd064e975e1c6d5f25e748663076,trojan
3,1,82,208,187,208,172,117,172,117,172,...,302,208,302,187,208,302,228,302,72049be7bd30ea61297ea624ae198067,trojan
4,1,82,240,117,240,117,240,117,240,117,...,260,40,209,260,141,260,141,260,c9b3700a77facf29172f32df6bc77f48,trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43871,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,e3d6d58faa040f0f9742c9d0eaf58be4,trojan
43872,1,82,240,117,240,117,240,117,240,117,...,224,82,159,224,82,159,224,82,9b917bab7f32188ae40c744f2be9aaf8,trojan
43873,1,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,35a18ee05f75f04912018d9f462cb990,trojan
43874,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,654139d715abcf7ecdddbef5a84f224b,trojan


In [2]:
#Convert malware types to its numeric equivalents
malware_types = ['trojan', 'downloader', 'pua', 'adware', 'ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm', 'benign']

print("Conversion of Malware Type to its Ordinal Encoded Form:")
for m in range(len(malware_types)):
    print(f"{malware_types[m]:10s} = {m}")

malware_types_int = []
for i in range(df.shape[0]):
    malware_types_int.append(malware_types.index(df.iloc[i,102]))
df.insert(103, "type_int", malware_types_int, True)
df

Conversion of Malware Type to its Ordinal Encoded Form:
trojan     = 0
downloader = 1
pua        = 2
adware     = 3
ransomware = 4
miner      = 5
virus      = 6
spyware    = 7
hacktool   = 8
dropper    = 9
worm       = 10
benign     = 11


Unnamed: 0,malware,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,hash,type,type_int
0,1,112,274,158,215,274,158,215,298,76,...,135,171,215,35,208,56,71,071e8c3f8922e186e57548cd4c703a5d,trojan,0
1,1,82,208,187,208,172,117,172,117,172,...,117,71,297,135,171,215,35,33f8e6d08a6aae939f25a8e0d63dd523,pua,2
2,1,16,110,240,117,240,117,240,117,240,...,123,65,112,123,65,113,112,b68abd064e975e1c6d5f25e748663076,trojan,0
3,1,82,208,187,208,172,117,172,117,172,...,208,302,187,208,302,228,302,72049be7bd30ea61297ea624ae198067,trojan,0
4,1,82,240,117,240,117,240,117,240,117,...,40,209,260,141,260,141,260,c9b3700a77facf29172f32df6bc77f48,trojan,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43871,1,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,e3d6d58faa040f0f9742c9d0eaf58be4,trojan,0
43872,1,82,240,117,240,117,240,117,240,117,...,82,159,224,82,159,224,82,9b917bab7f32188ae40c744f2be9aaf8,trojan,0
43873,1,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,35a18ee05f75f04912018d9f462cb990,trojan,0
43874,1,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,654139d715abcf7ecdddbef5a84f224b,trojan,0


## 2. Applying PCA

### 2.1 Applying StandardScaler

In [3]:
# Standardization
sc = StandardScaler()
segmentation_std = sc.fit_transform(df.iloc[:,1:101])
# pca = PCA(random_state=1)
# pca.fit(segmentation_std)

### 2.2. Visualizing Ideal `n_components` value

In [4]:
# plt.clf()
# plt.figure(figsize=(10,6), dpi=300)
# plt.plot(range(1,101),pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle='--')
# plt.title('Explained Variance by Components')
# plt.xlabel('Number of Components')
# plt.ylabel('Cumulative Explained Variance')
# plt.xticks(np.arange(0, 100, 5))
# plt.yticks(np.arange(0, 1.05, .05))
# plt.show()

While Dr. Mac suggested values of 2,3,4, and 16, the PCA variance only reaches >=95% at **69 components** which is still a lot in the grand scheme of things.

For Comparison (at 1e10):
- 2 = ~0.2500
- 3 = ~0.3250
- 4 = ~0.3750
- 16 = ~0.6500
- 31 = ~0.8500

### 2.3. Applying `n_components` to PCA

In [5]:
N_COMPONENTS = 69
pca = PCA(n_components=N_COMPONENTS, random_state=1)
pca.fit(segmentation_std)
scores_pca = pca.transform(segmentation_std)

### 2.4. Applying PCA results to K-Means

In [6]:
# wcss = [] #Within Cluster Sum of Squares or WCSS
# silhouette = []
# for i in range(1,100):
#     kmeans_pca = KMeans(n_clusters=i, random_state=1)
#     kmeans_pca.fit(scores_pca)
#     wcss.append(kmeans_pca.inertia_)
#     #silhouette.append(silhouette)
    
# plt.clf()
# plt.figure(figsize=(10,5), dpi=300)
# plt.plot(range(1,100), wcss, marker='o', linestyle='--')
# plt.xlabel("Number of Clusters")
# plt.ylabel("WCSS")
# plt.xticks(np.arange(0, 105, 5))
# plt.title("K-Means with PCA Clustering")
# plt.show()

In [7]:
def kmeans_test(CLUSTERS:int):
    kmeans = KMeans(n_clusters = CLUSTERS, random_state=1)
    kmeans.fit(df.iloc[:,1:101])
    silhouette = silhouette_score(scores_pca, kmeans.labels_, random_state=1)
    davies_bouldin = davies_bouldin_score(scores_pca, kmeans.labels_)
    print(f"Silhouette Score: {silhouette:.4f}")
    print(f"Davies Bouldin: {davies_bouldin:.4f}\n")
    return kmeans

# As per WCSS (n_components=3)
k8 = kmeans_test(8)

# As per WCSS (n_components=69)
k12 = kmeans_test(12)

# Preferrably, let's just stick with the actual number of malware types which there are 11.
CLUSTERS = 12

Silhouette Score: 0.2753
Davies Bouldin: 2.0902

Silhouette Score: 0.2705
Davies Bouldin: 2.0444



### 2.5. Aggregating PCA Dataframes

In [8]:
component_list = []
for i in range(1,N_COMPONENTS+1):
    component_list.append(f"comp_{i}")
cluster_id = []
for i in range(CLUSTERS):
    cluster_id.append(f"c_{i}")
    
pca_kmeans = pd.concat([df["malware"].reset_index(drop=True), 
                        pd.DataFrame(scores_pca, columns=component_list), 
                        df.iloc[:,1:101].reset_index(drop=True), df["type"].reset_index(drop=True), 
                        df["hash"].reset_index(drop=True)], axis=1)
pca_kmeans['Segment'] = k12.labels_
pca_kmeans['Segment'] = pca_kmeans['Segment'].map({0:'c_0', 1:'c_1', 2:'c_2', 3:'c_3', 4:'c_4',
                                                               5:'c_5', 6:'c_6', 7:'c_7', 8:'c_8', 9:'c_9',
                                                               10:'c_10', 11:'c_11', 12:'c_12', 13:'c_13', 14:'c_14',
                                                               15:'c_15', 16:'c_16', 17:'c_17', 18:'c_18', 19:'c_19'})
# pca_kmeans['Segment'].fillna('unseg', inplace=True)
# pca_kmeans = pca_kmeans.copy(deep=True)
display(pca_kmeans)

Unnamed: 0,malware,comp_1,comp_2,comp_3,comp_4,comp_5,comp_6,comp_7,comp_8,comp_9,...,t_93,t_94,t_95,t_96,t_97,t_98,t_99,type,hash,Segment
0,1,8.027009,-5.098860,2.266475,1.431478,0.721348,0.021623,0.351406,-0.202911,-0.796159,...,135,171,215,35,208,56,71,trojan,071e8c3f8922e186e57548cd4c703a5d,c_2
1,1,1.611804,1.859529,-0.805995,0.608988,-2.197815,0.372387,0.996577,0.128649,0.205597,...,117,71,297,135,171,215,35,pua,33f8e6d08a6aae939f25a8e0d63dd523,c_5
2,1,-0.354380,2.660978,-2.721029,1.243753,-0.604172,0.142703,4.158563,0.605894,-1.318086,...,123,65,112,123,65,113,112,trojan,b68abd064e975e1c6d5f25e748663076,c_5
3,1,-2.380784,1.431129,1.791900,2.110833,0.367195,-1.849076,-0.802305,0.699975,0.805707,...,208,302,187,208,302,228,302,trojan,72049be7bd30ea61297ea624ae198067,c_11
4,1,-4.716947,-4.371760,1.031914,-0.983038,-0.935810,-0.783044,-0.032827,1.024113,-0.353131,...,40,209,260,141,260,141,260,trojan,c9b3700a77facf29172f32df6bc77f48,c_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41231,1,-5.882868,-3.867033,1.650529,-2.728901,-1.147328,3.533687,1.299515,-2.025259,0.426001,...,141,260,141,260,141,260,141,trojan,e3d6d58faa040f0f9742c9d0eaf58be4,c_7
41232,1,-3.961226,-5.031721,-0.459609,-0.383707,0.535719,-1.302950,1.127014,2.419843,-0.553669,...,82,159,224,82,159,224,82,trojan,9b917bab7f32188ae40c744f2be9aaf8,c_0
41233,1,-4.512618,-3.428349,0.578645,-1.804005,-0.814422,-1.630722,-0.634713,-1.608661,-1.553272,...,260,141,260,141,260,141,260,trojan,35a18ee05f75f04912018d9f462cb990,c_7
41234,1,-5.882868,-3.867033,1.650529,-2.728901,-1.147328,3.533687,1.299515,-2.025259,0.426001,...,141,260,141,260,141,260,141,trojan,654139d715abcf7ecdddbef5a84f224b,c_7


### 2.7. Exporting Dataset for ML Use

In [9]:
pca_kmeans.to_csv('oliveira_pca_69.csv', index=False)