# PCA

**References:**
- https://365datascience.com/tutorials/python-tutorials/pca-k-means/
- https://www.datacamp.com/tutorial/introduction-t-sne
- https://www.geeksforgeeks.org/difference-between-pca-vs-t-sne/

The contents of this notebook only clusters both **benign & malicious** samples.

**Objectives**
- To reduce dimensionality of the dataset.
- To obtain optimum number of features whilst retaining dataset's context and value.


## 1. Import Data

In [None]:
# Import Libraries
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA, TruncatedSVD
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

sns.set()
warnings.filterwarnings("ignore")

df = pd.read_csv('../Dataset/oliveira_labelled.csv')

API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove benign samples
df = df[df['malware'] != 0]

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

df

In [None]:
#Convert malware types to its numeric equivalents
malware_types = ['trojan', 'downloader', 'pua', 'adware', 'ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm', 'benign']

print("Conversion of Malware Type to its Ordinal Encoded Form:")
for m in range(len(malware_types)):
    print(f"{malware_types[m]:10s} = {m}")

malware_types_int = []
for i in range(df.shape[0]):
    malware_types_int.append(malware_types.index(df.iloc[i,102]))
df.insert(103, "type_int", malware_types_int, True)
df

## 2. Applying PCA

### 2.1 Applying StandardScaler

In [None]:
# Standardization
sc = StandardScaler()
segmentation_std = sc.fit_transform(df.iloc[:,1:101])
pca = PCA(random_state=1)
pca.fit(segmentation_std)

### 2.2. Visualizing Ideal `n_components` value

In [None]:
plt.clf()
plt.figure(figsize=(10,6), dpi=300)
plt.plot(range(0,100),pca.explained_variance_ratio_.cumsum(), marker = 'o', linestyle='--')
plt.title('Explained Variance by Components')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.xticks(np.arange(0, 100, 5))
plt.yticks(np.arange(0, 1.05, .05))
plt.show()

While Dr. Mac suggested values of 2,3,4, and 16, with a target PCA variance of 0.95. Other online sources suggest that 0.80 is good enough as well. The PCA variance only reaches >=0.80 at **32 components** while >=0.95 at **69 components** which is still a lot in the grand scheme of things.

Results (at 1e10):
- 2 = ~0.2500
- 3 = ~0.3250
- 4 = ~0.3750
- 16 = ~0.6500
- 32 = ~0.8000
- 42 = ~0.8500

The explanation behind the ideal the value of being as low as possible is that analysis will be exponential in size the higher the `n_components` is which could be explained by the sample code below.

```
n_components = 4
ctr = 0
for i in range(n_components):
    for j in range(i+1,n_components):
        print(ctr, i, j)
        ctr += 1
```

If it takes quite numerous components before reaching 0.80, then it might be the case that PCA is not suitable for the dataset. An alternative is to try Kernel PCA, albeit at a limited capacity given its computationally and space expensive nature.

- https://www.quora.com/How-do-I-check-if-my-dataset-is-suitable-for-PCA-Principal-component-analysis-in-ML-PCA-requires-data-to-be-linear-but-if-my-dataset-has-many-columns-how-do-I-check-that-it-is-linear-especially-when-my-output=
- https://medium.com/@khwabkalra1/unleashing-the-power-of-kernel-pca-bce7f4d2923d

### 2.3. Applying `n_components` to PCA

In [None]:
N_COMPONENTS = 3
pca = PCA(n_components=N_COMPONENTS, random_state=1)
pca.fit(segmentation_std)
scores_pca = pca.transform(segmentation_std)

### 2.4. Applying PCA results to K-Means

In [None]:
wcss = [] #Within Cluster Sum of Squares or WCSS (aka Intertia)
silhouette = []
for i in range(1,51):
    kmeans_pca = KMeans(n_clusters=i, random_state=1)
    kmeans_pca.fit(scores_pca)
    wcss.append(kmeans_pca.inertia_)
    #silhouette.append(silhouette)
    
plt.clf()
plt.figure(figsize=(10,5), dpi=300)
plt.plot(range(1,51), wcss, marker='o', linestyle='--')
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.xticks(np.arange(0, 55, 5))
plt.title("K-Means with PCA Clustering")
plt.show()

In [None]:
for i in range(len(wcss)-1):
    print(f"{i:2d}-{i+1:2d}: {wcss[i]:.4f}-{wcss[i+1]:.4f}, Delta: {wcss[i+1]-wcss[i]:.4f}")
print("Average:", sum(wcss)/len(wcss)*-1) # Look for the last value that is lower than the ave.

In [None]:
def kmeans_test(CLUSTERS:int, df):
    kmeans = KMeans(n_clusters = CLUSTERS, random_state=1)
    kmeans.fit(df)
    silhouette = silhouette_score(df, kmeans.labels_, random_state=1)
    davies_bouldin = davies_bouldin_score(df, kmeans.labels_)
    print(f"Silhouette Score: {silhouette:.4f}")
    print(f"Davies Bouldin: {davies_bouldin:.4f}\n")
    return kmeans

# As per PCA WCSS
k7 = kmeans_test(7, scores_pca)

CLUSTERS = 7

### 2.5. Aggregating PCA Dataframes

In [None]:
component_list = []
for i in range(1,N_COMPONENTS+1):
    component_list.append(f"comp_{i}")
cluster_id = []
for i in range(CLUSTERS):
    cluster_id.append(f"c_{i}")
    
pca_kmeans = pd.concat([df.iloc[:,1:101].reset_index(drop=True), pd.DataFrame(scores_pca, columns=component_list), df["type"].reset_index(drop=True), df["type_int"].reset_index(drop=True)], axis=1)
pca_kmeans['pca_segment'] = k7.labels_
pca_kmeans['pca_segment'] = pca_kmeans['pca_segment'].map({0:'c_0', 1:'c_1', 2:'c_2', 3:'c_3', 4:'c_4',
                                                               5:'c_5', 6:'c_6', 7:'c_7', 8:'c_8', 9:'c_9',
                                                               10:'c_10', 11:'c_11', 12:'c_12', 13:'c_13', 14:'c_14',
                                                               15:'c_15', 16:'c_16', 17:'c_17', 18:'c_18', 19:'c_19'})
pca_kmeans['pca_segment'].fillna('unseg', inplace=True)
pca_kmeans = pca_kmeans.copy(deep=True)
display(pca_kmeans.iloc[:,100:100+68])
display(pca_kmeans)

### 2.6. Verifying results of PCA Kmeans Clustering

In [None]:
unsegmented = pca_kmeans[pca_kmeans['pca_segment'] == 'unseg'].shape[0]
print("# of Unsegmented Samples:", unsegmented, f"({unsegmented/df.shape[0]*100:.4f}%)", "\n")
for c in cluster_id:
    print(c)
    print(pca_kmeans[pca_kmeans['pca_segment'] == c]['type'].value_counts())
    print("")

### 2.7. Visualizing PCA Scatter

In [None]:
def pca_scatter(x:str, y:str, color_scheme:pd.Series, control_df:pd.DataFrame, filename:str, show:bool=True):  
    fig = px.scatter(x=control_df[x], y=control_df[y], color=color_scheme,opacity=0.8)
    fig.update_layout(
        title=f"PCA Scatter Plot: {x} & {y}",
        xaxis_title=x,
        yaxis_title=y,
        width=800,
        height=600
    )
    fig.write_image("./PCA Visualization/PCA Malicious/"+filename+"_"+x+"+"+y+"_Malware.png")
    if show:
        fig.show()

In [None]:
# View by PCA Kmeans Segment
pca_scatter(component_list[0], component_list[1], pca_kmeans['pca_segment'], pca_kmeans, "PCA_Segment")
pca_scatter(component_list[0], component_list[2], pca_kmeans['pca_segment'], pca_kmeans, "PCA_Segment")
# pca_scatter(component_list[0], component_list[3], pca_kmeans['pca_segment'], pca_kmeans, "PCA_Segment")
pca_scatter(component_list[1], component_list[2], pca_kmeans['pca_segment'], pca_kmeans, "PCA_Segment")
# pca_scatter(component_list[1], component_list[3], pca_kmeans['pca_segment'], pca_kmeans, "PCA_Segment")

In [None]:
# Convert certain malware types to others to simplify viewing
# pca_kmeans.replace("hacktool","others",inplace=True)
# pca_kmeans.replace("miner","others",inplace=True)
# pca_kmeans.replace("virus","others",inplace=True)
# pca_kmeans.replace("spyware","others",inplace=True)
# pca_kmeans.replace("ransomware","others",inplace=True)
# pca_kmeans.replace("dropper","others",inplace=True)
# pca_kmeans.replace("worm","others",inplace=True)

# View by type
pca_scatter(component_list[0], component_list[1], pca_kmeans['type'], pca_kmeans, "PCA_Type")
pca_scatter(component_list[0], component_list[2], pca_kmeans['type'], pca_kmeans, "PCA_Type")
# pca_scatter(component_list[0], component_list[3], pca_kmeans['type'], pca_kmeans, "PCA_Type")
pca_scatter(component_list[1], component_list[2], pca_kmeans['type'], pca_kmeans, "PCA_Type")
# pca_scatter(component_list[1], component_list[3], pca_kmeans['type'], pca_kmeans, "PCA_Type")