# Raw Scatter Plot

## 1. Import Data

In [None]:
# Import Libraries
import pandas as pd
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px

sns.set()
warnings.filterwarnings("ignore")

df = pd.read_csv('../Dataset/oliveira_labelled.csv')

API_LIST = "../Dataset/api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_label(item:str):
    global APIS
    return item.map(lambda x: APIS[int(x)])

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def inject_patterns(inner_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inner_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    return inner_df # DBSCAN requires only the numeric label encoded version of the API Calls

def ib_convert(input_df:pd.DataFrame):
    print("Transposing IB...")
    input_df.transpose()
    print("IB Transposed!")
    print("Removing duplicates...")
    print("Row:", end=" ")
    for r in range(input_df.shape[0]):
        row = input_df.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
        input_df.iloc[r, 1:101] = row + ([307]*(100-len(row)))
        if r % 100 == 0:
            print(r, end=" ")
    print("\nDuplicates removed!")
    print("Retransposing IB (revert)...")
    input_df.transpose()
    print("IB Retransposed!")
    return input_df

# Remove benign samples
# df = df[df['type'] != 'benign']

# Remove falsely labelled malicious samples
df = df[df['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     df = df[df['type'] != r]

#Remove type column
type_col = df.pop('type')

#Removing hash column
hash_col = df.pop('hash')

#Re-arranging column positions
label_col = df.pop('malware')
df = pd.concat([label_col, df], axis=1)
df = pd.concat([df, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
df = pd.concat([df, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

df

In [None]:
#Convert malware types to its numeric equivalents
malware_types = ['trojan', 'downloader', 'pua', 'adware', 'ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm', 'benign']

print("Conversion of Malware Type to its Ordinal Encoded Form:")
for m in range(len(malware_types)):
    print(f"{malware_types[m]:10s} = {m}")

malware_types_int = []
for i in range(df.shape[0]):
    malware_types_int.append(malware_types.index(df.iloc[i,102]))
df.insert(103, "type_int", malware_types_int, True)
df

## 2. Preview of Dataset via Scatter Plot

Compares two each of features (e.g., t_0 and t_1).

In [None]:
def scatter_plot(x_label:str, y_label:str, filename:str, df:pd.DataFrame):
    fig = px.scatter(x=df[x_label], y=df[y_label], color=df['type'], opacity=0.8)
    fig.update_layout(
        title=f"Visualization of Raw Data: {x_label} & {y_label}",
        xaxis_title=x_label,
        yaxis_title=y_label,
    )
    fig.write_image(filename+"_"+x_label+"+"+y_label+".png")
    # fig.show()

In [None]:
def render_scatter(df, cols, title:str, path:str):
    for i in range(0, len(cols)-1):
        scatter_plot(cols[i], cols[i+1], path+title, df)
    for i in range(0, len(cols)-1, 10):
        scatter_plot(cols[i], cols[i+1], path+title+"_10s", control_df)

In [None]:
#Malware
control_df = df[df['malware'] == 1]
cols = control_df.iloc[:,1:101].columns.to_list()
render_scatter(control_df, cols, "Scatter_Malware", "./Malicious/")

In [None]:
#Benign
control_df = df[df['malware'] == 0]
cols = control_df.iloc[:,1:101].columns.to_list()
render_scatter(control_df, cols, "Scatter_Benign", "./Benign/")

In [None]:
#All
control_df = df
cols = control_df.iloc[:,1:101].columns.to_list()
render_scatter(control_df, cols, "Scatter_All", "./All/")