# **Cortus Malware Analyzer**
This section of the notebook contains the data importing and cleaning of the features extracted from memory.

For the purpose of runtime, not all GUI displays are enabled, if you wish to see them uncomment the provided lines.

**Import libraries needed for model**

In [2]:
import ast
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
import sklearn
import seaborn as sns

from random import shuffle
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, adjusted_rand_score, accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, LabelBinarizer, MultiLabelBinarizer, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import svm

np.set_printoptions(threshold=sys.maxsize)

In [3]:
# Cleaning the file
# Drop useless index column and replace nans with 0
# May have to chunk it due to memory limitations
conversions = {'relocationContentFull': ast.literal_eval, 'stringContentFull': ast.literal_eval, 
               'sectionContentFull': ast.literal_eval, 'importNameContentFull': ast.literal_eval, 'importLibContentFull': ast.literal_eval }

finalFrame = pd.read_csv(dataset, sep=',', low_memory=False, index_col=[0] )
finalFrame = finalFrame.dropna(axis=1, how='all')
finalFrame = finalFrame.fillna(0)

In [4]:
finalFrame

Unnamed: 0,processName,processType,arch,binsz,bits,canary,retguard,crypto,endian,flags,...,.didat_7_perms,C:\Windows\System32\windowscodecs.dll_perms,.idata_9_perms,C:\Windows\System32\mshtml.dll_perms,.idata_10_perms,.didat_8_perms,C:\Windows\System32\rtm.dll_size,C:\Windows\System32\rtm.dll_perms,C:\Windows\CCTV.exe_size,C:\Windows\CCTV.exe_perms
0,soffice.exe_220223_021039.dmp_benign,benign,x86,29220056,64,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
1,explorer.exe_220123_232523.dmp_benign,benign,x86,300397859,64,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
2,soffice.exe_220223_020953.dmp_benign,benign,x86,29220056,64,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
3,Powder32.exe_220409_234758.dmp_benign,benign,x86,101173938,32,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
4,vlc.exe_220409_234158.dmp_benign,benign,x86,102608960,32,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,conlhost.exe_220309_021439.dmp_malicious,malicious,x86,51458484,32,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
607,01828be39e9c87bcfe2f59374c1dc5e9fb963bffcf3bf5...,malicious,x86,125974158,32,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
608,cmd.exe_220215_002931.dmp_malicious,malicious,x86,20335580,32,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0
609,reg.exe_220220_013323.dmp_malicious,malicious,x86,20375836,32,False,False,False,little,0x00061826,...,0,0,0,0,0,0,0.0,0,0.0,0


**Data Cleaning and Analysis**
1.   Data filtering
2.   Data transformation
3.   Feature visualisation and Analysis
4.   Final data creation

In [5]:
# Find counts of level of permissions
finalFrame[finalFrame.filter(regex='_perms').columns] = finalFrame[finalFrame.filter(regex='_perms').columns].apply(lambda col:(pd.Categorical(col).codes))
finalFrame = pd.concat([finalFrame, pd.DataFrame(finalFrame[finalFrame.filter(regex='_perms').columns].stack().groupby(level=0).value_counts().unstack(fill_value=0).add_prefix("permissionCount_"))], axis=1)
finalFrame = finalFrame.drop(finalFrame.filter(regex='_perms').columns, axis=1)

# Grab count of interesting memory sections per process dump
finalFrame = finalFrame.drop(finalFrame.filter(regex='Memory_Section').columns, axis=1)
dataUniqueMemorySectionCount = finalFrame[finalFrame.filter(regex='_size').columns].gt(0).sum(axis=1)
finalFrame['uniqueMemorySectionCount'] = dataUniqueMemorySectionCount
finalFrame = finalFrame.drop(finalFrame.filter(regex='_size').columns, axis=1)

# Clean up string data into categorical data
finalFrame['processType'] = pd.Categorical(finalFrame['processType']).codes
finalFrame['arch'] = pd.Categorical(finalFrame['arch']).codes
finalFrame['bits'] = pd.Categorical(finalFrame['bits']).codes
finalFrame['canary'] = pd.Categorical(finalFrame['canary']).codes
finalFrame['retguard'] = pd.Categorical(finalFrame['retguard']).codes
finalFrame['crypto'] = pd.Categorical(finalFrame['crypto']).codes
finalFrame['endian'] = pd.Categorical(finalFrame['endian']).codes
finalFrame['flags'] = pd.Categorical(finalFrame['flags']).codes
finalFrame['havecode'] = pd.Categorical(finalFrame['havecode']).codes
finalFrame['machine'] = pd.Categorical(finalFrame['machine']).codes
finalFrame['static'] = pd.Categorical(finalFrame['static']).codes

finalFrame_true_labels = finalFrame[['processType', 'processName']]
finalFrame = finalFrame.drop(['processType', 'processName'], 1)



In [6]:
finalFrame

Unnamed: 0,arch,binsz,bits,canary,retguard,crypto,endian,flags,havecode,hdr.csum,...,ebp,eip,eflags,permissionCount_0,permissionCount_1,permissionCount_2,permissionCount_3,permissionCount_4,permissionCount_5,uniqueMemorySectionCount
0,0,29220056,1,0,0,0,0,0,0,0x00000000,...,0.0,0.000000e+00,0.0,2969,250,28,27,7,0,195
1,0,300397859,1,0,0,0,0,0,0,0x00000000,...,0.0,0.000000e+00,0.0,550,1515,533,287,395,1,1339
2,0,29220056,1,0,0,0,0,0,0,0x00000000,...,0.0,0.000000e+00,0.0,2969,250,28,27,7,0,195
3,0,101173938,0,0,0,0,0,0,0,0x00000000,...,0.0,1.916213e+07,0.0,2753,281,104,90,53,0,242
4,0,102608960,0,0,0,0,0,0,0,0x00000000,...,0.0,1.717574e+07,0.0,2395,359,227,148,152,0,291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,51458484,0,0,0,0,0,0,0,0x00000000,...,0.0,1.077568e+07,0.0,2565,333,150,146,87,0,309
607,0,125974158,0,0,0,0,0,0,0,0x00000000,...,0.0,4.202496e+06,0.0,2433,360,155,152,165,16,281
608,0,20335580,0,0,0,0,0,0,0,0x00000000,...,0.0,1.248887e+09,0.0,3022,120,46,63,30,0,93
609,0,20375836,0,0,0,0,0,0,0,0x00000000,...,0.0,4.725706e+06,0.0,3019,130,46,56,30,0,98


LSH

Due to the importance of Strings, Imports and Relocations, we use LSH to define buckets that each process can be assigned to as a feature

In [7]:
# Convert to the equivalent of our "Shingles" (We can use the full words except for strings)
finalFrame['relocationContentFull'] = finalFrame['relocationContentFull'].apply(ast.literal_eval)
finalFrame['importNameContentFull'] = finalFrame['importNameContentFull'].apply(ast.literal_eval)
finalFrame['importLibContentFull'] = finalFrame['importLibContentFull'].apply(ast.literal_eval)

# finalFrame['stringContentFull'] = finalFrame['stringContentFull'].apply(ast.literal_eval)
# finalFrame['sectionContentFull'] = finalFrame['sectionContentFull'].apply(ast.literal_eval)

In [8]:
relocationVocab = set().union(*finalFrame['relocationContentFull'])
importNameVocab = set().union(*finalFrame['importNameContentFull'])
importLibVocab = set().union(*finalFrame['importLibContentFull'])

# sectionVocab = set().union(*finalFrame['sectionContentFull'])
# stringVocab = set().union(*finalFrame['stringContentFull'])

In [9]:
relocList = []
for index, value in finalFrame['relocationContentFull'].items() :
  valueList = [1 if x in value else 0 for x in relocationVocab]
  relocList.append(valueList)

importNameList = []
for index, value in finalFrame['importNameContentFull'].items() :
  valueList = [1 if x in value else 0 for x in importNameVocab]
  importNameList.append(valueList)

importLibList = []
for index, value in finalFrame['importLibContentFull'].items() :
  valueList = [1 if x in value else 0 for x in importLibVocab]
  importLibList.append(valueList)

In [10]:
finalFrame['relocationContentEncoding'] = relocList
finalFrame['importNameContentEncoding'] = importNameList
finalFrame['importLibContentEncoding'] = importLibList

In [11]:
def create_hash_func(size):
    # function for creating the hash vector/function
    hash_ex = list(range(1, size+1))
    shuffle(hash_ex)
    return hash_ex

def build_minhash_func(vocab_size, nbits):
    # function for building multiple minhash vectors
    hashes = []
    for _ in range(nbits):
        hashes.append(create_hash_func(vocab_size))
    return hashes

def create_hash(vocab, vector, minhash_func):
    # use this function for creating our signatures (eg the matching)
    signature = []
    for func in minhash_func:
        for i in range(1, len(vocab)+1):
            idx = func.index(i)
            print(vector)
            signature_val = vector[idx]
            if signature_val == 1:
                signature.append(idx)
                break
    return signature

In [12]:
reloc_minhash = build_minhash_func(len(relocationVocab), 20)
importName_minhash = build_minhash_func(len(importNameVocab), 20)
importLib_minhash = build_minhash_func(len(importLibVocab), 20)

In [None]:
finalFrame['relocationHash'] = finalFrame['relocationContentEncoding'].apply(lambda x: create_hash(relocationVocab, x, reloc_minhash))
finalFrame['importNameHash'] = finalFrame['importNameContentEncoding'].apply(lambda x: create_hash(importNameVocab, x, importName_minhash))
finalFrame['importLibHash'] = finalFrame['importLibContentEncoding'].apply(lambda x: create_hash(importLibVocab, x, importLib_minhash))

In [14]:
finalFrame = finalFrame.drop(['relocationContentEncoding', 'importNameContentEncoding', 'importLibContentEncoding'], 1)
finalFrame = finalFrame.drop(['relocationContentFull', 'importNameContentFull', 'importLibContentFull'], 1)

  """Entry point for launching an IPython kernel.
  


In [25]:
finalFrame

Unnamed: 0,arch,binsz,bits,canary,retguard,crypto,endian,flags,havecode,hdr.csum,...,10,11,12,13,14,15,16,17,18,19
0,0,29220056,1,0,0,0,0,0,0,0x00000000,...,73,249,91,56,56,217,173,127,299,119
1,0,300397859,1,0,0,0,0,0,0,0x00000000,...,73,249,156,56,5,217,173,127,313,347
2,0,29220056,1,0,0,0,0,0,0,0x00000000,...,73,249,91,56,56,217,173,127,299,119
3,0,101173938,0,0,0,0,0,0,0,0x00000000,...,73,249,195,56,56,217,173,127,96,347
4,0,102608960,0,0,0,0,0,0,0,0x00000000,...,73,249,340,56,257,217,173,269,96,347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,51458484,0,0,0,0,0,0,0,0x00000000,...,73,249,156,56,257,217,173,127,96,347
607,0,125974158,0,0,0,0,0,0,0,0x00000000,...,73,249,156,56,56,217,173,127,96,347
608,0,20335580,0,0,0,0,0,0,0,0x00000000,...,73,309,156,116,311,309,96,338,96,119
609,0,20375836,0,0,0,0,0,0,0,0x00000000,...,73,309,156,116,311,140,96,338,96,119


In [16]:
# new df from the column of lists
splitRelocs = pd.DataFrame(finalFrame['relocationHash'].tolist())
splitImportNames = pd.DataFrame(finalFrame['importNameHash'].tolist())
splitImportLibs = pd.DataFrame(finalFrame['importLibHash'].tolist())
# concat df and split_df
finalFrame = pd.concat([finalFrame, splitRelocs], axis=1)
finalFrame = pd.concat([finalFrame, splitImportNames], axis=1)
finalFrame = pd.concat([finalFrame, splitImportLibs], axis=1)
# display df
finalFrame

Unnamed: 0,arch,binsz,bits,canary,retguard,crypto,endian,flags,havecode,hdr.csum,...,10,11,12,13,14,15,16,17,18,19
0,0,29220056,1,0,0,0,0,0,0,0x00000000,...,73,249,91,56,56,217,173,127,299,119
1,0,300397859,1,0,0,0,0,0,0,0x00000000,...,73,249,156,56,5,217,173,127,313,347
2,0,29220056,1,0,0,0,0,0,0,0x00000000,...,73,249,91,56,56,217,173,127,299,119
3,0,101173938,0,0,0,0,0,0,0,0x00000000,...,73,249,195,56,56,217,173,127,96,347
4,0,102608960,0,0,0,0,0,0,0,0x00000000,...,73,249,340,56,257,217,173,269,96,347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,51458484,0,0,0,0,0,0,0,0x00000000,...,73,249,156,56,257,217,173,127,96,347
607,0,125974158,0,0,0,0,0,0,0,0x00000000,...,73,249,156,56,56,217,173,127,96,347
608,0,20335580,0,0,0,0,0,0,0,0x00000000,...,73,309,156,116,311,309,96,338,96,119
609,0,20375836,0,0,0,0,0,0,0,0x00000000,...,73,309,156,116,311,140,96,338,96,119


In [19]:
finalFrameV2 = finalFrame[finalFrame.T[finalFrame.dtypes!=np.object].index]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


In [26]:
finalFrameV2

Unnamed: 0,arch,binsz,bits,canary,retguard,crypto,endian,flags,havecode,laddr,...,16,17,17.1,17.2,18,18.1,18.2,19,19.1,19.2
0,0,29220056,1,0,0,0,0,0,0,0,...,173,9937,2623,127,8373,5292,299,2272,7798,119
1,0,300397859,1,0,0,0,0,0,0,0,...,173,9937,4458,127,1618,5292,313,6617,6945,347
2,0,29220056,1,0,0,0,0,0,0,0,...,173,9937,2623,127,8373,5292,299,2272,7798,119
3,0,101173938,0,0,0,0,0,0,0,0,...,173,9937,9942,127,8373,5292,96,6617,1860,347
4,0,102608960,0,0,0,0,0,0,0,0,...,173,9937,5429,269,8417,5292,96,6617,7618,347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0,51458484,0,0,0,0,0,0,0,0,...,173,9937,4458,127,8745,5292,96,6617,1860,347
607,0,125974158,0,0,0,0,0,0,0,0,...,173,9937,2623,127,8373,5292,96,6617,1860,347
608,0,20335580,0,0,0,0,0,0,0,0,...,96,9937,9942,338,8373,5292,96,2272,7587,119
609,0,20375836,0,0,0,0,0,0,0,0,...,96,9937,9942,338,8373,5292,96,7584,7587,119


In [None]:
relocList

**Model Creation and Development**
1.   Functions for creation of ML Model
2.   Model Training
3.   Model Testing
4.   Model visulisation and Analysis

In [None]:
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

#-------------------------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(finalFrameV2, finalFrame_true_labels['processType'], test_size=0.2) # 70% training and 30% test

preprocessor = Pipeline(
    [
        ("scaler", MinMaxScaler()),
        ("pca", PCA(n_components=6, random_state=42)),
    ]
)

classifier = Pipeline(
   [
       (
           "SVC",
           svm.SVC(kernel='rbf')
       ),
   ]
)

clusterer = Pipeline(
    [
      (
           "KNN",
           KMeans(n_clusters=2)
      )
    ]
)

pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("clusterer", classifier)
    ]
)


In [None]:

label_encoder = LabelEncoder()
true_labels = label_encoder.fit_transform(finalFrame_true_labels['processType'])
clf = pipe.fit(X_train, y_train)

preprocessed_data = pipe["preprocessor"].transform(X_test)
predicted_labels = pipe["clusterer"]["SVC"].predict(preprocessed_data)

print("Accuracy:", accuracy_score(y_test, predicted_labels))

fig, ax = plt.subplots()
# title for the plots
title = ('Decision surface of RBF SVC ')
# Set-up grid for plotting.
X0, X1 = preprocessed_data[:, 0], preprocessed_data[:, 1]
xx, yy = make_meshgrid(X0, X1)

plot_contours(ax, pipe["clusterer"]["SVC"], xx, yy, cmap=plt.cm.coolwarm, alpha=0.8)
ax.scatter(X0, X1, c=y_test, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
ax.set_ylabel('PCA Component 1')
ax.set_xlabel('PCA Component 2')
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(title)
ax.legend()
plt.show()