# Exploring SMOTE

The aim of this notebook is to explore SMOTE in the "Oliveira" Dataset.

For this example, only Time-based Behavior will be processed.

In [1]:
#Python Libraries
import time
import math

#Data/Dataset libraries
import pandas as pd
import numpy as np

#Split Sampler/Data Splitting
from sklearn.model_selection import train_test_split

#Oversampler
from imblearn.over_sampling import RandomOverSampler, SMOTEN

#Label Encoding
from sklearn.preprocessing import LabelEncoder

#Filenames
filename = ["oliveira_lite.csv", "oliveira.csv"]
DATASET_FILENAME = filename[1] #Change accordingly;
API_LIST = "api_calls.txt"

#APIs List
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append("NaN") #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Timer
start = end = 0
def start_time():
    global start
    start = time.time()
def end_time():
    global start
    elapse = time.time()-start
    start = 0
    print(f"Elapsed Time: {round(elapse, 6)}s\n")
    return round(elapse, 6)

# Loading Dataset File

In [2]:
start_time()
oli = pd.read_csv(DATASET_FILENAME, low_memory=False, memory_map=True)
end_time()

#Dataset Information
print("Dataset Information:")
print(oli.info())
print("\nDataset Shape:", oli.shape)

#Dataset Labels:
print('\nDataset Labels:\n',oli['malware'].value_counts())

oli.head()

Elapsed Time: 0.729802s

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43876 entries, 0 to 43875
Columns: 102 entries, hash to malware
dtypes: int64(101), object(1)
memory usage: 34.1+ MB
None

Dataset Shape: (43876, 102)

Dataset Labels:
 malware
1    42797
0     1079
Name: count, dtype: int64


Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,malware
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,1
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,1
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,1
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,1
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,1


# Dataset Preparation

In [3]:
start_time()
#Dataset Cleaning: Nothing much here as the dataset is already clean enough for our purposes.
#Dataset Formatting: Changing order of columns to malware (label), hash, t_1, t_2, ..., t_99
label_col = oli.pop('malware')
oli.insert(0, label_col.name, label_col)
label_col = None
#Inverse Label Encoding
def inverse_label(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
oli.iloc[:, 2:] = oli.iloc[:, 2:].apply(inverse_label, axis=1, result_type='reduce')
print("")
end_time()
    
oli.head()


Elapsed Time: 11.355398s



Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
0,1,071e8c3f8922e186e57548cd4c703a5d,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,...,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW
1,1,33f8e6d08a6aae939f25a8e0d63dd523,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,...,FindResourceExW,LoadResource,LdrLoadDll,LdrGetProcedureAddress,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics
2,1,b68abd064e975e1c6d5f25e748663076,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExA,RegCloseKey,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA
3,1,72049be7bd30ea61297ea624ae198067,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,...,NtClose,NtAllocateVirtualMemory,NtWriteVirtualMemory,NtAllocateVirtualMemory,NtWriteVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,NtWriteVirtualMemory,NtProtectVirtualMemory,NtWriteVirtualMemory
4,1,c9b3700a77facf29172f32df6bc77f48,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegQueryInfoKeyW,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumKeyExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW


# Building Holdout Dataset

Using 90:10 split for Train:Holdout respectively

In [4]:
start_time()

#Time-based
X = oli.iloc[:,1:] #Features (including hash for now)
y = oli.iloc[:,0] #Labels
Train_X, Reserve_X, Train_y, Reserve_y = train_test_split(X, y, test_size=.10, random_state=1, shuffle=True)

Train_X.insert(0, 'malware', Train_y)
Reserve_X.insert(0, 'malware', Reserve_y)

Train = Train_X.copy(deep=True)
Reserve = Reserve_X.copy(deep=True)

Reserve.drop(axis=1, labels='hash', inplace=True)
Reserve.to_csv(index=False, chunksize=100, mode='w', path_or_buf='TB/Reserve.csv')

end_time()

  Train_X.insert(0, 'malware', Train_y)
  Reserve_X.insert(0, 'malware', Reserve_y)


Elapsed Time: 0.79582s



0.79582

In [5]:
print("TB Shape:", Train.shape)
Train.head()

TB Shape: (39488, 102)


Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
21089,1,c666e239ce25f91048ced38ccfe24659,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,LdrGetProcedureAddress,LookupAccountSidW,LdrGetProcedureAddress,LookupAccountSidW,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress
40618,1,96bd371670a23303d35dbe0a96f83d1e,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,...,FindResourceExW,LoadResource,FindResourceExW,LoadResource,LdrGetDllHandle,LdrGetProcedureAddress,LoadStringW,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle
7233,1,ea332897b590b8feca645d219d42838b,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,LdrGetDllHandle,...,FindResourceExW,LoadResource,FindResourceExW,LoadResource,LdrGetDllHandle,LdrGetProcedureAddress,LoadStringW,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle
31038,0,97c015f5978914e256445fa84c7d6613,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,SetFilePointer,NtReadFile,SetFilePointer,NtReadFile,SetFilePointer,NtReadFile,SetFilePointer,NtReadFile,SetFilePointer,NtReadFile
10082,1,4b795ce47b92f32bef67915ec79622ea,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,LoadResource,FindResourceExW,LoadResource,FindResourceExW,LoadResource,OleInitialize,FindResourceExW,LoadResource,FindResourceExW,LoadResource


# Data Rebalancing

To be applied to the train split (i.e., TB & IB)

RandomOverSampler is less prone to oversampling a sample/set of samples as is the case shown in [Balance Oliveira](https://github.com/jm55DLSU/THESIS/blob/main/ML%20Test/Balancing%20Oliveira/Balancing_Oliveira.ipynb).

In [6]:
def sample_distribution(dataset):
    distribution = []
    total_size = dataset.shape[0]
    dataset = str(dataset[['hash']].value_counts()[0:10]).split('\n')
    dataset.pop(0)
    dataset.pop(len(dataset)-1)
    for d in dataset:
        hs = d.split(' ')[0]
        qt = int(d.split(' ')[len(d.split(' '))-1])
        ratio = str(round(qt/total_size*100,2))+"%"
        distribution.append([hs,qt,ratio])
        print([hs,qt,ratio])
def rebalance(balancer):
    global Train
    #Time-based
    X = Train.iloc[:,1:]
    y = Train.iloc[:,0]
    print("Rebalance...")
    X, y = balancer.fit_resample(X, y)
    print("Rebalancing Finished!")
    X.insert(0, 'malware', y)
    Train = X.copy(deep=True)
    #Check sample distribution
    print("Sample Distribution (Top 10)")
    sample_distribution(Train)
    #Drop hash
    #Train.drop(axis=1, labels='hash', inplace=True)
    print("Time-based Label Counts:")
    print(Train['malware'].value_counts())
    Train.head()
    #Save to CSV
    Train.to_csv(index=False, chunksize=100, mode='w', path_or_buf='TB/Train.csv')
print("Time-based Label Counts:")
print(Train['malware'].value_counts())

Time-based Label Counts:
malware
1    38513
0      975
Name: count, dtype: int64


In [7]:
#ros = RandomOverSampler(sampling_strategy='minority', random_state=1, shrinkage=None)
#smoten = SMOTEN(sampling_strategy='minority', random_state=1, k_neighbors=5)

start_time()
rebalance(SMOTEN(sampling_strategy='minority', random_state=1, k_neighbors=math.ceil(math.sqrt(Train.shape[0]))))
end_time()

Rebalance...


Rebalancing Finished!
Sample Distribution (Top 10)


  X.insert(0, 'malware', y)


['03384ab6368b68ed16ecb9e6352539af', 26029, '33.79%']
['79b78bb3d583748040c41ded09555fd3', 7441, '9.66%']
['01ebb84c337b5e53b08603ab67b7f4b4', 3830, '4.97%']
['0084df58a48b23705f5f41c71c9789a7', 103, '0.13%']
['0470c05786ded1a769d47a092fad93ca', 44, '0.06%']
['0327301655f2e1c6bdbd4536a3349216', 43, '0.06%']
['022fee1488e6a0d06bc5882b998663f3', 31, '0.04%']
['01fe0f8b2484fa3995c192d660befdb1', 27, '0.04%']
['bdaaac3fa3f6796825a51ef1c0e5b3fd', 2, '0.0%']
['a71a2319cf8c74a89501eb80acd04fe6', 2, '0.0%']
Time-based Label Counts:
malware
1    38513
0    38513
Name: count, dtype: int64
Elapsed Time: 354.412796s



354.412796

In [11]:
Train[Train['hash']=='03384ab6368b68ed16ecb9e6352539af'].drop_duplicates()

Unnamed: 0,malware,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,...,t_90,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99
11905,0,03384ab6368b68ed16ecb9e6352539af,SetUnhandledExceptionFilter,LdrGetDllHandle,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,...,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey
39488,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,LdrGetProcedureAddress
39489,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtClose,RegCloseKey,LdrLoadDll,NtClose,LdrGetProcedureAddress,NtClose,NtClose,NtClose,NtClose
39490,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,NtClose,NtClose
39491,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,LdrGetProcedureAddress,NtClose
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43865,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,SetUnhandledExceptionFilter,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose
43874,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,NtClose
44055,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,NtClose,LdrGetProcedureAddress
44091,0,03384ab6368b68ed16ecb9e6352539af,GetSystemTimeAsFileTime,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtClose,NtClose,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,LdrGetProcedureAddress,NtClose,NtClose,NtClose,NtClose
