# Evaluation and Comparison of Boosted ML Models in Behavior-Based Malware Detection


## Notebook: LightGBM Tuning

***

**What is the objective of this file?**

To process the datasets to make it suitable for use in Model Training and Model Evaluation.

## Checklist

- Ensure that you have installed the necessary libraries needed to execute the training process. 
- You can view the list of the specific versions in the thesis document or through the `.sh` or `.bat` files in the repository's home directory.

# 0. Preparations

In [None]:
#Python Libraries
import time, threading, math
from datetime import datetime

#Data/Dataset libraries
import pandas as pd
import numpy as np

#Split Sampler/Data Splitting
from sklearn.model_selection import train_test_split

#Oversampler
from imblearn.over_sampling import SMOTEN

#Label Encoding
# from sklearn.preprocessing import LabelEncoder

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#Timer
start = end = 0
LOG_FILENAME = "MalbehavD_Log.txt"
def logging(message):
    log = open(LOG_FILENAME, "a")
    log.write(message)
    log.close()
def start_time():
    global start
    start = time.time()
def end_time(process):
    global start
    global DATASET_FILENAME
    elapse = time.time()-start
    start = 0
    printout = f"{str(datetime.now())}@{DATASET_FILENAME}: {process} - {round(elapse, 6)}s\n"
    logging(printout)
    return round(elapse, 6)

# 1. Import Datasets

**Notice:** 
1. Make sure check the value of line 3 of the block below before running.
2. A backup of the processed datasets is already found in `/Processed Datasets` folder. Simply unzip it to replace the contents of `/Processed Datasets/IB` and `/Processed Datasets/TB` folders.

In [None]:
#Filenames
filename = ["malbd.csv"]
DATASET_FILENAME = filename[0] # <== CHANGE THIS ACCORDINGLY
API_LIST = "../../api_calls.txt"

#Load list of API calls
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Importing Datasets
oli = pd.read_csv(DATASET_FILENAME)
print("Oliveira Info:")
oli.info()

In [None]:
APIS

In [None]:
#Previewing Dataset
oli.head()

In [None]:
print("Oliveira Label Counts")
print("0 as Benign, 1 as Malicious")
oli.malware.value_counts()

In [None]:
print("Oliveira Unique API calls list")
oli_unique = pd.Series(oli[list(oli.columns.values)[1:101]].values.ravel())
oli_unique.sort_values(inplace=True, ascending=False)
oli_unique.value_counts()[0:19].plot(kind='barh', figsize=(7,3.5), title='Top 20 API calls in \'Oliveira\'') #Top 20 only

In [None]:
oli_unique.unique()

# 2. Dataset Preparation

## 2.1. Dataset Cleaning and Dataset Formatting

In [None]:
# Remove falsely labelled malicious samples
oli = oli[oli['type'] != '_']

# Remove specific malware types
# removables = ['ransomware', 'miner', 'virus', 'spyware', 'hacktool', 'dropper', 'worm']
# for r in removables:
#     oli = oli[oli['type'] != r]

#Remove type column
type_col = oli.pop('type')

#Removing hash column
hash_col = oli.pop('hash')

#Re-arranging column positions
label_col = oli.pop('malware')
oli = pd.concat([label_col, oli], axis=1)
oli = pd.concat([oli, hash_col], axis=1) # <=== This will be retained for the benefit of model evaluation.
oli = pd.concat([oli, type_col], axis=1) # <=== This will be retained for the benefit of model evaluation.

oli

## 2.2. Inverse Data Encoding

*Encoded (Ordinal) API calls to String API Calls*

In [None]:
def convert(api:str):
    return APIS.index(api)

for j in range(1,101):
    oli.iloc[:,j] = pd.Series(list(map(convert, oli.iloc[:,j].to_list())))

oli

In [None]:
#Inverse Label Encoding
def inverse_label(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
start_time()
oli.iloc[:, 1:101] = oli.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')
end_time("Inverse Data Encode")

oli.head()

In [None]:
print("Oliveira Unique API calls list")
oli_unique = pd.Series(oli[list(oli.columns.values)[1:101]].values.ravel())
oli_unique.sort_values(inplace=True, ascending=False)
oli_unique.value_counts()[0:19].plot(kind='barh', figsize=(7,3.5), title='Top 20 API calls in \'Oliveira\'') #Top 20 only

In [None]:
oli_unique.unique()

## 2.3. Feature Duplicate Processing

*Building Time-based and Instance-Based Datasets*

In [None]:
TB = oli.copy(deep=True) #Time-based behavior (same as original)
IB = oli.copy(deep=True) #Instance-based behavior (to be created)

start_time()
print("Transposing IB...")
IB.transpose()
print("IB Transposed!")
print("Removing duplicates...")
print("Row:", end=" ")
for r in range(oli.shape[0]):
    #Per row (sample) removal of duplicates, thus cannot scale into the whole dataframe (which is way faster)
    row = IB.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
    IB.iloc[r, 1:101] = row + ([DELIMITER]*(100-len(row)))
    if r % 100 == 0:
        print(r, end=" ")
print("\nDuplicates removed!")
print("Retransposing IB (revert)...")
IB.transpose()
print("IB Retransposed!")
end_time("Feature Duplicate Process")

In [None]:
IB

# 5. (Manual) Ordinal Encoding

In [None]:
def convert(api:str):
    return APIS.index(api)
def ordinal_encode(offset):
    global APIS
    global ENCODED
    for j in range(1,101):
        ENCODED[offset].iloc[:,j] = pd.Series(list(map(convert, ENCODED[offset].iloc[:,j].to_list())))

ENCODED = [TB.copy(deep=True), IB.copy(deep=True)]

tb_thread = threading.Thread(target=ordinal_encode, args=(0,))
ib_thread = threading.Thread(target=ordinal_encode, args=(1,))

start_time()
tb_thread.start()
ib_thread.start()
tb_thread.join()
ib_thread.join()
end_time("Label Encode")

# 6. Saving to File

In [None]:
def save_to_csv(dfs, filenames):
    for d in range(len(dfs)):
        dfs[d].to_csv(index=False, chunksize=100, mode='w', path_or_buf=filenames[d])

STR_FILENAMES = ['TB/M-CATB_TB.csv', 'IB/M-CATB_IB.csv', 'TB/M-CATB_TB_Test.csv', 'IB/M-CATB_IB_Test.csv']
STR_API = [TB.copy(deep=True), IB.copy(deep=True)]

ENC_FILENAMES = ['TB/M-LGBM_TB.csv', 'IB/M-LGBM_IB.csv', 'TB/M-LGBM_TB_Test.csv', 'IB/M-LGBM_IB_Test.csv']
ENC_API = ENCODED

start_time()
save_to_csv(STR_API, STR_FILENAMES)
save_to_csv(ENC_API, ENC_FILENAMES)
end_time('save_to_file')

In [None]:
logging("\n") #Adds a spacer for next instance of logs for this.