# Load Libraries

In [1]:
import time, datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.model_selection as model_selection
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import HistGradientBoostingClassifier as HGBT

# Load Dataset

In [2]:
malbehavd = pd.read_csv('MalbehavD_LabelEncoded.csv', low_memory=False)
malbehavd.head()

Unnamed: 0,sha256,malware,0,1,2,3,4,5,6,7,...,165,166,167,168,169,170,171,172,173,174
0,5c18291c481a192ed5003084dab2d8a117fd3736359218...,0,26,2,65,52,29,31,44,37,...,1,1,1,1,1,1,1,1,1,1
1,4683faf3da550ffb594cf5513c4cbb34f64df85f27fd1c...,0,35,19,65,9,78,92,46,1,...,1,1,1,1,1,1,1,1,1,1
2,9a0aea1c7290031d7c3429d0e921f107282cc6eab854ee...,0,14,7,26,35,34,77,94,22,...,1,1,1,1,1,1,1,1,1,1
3,e0f3e4d5f50afd9c31e51dd9941c5a52d57c7c524f5d11...,0,39,31,35,95,66,43,85,54,...,1,1,1,1,1,1,1,1,1,1
4,ec2b6d29992f13e74015ff0b129150b4afae15c593e4b7...,0,26,23,60,94,104,83,56,53,...,1,1,1,1,1,1,1,1,1,1


In [3]:
malbehavd_feats = []
for i in range(malbehavd.shape[1]-2):
    malbehavd_feats.append(str(i))
X = malbehavd[malbehavd_feats]
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,165,166,167,168,169,170,171,172,173,174
0,26,2,65,52,29,31,44,37,63,31,...,1,1,1,1,1,1,1,1,1,1
1,35,19,65,9,78,92,46,1,85,98,...,1,1,1,1,1,1,1,1,1,1
2,14,7,26,35,34,77,94,22,105,73,...,1,1,1,1,1,1,1,1,1,1
3,39,31,35,95,66,43,85,54,62,48,...,1,1,1,1,1,1,1,1,1,1
4,26,23,60,94,104,83,56,53,81,48,...,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2565,3,13,1,77,44,13,83,2,43,85,...,1,1,1,1,1,1,1,1,1,1
2566,31,62,39,68,93,83,78,93,71,66,...,1,1,1,1,1,1,1,1,1,1
2567,10,51,1,77,44,31,100,79,85,25,...,1,1,1,1,1,1,1,1,1,1
2568,28,30,90,21,33,44,45,70,85,51,...,1,1,1,1,1,1,1,1,1,1


In [4]:
y = malbehavd['malware'].to_numpy()
labels = malbehavd['malware'].unique()
print("MalbehavD - No. of unique labels: ", labels.size)
print(labels)

MalbehavD - No. of unique labels:  2
[0 1]


In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=.20, random_state=1)

# Train Model

In [6]:
hgbt = HGBT(loss='log_loss', learning_rate=0.1, max_iter=1000, max_leaf_nodes=None, max_depth=None, 
            min_samples_leaf=20, l2_regularization=0.0, max_bins=255, categorical_features=None, 
            monotonic_cst=None, interaction_cst=None, warm_start=False, early_stopping='auto', 
            scoring='loss', validation_fraction=0.1, n_iter_no_change=10, tol=1e-07, verbose=False, 
            random_state=1, class_weight=None)
hgbt.fit(X_train, y_train)
y_pred = hgbt.predict(X_test)
hgbt_cr = classification_report(y_test, y_pred, digits=4)
print(hgbt_cr)

              precision    recall  f1-score   support

           0     0.8777    0.9799    0.9260       249
           1     0.9788    0.8717    0.9222       265

    accuracy                         0.9241       514
   macro avg     0.9283    0.9258    0.9241       514
weighted avg     0.9298    0.9241    0.9240       514



# Saving Model

In [7]:
from joblib import dump, load

dump(hgbt, './Saved Model/saved_model.joblib') #Dumping trained/fitted model to file; File extension can technically be arbitrary
hgbt_file = load('./Saved Model/saved_model.joblib') #Loading trained/fitted model to file; File extension can technically be arbitrary

# Retrying Saved Model

In [8]:
y_pred = hgbt_file.predict(X_test)
hgbt_cr = classification_report(y_test, y_pred, digits=4)
print(hgbt_cr)

              precision    recall  f1-score   support

           0     0.8777    0.9799    0.9260       249
           1     0.9788    0.8717    0.9222       265

    accuracy                         0.9241       514
   macro avg     0.9283    0.9258    0.9241       514
weighted avg     0.9298    0.9241    0.9240       514

