In [153]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam


In [154]:
file_path = "../data/bodmas.npz"

In [155]:
features = [
    "Machine",
    "NumberOfSections",
    "TimeDateStamp",
    "PointerToSymbolTable",
    "NumberOfSymbols",
    "SizeOfOptionalHeader",
    "Characteristics",
    "Magic",
    "MajorLinkerVersion",
    "MinorLinkerVersion",
    "SizeOfCode",
    "SizeOfInitializedData",
    "SizeOfUninitializedData",
    "AddressOfEntryPoint",
    "BaseOfCode",
    "ImageBase",
    "SectionAlignment",
    "FileAlignment",
    "MajorOperatingSystemVersion",
    "MinorOperatingSystemVersion",
    "MajorImageVersion",
    "MinorImageVersion",
    "MajorSubsystemVersion",
    "MinorSubsystemVersion",
    "SizeOfImage",
    "SizeOfHeaders",
    "CheckSum",
    "Subsystem",
    "DllCharacteristics",
    "SizeOfStackReserve",
    "SizeOfStackCommit",
    "SizeOfHeapReserve",
    "SizeOfHeapCommit",
    "LoaderFlags",
    "NumberOfRvaAndSizes"
]

In [156]:
data = np.load(file_path)
X = data['X']
y = data['y']
print(X.shape, y.shape)

(134435, 2381) (134435,)


In [157]:
# Print all the features
for i, feature in enumerate(features):
    print(i, feature)

# Feature selection
selected_features = [
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34
]

X = X[:, selected_features]

0 Machine
1 NumberOfSections
2 TimeDateStamp
3 PointerToSymbolTable
4 NumberOfSymbols
5 SizeOfOptionalHeader
6 Characteristics
7 Magic
8 MajorLinkerVersion
9 MinorLinkerVersion
10 SizeOfCode
11 SizeOfInitializedData
12 SizeOfUninitializedData
13 AddressOfEntryPoint
14 BaseOfCode
15 ImageBase
16 SectionAlignment
17 FileAlignment
18 MajorOperatingSystemVersion
19 MinorOperatingSystemVersion
20 MajorImageVersion
21 MinorImageVersion
22 MajorSubsystemVersion
23 MinorSubsystemVersion
24 SizeOfImage
25 SizeOfHeaders
26 CheckSum
27 Subsystem
28 DllCharacteristics
29 SizeOfStackReserve
30 SizeOfStackCommit
31 SizeOfHeapReserve
32 SizeOfHeapCommit
33 LoaderFlags
34 NumberOfRvaAndSizes


In [158]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(107548, 35) (26887, 35) (107548,) (26887,)


In [159]:
train_rows = X_train.shape[0]
test_rows = X_test.shape[0]
print(train_rows, test_rows)

107548 26887


In [160]:
y_binary = to_categorical(y_test)
print(y_binary.shape)

(26887, 2)


In [165]:
# Train Random Forest
clf = RandomForestClassifier(n_estimators=64, max_depth=10, random_state=0)
clf.fit(X_train, y_train)

# Predict
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 93.15


In [162]:
# Save Model
joblib.dump(clf, "../models/bodmas/model.joblib")

['../models/bodmas/model.joblib']