In [2]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# Dataset 1

In [3]:
header_df = pd.read_csv('/root/ByteMe/datasets/1/PE_Header.csv')
section_df = pd.read_csv('/root/ByteMe/datasets/1/PE_Section.csv')

ds1 = pd.merge(header_df, section_df, on=["SHA256", "Type"], how="inner")
ds1['Type'] = ds1['Type'].apply(lambda x: 0 if x == 0 else 1)

In [14]:
# Distribution


# Dataset 2

In [4]:
header_df = pd.read_csv('/root/ByteMe/datasets/PE_Header2.csv')
section_df = pd.read_csv('/root/ByteMe/datasets/PE_Section2.csv')

ds2 = pd.merge(header_df, section_df, on=["SHA256", "Type"], how="inner")
ds2['Type'] = ds2['Type'].apply(lambda x: 0 if x == 0 else 1)

In [15]:
# Distribution


# Dataset 3: Dataset 1 + Dataset 2

In [5]:
ds3 = pd.concat([ds1, ds2])
ds3 = ds3.drop_duplicates(subset='SHA256')
ds3 = ds3.reset_index(drop=True)

In [6]:
ds3.head()

Unnamed: 0,SHA256,Type,e_magic,e_cblp,e_cp,e_crlc,e_cparhdr,e_minalloc,e_maxalloc,e_ss,...,tls_Characteristics,pdata_Misc_VirtualSize,pdata_VirtualAddress,pdata_SizeOfRawData,pdata_PointerToRawData,pdata_PointerToRelocations,pdata_PointerToLinenumbers,pdata_NumberOfRelocations,pdata_NumberOfLinenumbers,pdata_Characteristics
0,dacbe8cb72dd746539792a50e84965fefef73feaa07b5d...,0,23117,144,3,0,4,0,65535,0,...,0,0,0,0,0,0,0,0,0,0
1,d3dc7512ce75db33b2c3063fa99245e9ca9fe3b086462f...,0,23117,144,3,0,4,0,65535,0,...,0,0,0,0,0,0,0,0,0,0
2,b350fac81533f02981dc2176ed17163177d92d9405758e...,0,23117,144,3,0,4,0,65535,0,...,0,0,0,0,0,0,0,0,0,0
3,dfee618043a47b7b09305df0ca460559d9f567ee246c7b...,0,23117,144,3,0,4,0,65535,0,...,0,0,0,0,0,0,0,0,0,0
4,c7b2e4e4fb2fcc44c953673ff57c3d14bdf5d2008f35e9...,0,23117,144,3,0,4,0,65535,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Distribution


# Train

In [7]:
dataset = ds3
# shuffle the dataset
dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
y = dataset["Type"]
X = dataset.drop(columns=["SHA256", "Type"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
from joblib import dump, load

In [11]:
# Train the model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
# Save the model
dump(rf_classifier, 'rf_classifier.joblib')



['rf_classifier.joblib']

In [14]:
rf_classifier = load('rf_classifier.joblib')

In [15]:
# Get predicted probabilities for the positive class (class 1)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]
threshold = 0.8
y_pred = (y_pred_proba >= threshold).astype(int)
# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nThreshold: {threshold}")
print("Test Accuracy:", test_accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)

print("False Positive Rate:", FPR)
print("True Positive Rate:", TPR)


Threshold: 0.8
Test Accuracy: 0.9823451607337972
False Positive Rate: 0.015766653527788728
True Positive Rate: 0.9820226232157285


In [16]:
dt_classifier = DecisionTreeClassifier(random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
dt_classifier.fit(X_train, y_train)
dump(dt_classifier, 'dt_classifier.joblib')
# Get predicted probabilities for the positive class (class 1)
y_pred_proba = dt_classifier.predict_proba(X_test)[:, 1]
threshold = 0.9
# Convert probabilities to binary predictions using the threshold
y_pred = (y_pred_proba >= threshold).astype(int)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nThreshold: {threshold}")
print("Test Accuracy:", test_accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)

print("False Positive Rate:", FPR)
print("True Positive Rate:", TPR)


Threshold: 0.9
Test Accuracy: 0.9861981712576916
False Positive Rate: 0.05084745762711865
True Positive Rate: 0.9925262590896848


In [17]:
gb_classifier = HistGradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)
dump(gb_classifier, 'gb_classifier.joblib')
# Get predicted probabilities for the positive class (class 1)
y_pred_proba = gb_classifier.predict_proba(X_test)[:, 1]
threshold = 0.9
# Convert probabilities to binary predictions using the threshold
y_pred = (y_pred_proba >= threshold).astype(int)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nThreshold: {threshold}")
print("Test Accuracy:", test_accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)

print("False Positive Rate:", FPR)
print("True Positive Rate:", TPR)
    


Threshold: 0.9
Test Accuracy: 0.9870607855540859
False Positive Rate: 0.016160819865983445
True Positive Rate: 0.9876110961486668


# NNs


In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Add, Activation, BatchNormalization, Dropout


2024-04-11 03:18:57.421689: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-11 03:18:57.494845: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-11 03:18:57.785286: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [19]:

# Preprocessing: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the neural network model with residual connections
def residual_block(x, units, dropout_rate=0.5):
    y = Dense(units)(x)
    y = BatchNormalization()(y)
    y = Activation('relu')(y)
    y = Dropout(dropout_rate)(y)
    y = Dense(units)(y)
    y = BatchNormalization()(y)
    y = Activation('relu')(y)
    y = Dropout(dropout_rate)(y)
    y = Add()([x, y])  # Residual connection
    return y

input_layer = Input(shape=(X_train_scaled.shape[1],))
x = Dense(64)(input_layer)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)

# Add residual blocks
num_blocks = 3  # Adjust the number of residual blocks as needed
for _ in range(num_blocks):
    x = residual_block(x, units=64, dropout_rate=0.5)

# Output layer
output_layer = Dense(1, activation='sigmoid')(x)

# Create the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model, 10 32
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2)



Epoch 1/10
[1m1015/1015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.8124 - loss: 0.4908 - val_accuracy: 0.9031 - val_loss: 0.2511
Epoch 2/10
[1m1015/1015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9163 - loss: 0.2140 - val_accuracy: 0.9342 - val_loss: 0.1912
Epoch 3/10
[1m1015/1015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9289 - loss: 0.1854 - val_accuracy: 0.9539 - val_loss: 0.1600
Epoch 4/10
[1m1015/1015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9423 - loss: 0.1626 - val_accuracy: 0.9596 - val_loss: 0.1532
Epoch 5/10
[1m1015/1015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.9503 - loss: 0.1457 - val_accuracy: 0.9688 - val_loss: 0.1265
Epoch 6/10
[1m1015/1015[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 5ms/step - accuracy: 0.9603 - loss: 0.1186 - val_accuracy: 0.9730 - val_loss: 0.1105
Epoch 7/10
[1

<keras.src.callbacks.history.History at 0x7f2355f7d2a0>

In [21]:
# Save the model architecture and weights
model.save("keras_model.keras")

# Save the scaler
scaler_filename = "scaler.save"
dump(scaler, scaler_filename)

['scaler.save']

In [22]:
# Evaluate the model on the test set
y_pred_proba = model.predict(X_test_scaled)
threshold = 0.85
# Convert probabilities to binary predictions using the threshold
y_pred = (y_pred_proba >= threshold).astype(int)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"\nThreshold: {threshold}")
print("Test Accuracy:", test_accuracy)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
TN, FP, FN, TP = conf_matrix.ravel()
FPR = FP / (FP + TN)
TPR = TP / (TP + FN)

print("False Positive Rate:", FPR)
print("True Positive Rate:", TPR)

[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

Threshold: 0.85
Test Accuracy: 0.973488987290816
False Positive Rate: 0.07725660228616477
True Positive Rate: 0.9821572852141126


In [16]:
# Assuming you have predictions and thresholds for each model
rf_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]
rf_threshold = 0.8
rf_pred = (rf_pred_proba >= rf_threshold).astype(int)

dt_pred_proba = dt_classifier.predict_proba(X_test)[:, 1]
dt_threshold = 0.9
dt_pred = (dt_pred_proba >= dt_threshold).astype(int)

gb_pred_proba = gb_classifier.predict_proba(X_test)[:, 1]
gb_threshold = 0.9
gb_pred = (gb_pred_proba >= gb_threshold).astype(int)

nn_pred_proba = model.predict(X_test_scaled)
nn_threshold = 0.85
nn_pred = (nn_pred_proba >= nn_threshold).astype(int)
nn_pred = nn_pred.flatten()

print("rf_pred shape:", rf_pred.shape)
print("dt_pred shape:", dt_pred.shape)
print("gb_pred shape:", gb_pred.shape)
print("nn_pred shape:", nn_pred.shape)


[1m544/544[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
rf_pred shape: (17389,)
dt_pred shape: (17389,)
gb_pred shape: (17389,)
nn_pred shape: (17389,)


In [18]:
# Combine predictions into an array
combined_preds = np.array([rf_pred, dt_pred, gb_pred, nn_pred])

# Perform majority voting
majority_vote = np.round(np.mean(combined_preds, axis=0))

# Evaluate the majority voting ensemble
ensemble_accuracy = accuracy_score(y_test, majority_vote)
print("Ensemble Accuracy:", ensemble_accuracy)

conf_matrix = confusion_matrix(y_test, majority_vote)

# Extract TN, FP, FN, TP from confusion matrix
TN, FP, FN, TP = conf_matrix.ravel()

# Calculate True Positive Rate (TPR) and False Positive Rate (FPR)
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)

print("Ensemble True Positive Rate (TPR):", TPR)
print("Ensemble False Positive Rate (FPR):", FPR)

Ensemble Accuracy: 0.9849330036229801
Ensemble True Positive Rate (TPR): 0.9851198491785618
Ensemble False Positive Rate (FPR): 0.016160819865983445
