In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from IPython.display import display, HTML
from sklearn import preprocessing
import matplotlib.pyplot as plt

# Load the dataset from the CSV file
data = pd.read_csv('malware_BinaryImbalanced.csv')

# Selecting relevant columns based on ReadMe.txt
cols = ['classification', 'os', 'usage_counter', 'prio', 'static_prio', 'normal_prio', 'vm_pgoff', 
        'vm_truncate_count', 'task_size', 'map_count', 'hiwater_rss', 'total_vm', 'shared_vm',
        'exec_vm', 'reserved_vm', 'nr_ptes', 'nvcsw', 'nivcsw', 'signal_nvcsw']
df = data[cols]

# strip column names
df = df.rename(columns=lambda x: x.strip())
cols = df.columns

# replace missing values in numerical variables by using mean value
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col].fillna(df[col].mean(), inplace=True)

# encode labels
y = df['classification']
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y)
df['classification'] = y_encoded

# convert all nominal variables to binary variables
df_num = df.copy(deep=True)
df_dummies = pd.get_dummies(df_num[['os']])
df_num = df_num.join(df_dummies)
df_num = df_num.drop('os', axis=1)
df_num = df_num.drop('os_Windows', axis=1)

# Split the data into training and testing sets
X = df_num.drop('classification', axis=1)
y = df_num['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize models
mlp = MLPClassifier(max_iter=300)
rf = RandomForestClassifier()
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define hyperparameters for each model
mlp_params = {'hidden_layer_sizes': [(50,), (100,), (50,50)],
                    'activation': ['relu', 'tanh'],
                    'solver': ['adam', 'sgd']}

rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, 30]}

xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6, 10]}

# Grid Search for MLP
mlp_grid = GridSearchCV(mlp, mlp_params, scoring='f1', cv=5)
mlp_grid.fit(X_train_scaled, y_train)
mlp_best = mlp_grid.best_estimator_

# Grid Search for Random Forest
rf_grid = GridSearchCV(rf, rf_params, scoring='f1', cv=5)
rf_grid.fit(X_train_scaled, y_train)
rf_best = rf_grid.best_estimator_

# Grid Search for XGBoost
xgb_grid = GridSearchCV(xgb, xgb_params, scoring='f1', cv=5)
xgb_grid.fit(X_train_scaled, y_train)
xgb_best = xgb_grid.best_estimator_

# Predictions and Metrics
models = {'MLP': mlp_best, 'Random Forest': rf_best, 'XGBoost': xgb_best}
results = {}

plt.figure(figsize=(10, 8))

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred_proba)

    results[name] = {'Accuracy': accuracy, 'F1 Score': f1, 'AUC': auc_score, 'Best Parameters': model.get_params()}

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc="lower right")
plt.show()

# Display Results
print("Model Comparison Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        if metric != 'Best Parameters':
            print(f"  {metric}: {value:.4f}")
        else:
            print(f"  Best Parameters: {value}")