In [6]:
import warnings
warnings.filterwarnings("ignore")

In [7]:


# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [8]:
# Load the dataset from the CSV file
df = pd.read_csv("Malware_BinaryImbalanced.csv")

# Display dataset info
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hash               100000 non-null  object
 1   millisecond        100000 non-null  int64 
 2   classification     100000 non-null  object
 3   os                 100000 non-null  object
 4   state              100000 non-null  int64 
 5   usage_counter      100000 non-null  int64 
 6   prio               100000 non-null  int64 
 7   static_prio        100000 non-null  int64 
 8   normal_prio        100000 non-null  int64 
 9   policy             100000 non-null  int64 
 10  vm_pgoff           100000 non-null  int64 
 11  vm_truncate_count  100000 non-null  int64 
 12  task_size          100000 non-null  int64 
 13  cached_hole_size   100000 non-null  int64 
 14  free_area_cache    100000 non-null  int64 
 15  mm_users           100000 non-null  int64 
 16  map_count          10

In [None]:


# Selecting relevant columns based on ReadMe.txt
cols = ['classification', 'os', 'usage_counter', 'prio', 'static_prio', 'normal_prio', 'vm_pgoff', 
        'vm_truncate_count', 'task_size', 'map_count', 'hiwater_rss', 'total_vm', 'shared_vm',
        'exec_vm', 'reserved_vm', 'nr_ptes', 'nvcsw', 'nivcsw', 'signal_nvcsw']
data = data[cols]

# Encode the target variable to binary
# malware = 1, beign = 0
data['classification'] = data['classification'].map({'malware': 1, 'beign': 0})

# One-hot encoding for categorical column 'os'
data = pd.get_dummies(data, columns=['os'], drop_first=True)

# Split the data into training and testing sets
X = data.drop('classification', axis=1)
y = data['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Kernel 2: Initialize and tune models
# Initialize models
mlp = MLPClassifier(max_iter=300)
rf = RandomForestClassifier()
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define hyperparameters for each model
mlp_params = {'hidden_layer_sizes': [(50,), (100,), (50,50)],
               'activation': ['relu', 'tanh'],
               'solver': ['adam', 'sgd']}

rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, 30]}

xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6, 10]}

# Kernel 3: Perform Grid Search for MLP
# Grid Search for MLP
mlp_grid = GridSearchCV(mlp, mlp_params, scoring='f1', cv=5)
mlp_grid.fit(X_train, y_train)
mlp_best = mlp_grid.best_estimator_

# Kernel 4: Perform Grid Search for Random Forest
# Grid Search for Random Forest
rf_grid = GridSearchCV(rf, rf_params, scoring='f1', cv=5)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

# Kernel 5: Perform Grid Search for XGBoost
# Grid Search for XGBoost
xgb_grid = GridSearchCV(xgb, xgb_params, scoring='f1', cv=5)
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_

# Kernel 6: Evaluate the models
# Evaluate the models on the test data
models = {'MLP': mlp_best, 'Random Forest': rf_best, 'XGBoost': xgb_best}
results = []

# Iterate through each model and calculate performance metrics
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    results.append([name, accuracy, f1, auc])

# Create a results DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'F1 Score', 'AUC'])
print(results_df)

# Kernel 7: Save the results and notebook
# Save the notebook as HTML
import nbformat as nbf
nb = nbf.v4.new_notebook()
nb.cells.append(nbf.v4.new_code_cell(open(__file__).read()))
with open('/mnt/data/malware_classification.ipynb', 'w') as f:
    nbf.write(nb, f)

# Also save the results
results_df.to_html('/mnt/data/comparison_results.html')
