In [1]:
# Suppress warnings (ignore any warnings during the execution)
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import necessary libraries for data processing and modeling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from IPython.display import display, HTML
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [6]:
# Load the dataset
# Load the dataset 'malware_BinaryImbalanced.csv' into a pandas DataFrame
data = pd.read_csv('malware_BinaryImbalanced.csv')

# Display dataset info
print(data.info())
print(data.head())

# Selecting relevant columns as per the ReadMe.txt (list of columns that are relevant)
cols = ['classification', 'os', 'usage_counter', 'prio', 'static_prio', 'normal_prio', 'vm_pgoff', 
        'vm_truncate_count', 'task_size', 'map_count', 'hiwater_rss', 'total_vm', 'shared_vm',
        'exec_vm', 'reserved_vm', 'nr_ptes', 'nvcsw', 'nivcsw', 'signal_nvcsw']
df = data[cols]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hash               100000 non-null  object
 1   millisecond        100000 non-null  int64 
 2   classification     100000 non-null  object
 3   os                 100000 non-null  object
 4   state              100000 non-null  int64 
 5   usage_counter      100000 non-null  int64 
 6   prio               100000 non-null  int64 
 7   static_prio        100000 non-null  int64 
 8   normal_prio        100000 non-null  int64 
 9   policy             100000 non-null  int64 
 10  vm_pgoff           100000 non-null  int64 
 11  vm_truncate_count  100000 non-null  int64 
 12  task_size          100000 non-null  int64 
 13  cached_hole_size   100000 non-null  int64 
 14  free_area_cache    100000 non-null  int64 
 15  mm_users           100000 non-null  int64 
 16  map_count          10

In [7]:
# Clean column names
# Strip any extra spaces in column names
df = df.rename(columns=lambda x: x.strip())
cols = df.columns  # Update column names

In [8]:

# Handle missing values
# Replace missing (NaN) values in numerical columns with the column's mean value
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col].fillna(df[col].mean(), inplace=True)

In [9]:
# Encode the target variable 'classification'
# Label encode the 'classification' column from categorical values to numeric
y = df['classification']
le = preprocessing.LabelEncoder()
le.fit(y)  # Fit the label encoder
y_encoded = le.transform(y)  # Encode labels (0 or 1)
df['classification'] = y_encoded  # Replace original classification column with encoded values

In [10]:
# Convert categorical variable ('os') into binary dummy variables
# 'os' is a categorical column, which we need to convert to binary dummy variables
df_num = df.copy(deep=True)  # Make a deep copy to avoid modifying the original df
df_dummies = pd.get_dummies(df_num[['os']])  # Create dummy variables for 'os'
df_num = df_num.join(df_dummies)  # Add the dummy variables back to the dataframe
df_num = df_num.drop('os', axis=1)  # Drop the original 'os' column
df_num = df_num.drop('os_Windows', axis=1)  # Drop one of the dummy variables ('os_Windows') to avoid multicollinearity

In [11]:
# Split the data into features (X) and target (y)
# X contains the features (independent variables), y is the target (dependent variable)
X = df_num.drop('classification', axis=1)  # Features are all columns except 'classification'
y = df_num['classification']  # Target is 'classification'

In [12]:

# Split the dataset into a training set (75%) and a testing set (25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [13]:

# Scale the data
# Standardize features by scaling them to have a mean of 0 and standard deviation of 1
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit the scaler to training data and transform it
X_test_scaled = scaler.transform(X_test)  # Use the fitted scaler to transform the test data

In [14]:
# Initialize models
# Initialize the machine learning models we will use
mlp = MLPClassifier(max_iter=300)  # Multi-layer Perceptron (neural network)
rf = RandomForestClassifier()  # Random Forest classifier
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # XGBoost classifier

In [15]:
# Define hyperparameters for each model
# These are the hyperparameters we will tune using GridSearchCV
mlp_params = {'hidden_layer_sizes': [(50,), (100,), (50,50)],
                    'activation': ['relu', 'tanh'],
                    'solver': ['adam', 'sgd']}

rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, 30]}

xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6, 10]}

In [None]:
# Hyperparameter Tuning using Grid Search
# GridSearchCV performs cross-validation to find the best hyperparameters

# Grid Search for MLP (Neural Network)
mlp_grid = GridSearchCV(mlp, mlp_params, scoring='f1', cv=5)  # Use F1 score as the evaluation metric
mlp_grid.fit(X_train_scaled, y_train)  # Train the model with the training data
mlp_best = mlp_grid.best_estimator_  # Get the best model after tuning

# Grid Search for Random Forest
rf_grid = GridSearchCV(rf, rf_params, scoring='f1', cv=5)
rf_grid.fit(X_train_scaled, y_train)
rf_best = rf_grid.best_estimator_

# Grid Search for XGBoost
xgb_grid = GridSearchCV(xgb, xgb_params, scoring='f1', cv=5)
xgb_grid.fit(X_train_scaled, y_train)
xgb_best = xgb_grid.best_estimator_


In [None]:
# Evaluate models and compare performance
# Store the results of each model and evaluate them
models = {'MLP': mlp_best, 'Random Forest': rf_best, 'XGBoost': xgb_best}
results = {}

# Initialize the plot
plt.figure(figsize=(10, 8))

# Loop through each model, make predictions, and calculate performance metrics
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)  # Predict the target labels for the test set
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]  # Get probabilities for the positive class

    accuracy = accuracy_score(y_test, y_pred)  # Calculate accuracy
    f1 = f1_score(y_test, y_pred)  # Calculate F1 score
    auc_score = roc_auc_score(y_test, y_pred_proba)  # Calculate ROC AUC score

    results[name] = {'Accuracy': accuracy, 'F1 Score': f1, 'AUC': auc_score, 'Best Parameters': model.get_params()}

    # Calculate ROC curve and plot it
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)  # False positive rate, true positive rate for ROC curve
    roc_auc = auc(fpr, tpr)  # Calculate the AUC from the ROC curve
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')  # Plot the ROC curve for this model

In [None]:
#ROC plot
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random classifier
plt.xlim([0.0, 1.0])  # Set axis limits
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')  # Label for the x-axis
plt.ylabel('True Positive Rate')  # Label for the y-axis
plt.title('ROC Curve Comparison')  # Plot title
plt.legend(loc="lower right")  # Display the legend
plt.show()  # Show the plot

In [None]:

#Display Results
# Print out the performance metrics of each model
print("Model Comparison Results:")
for name, metrics in results.items():
    print(f"\n{name}:")
    for metric, value in metrics.items():
        if metric != 'Best Parameters':
            print(f"  {metric}: {value:.4f}")
        else:
            print(f"  Best Parameters: {value}")
