In [1]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from IPython.display import display, HTML
from sklearn import preprocessing

In [6]:
# Load the dataset from the CSV file
data = pd.read_csv('malware_BinaryImbalanced.csv')

# Display dataset info
print(data.info())
print(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 36 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hash               100000 non-null  object
 1   millisecond        100000 non-null  int64 
 2   classification     100000 non-null  object
 3   os                 100000 non-null  object
 4   state              100000 non-null  int64 
 5   usage_counter      100000 non-null  int64 
 6   prio               100000 non-null  int64 
 7   static_prio        100000 non-null  int64 
 8   normal_prio        100000 non-null  int64 
 9   policy             100000 non-null  int64 
 10  vm_pgoff           100000 non-null  int64 
 11  vm_truncate_count  100000 non-null  int64 
 12  task_size          100000 non-null  int64 
 13  cached_hole_size   100000 non-null  int64 
 14  free_area_cache    100000 non-null  int64 
 15  mm_users           100000 non-null  int64 
 16  map_count          10

In [7]:
# Selecting relevant columns based on ReadMe.txt
cols = ['classification', 'os', 'usage_counter', 'prio', 'static_prio', 'normal_prio', 'vm_pgoff', 
        'vm_truncate_count', 'task_size', 'map_count', 'hiwater_rss', 'total_vm', 'shared_vm',
        'exec_vm', 'reserved_vm', 'nr_ptes', 'nvcsw', 'nivcsw', 'signal_nvcsw']
df = data[cols]

# strip column names
df = df.rename(columns=lambda x: x.strip())
cols = df.columns

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

# replace missing values in numerical variables by using mean value #################################
for col in df.columns:
    if pd.api.types.is_numeric_dtype(df[col]): # Use pandas to check for numeric dtype.
        df[col].fillna(df[col].mean(), inplace=True)

# check again whether there are missing values
print('ColumnName, DataType, MissingValues')
for i in cols:
    print(i, ',', df[i].dtype, ',', df[i].isnull().any())

# remove column which are not appropriate to be included in this classification task
# In this case we do not drop any columns, but we can drop the classification column to create the X data.
# df=df.drop('classification',axis=1)

# encode labels
y = df['classification']  # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y)  # encode nominal labels to integers #####################################
df['classification'] = y_encoded

# print out and display dataframe as tables in HTML
display(HTML(df.head(10).to_html()))

print('Column Datatypes:\n', df.dtypes)

# convert all nominal variables to binary variables
df_num = df.copy(deep=True)
# create new binary columns
df_dummies = pd.get_dummies(df_num[['os']])
# add them to dataframe
df_num = df_num.join(df_dummies)
# drop original columns
df_num = df_num.drop('os', axis=1)

# drop extra binary columns, since we only need N-1 binary columns
df_num = df_num.drop('os_Windows', axis=1) #Drop one of the OS columns.

display(HTML(df_num.head(10).to_html())) #correct display.

Unnamed: 0,classification,os,usage_counter,prio,static_prio,normal_prio,vm_pgoff,vm_truncate_count,task_size,map_count,hiwater_rss,total_vm,shared_vm,exec_vm,reserved_vm,nr_ptes,nvcsw,nivcsw,signal_nvcsw
0,benign,Ubuntu,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
1,benign,CentOS,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
2,benign,Ubuntu,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
3,benign,CentOS,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
4,benign,Mac,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
5,benign,Windows,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
6,benign,Windows,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
7,benign,CentOS,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
8,benign,Ubuntu,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
9,benign,Mac,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0


ColumnName, DataType, MissingValues
classification , object , False
os , object , False
usage_counter , int64 , False
prio , int64 , False
static_prio , int64 , False
normal_prio , int64 , False
vm_pgoff , int64 , False
vm_truncate_count , int64 , False
task_size , int64 , False
map_count , int64 , False
hiwater_rss , int64 , False
total_vm , int64 , False
shared_vm , int64 , False
exec_vm , int64 , False
reserved_vm , int64 , False
nr_ptes , int64 , False
nvcsw , int64 , False
nivcsw , int64 , False
signal_nvcsw , int64 , False


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


Unnamed: 0,classification,os,usage_counter,prio,static_prio,normal_prio,vm_pgoff,vm_truncate_count,task_size,map_count,hiwater_rss,total_vm,shared_vm,exec_vm,reserved_vm,nr_ptes,nvcsw,nivcsw,signal_nvcsw
0,0,Ubuntu,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
1,0,CentOS,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
2,0,Ubuntu,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
3,0,CentOS,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
4,0,Mac,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
5,0,Windows,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
6,0,Windows,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
7,0,CentOS,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
8,0,Ubuntu,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0
9,0,Mac,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0


Column Datatypes:
 classification        int64
os                   object
usage_counter         int64
prio                  int64
static_prio           int64
normal_prio           int64
vm_pgoff              int64
vm_truncate_count     int64
task_size             int64
map_count             int64
hiwater_rss           int64
total_vm              int64
shared_vm             int64
exec_vm               int64
reserved_vm           int64
nr_ptes               int64
nvcsw                 int64
nivcsw                int64
signal_nvcsw          int64
dtype: object


Unnamed: 0,classification,usage_counter,prio,static_prio,normal_prio,vm_pgoff,vm_truncate_count,task_size,map_count,hiwater_rss,total_vm,shared_vm,exec_vm,reserved_vm,nr_ptes,nvcsw,nivcsw,signal_nvcsw,os_CentOS,os_Debian,os_Mac,os_Ubuntu
0,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,False,False,False,True
1,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,True,False,False,False
2,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,False,False,False,True
3,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,True,False,False,False
4,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,False,False,True,False
5,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,False,False,False,False
6,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,False,False,False,False
7,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,True,False,False,False
8,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,False,False,False,True
9,0,0,3069403136,16447,0,0,14739,0,7903,0,88,120,120,80,0,349169,0,0,False,False,True,False


In [8]:
# Split the data into training and testing sets
X = data.drop('classification', axis=1)
y = data['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
# Initialize models
mlp = MLPClassifier(max_iter=300)
rf = RandomForestClassifier()
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define hyperparameters for each model
mlp_params = {'hidden_layer_sizes': [(50,), (100,), (50,50)],
               'activation': ['relu', 'tanh'],
               'solver': ['adam', 'sgd']}

rf_params = {'n_estimators': [100, 200], 'max_depth': [10, 20, 30]}

xgb_params = {'n_estimators': [100, 200], 'max_depth': [3, 6, 10]}

In [10]:
# Grid Search for MLP
mlp_grid = GridSearchCV(mlp, mlp_params, scoring='f1', cv=5)
mlp_grid.fit(X_train, y_train)
mlp_best = mlp_grid.best_estimator_

# Grid Search for Random Forest
rf_grid = GridSearchCV(rf, rf_params, scoring='f1', cv=5)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_estimator_

# Grid Search for XGBoost
xgb_grid = GridSearchCV(xgb, xgb_params, scoring='f1', cv=5)
xgb_grid.fit(X_train, y_train)
xgb_best = xgb_grid.best_estimator_

ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 754, in fit
    return self._fit(X, y, incremental=False)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 442, in _fit
    X, y = self._validate_input(X, y, incremental, reset=first_pass)
           ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 1114, in _validate_input
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        reset=reset,
        ^^^^^^^^^^^^
    )
    ^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: '1dec265aeda7b58e4173f47af0641a949937edbf21904ff1b6681c5348642387'

--------------------------------------------------------------------------------
48 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 754, in fit
    return self._fit(X, y, incremental=False)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 442, in _fit
    X, y = self._validate_input(X, y, incremental, reset=first_pass)
           ~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 1114, in _validate_input
    X, y = validate_data(
           ~~~~~~~~~~~~~^
        self,
        ^^^^^
    ...<5 lines>...
        reset=reset,
        ^^^^^^^^^^^^
    )
    ^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 2961, in validate_data
    X, y = check_X_y(X, y, **check_params)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 1370, in check_X_y
    X = check_array(
        X,
    ...<12 lines>...
        input_name="X",
    )
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\utils\_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "c:\Users\harip\AppData\Local\Programs\Python\Python313\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
ValueError: could not convert string to float: '21b41814b140cd9824b72e765270800f7975fa4c8955396a95370567d95138ae'


ModuleNotFoundError: No module named 'imblearn'