In [25]:
# Template for Coursework of GINT
import pandas as pd
from matplotlib import pyplot
from sklearn import metrics 
from sklearn.model_selection import KFold
import numpy as np 
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
#paths to the file containing the samples with the processed features
feature_of_counts = "../processed_data/feature_vectors_counts.csv"

In [3]:
# Importing the dataset, and performing an undersampling of the benignware
dataset = pd.read_csv(feature_of_counts, index_col=0)

malware= dataset.loc[dataset["malware"] == True]
#print(type(malware))
benign = dataset.loc[dataset["malware"] == False]
#print(len(benign))
benign_undersample = benign.sample(n=len(malware), replace= False, random_state=42)
#print(len(benign_undersample))

dataset= pd.concat([malware, benign_undersample])
X = dataset.iloc[:,1:9].values
print(len(X))
y = dataset.iloc[:, 9].values
print(len(y))

11120
11120


In [4]:
# Splitting (randomly) the dataset into the Training set and the (unseen) Test set
# Note this is only for the first task of the coursework. You'll need a different approach for the other tasks, as they also need a validation stage in addition to the test with unseen data.
# Also note the split is training 80% and test 20%) 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), random_state=42, test_size = 0.2)


In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


## Random Forest Classifier

Based on [this](https://blog.dataiku.com/narrowing-the-search-which-hyperparameters-really-matter) article, min_samples_leaf and max_features are the most important hyperparameters for the Random Forest Classifier. At least for his project.

In [6]:
# Fitting the model to the Training set
from sklearn.ensemble import RandomForestClassifier

In [11]:
# creating a RF classifier
AUC_rfc_min = 0
X_train_RF , y_train_RF = np.array(X_train), np.array(y_train)
kf = KFold(n_splits=3,shuffle=True)
avarage_auc_RF_min = 0 
for min_samples_leaf in [1, 20, 40]:
    for max_features in [None, "sqrt", 0.2]:
        avarage_auc = 0
        for train, test in kf.split(X_train_RF):
            training_X , training_y = X_train_RF[train], y_train_RF[train]
            test_X, train_y = X_train_RF[test], y_train_RF[test]

            clf = RandomForestClassifier(min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=42)
            clf.fit(training_X, training_y)

            scores_clf = clf.predict_proba(test_X)[:,1]

            AUC_rfc = metrics.roc_auc_score(train_y, scores_clf)
            if AUC_rfc > AUC_rfc_min:
                AUC_rfc_min = AUC_rfc
                min_samples_leaf_best = min_samples_leaf
                max_features_best = max_features
                print(f"min_samples_leaf: {min_samples_leaf_best}, max_features: {max_features_best} gives AUC: {AUC_rfc_min}")
            avarage_auc = avarage_auc + AUC_rfc
        avarage_auc = avarage_auc/3
        if avarage_auc > avarage_auc_RF_min:
            avarage_auc_RF_min = avarage_auc
            avarage_min_samples_leaf_best = min_samples_leaf
            avarage_max_features_best = max_features
            print(f"min_samples_leaf: {avarage_min_samples_leaf_best}, max_features: {avarage_max_features_best} gives the avarage AUC: {avarage_auc_RF_min}")
            

min_samples_leaf: 1, max_features: None gives AUC: 0.9721065530537266
min_samples_leaf: 1, max_features: None gives AUC: 0.9765431767345951
min_samples_leaf: 1, max_features: None gives AUC: 0.9780497349441448
min_samples_leaf: 1, max_features: None gives the avarage AUC: 0.9755664882441556
min_samples_leaf: 1, max_features: sqrt gives AUC: 0.9791345061254889
min_samples_leaf: 1, max_features: sqrt gives AUC: 0.979617920545617
min_samples_leaf: 1, max_features: sqrt gives the avarage AUC: 0.9788276904559304
min_samples_leaf: 1, max_features: 0.2 gives AUC: 0.9801724836240218


In [9]:
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

In [17]:
# performing probability predictions on the test dataset
# scores_clf = clf.predict_proba(X_test)[:,1]

## Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

In [16]:
AUC_log_min = 0
X_train_log , y_train_log = np.array(X_train), np.array(y_train)
kf = KFold(n_splits=3,shuffle=True)
avarage_auc_log_min = 0 

for algorithm in ['lbfgs', 'newton-cg']:
  for toleration in (1e-4, 1e-3,1e-2):
    for regulation in (1e-1, 1e1):
      for iter in (100, 1000):
        avarage_auc = 0 
        for train, test in kf.split(X_train_RF):
          training_X , training_y = X_train_log[train], y_train_log[train]
          test_X, train_y = X_train_log[test], y_train_log[test]
          
          log = LogisticRegression(solver = algorithm, tol = toleration, C= regulation, random_state = 42,  max_iter=iter)
          log.fit(training_X, training_y)
          
          scores_log = log.predict_proba(test_X)[:,1]

          AUC_log = metrics.roc_auc_score(train_y, scores_log)
          if AUC_log > AUC_log_min:
            AUC_log_min = AUC_log
            print(f"Solver {algorithm!s}, toleration {toleration:g}, regulation {regulation:g}, itteration {iter:g} gives this AUC {AUC_log_min:g}")
          avarage_auc += AUC_log
        avarage_auc = avarage_auc/3
        if avarage_auc > avarage_auc_log_min:
            avarage_auc_log_min = avarage_auc 
            print(f"Solver {algorithm!s}, toleration {toleration:g}, regulation {regulation:g}, itteration {iter:g} gives the avarage AUC of {avarage_auc_log_min:g}")
 

Solver lbfgs, toleration 0.0001, regulation 0.1, itteration 100 gives this AUC 0.889782
Solver lbfgs, toleration 0.0001, regulation 0.1, itteration 100 gives this AUC 0.895114
Solver lbfgs, toleration 0.0001, regulation 0.1, itteration 100 gives this AUC 0.899883
Solver lbfgs, toleration 0.0001, regulation 0.1, itteration 100 gives the avarage AUC of 0.894926
Solver lbfgs, toleration 0.0001, regulation 0.1, itteration 1000 gives this AUC 0.900137
Solver lbfgs, toleration 0.0001, regulation 0.1, itteration 1000 gives the avarage AUC of 0.895085
Solver lbfgs, toleration 0.0001, regulation 10, itteration 100 gives the avarage AUC of 0.896566
Solver lbfgs, toleration 0.001, regulation 10, itteration 100 gives this AUC 0.903565
Solver lbfgs, toleration 0.001, regulation 10, itteration 1000 gives this AUC 0.904771
Solver newton-cg, toleration 0.01, regulation 0.1, itteration 100 gives this AUC 0.907501


## Gradient Boosting Classifier

[This](https://aiml.com/what-are-the-key-hyper-parameters-for-a-gbm-model/) article defines *learning_rate* and *n_estimators* as the two most important hyperparameters

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

In [20]:
AUC_gb_min = 0
X_train_GBC , y_train_GBC = np.array(X_train), np.array(y_train)
kf = KFold(n_splits=3,shuffle=True)
avarage_auc_gbc_min = 0 

for learning_rate in [0.05, 0.1, 0.2]:
    for n_estimators in [2, 8, 16]:
        avarage_auc = 0 
        for train, test in kf.split(X_train_RF):
            training_X , training_y = X_train_GBC[train], y_train_GBC[train]
            test_X, train_y = X_train_GBC[test], y_train_GBC[test]

            gb= GradientBoostingClassifier(learning_rate=learning_rate, random_state=42)
            gb.fit(training_X, training_y)
            scores_gb = gb.predict_proba(test_X)[:,1]
            AUC_gb = metrics.roc_auc_score(train_y, scores_gb)

            if AUC_gb > AUC_gb_min:
                AUC_gb_min = AUC_gb
                learning_rate_best = learning_rate
                n_estimators_best = n_estimators
                print(f"learning_rate: {learning_rate_best}, n_estimators: {n_estimators_best} gives AUC: {AUC_gb_min}")
            avarage_auc += AUC_gb
        avarage_auc = avarage_auc/3
        if avarage_auc > avarage_auc_gbc_min:
            avarage_auc_gbc_min = avarage_auc
            learning_rate_best = learning_rate
            n_estimators_best = n_estimators
            print(f"learning_rate: {learning_rate_best}, n_estimators: {n_estimators_best} gives the avarage AUC: {avarage_auc_gbc_min}")

learning_rate: 0.05, n_estimators: 2 gives AUC: 0.9373116524359618
learning_rate: 0.05, n_estimators: 2 gives AUC: 0.9411152417033657
learning_rate: 0.05, n_estimators: 2 gives the avarage AUC: 0.9385351442130544
learning_rate: 0.05, n_estimators: 16 gives AUC: 0.9490993654118269
learning_rate: 0.1, n_estimators: 2 gives AUC: 0.95126904038377
learning_rate: 0.1, n_estimators: 2 gives the avarage AUC: 0.9476585100207107
learning_rate: 0.1, n_estimators: 8 gives AUC: 0.9526132196294118
learning_rate: 0.2, n_estimators: 2 gives AUC: 0.9610316595724542
learning_rate: 0.2, n_estimators: 2 gives the avarage AUC: 0.9556588692785889
learning_rate: 0.2, n_estimators: 8 gives the avarage AUC: 0.9561420900706828
learning_rate: 0.2, n_estimators: 16 gives the avarage AUC: 0.9567193509327643


## Ada Boost Classifier

[This](https://medium.com/swlh/the-hyperparameter-cheat-sheet-770f1fed32ff) article defines *n_estimators* as the most important hyperparameter, with *learning_rate* and *base_estimator* as two other important ones. We will tweak n_estimators and learning_rate.

In [21]:
from sklearn.ensemble import AdaBoostClassifier

In [26]:
AUC_abc_min = 0
X_train_abc , y_train_abc = np.array(X_train), np.array(y_train)
kf = KFold(n_splits=3,shuffle=True)
avarage_auc_abc_min = 0 

for learning_rate in [0.05, 0.1, 0.2]:
    for n_estimators in [2, 8, 16]:
        avarage_auc = 0 
        for train, test in kf.split(X_train_RF):
            training_X , training_y = X_train_abc[train], y_train_abc[train]
            test_X, train_y = X_train_abc[test], y_train_abc[test]

            abc= AdaBoostClassifier(learning_rate=learning_rate, random_state=42)
            abc.fit(training_X, training_y)
            scores_abc = abc.predict_proba(test_X)[:,1]
            AUC_abc = metrics.roc_auc_score(train_y, scores_abc)
            
            if AUC_abc > AUC_abc_min:
                AUC_abc_min = AUC_abc
                learning_rate_best = learning_rate
                n_estimators_best = n_estimators
                print(f"learning_rate: {learning_rate_best}, n_estimators: {n_estimators_best} gives AUC: {AUC_abc_min}")
            avarage_auc = AUC_abc
        avarage_auc = avarage_auc/3
        if avarage_auc > avarage_auc_abc_min:
            avarage_auc_abc_min = avarage_auc
            learning_rate_best = learning_rate
            n_estimators_best = n_estimators
            print(f"learning_rate: {learning_rate_best}, n_estimators: {n_estimators_best} gives the avarage AUC: {avarage_auc_abc_min}")
            

learning_rate: 0.05, n_estimators: 2 gives AUC: 0.8755657312273635
learning_rate: 0.05, n_estimators: 2 gives the avarage AUC: 0.28996779789600385
learning_rate: 0.05, n_estimators: 8 gives the avarage AUC: 0.29084212930857173
learning_rate: 0.05, n_estimators: 16 gives AUC: 0.8796703306704236
learning_rate: 0.05, n_estimators: 16 gives the avarage AUC: 0.29322344355680785
learning_rate: 0.1, n_estimators: 2 gives AUC: 0.8823290022621606
learning_rate: 0.1, n_estimators: 2 gives AUC: 0.8870348429297217
learning_rate: 0.1, n_estimators: 2 gives AUC: 0.8995561003442543
learning_rate: 0.1, n_estimators: 2 gives the avarage AUC: 0.29985203344808475
learning_rate: 0.2, n_estimators: 2 gives AUC: 0.9121586937527304
learning_rate: 0.2, n_estimators: 2 gives the avarage AUC: 0.30267644939900656
learning_rate: 0.2, n_estimators: 8 gives AUC: 0.913461363458497
learning_rate: 0.2, n_estimators: 16 gives AUC: 0.9135633856522851
