In [1]:
# Template for Coursework of GINT
import pandas as pd
from matplotlib import pyplot
from sklearn import metrics 

In [2]:
#paths to the file containing the samples with the processed features
feature_of_counts = "../processed_data/feature_vectors_counts.csv"

In [5]:
#Here we have to undersample the benign data to obtain a better model
#TODO: undersample the benignware


In [28]:
# Importing the dataset, and performing an undersampling of the benignware
dataset = pd.read_csv(feature_of_counts, index_col=0)

malware= dataset.loc[dataset["malware"] == True]
#print(type(malware))
benign = dataset.loc[dataset["malware"] == False]
#print(len(benign))
benign_undersample = benign.sample(n=len(malware), replace= False, random_state=42)
#print(len(benign_undersample))

dataset= pd.concat([malware, benign_undersample])
X = dataset.iloc[:,1:9].values
print(len(X))
y = dataset.iloc[:, 9].values
print(len(y))

11120
11120


In [31]:
# Splitting (randomly) the dataset into the Training set and the (unseen) Test set
# Note this is only for the first task of the coursework. You'll need a different approach for the other tasks, as they also need a validation stage in addition to the test with unseen data.
# Also note the split is training 80% and test 20%) 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y.astype(int), random_state=42, test_size = 0.2)


[0 1 0 ... 1 1 0]
2224


In [8]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


## Random Forest Classifier

Based on [this](https://blog.dataiku.com/narrowing-the-search-which-hyperparameters-really-matter) article, min_samples_leaf and max_features are the most important hyperparameters for the Random Forest Classifier. At least for his project.

In [7]:
# Fitting the model to the Training set
from sklearn.ensemble import RandomForestClassifier

In [24]:
# creating a RF classifier
AUC_rfc_min = 0
for min_samples_leaf in [1, 20, 40]:
    for max_features in [None, "sqrt", 0.2]:
        clf = RandomForestClassifier(min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=42)
        clf.fit(X_train, y_train)
        scores_clf = clf.predict_proba(X_test)[:,1]
        AUC_rfc = metrics.roc_auc_score(y_test, scores_clf)
        if AUC_rfc > AUC_rfc_min:
            AUC_rfc_min = AUC_rfc
            min_samples_leaf_best = min_samples_leaf
            max_features_best = max_features
            print(f"min_samples_leaf: {min_samples_leaf_best}, max_features: {max_features_best} gives AUC: {AUC_rfc_min}")

min_samples_leaf: 1, max_features: None gives AUC: 0.978322473831702
min_samples_leaf: 1, max_features: sqrt gives AUC: 0.9812284828839274
min_samples_leaf: 1, max_features: 0.2 gives AUC: 0.9813686617325267


In [9]:
# Training the model on the training dataset
# fit function is used to train the model using the training sets as parameters
clf.fit(X_train, y_train)

In [17]:
# performing probability predictions on the test dataset
# scores_clf = clf.predict_proba(X_test)[:,1]

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
AUC_log_min = 0

In [14]:
for algorithm in ['lbfgs', 'newton-cg']:
  for toleration in (1e-4, 1e-3,1e-2):
    for regulation in (1e-1, 1e1):
      for iter in (100, 1000):
        log = LogisticRegression(solver = algorithm, tol = toleration, C= regulation, random_state = 23,  max_iter=iter).fit(X_train, y_train)
        log.fit(X_train, y_train)
        scores_log = log.predict_proba(X_test)[:,1]
        AUC_log = metrics.roc_auc_score(y_test, scores_log)
        if AUC_log > AUC_log_min:
          AUC_log_min = AUC_log
          print(f"Solver {algorithm!s}, toleration {toleration:g}, regulation {regulation:g}, itteration{iter:g} gives this AUC {AUC_log_min:g}")

0.8823322414049278
Solver lbfgs, toleration 0.0001, regulation 0.1, itteration100 gives this AUC 0.882332
0.8823322414049278
0.8821594009392839
0.8821594009392839
0.880417654595526
0.880417654595526
0.8801347452094959
0.8801347452094959
0.8703605740454168
0.8703605740454168
0.8703888248860525
0.8703888248860525
0.882009580352171
0.882009580352171
0.8815238846094232
0.8815238846094232
0.882756240956997
Solver newton-cg, toleration 0.001, regulation 0.1, itteration100 gives this AUC 0.882756
0.882756240956997
0.882509018761912
0.882509018761912
0.885283834555495
Solver newton-cg, toleration 0.01, regulation 0.1, itteration100 gives this AUC 0.885284
0.885283834555495
0.8852659727336738
0.8852659727336738


## Gradient Boosting Classifier

[This](https://aiml.com/what-are-the-key-hyper-parameters-for-a-gbm-model/) article defines *learning_rate* and *n_estimators* as the two most important hyperparameters

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [25]:
AUC_gb_min = 0

In [15]:
for learning_rate in [0.05, 0.1, 0.2]:
    for n_estimators in [2, 8, 16]:
        gb= GradientBoostingClassifier(learning_rate=learning_rate, random_state=42)
        gb.fit(X_train, y_train)
        scores_gb = gb.predict_proba(X_test)[:,1]
        AUC_gb = metrics.roc_auc_score(y_test, scores_gb)
        if AUC_gb > AUC_gb_min:
            AUC_gb_min = AUC_gb
            learning_rate_best = learning_rate
            n_estimators_best = n_estimators
            print(f"learning_rate: {learning_rate_best}, n_estimators: {n_estimators_best} gives AUC: {AUC_gb_min}")

## Ada Boost Classifier

[This](https://medium.com/swlh/the-hyperparameter-cheat-sheet-770f1fed32ff) article defines *n_estimators* as the most important hyperparameter, with *learning_rate* and *base_estimator* as two other important ones. We will tweak n_estimators and learning_rate.

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
AUC_abc_min = 0

In [None]:
for learning_rate in [0.05, 0.1, 0.2]:
    for n_estimators in [2, 8, 16]:
        abc= AdaBoostClassifier(learning_rate=learning_rate, random_state=42)
        abc.fit(X_train, y_train)
        scores_abc = abc.predict_proba(X_test)[:,1]
        AUC_abc = metrics.roc_auc_score(y_test, scores_abc)
        if AUC_abc > AUC_abc_min:
            AUC_abc_min = AUC_abc
            learning_rate_best = learning_rate
            n_estimators_best = n_estimators
            print(f"learning_rate: {learning_rate_best}, n_estimators: {n_estimators_best} gives AUC: {AUC_abc_min}")