## MLP Classification of Certificate Features

In [12]:
# Import necessary packages

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import scale
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [13]:
# Read in the certificate features excel files with the "phish" classification
cert_phish = pd.read_csv('cert_features_phish.csv')

# Create a column for binary output where if the class is phish, it equals 1
cert_phish['class_bin'] = pd.Series([1 for x in range(len(cert_phish.index))])

In [14]:
# Read in the certificate features excel file with the "alexa" or non-phish classification
cert_alexa = pd.read_csv('cert_features_alexa.csv')

# Create a column for binary output where if the class is alexa, it equals 0
cert_alexa['class_bin'] = pd.Series([0 for x in range(len(cert_alexa.index))])

In [15]:
# Input the two previously loaded dataframes into a list
certs = [cert_phish, cert_alexa]

# Concatenate all the items of the list
cert = pd.concat(certs)

cert.head(3)

Unnamed: 0,domain,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,globalsign nv-sa,google inc,gandi,...,reviews-by.ml,sewaknepal.org,watchvslivestreamtv.club,nehanaxane.ga,alexis-collins10.tk,laborindonesia.com,egdemo.info,theflatbellydrink.com,class,class_bin
0,yusqa.com,True,365,True,False,257,False,0,0,0,...,0,0,0,0,0,0,0,0,phish,1
1,yonnaforexbureau.gm,True,90,True,False,61,False,0,0,0,...,0,0,0,0,0,0,0,0,phish,1
2,www.serveradmin.gq,True,57,True,False,6,True,0,0,0,...,0,0,0,0,0,0,0,0,phish,1


In [16]:
cert.shape

(8398, 2209)

In [17]:
X = cert.iloc[:, 1:7 ]
y = cert.iloc[:,-1]
X.head()

Unnamed: 0,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn
0,True,365,True,False,257,False
1,True,90,True,False,61,False
2,True,57,True,False,6,True
3,True,90,True,False,1,False
4,True,90,True,False,7,False


In [33]:
# Scale the X values
XS=scale(X)

# Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(XS, y, test_size=0.3, random_state=42)

# Create the MLP classifier model with particular parameters
mlp = MLPClassifier(max_iter=10000, random_state=42, hidden_layer_sizes=(256,128,128,128,256))

# Fit the mlp model to the data
mlp.fit(X_train, y_train)

# Perform a 5-fold cross validation
scores = cross_val_score(mlp, X_train, y_train, cv=5, scoring="accuracy")

# Print the mean of the validation scores
np.mean(scores)

0.8751274781898877

In [34]:
mlp.score(X_test, y_test)

0.8551587301587301

In [35]:
pred = mlp.predict(X_test)

In [36]:
print(confusion_matrix(y_test,pred))

[[2073   36]
 [ 329   82]]


In [37]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.86      0.98      0.92      2109
           1       0.69      0.20      0.31       411

    accuracy                           0.86      2520
   macro avg       0.78      0.59      0.61      2520
weighted avg       0.84      0.86      0.82      2520

