## Random Forest Classification of Certificate Features

In [123]:
# Import necessary packages

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ParameterGrid
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

In [124]:
# Read in the certificate features excel files with the "phish" classification
cert_phish = pd.read_csv('cert_features_phish.csv')

# Create a column for binary output where if the class is phish, it equals 1
cert_phish['class_bin'] = pd.Series([1 for x in range(len(cert_phish.index))])

In [125]:
# Read in the certificate features excel file with the "alexa" or non-phish classification
cert_alexa = pd.read_csv('cert_features_alexa.csv')

# Create a column for binary output where if the class is alexa, it equals 0
cert_alexa['class_bin'] = pd.Series([0 for x in range(len(cert_alexa.index))])

In [126]:
# Input the two previously loaded dataframes into a list
certs = [cert_phish, cert_alexa]

# Concatenate all the items of the list
cert = pd.concat(certs)
cert.head(3)

Unnamed: 0,domain,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn,globalsign nv-sa,google inc,gandi,...,reviews-by.ml,sewaknepal.org,watchvslivestreamtv.club,nehanaxane.ga,alexis-collins10.tk,laborindonesia.com,egdemo.info,theflatbellydrink.com,class,class_bin
0,yusqa.com,True,365,True,False,257,False,0,0,0,...,0,0,0,0,0,0,0,0,phish,1
1,yonnaforexbureau.gm,True,90,True,False,61,False,0,0,0,...,0,0,0,0,0,0,0,0,phish,1
2,www.serveradmin.gq,True,57,True,False,6,True,0,0,0,...,0,0,0,0,0,0,0,0,phish,1


In [127]:
X = cert.iloc[:, 1:7]
y = cert.iloc[:,-1]
X.head()

Unnamed: 0,has_cert,longetivity,valid_cert,extended_validation,cert_age,multi_mtn
0,True,365,True,False,257,False
1,True,90,True,False,61,False
2,True,57,True,False,6,True
3,True,90,True,False,1,False
4,True,90,True,False,7,False


In [128]:
# Scale the X values
XS=scale(X)

# Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(XS, y, test_size=0.3, random_state=42)


# Create the random forest classifier model with particular parameters
random_forest = RandomForestRegressor(n_estimators=200,
                                      max_depth=20,
                                      random_state=42)


# Fit the random forest model to the data
random_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [129]:
random_forest.score(X_test, y_test)

0.5323140019452102

In [130]:
# Make predictions with our model
pred = random_forest.predict(X_test)

In [131]:
# Example of a confusion matrix in Python
y_test = y_test
pred = np.around(pred)

results = confusion_matrix(y_test, pred)
print(results)

[[2019   90]
 [ 128  283]]


In [132]:
accuracy_score( y_test, pred)

0.9134920634920635

In [133]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2109
           1       0.76      0.69      0.72       411

    accuracy                           0.91      2520
   macro avg       0.85      0.82      0.84      2520
weighted avg       0.91      0.91      0.91      2520

