In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [17]:

def read_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)

    label_encoder = LabelEncoder()
    T_vectorizer = TfidfVectorizer()

    X = data['Email Text'].values
    y = data['Email Type'].values

    for i in range(len(y)):
        if y[i] == 'Phishing Email':
            y[i] = 1
        else:
            y[i] = 0

    X = T_vectorizer.fit_transform(X)
    y = label_encoder.fit_transform(y)
    return X, y


def print_report(y_test, predictions, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_test, predictions)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_test, predictions)}')
    print(f'Classification Report: \n {classification_report(y_test, predictions)}')


In [18]:
# k-fold experiment using XGBoost
X, y = read_data('phishing_email.csv')

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = XGBClassifier(n_estimators=400, learning_rate=0.5, max_depth=3, colsample_bytree=0.2, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1



Fold: 1
Accuracy Score: 0.9680708344513013
Confusion Matrix: 
 [[2113   23]
 [  96 1495]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.99      0.97      2136
           1       0.98      0.94      0.96      1591

    accuracy                           0.97      3727
   macro avg       0.97      0.96      0.97      3727
weighted avg       0.97      0.97      0.97      3727

Fold: 2
Accuracy Score: 0.9702173329755835
Confusion Matrix: 
 [[2205   37]
 [  74 1411]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.98      0.98      2242
           1       0.97      0.95      0.96      1485

    accuracy                           0.97      3727
   macro avg       0.97      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727

Fold: 3
Accuracy Score: 0.9635095250872016
Confusion Matrix: 
 [[2169   45]
 [  91 1422]]
Classification Report: 
  

In [10]:
# k-fold experiment using Adaboost
X, y = read_data('phishing_email.csv')

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  tree = DecisionTreeClassifier(max_depth=3, max_features=500, random_state=42)
  model = AdaBoostClassifier(base_estimator=tree, n_estimators=400, random_state=42, learning_rate=0.1)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)




--------------------------------------------------
[-----Adaboost-----]
Fold accuracies:  [0.9608264019318487, 0.9645827743493426, 0.9584115910920311, 0.9635095250872016, 0.9648416532474503]
Average accuracy:  0.9624343891415748
Average confusion matrix: 1st row: safe, 2nd row: phishing
[[2164.   100.4]
 [  39.6 1422.8]]


In [13]:
# k-fold experiment using random forest
X, y = read_data('phishing_email.csv')

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = RandomForestClassifier(n_estimators=400, max_features=1000, random_state=42, n_jobs=-1)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1



--------------------------------------------------
[-----Random Forest-----]
Fold accuracies:  [0.9632412127716662, 0.9576066541454252, 0.9597531526697075, 0.9667292728736249, 0.966183574879227]
Average accuracy:  0.9627027734679302
Average confusion matrix: 1st row: safe, 2nd row: phishing
[[2196.4   68. ]
 [  71.  1391.4]]
