In [4]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [5]:

def read_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)

    label_encoder = LabelEncoder()
    T_vectorizer = TfidfVectorizer()

    X = data['Email Text'].values
    y = data['Email Type'].values

    for i in range(len(y)):
        if y[i] == 'Phishing Email':
            y[i] = 1
        else:
            y[i] = 0

    X = T_vectorizer.fit_transform(X)
    y = label_encoder.fit_transform(y)
    return X, y


def print_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val, y_pred)}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')


In [11]:
# data set preview
data = pd.read_csv('datasets/phishing_emails/phishing_email.csv')
print(data.head())
print(data['Email Type'].value_counts())

   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  
Safe Email        11322
Phishing Email     7328
Name: Email Type, dtype: int64


In [12]:
# k-fold experiment using XGBoost
X, y = read_data('datasets/phishing_emails/phishing_email.csv')

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = XGBClassifier(n_estimators=400, learning_rate=0.5, max_depth=3, colsample_bytree=0.2, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1



FileNotFoundError: [Errno 2] No such file or directory: 'phishing_email.csv'

In [24]:
# k-fold experiment using Adaboost
X, y = read_data('datasets/phishing_emails/phishing_email.csv')

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  tree = DecisionTreeClassifier(max_depth=3, max_features=500, random_state=42)
  model = AdaBoostClassifier(base_estimator=tree, n_estimators=400, random_state=42, learning_rate=0.1)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1




Fold: 1
Accuracy Score: 0.9608264019318487
Confusion Matrix: 
 [[2102  107]
 [  39 1479]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.95      0.97      2209
           1       0.93      0.97      0.95      1518

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727

Fold: 2
Accuracy Score: 0.9645827743493426
Confusion Matrix: 
 [[2190   89]
 [  43 1405]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      2279
           1       0.94      0.97      0.96      1448

    accuracy                           0.96      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.96      0.96      3727

Fold: 3
Accuracy Score: 0.9584115910920311
Confusion Matrix: 
 [[2159  101]
 [  54 1413]]
Classification Report: 
  

In [25]:
# k-fold experiment using random forest
X, y = read_data('datasets/phishing_emails/phishing_email.csv')

num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  model = RandomForestClassifier(n_estimators=400, max_features=1000, random_state=42, n_jobs=-1)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1



Fold: 1
Accuracy Score: 0.9632412127716662
Confusion Matrix: 
 [[2138   71]
 [  66 1452]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.97      0.97      2209
           1       0.95      0.96      0.95      1518

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727

Fold: 2
Accuracy Score: 0.9576066541454252
Confusion Matrix: 
 [[2209   70]
 [  88 1360]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.97      0.97      2279
           1       0.95      0.94      0.95      1448

    accuracy                           0.96      3727
   macro avg       0.96      0.95      0.96      3727
weighted avg       0.96      0.96      0.96      3727

Fold: 3
Accuracy Score: 0.9597531526697075
Confusion Matrix: 
 [[2187   73]
 [  77 1390]]
Classification Report: 
  