In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
import lightgbm as lgb

In [3]:

def read_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)

    label_encoder = LabelEncoder()
    T_vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), max_features=5000)

    X = data['Email Text'].values
    y = data['Email Type'].values

    for i in range(len(y)):
        if y[i] == 'Phishing Email':
            y[i] = 1
        else:
            y[i] = 0

    y = label_encoder.fit_transform(y)
    return X, y


def print_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val, y_pred)}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')


In [4]:
# data set preview
data = pd.read_csv('datasets/phishing_emails/phishing_email.csv')
print(data.head())
print(data['Email Type'].value_counts())


   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \r\nHello I am your hot lil horny toy.\r\n    ...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  
Safe Email        11322
Phishing Email     7328
Name: Email Type, dtype: int64


In [5]:
X, y = read_data('datasets/phishing_emails/phishing_email.csv')
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

print(X.shape, y.shape)

(18634,) (18634,)


In [6]:
# k-fold experiment using XGBoost
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 3), max_features=10000)
  vectorizer.fit(X_train)

  X_train = vectorizer.transform(X_train)
  X_val = vectorizer.transform(X_val)
  
  model = XGBClassifier(n_estimators=800, learning_rate=0.1, max_depth=4, colsample_bytree=0.2, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1



Fold: 1
Accuracy Score: 0.967802522135766
Confusion Matrix: 
 [[2112   97]
 [  23 1495]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.96      0.97      2209
           1       0.94      0.98      0.96      1518

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727

Fold: 2
Accuracy Score: 0.9704856452911189
Confusion Matrix: 
 [[2196   83]
 [  27 1421]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.96      0.98      2279
           1       0.94      0.98      0.96      1448

    accuracy                           0.97      3727
   macro avg       0.97      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727

Fold: 3
Accuracy Score: 0.964851086664878
Confusion Matrix: 
 [[2174   86]
 [  45 1422]]
Classification Report: 
    

In [8]:
# k-fold experiment using Adaboost with Decision Tree as base estimator
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 3), max_features=10000)
  vectorizer.fit(X_train)

  X_train = vectorizer.transform(X_train)
  X_val = vectorizer.transform(X_val)

  tree = DecisionTreeClassifier(max_depth=3, max_features=0.2, random_state=42)
  model = AdaBoostClassifier(base_estimator=tree, n_estimators=800, random_state=42, learning_rate=0.1)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1



Fold: 1
Accuracy Score: 0.9632412127716662
Confusion Matrix: 
 [[2107  102]
 [  35 1483]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.95      0.97      2209
           1       0.94      0.98      0.96      1518

    accuracy                           0.96      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.96      0.96      0.96      3727

Fold: 2
Accuracy Score: 0.9694123960289778
Confusion Matrix: 
 [[2201   78]
 [  36 1412]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      2279
           1       0.95      0.98      0.96      1448

    accuracy                           0.97      3727
   macro avg       0.97      0.97      0.97      3727
weighted avg       0.97      0.97      0.97      3727

Fold: 3
Accuracy Score: 0.9640461497182721
Confusion Matrix: 
 [[2168   92]
 [  42 1425]]
Classification Report: 
  

In [67]:
# k-fold experiment using random forest
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 3), max_features=10000)
  vectorizer.fit(X_train)

  X_train = vectorizer.transform(X_train)
  X_val = vectorizer.transform(X_val)

  model = RandomForestClassifier(n_estimators=500, max_features=0.15, n_jobs=-1, random_state=42)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1  



Fold: 1
Accuracy Score: 0.9581432787764959
Confusion Matrix: 
 [[2100  109]
 [  47 1471]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.95      0.96      2209
           1       0.93      0.97      0.95      1518

    accuracy                           0.96      3727
   macro avg       0.95      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727

Fold: 2
Accuracy Score: 0.9557284679366783
Confusion Matrix: 
 [[2175  104]
 [  61 1387]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.95      0.96      2279
           1       0.93      0.96      0.94      1448

    accuracy                           0.96      3727
   macro avg       0.95      0.96      0.95      3727
weighted avg       0.96      0.96      0.96      3727

Fold: 3
Accuracy Score: 0.9549235309900724
Confusion Matrix: 
 [[2167   93]
 [  75 1392]]
Classification Report: 
  

In [66]:
# Gradient boosting tree
fold = 1

for train_index, val_index in kfold.split(X):

  X_train, X_val = X[train_index], X[val_index]
  y_train, y_val = y[train_index], y[val_index]

  vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 3), max_features=10000)
  vectorizer.fit(X_train)

  X_train = vectorizer.transform(X_train)
  X_val = vectorizer.transform(X_val)

  model = GradientBoostingClassifier(n_estimators=500, learning_rate=0.5, max_depth=3, random_state=42, max_features=0.15)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_val)

  print_report(y_val, y_pred, fold)
  fold += 1  

Fold: 1
Accuracy Score: 0.9659243359270191
Confusion Matrix: 
 [[2116   93]
 [  34 1484]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      2209
           1       0.94      0.98      0.96      1518

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.97      0.97      3727

Fold: 2
Accuracy Score: 0.9659243359270191
Confusion Matrix: 
 [[2199   80]
 [  47 1401]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      2279
           1       0.95      0.97      0.96      1448

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.97      0.97      3727

Fold: 3
Accuracy Score: 0.9643144620338073
Confusion Matrix: 
 [[2183   77]
 [  56 1411]]
Classification Report: 
  

In [22]:
# k-fold experiment using XGBoost
fold = 1

params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'learning_rate': 0.2,
    'max_depth': 7,
    'n_estimators': 100,
    'subsample': 0.9,
    'verbose': -1,
}

for train_index, val_index in kfold.split(X):
    
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 4), max_features=15000)
    vectorizer.fit(X_train)

    X_train = vectorizer.transform(X_train)
    X_val = vectorizer.transform(X_val)

    train_data = lgb.Dataset(X_train, label=y_train)
    test_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, train_data)
    
    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = np.round(y_pred)

    print_report(y_val, y_pred, fold)
    fold += 1




Fold: 1
Accuracy Score: 0.9618996511939898
Confusion Matrix: 
 [[2095  114]
 [  28 1490]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.99      0.95      0.97      2209
           1       0.93      0.98      0.95      1518

    accuracy                           0.96      3727
   macro avg       0.96      0.96      0.96      3727
weighted avg       0.96      0.96      0.96      3727





Fold: 2
Accuracy Score: 0.9653877112959485
Confusion Matrix: 
 [[2196   83]
 [  46 1402]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      2279
           1       0.94      0.97      0.96      1448

    accuracy                           0.97      3727
   macro avg       0.96      0.97      0.96      3727
weighted avg       0.97      0.97      0.97      3727



KeyboardInterrupt: 

: 