In [74]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [12]:
# Read in the data and drop empty rows (does not quite make sense to clasify these)
data = pd.read_csv(r'Phishing_Email.csv', encoding='latin-1')
data.dropna(inplace=True)

# vectorize text data and put labels in the right format
vectorizer = CountVectorizer()
y = data['Email Type'].values
for i in range(len(y)):
  if y[i] == 'Phishing Email':
    y[i] = 1
  else:
    y[i] = 0

X = data['Email Text'].values
X = vectorizer.fit_transform(X)

# Split train an test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)



In [13]:
# decision tree classifier

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
predictions = tree.predict(X_test)

print('-'*50)
print('[ Decision tree classifier ]')
print(f'Accuracy: {accuracy_score(y_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

print('Classification report:')
print(classification_report(y_test, predictions))

--------------------------------------------------
[ Decision tree classifier ]
Accuracy: 0.9018997531394226
Confusion matrix:
[[5123  513]
 [ 401 3280]]
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      5636
           1       0.86      0.89      0.88      3681

    accuracy                           0.90      9317
   macro avg       0.90      0.90      0.90      9317
weighted avg       0.90      0.90      0.90      9317



In [15]:
# random forest classifier
random_forest = RandomForestClassifier(n_estimators=200, random_state=42, max_features=1000, n_jobs=-1)
random_forest.fit(X_train, y_train)
predictions = random_forest.predict(X_test)

print('-'*50)
print('[Random Forest]')
print(f'Accuracy: {accuracy_score(y_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y_test, predictions))


--------------------------------------------------
[Random Forest]
Accuracy: 0.9505205538263389
Confusion matrix:
[[5439  197]
 [ 264 3417]]
Classification report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      5636
           1       0.95      0.93      0.94      3681

    accuracy                           0.95      9317
   macro avg       0.95      0.95      0.95      9317
weighted avg       0.95      0.95      0.95      9317



In [27]:
tree = DecisionTreeClassifier(max_depth=5, random_state=42, min_samples_leaf=5, max_features=1000)
ada_boost = AdaBoostClassifier(tree, n_estimators=200, random_state=42)
ada_boost.fit(X_train, y_train)
predictions = ada_boost.predict(X_test)

print('-'*50)
print('[Adaboost with Decision Tree]')
print(f'Accuracy: {accuracy_score(y_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y_test, predictions))

--------------------------------------------------
[Adaboost with Decision Tree]
Accuracy: 0.9422560910164216
Confusion matrix:
[[5326  310]
 [ 228 3453]]
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      5636
           1       0.92      0.94      0.93      3681

    accuracy                           0.94      9317
   macro avg       0.94      0.94      0.94      9317
weighted avg       0.94      0.94      0.94      9317



In [37]:
xgb = XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, n_jobs=-1, colsample_bytree=0.2, nthread = -1,  random_state=42)
xgb.fit(X_train, y_train)

print('-'*50)
print('[XGBoost]')
print(f'Accuracy: {accuracy_score(y_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y_test, predictions))

--------------------------------------------------
[XGBoost]
Accuracy: 0.9422560910164216
Confusion matrix:
[[5326  310]
 [ 228 3453]]
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95      5636
           1       0.92      0.94      0.93      3681

    accuracy                           0.94      9317
   macro avg       0.94      0.94      0.94      9317
weighted avg       0.94      0.94      0.94      9317



In [59]:
data = pd.read_csv(r'datasets/phishing_emails/Phishing_Email.csv', encoding='latin-1')
data.dropna(inplace=True)
label_encoder = LabelEncoder()
print(data.head())
X2 = data['Email Text'].values
y2 = data['Email Type'].values

for i in range(len(y2)):
  if y2[i] == 'Phishing Email':
    y2[i] = 1
  else:
    y2[i] = 0

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# vectorize text data using tf-idf
T_vectorizer = TfidfVectorizer()
X2_train = T_vectorizer.fit_transform(X2_train)
X2_test = T_vectorizer.transform(X2_test)
y2_train = label_encoder.fit_transform(y2_train)
y2_test = label_encoder.fit_transform(y2_test)


   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  


In [43]:
tree2 = DecisionTreeClassifier()
tree2.fit(X2_train, y2_train)
predictions = tree2.predict(X2_test)

print('-'*50)
print('[Decision Tree]')
print(f'Accuracy: {accuracy_score(y2_test, predictions)}')
print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y2_test, predictions))


--------------------------------------------------
[Decision Tree]
Accuracy: 0.87828418230563
Confusion matrix:
[[7970 1071]
 [ 745 5134]]
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      9041
           1       0.83      0.87      0.85      5879

    accuracy                           0.88     14920
   macro avg       0.87      0.88      0.87     14920
weighted avg       0.88      0.88      0.88     14920



In [44]:
# random forest classifier
random_forest2 = RandomForestClassifier(n_estimators=200, random_state=42, max_features=1000, n_jobs=-1)
random_forest2.fit(X2_train, y2_train)
predictions = random_forest2.predict(X2_test)

print('-'*50)
print('[Random Forest]')
print(f'Accuracy {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)

print('Classification report:')
print(classification_report(y2_test, predictions))

--------------------------------------------------
[Random Forest]
Accuracy 0.9476541554959785
Confusion matrix:
[[8664  377]
 [ 404 5475]]
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      9041
           1       0.94      0.93      0.93      5879

    accuracy                           0.95     14920
   macro avg       0.95      0.94      0.95     14920
weighted avg       0.95      0.95      0.95     14920



In [71]:
tree2 = DecisionTreeClassifier(max_depth=3, random_state=42, min_samples_leaf=5, max_features=1000)
ada_boost2 = AdaBoostClassifier(tree2, n_estimators=400, random_state=42)
ada_boost2.fit(X2_train, y2_train)
predictions = ada_boost2.predict(X2_test)

print('[AdaBoost]')
print(f'Accuracy: {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y2_test, predictions))

[AdaBoost]
Accuracy: 0.9568364611260054
Confusion matrix:
[[2184   89]
 [  72 1385]]
Classification report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2273
           1       0.94      0.95      0.95      1457

    accuracy                           0.96      3730
   macro avg       0.95      0.96      0.95      3730
weighted avg       0.96      0.96      0.96      3730



In [70]:
tree2 = DecisionTreeClassifier(max_features=1000, max_depth=10)
bagging = BaggingClassifier(tree2, n_estimators=400, random_state=42)
bagging.fit(X2_train, y2_train)
predictions = bagging.predict(X2_test)

print('[Bagging]')
print(f'Accuracy: {accuracy_score(y2_test, predictions)}')
print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y2_test, predictions))

[Bagging]
Accuracy: 0.8477211796246649
Confusion matrix:
[[2269    4]
 [ 564  893]]
Classification report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89      2273
           1       1.00      0.61      0.76      1457

    accuracy                           0.85      3730
   macro avg       0.90      0.81      0.82      3730
weighted avg       0.88      0.85      0.84      3730



In [57]:
gbrt = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)
gbrt.fit(X2_train, y2_train)
predictions = gbrt.predict(X2_test)

print('-'*50)
print('[Gradient Boosting]')
print(f'Accuracy: {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y2_test, predictions))

--------------------------------------------------
[Gradient Boosting]
Accuracy: 0.9217158176943699
Confusion matrix:
[[8369  672]
 [ 496 5383]]
Classification report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      9041
           1       0.89      0.92      0.90      5879

    accuracy                           0.92     14920
   macro avg       0.92      0.92      0.92     14920
weighted avg       0.92      0.92      0.92     14920



In [66]:
xgb = XGBClassifier(n_estimators=400, learning_rate=0.5, random_state=42, n_jobs=-1, max_depth=2, colsample_bytree=0.2)
xgb.fit(X2_train, y2_train)
predictions = xgb.predict(X2_test)

print('-'*50)
print('[XGBoost]')
print(f'Accuracy: {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y2_test, predictions))

--------------------------------------------------
[XGBoost]
Accuracy: 0.9646112600536193
Confusion matrix:
[[2186   87]
 [  45 1412]]
Classification report:
              precision    recall  f1-score   support

           0       0.98      0.96      0.97      2273
           1       0.94      0.97      0.96      1457

    accuracy                           0.96      3730
   macro avg       0.96      0.97      0.96      3730
weighted avg       0.97      0.96      0.96      3730



In [75]:
# Final k-fold experiment using XGBoost
data = pd.read_csv(r'datasets/phishing_emails/Phishing_Email.csv', encoding='latin-1')
data.dropna(inplace=True)

label_encoder = LabelEncoder()
T_vectorizer = TfidfVectorizer()

X3 = data['Email Text'].values
y3 = data['Email Type'].values

for i in range(len(y3)):
  if y3[i] == 'Phishing Email':
    y3[i] = 1
  else:
    y3[i] = 0

X3 = T_vectorizer.fit_transform(X3)
y3 = label_encoder.fit_transform(y3)

num_folds = 10
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=42)

fold_acc = []

for train_index, val_index in kfold.split(X3):

  X3_train, X3_val = X3[train_index], X3[val_index]
  y3_train, y3_val = y3[train_index], y3[val_index]

  model = XGBClassifier(n_estimators=400, learning_rate=0.5, random_state=42, n_jobs=-1, max_depth=2, colsample_bytree=0.2, nthread=-1)
  model.fit(X3_train, y3_train)

  y3_pred = model.predict(X3_val)
  acc = accuracy_score(y3_val, y3_pred)
  fold_acc.append(acc)

print(fold_acc)

print('Average accuracy: ', sum(fold_acc)/len(fold_acc))






[0.9694206008583691, 0.9624463519313304, 0.9694206008583691, 0.971030042918455, 0.9651100375738056, 0.964573268921095, 0.9688674181427804, 0.9688674181427804, 0.9753086419753086, 0.9667203435319377]
Average accuracy:  0.9681764724854232
