In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier

In [None]:
# read in data
data = pd.read_csv(r'Phishing_Email.csv', encoding='latin-1')
print(data.head())

# replace NaN values
data.dropna(inplace=True)


# One-hot encode the labels
#ohe = OneHotEncoder(sparse_output=False)
#y = ohe.fit_transform(data[['Email Type']])

# vectorize text data and put labels in the right format
vectorizer = CountVectorizer()
y = data['Email Type'].values
for i in range(len(y)):
  if y[i] == 'Phishing Email':
    y[i] = 1
  else:
    y[i] = 0

X = data['Email Text'].values
X = vectorizer.fit_transform(X)

# Split train an test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)



   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  


In [None]:
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)
print(f'The accuracy on a single decision tree is {accuracy_score(y_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

The accuracy on a single decision tree is 0.9160857908847185
Confusion matrix:
[[2087  186]
 [ 127 1330]]


In [None]:
# random forest classifier
random_forest = RandomForestClassifier(n_estimators=100, random_state=42, max_features=2000, n_jobs=-1)
random_forest.fit(X_train, y_train)

predictions = random_forest.predict(X_test)
print(f'The accuracy on a random forest is {accuracy_score(y_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)
print('Classification report:')
print(classification_report(y_test, predictions))


The accuracy on a random forest is 0.9624664879356568
Confusion matrix:
[[1120   33]
 [  37  675]]
Classification report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1153
           1       0.95      0.95      0.95       712

    accuracy                           0.96      1865
   macro avg       0.96      0.96      0.96      1865
weighted avg       0.96      0.96      0.96      1865



In [None]:
ada_boost = AdaBoostClassifier(tree, n_estimators=20, random_state=42)
ada_boost.fit(X_train, y_train)

In [None]:
predictions = ada_boost.predict(X_test)
print(f'accuracy: {accuracy_score(y_test, predictions)}')

accuracy: 0.9560321715817695


In [3]:
data = pd.read_csv(r'datasets/phishing_emails/Phishing_Email.csv', encoding='latin-1')
data.fillna(' ', inplace=True)
label_encoder = LabelEncoder()
print(data.head())
X2 = data['Email Text'].values
y2 = data['Email Type'].values

for i in range(len(y2)):
  if y2[i] == 'Phishing Email':
    y2[i] = 1
  else:
    y2[i] = 0

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)


T_vectorizer = TfidfVectorizer()
X2_train = T_vectorizer.fit_transform(X2_train)
X2_test = T_vectorizer.transform(X2_test)
y2_train = label_encoder.fit_transform(y2_train)
y2_test = label_encoder.fit_transform(y2_test)

print(y2_train.shape)
print(y2_test.shape)




   Unnamed: 0                                         Email Text  \
0           0  re : 6 . 1100 , disc : uniformitarianism , re ...   
1           1  the other side of * galicismos * * galicismo *...   
2           2  re : equistar deal tickets are you still avail...   
3           3  \nHello I am your hot lil horny toy.\n    I am...   
4           4  software at incredibly low prices ( 86 % lower...   

       Email Type  
0      Safe Email  
1      Safe Email  
2      Safe Email  
3  Phishing Email  
4  Phishing Email  
(14920,)
(3730,)


In [None]:
tree2 = DecisionTreeClassifier()
tree2.fit(X2_train, y2_train)

predictions = tree2.predict(X2_test)
print(f'The accuracy on a single decision tree is {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)


The accuracy on a single decision tree is 0.9179624664879357
Confusion matrix:
[[1071   82]
 [  71  641]]


In [None]:
# random forest classifier
random_forest2 = RandomForestClassifier(n_estimators=100, random_state=42, max_features=1000)
random_forest2.fit(X2_train, y2_train)

predictions = random_forest2.predict(X2_test)
print(f'The accuracy on a random forest is {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)

print('Classification report:')
print(classification_report(y2_test, predictions))

The accuracy on a random forest is 0.9597855227882037
Confusion matrix:
[[1121   32]
 [  43  669]]
Classification report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      1153
           1       0.95      0.94      0.95       712

    accuracy                           0.96      1865
   macro avg       0.96      0.96      0.96      1865
weighted avg       0.96      0.96      0.96      1865



In [None]:
tree2 = DecisionTreeClassifier(max_features=500, max_depth=3)
ada_boost2 = AdaBoostClassifier(tree2, n_estimators=200, random_state=42)
ada_boost2.fit(X2_train, y2_train)

predictions = ada_boost2.predict(X2_test)
print(f'The accuracy of Adaboost is: {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)

The accuracy of Adaboost is: 0.9439678284182306
Confusion matrix:
[[2154  119]
 [  90 1367]]


In [None]:
tree2 = DecisionTreeClassifier(max_features=500, max_depth=20)
bagging = BaggingClassifier(tree2, n_estimators=100, random_state=42)
bagging.fit(X2_train, y2_train)

predictions = bagging.predict(X2_test)
print(f'The accuracy of Bagging is: {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)

The accuracy of Bagging is: 0.8616621983914209
Confusion matrix:
[[2265    8]
 [ 508  949]]


In [None]:
gbrt = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=42)
gbrt.fit(X2_train, y2_train)

predictions = gbrt.predict(X2_test)
print(f'The accuracy of Gradient Boosting is: {accuracy_score(y2_test, predictions)}')

print('Confusion matrix:')
cm = confusion_matrix(y2_test, predictions)
print(cm)

The accuracy of Gradient Boosting is: 0.9394101876675603
Confusion matrix:
[[2145  128]
 [  98 1359]]
