## Phishing Email Classification

In [8]:
#import libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [9]:
#import data and print to check form
df = pd.read_csv('Phishing_Email.csv', usecols=['Email Text', 'Email Type'], nrows=10000)
print(df.shape)
df.sample(5)

(10000, 2)


Unnamed: 0,Email Text,Email Type
1217,"\n >> If and when we package this, perhaps ...",Safe Email
3700,fw : ferc meeting summary the commission held ...,Safe Email
6315,Matthew Cline writes:> There must be *some* w...,Safe Email
8327,thank you for the memories dear all : the last...,Safe Email
6141,URL: http://www.livejournal.com/talkread.bml?j...,Safe Email


In [10]:
#one hto encode email type
one_hot = pd.get_dummies(df['Email Type'], drop_first=True)
df = df.drop('Email Type', axis=1)
df = df.join(one_hot)

In [11]:
df.sample(10)

Unnamed: 0,Email Text,Safe Email
9781,super - discounts on víagra and cialis up to 8...,False
7546,"more on "" mazel tov "" as far as i know , hebre...",True
3478,8 - bit characters dear fellow linguists : in ...,True
9840,awali is there anyone who can tell me which is...,True
8462,saturday on saturday i came to work and did so...,True
8483,language technology in multimedia information ...,True
1462,fw : press release - enron announces progress ...,True
4152,re : sorry i forgot the attachment . aimee lan...,True
7835,http://spineless.org/~mod/pix/octoberMoon.jpg\...,True
7280,woww . . 8 o - % off paliourg buy med ' s on -...,False


#### Code is all processed and formatted, now we pass through models for best classification model

(**Disclaimer:** *Switching data types for each 12000 rows of data takes a while, making this cell below run slower than usual (1-2 min). The memory allocated for the array would be exceeded if we used the full dataset given. We still get good accuracies with all the models, therefore making our scaling justified*.)

In [12]:
X = df['Email Text'].values.astype('U')
y = df['Safe Email']

print(type(X))

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

<class 'numpy.ndarray'>


In [21]:
#create pipelines
pipeMNB = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

pipeCNB = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', ComplementNB())
])

pipeSVC = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

pipeDTC = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', DecisionTreeClassifier())
])

pipeKNN = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', KNeighborsClassifier())
])

pipeMLP = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MLPClassifier())
])

#### Time to fit and train models and print accuracy scores

In [14]:
#MNB PipeLine
pipeMNB.fit(X_train, y_train)
#preds
MNBpredict = pipeMNB.predict(X_test)

#score
MNBscore = accuracy_score(y_test, MNBpredict)
print(f"MNB Accuracy: {MNBscore}")

MNB Accuracy: 0.8835


In [15]:
#CNB Pipeline
pipeCNB.fit(X_train, y_train)
#preds
CNBpredict = pipeCNB.predict(X_test)

CNBscore = accuracy_score(y_test, CNBpredict)
print(f"CNB Accuracy: {CNBscore}")

CNB Accuracy: 0.9165


In [16]:
#LinearSVC Pipeline
pipeSVC.fit(X_train, y_train)
#preds
SVCpredict = pipeSVC.predict(X_test)

SVCscore = accuracy_score(y_test, SVCpredict)
print(f"SVC Accuracy: {SVCscore}")

SVC Accuracy: 0.9785


In [17]:
#Decision Tree Pipeline
pipeDTC.fit(X_train, y_train)
#preds
DTCpredict = pipeDTC.predict(X_test)

DTCscore = accuracy_score(y_test, DTCpredict)
print(f"Decision Tree Classifier Accuracy: {DTCscore}")

Decision Tree Classifier Accuracy: 0.9085


In [20]:
#KNearest Neighbors Pipeline
pipeKNN.fit(X_train, y_train)
#preds
KNNpredict = pipeKNN.predict(X_test)

KNNscore = accuracy_score(y_test, KNNpredict)
print(f"K Nearest Neighbors Accuracy: {KNNscore}")


K Nearest Neighbors Accuracy: 0.927


In [22]:
#MLP Neural Network Pipeline
pipeMLP.fit(X_test, y_test)
#preds
MLPpredict = pipeMLP.predict(X_test)

MLPscore = accuracy_score(y_test, MLPpredict)
print(f"MLP Accuracy: {MLPscore}")



MLP Accuracy: 0.9895
