# Phishing Detection

### Libraries Import & data import

Import the following libraries in the notebook
1. Pandas
2. Numpy (Numerical Python)
3. matplotlib
4. sklearn and pretrained models from sklearn
5. warnings (to suppress warning messages regarding deprecation)

Also import the dataset in the notebook using pandas.read_csv() function. Also, set the encoding to windows-1252

In [43]:
# import libraries

import pandas as pd # pandas library 
import numpy as np # numerical python library
import matplotlib.pyplot as plt # plotting library


import sklearn as sk
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

from datetime import datetime, timedelta, date

from __future__ import division # use / to perform floor division

# show charts and graphs inline in jupyter notebook
%matplotlib inline 

In [44]:
# here we import algorithms

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [45]:
#supress all the warnings of deprecation

import warnings
warnings.filterwarnings("ignore")

In [46]:
#read data
df = pd.read_csv('Phising_Training_Dataset.csv') 

In [47]:
df.shape

(8955, 32)

In [48]:
df.head()

Unnamed: 0,key,having_IP,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,12344,-1,1,1,1,-1,-1,-1,-1,-1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,12345,1,1,1,1,1,-1,0,1,-1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,12346,1,0,1,1,1,-1,-1,-1,-1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,12347,1,0,1,1,1,-1,-1,-1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,12348,1,0,-1,1,1,-1,1,1,-1,...,-1,1,-1,-1,0,-1,1,1,1,1


### Model Selection & Training

1 = Legitimate <br>
-1 = Phishing

In [49]:
models = {}

models['SVC'] = SVC()
models['Linear SVC'] = LinearSVC()
models['GaussianNB'] = GaussianNB()
models['GradientBoostingClassifier'] = GradientBoostingClassifier()
models['DecisionTreeClassifier'] = DecisionTreeClassifier()
models['KNeighborsClassifier'] = KNeighborsClassifier()
models['RandomForestClassifier'] = RandomForestClassifier()
models['Logistic Regression'] = LogisticRegression()
models['MLPClassifier'] = MLPClassifier()

In [50]:
# Split training data into test & validation data

x = df.drop("key",axis=1).drop('Result',axis=1)
y = df['Result']

print(x)
print(y)

      having_IP  URL_Length  Shortining_Service  having_At_Symbol  \
0            -1           1                   1                 1   
1             1           1                   1                 1   
2             1           0                   1                 1   
3             1           0                   1                 1   
4             1           0                  -1                 1   
...         ...         ...                 ...               ...   
8950          1          -1                   1                -1   
8951         -1           1                   1                -1   
8952          1          -1                   1                 1   
8953         -1          -1                   1                 1   
8954         -1          -1                   1                 1   

      double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  \
0                           -1             -1                 -1   
1                            1     

In [59]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

x_train = x_train.reset_index().drop('index',axis=1)
x_test = x_test.reset_index().drop('index',axis=1)
y_train = y_train.reset_index().drop('index',axis=1)
y_test = y_test.reset_index().drop('index',axis=1)

print(x_train)
print(x_test)
print(y_train)
print(y_test)

      having_IP  URL_Length  Shortining_Service  having_At_Symbol  \
0             1          -1                   1                 1   
1            -1           1                  -1                 1   
2             1          -1                   1                 1   
3             1          -1                   1                 1   
4             1          -1                   1                 1   
...         ...         ...                 ...               ...   
6711         -1          -1                   1                 1   
6712          1          -1                   1                 1   
6713         -1          -1                  -1                -1   
6714          1          -1                   1                 1   
6715         -1          -1                   1                 1   

      double_slash_redirecting  Prefix_Suffix  having_Sub_Domain  \
0                            1             -1                  1   
1                           -1     

In [60]:
accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(x_train, y_train)
    
    # Prediction 
    predictions = models[key].predict(x_test)

    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)

In [66]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model.to_csv("Classifiers Report.csv")
df_model

Unnamed: 0,Accuracy,Precision,Recall
SVC,0.937025,0.966929,0.925396
Linear SVC,0.92586,0.956693,0.91629
GaussianNB,0.590889,0.27874,1.0
GradientBoostingClassifier,0.945065,0.967717,0.937452
DecisionTreeClassifier,0.958017,0.974803,0.952308
KNeighborsClassifier,0.932113,0.954331,0.928025
RandomForestClassifier,0.969629,0.987402,0.960184
Logistic Regression,0.922733,0.949606,0.91711
MLPClassifier,0.96159,0.972441,0.960342


In [62]:
test_data = pd.read_csv('Phising_Testing_Dataset.csv')

test_data

Unnamed: 0,key,having_IP,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,...,RightClick,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report
0,21338,1,1,1,1,1,1,-1,1,-1,...,1,1,1,1,1,1,1,-1,0,1
1,21339,1,-1,1,1,1,-1,0,-1,1,...,1,1,1,-1,-1,0,-1,-1,0,1
2,21340,1,-1,1,1,1,-1,0,0,-1,...,-1,-1,-1,1,-1,1,-1,1,1,-1
3,21341,-1,-1,-1,1,-1,-1,-1,-1,1,...,1,-1,-1,1,-1,1,-1,1,1,-1
4,21342,1,-1,1,1,1,-1,1,1,-1,...,1,1,1,1,1,1,-1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2095,23433,-1,-1,-1,1,-1,-1,1,1,1,...,1,1,1,1,-1,1,-1,-1,1,1
2096,23434,1,1,1,-1,1,-1,1,1,-1,...,1,-1,1,-1,1,-1,1,1,1,1
2097,23435,1,-1,1,1,1,-1,-1,-1,1,...,1,1,1,-1,1,0,-1,1,0,1
2098,23436,1,-1,1,-1,1,-1,0,-1,-1,...,1,-1,-1,1,1,-1,-1,1,1,1


In [56]:
x_testing = test_data.drop('key', axis=1) 
accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    # Prediction 
    result = models[key].predict(x_testing)
    
    ans = test_data[['key']]
    ans["Result"] = result.tolist()
    ans.to_csv("Results/%s.csv"%(key),index=False)