**Classification into Phishing and Non-phishing sites**

The dataset has 30 features and the result will be printed in the Result column as 1(legitimate) or -1(phishing).

Each site is uniquely identified by a key value.

In [243]:
import pandas as pd
import numpy as np

traindata='https://raw.githubusercontent.com/kshitijabits/ML_Phishing/CSVs/Phising_Training_Dataset.csv'
dftrain=pd.read_csv(traindata)

testdata='https://raw.githubusercontent.com/kshitijabits/ML_Phishing/CSVs/Phising_Testing_Dataset.csv'
dftest=pd.read_csv(testdata)

#dftrain.head()

In [None]:
# Statistical aspects of the dataframe
print(dftrain.describe())
print()

In [245]:
urllen=pd.get_dummies(dftrain['URL_Length'], drop_first=True, prefix='URL_len')
dftrain=pd.concat([dftrain, urllen], axis = 1)

hsd=pd.get_dummies(dftrain['having_Sub_Domain'], drop_first=True, prefix='Has_Subdomain')
dftrain=pd.concat([dftrain, hsd], axis = 1)

sslfs=pd.get_dummies(dftrain['SSLfinal_State'], drop_first=True, prefix='SSL_final')
dftrain=pd.concat([dftrain, sslfs], axis = 1)

uoa=pd.get_dummies(dftrain['URL_of_Anchor'], drop_first=True, prefix='URLofAnchor')
dftrain=pd.concat([dftrain, uoa], axis = 1)

litag=pd.get_dummies(dftrain['Links_in_tags'], drop_first=True, prefix='Tag_links')
dftrain=pd.concat([dftrain, litag], axis = 1)

sfh=pd.get_dummies(dftrain['SFH'], drop_first=True, prefix='SFH_column')
dftrain=pd.concat([dftrain, sfh], axis = 1)

traffic=pd.get_dummies(dftrain['web_traffic'], drop_first=True, prefix='traffic')
dftrain=pd.concat([dftrain, traffic], axis = 1)

lp2p=pd.get_dummies(dftrain['Links_pointing_to_page'], drop_first=True, prefix='Links_pointing2page')
dftrain=pd.concat([dftrain, lp2p], axis = 1)

# Dropping vars as we have created the dummies for it
dftrain=dftrain.drop(['URL_Length','having_Sub_Domain','SSLfinal_State','URL_of_Anchor','Links_in_tags','SFH','web_traffic','Links_pointing_to_page'], axis=1)



urllen1=pd.get_dummies(dftest['URL_Length'], drop_first=True, prefix='URL_len')
dftest=pd.concat([dftest, urllen1], axis = 1)

hsd1=pd.get_dummies(dftest['having_Sub_Domain'], drop_first=True, prefix='Has_Subdomain')
dftest=pd.concat([dftest, hsd1], axis = 1)

sslfs1=pd.get_dummies(dftest['SSLfinal_State'], drop_first=True, prefix='SSL_final')
dftest=pd.concat([dftest, sslfs1], axis = 1)

uoa1=pd.get_dummies(dftest['URL_of_Anchor'], drop_first=True, prefix='URLofAnchor')
dftest=pd.concat([dftest, uoa1], axis = 1)

litag1=pd.get_dummies(dftest['Links_in_tags'], drop_first=True, prefix='Tag_links')
dftest=pd.concat([dftest, litag1], axis = 1)

sfh1=pd.get_dummies(dftest['SFH'], drop_first=True, prefix='SFH_column')
dftest=pd.concat([dftest, sfh1], axis = 1)

traffic1=pd.get_dummies(dftest['web_traffic'], drop_first=True, prefix='traffic')
dftest=pd.concat([dftest, traffic1], axis = 1)

lp2p1=pd.get_dummies(dftest['Links_pointing_to_page'], drop_first=True, prefix='Links_pointing2page')
dftest=pd.concat([dftest, lp2p1], axis = 1)

# Dropping vars as we have created the dummies for it
dftest=dftest.drop(['URL_Length','having_Sub_Domain','SSLfinal_State','URL_of_Anchor','Links_in_tags','SFH','web_traffic','Links_pointing_to_page'], axis=1)


In [246]:
# Importing necessary libraries to split the data
from sklearn.model_selection import train_test_split

"""
# Putting feature variable to X
X = dftrain.drop(['key','Result'], axis=1)
X.head()
"""
dftest['Result']=0

X_train, y_train = dftrain.drop(['key','Result'], axis=1), dftrain['Result']
X_test, y_test = dftest, dftest['Result']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Let's see the correlation matrix 
plt.figure(figsize = (20,10))        # Size of the figure
sns.heatmap(dftrain.corr(),annot = True)
plt.show()

In [None]:
import statsmodels.api as sm

# Logistic regression model
logm1 = sm.GLM(y_train,(sm.add_constant(X_train)), family = sm.families.Binomial())
logm1.fit().summary()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
rfe = RFE(estimator=LogisticRegression(),n_features_to_select=30)          
rfe = rfe.fit(X_train, y_train)

list(zip(X_train.columns, rfe.support_, rfe.ranking_))

In [None]:
col = X_train.columns[rfe.support_]

X_train_sm = sm.add_constant(X_train[col])
logm2 = sm.GLM(y_train,X_train_sm, family = sm.families.Binomial())
res = logm2.fit()
res.summary()

In [None]:
# Getting the predicted values on the train set
y_train_pred = res.predict(X_train_sm)
y_train_pred[:10]

In [None]:
col1 = X_train.columns[rfe.support_]

X_test_sm = sm.add_constant(X_test[col1])
y_test = res.predict(X_test_sm)
y_test[:10]

# Creating a dataframe with the actual result flag and the predicted probabilities
y_test_final = pd.DataFrame({'Result':y_test.values, 'Result_Prob':y_test})

# Creating new column 'predicted' with 1 if Result_Prob > 0.5 else 0
y_test_final['predicted'] = y_test_final.Result_Prob.map(lambda x: 1 if x > 0.5 else -1)

finaldata=y_test_final.drop(['Result','Result_Prob'],1)
finaldata.rename(columns={'predicted':'Result'}, inplace=True)
finaldata.index = dftest['key']

finaldata


In [263]:
# saving the dataframe to a CSV
finaldata.to_csv('Phishing_output.csv')