# IMPORTING DATA

In [1]:
import pandas as pd
import numpy as np
#Importing the data

df = pd.read_csv("email.csv")
pd.set_option('display.max_colwidth', 500)
df.head(5)

FileNotFoundError: [Errno 2] No such file or directory: 'email.csv'

In [None]:
#Check for missing values
df.dropna(inplace = True)
df.info()
#There are no missing values

# DATA CLEANING & PROCESSING

In [None]:
#Converting the text to lower case 
import re
df['Text'] = df['Text'].astype(str).apply(lambda x: x.lower()) 
df.head(5)

#Remove "=" symbol from data
df['Text'] = df['Text'].apply(lambda x: x.replace("=",''))
df.head(5)

In [None]:
#Extracting url from the text
df['Url'] = df['Text'].apply(lambda x: re.findall("http\S+",x))
#Create new feature called Url_Count
df['Url_Count'] = df['Url'].apply(lambda x: len(x))

#Extracting email from the text 
df['Email'] = df['Text'].apply(lambda x: re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+",x))
#Create new feature called Email_Count
df['Email_Count'] = df['Email'].apply(lambda x: len(x))

df.head(8)


In [None]:
#Plotting graphs to analysing the data
import matplotlib.pyplot as plt
import seaborn as sns

#Number of phishing emails vs how many non-phishing emails
print(df["Class"].value_counts())
sns.countplot(df['Class'])
plt.title('Number of Emails by Class')
plt.xlabel('Class (0 = Non-phishing, 1 = Phishing)')

#Analysing count of Url in text
pd.crosstab(df['Class'],df['Url_Count']).plot(kind='bar')
plt.title('Count of Url in Text')
plt.xlabel('Class (0 = Non-phishing, 1 = Phishing)')
plt.ylabel('Frequency of URLs in Text')




In [None]:
df

In [None]:
#histograms
#histogram of number of url per emails for both classes 
sns.histplot(df, x="Url_Count", binwidth=1, hue="Class")
plt.title('Distribution of Url Count')
plt.xlabel('Class (0 = Non-phishing, 1 = Phishing)')
plt.ylabel('Frequency of Url in Text')
plt.show()

# URL COLUMNS AND CLASS

In [None]:
url = df[['Url','Class']]
url

In [None]:
s = url.apply(lambda x: pd.Series(x['Url']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'url'
url = url.drop('Url', axis=1).join(s)
url['url'] = url['url'].fillna("")
url

In [None]:
phishing_url = url[url.Class == 1]
phishing_url.drop(phishing_url[phishing_url.url == ""].index, inplace=True)
phishing_url = phishing_url["url"]
phishing_url
phishing_url.to_csv("phishing_url.csv", encoding='utf-8', header=False, index=False)

In [None]:
normal_url = url[url.Class == 0]
normal_url.drop(normal_url[normal_url.url == ""].index, inplace=True)
normal_url = normal_url["url"]
normal_url
normal_url.to_csv("normal_url.csv", encoding='utf-8', header=False, index=False)

# EXTRACTING FEATURES

###### 
Information entropy allows you to determine how much randomness is present in a string, and randomness in a URL is often an indicator of a malicious site.

A fragment is an internal page reference, sometimes called a named anchor. It usually appears at the end of a URL and begins with a hash (#) character followed by an identifier. It refers to a section within a web page.

![image.png](attachment:image.png)

In [None]:
import whois
from datetime import datetime, timezone
import math
import pandas as pd
import numpy as np
from pyquery import PyQuery
from requests import get

class UrlFeaturizer(object):
    def __init__(self, url):
        self.url = url
        self.domain = url.split('//')[-1].split('/')[0]
        self.today = datetime.now().replace(tzinfo=None)

        try:
            self.whois = whois.query(self.domain).__dict__
        except:
            self.whois = None

        try:
            self.response = get(self.url)
            self.pq = PyQuery(self.response.text)
        except:
            self.response = None
            self.pq = None

    ## URL string Features
    def entropy(self):
        string = self.url.strip()
        prob = [float(string.count(c)) / len(string) for c in dict.fromkeys(list(string))]
        entropy = sum([(p * math.log(p) / math.log(2.0)) for p in prob])
        return entropy

    def ip(self):
        string = self.url
        flag = False
        if ("." in string):
            elements_array = string.strip().split(".")
            if(len(elements_array) == 4):
                for i in elements_array:
                    if (i.isnumeric() and int(i)>=0 and int(i)<=255):
                        flag=True
                    else:
                        flag=False
                        break
        if flag:
            return 1 
        else:
            return 0

    def numDigits(self):
        digits = [i for i in self.url if i.isdigit()]
        return len(digits)

    def urlLength(self):
        return len(self.url)

    def numParameters(self):
        params = self.url.split('&')
        return len(params) - 1

    def numFragments(self):
        fragments = self.url.split('#')
        return len(fragments) - 1

    def numSubDomains(self):
        subdomains = self.url.split('http')[-1].split('//')[-1].split('/')
        return len(subdomains)-1

    def domainExtension(self):
        ext = self.url.split('.')[-1].split('/')[0]
        return ext

    ## URL domain features
    def hasHttp(self):
        return 'http:' in self.url

    def hasHttps(self):
        return 'https:' in self.url

    def daysSinceRegistration(self):
        if self.whois and self.whois['creation_date']:
            diff = self.today - self.whois['creation_date'].replace(tzinfo=None)
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0

    def daysSinceExpiration(self):
        if self.whois and self.whois['expiration_date']:
            diff = self.whois['expiration_date'].replace(tzinfo=None) - self.today
            diff = str(diff).split(' days')[0]
            return diff
        else:
            return 0
    
     ## URL Page Features
    def bodyLength(self):
        if self.pq is not None:
            return len(self.pq('html').text()) if self.urlIsLive else 0
        else:
            return 0

    def numTitles(self):
        if self.pq is not None:
            titles = ['h{}'.format(i) for i in range(7)]
            titles = [self.pq(i).items() for i in titles]
            return len([item for s in titles for item in s])
        else:
            return 0

    def numImages(self):
        if self.pq is not None:
            return len([i for i in self.pq('img').items()])
        else:
            return 0

    def numLinks(self):
        if self.pq is not None:
            return len([i for i in self.pq('a').items()])
        else:
            return 0

    def scriptLength(self):
        if self.pq is not None:
            return len(self.pq('script').text())
        else:
            return 0

    def specialCharacters(self):
        if self.pq is not None:
            bodyText = self.pq('html').text()
            schars = [i for i in bodyText if not i.isdigit() and not i.isalpha()]
            return len(schars)
        else:
            return 0

    def scriptToSpecialCharsRatio(self):
        v = self.specialCharacters()
        if self.pq is not None and v!=0:
            sscr = self.scriptLength()/v
        else:
            sscr = 0
        return sscr

    def scriptTobodyRatio(self):
        v = self.bodyLength()
        if self.pq is not None and v!=0:
            sbr = self.scriptLength()/v
        else:
            sbr = 0
        return sbr

    def bodyToSpecialCharRatio(self):
        v = self.bodyLength()
        if self.pq is not None and v!=0:
            bscr = self.specialCharacters()/v
        else:
            bscr = 0
        return bscr

    def urlIsLive(self):
        return self.response == 200

    def run(self):
        data = {}
        # 22 features
        data['entropy'] = self.entropy()
        data['numDigits'] = self.numDigits()
        data['urlLength'] = self.urlLength()
        data['numParams'] = self.numParameters()
        data['hasHttp'] = self.hasHttp()
        data['hasHttps'] = self.hasHttps()
        data['urlIsLive'] = self.urlIsLive()
        data['bodyLength'] = self.bodyLength()
        data['numTitles'] = self.numTitles()
        data['numImages'] = self.numImages()
        data['numLinks'] = self.numLinks()
        data['scriptLength'] = self.scriptLength()
        data['specialChars'] = self.specialCharacters()
        data['ext'] = self.domainExtension()
        data['dsr'] = self.daysSinceRegistration()
        data['dse'] = self.daysSinceExpiration()
        data['sscr'] = self.scriptToSpecialCharsRatio()
        data['sbr'] = self.scriptTobodyRatio()
        data['bscr'] = self.bodyToSpecialCharRatio()
        data['num_%20'] = self.url.count("%20")
        data['num_@'] = self.url.count("@")
        data['has_ip'] = self.ip()
    
        return data

In [None]:
import os
l = ['normal_url.csv','phishing_url.csv']

emp = UrlFeaturizer("").run().keys()

A = pd.DataFrame(columns = emp)
t=[]
for j in l:
    print(j)
    d=pd.read_csv(j,header=None).to_numpy().flatten()
    for i in d:
        try: 
            temp=UrlFeaturizer(i).run()
            temp["Class"]=j.split(".")[0]
            t.append(temp)
        except RuntimeError: 
            pass 
A=A.append(t)
os.chdir('../')
A.to_csv("features.csv")

In [None]:
A

In [None]:
set(A['Class'])

In [None]:
A.replace("phishing_url",1,inplace=True)
A.replace("normal_url",0,inplace=True)

In [None]:
A.replace(True,1,inplace = True)
A.replace(False,0,inplace = True)


In [None]:
A

# MODELING & ANALYSIS

In [None]:
y = A['Class']
A

In [None]:
#Split dataset to test and train data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


A = A.drop(columns = "Class")
A = A.drop(columns = "ext")

scaler = MinMaxScaler(feature_range=(0, 1))  
X = pd.DataFrame(scaler.fit_transform(A))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [None]:
# Plotting feature importance graph
def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
#Model 1 - Gaussian Naive Bayes (supports continuous data) 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print('accuracy: %s' % accuracy_score(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

In [None]:
# feature importance
from sklearn.inspection import permutation_importance

imps = permutation_importance(nb, X_test, y_test)

print(imps.importances_mean)
print(type(imps))

#Create a DataFrame using a Dictionary
data={'feature_names':X_train.columns,'feature_importance':imps.importances_mean}
fi_df = pd.DataFrame(data)

#Sort the DataFrame in order decreasing feature importance
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

#Define size of bar plot
plt.figure(figsize=(10,8))
#Plot Searborn bar chart
sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
#Add chart labels
plt.title("NAIVE BAYES" + 'FEATURE IMPORTANCE')
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')

In [None]:
#Model 2 - Logistic regression
from sklearn.linear_model import LogisticRegression
from matplotlib.pyplot import figure


log_reg = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000)

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print('accuracy: %s' % accuracy_score(y_pred, y_test))
print('\n')
print(classification_report(y_test, y_pred))
type(log_reg.coef_)

In [None]:
# logistic regression for feature importance
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot

# get importance
importance = log_reg.coef_[0]

# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
    
print(importance)

# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

plot_feature_importance(importance,X_train.columns,'LOGISTIC REGRESSION')

In [None]:
#Model 3 - SVM
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('accuracy: %s' % accuracy_score(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))


In [None]:
#Model 4 - Decision Tree
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt = dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print('accuracy: %s' % accuracy_score(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

In [None]:
# decision tree for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot

# get importance
importance = dt.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
    
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

plot_feature_importance(importance,X_train.columns,'DECISION TREE')

In [None]:
#Model 5 - Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
rf = RandomForestClassifier(max_depth=100, random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print('accuracy: %s' % accuracy_score(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))
print(rf.get_params())


In [None]:
# random forest for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot

# get importance
importance = rf.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
    
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

plot_feature_importance(importance,X_train.columns,'RANDOM FOREST')

In [None]:
#Model 6 - Random Forest with parameter tuning ()
from sklearn.model_selection import RandomizedSearchCV
import time

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 300, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# Minimum number of samples required to split a node
min_samples_split = [10,50, 100, 300,500]
# Minimum number of samples required at each leaf node
min_samples_leaf = [10,50, 100,300,500]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
               
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 3, verbose=2, random_state=0, n_jobs = -1)
start = time.time()
rf_random.fit(X_train, y_train)
end = time.time()
print(end - start)

In [None]:
#Model 6 - Continue
print(rf_random.best_params_)
y_pred = rf_random.best_estimator_.predict(X_test)

print('accuracy: %s' % accuracy_score(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))

In [None]:
# random forest for feature importance on a classification problem
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot

# get importance
importance = rf_random.best_estimator_.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
    
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

plot_feature_importance(importance,X_train.columns,'RANDOM FOREST W PARAMETER TUNING')

In [None]:
#Model 7 - XGBoost
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.fit(X_train, y_train)
y_pred = xg.predict(X_test)
print('accuracy: %s' % accuracy_score(y_test, y_pred))
print('\n')
print(classification_report(y_test, y_pred))


In [None]:
# xgboost for feature importance on a classification problem
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from matplotlib import pyplot

# get importance
importance = xg.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

plot_feature_importance(importance,X_train.columns,'XGBOOST')