In [42]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from urllib.parse import urlparse
import numpy as np
from math import log
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb

In [43]:
df = pd.read_csv("malicious_phish.csv")


In [44]:
#LE = LabelEncoder()
#df['label'] = LE.fit_transform(df['type'])

df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [45]:
# Create new features based on URL
df["url length"] = df["url"].str.len()

pathlength = []
entropy = []
hostlength = []
isportinstring = []
numofdigits = []
numofparams = []
numoffrags = []
isencoded = []
numencodedchar = []
numsubdir = []
numperiods = []
clientinstring = []
admininstring = []
serverinstring = []
logininstring = []
tld =[]

#Extract various features from URL
for url in df["url"].tolist():
    pathlength.append(len(urlparse(url).path))
    text = url.lower()
    probs = [text.count(c) / len(text) for c in set(text)]
    entropy.append( -sum([p * log(p) / log(2.0) for p in probs]))
    hostlength.append(len(urlparse(url).netloc))
    withport = urlparse(url).netloc.split(':')
    isportinstring.append(len(withport) > 1 and withport[-1].isdigit())
    numofdigits.append(len([i for i in url if i.isdigit()]))
    numofparams.append(0 if urlparse(url).query == '' else len(urlparse(url).query.split('&')))
    numoffrags.append(len(urlparse(url).fragment.split('#')) - 1 if urlparse(url).fragment == '' else 0)
    isencoded.append('%' in url.lower())
    numencodedchar.append(len([i for i in url if i == '%']))
    numsubdir.append(len(urlparse(url).path.split('/')))
    numperiods.append(len([i for i in url if i == '.']))
    clientinstring.append('client' in url.lower())
    admininstring.append('admin' in url.lower())
    serverinstring.append('server' in url.lower())
    logininstring.append('login' in url.lower())
    tld.append(urlparse(url).netloc.split('.')[-1].split(':')[0])

df["path length"] = pathlength
df["entropy"] = entropy
df["host length"] = hostlength
df["is port in string"] = isportinstring
df["number of digits"] = numofdigits
df["number of params"] = numofparams
df["number of fragments"] = numoffrags
df["is encoded"] = isencoded
df["num_encoded_char"] = numencodedchar
df["number of subdirectories"] = numsubdir
df["number of periods"] = numperiods
df["is client in string"] = clientinstring
df["is admin in string"] = admininstring
df["is server in string"] = serverinstring
df["is login in string"] = logininstring
df["tld"] = tld

LE = LabelEncoder()
df['tld2'] = LE.fit_transform(df['tld'])

df.head()

Unnamed: 0,url,type,url length,path length,entropy,host length,is port in string,number of digits,number of params,number of fragments,is encoded,num_encoded_char,number of subdirectories,number of periods,is client in string,is admin in string,is server in string,is login in string,tld,tld2
0,br-icloud.com.br,phishing,16,16,3.375,0,False,0,0,0,False,0,1,2,False,False,False,False,,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,35,4.079143,0,False,1,0,0,False,0,3,2,False,False,False,False,,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,31,3.708093,0,False,1,0,0,False,0,4,2,False,False,False,False,,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,88,10,4.660343,21,False,7,4,0,False,0,2,3,False,False,False,False,be,279
4,http://adventure-nicaragua.net/index.php?optio...,defacement,235,10,4.980518,23,False,22,3,0,False,0,2,2,False,False,False,False,net,412


In [46]:
#models
y = df["type"]
X = df.drop(labels=['url', 'type', 'tld'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)


# will avoid K-nearest neighbors as it is not good on large datasets

# will avoid decision tree as it is also not good on large datasets

#Naive Bayes
NBclassifier = GaussianNB()
NBmodel = NBclassifier.fit(X_train, y_train)
y_pred = NBmodel.predict(X_test)
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("F1: ",f1_score(y_test, y_pred, average='weighted'))
print("Precision: ",precision_score(y_test, y_pred, average='weighted'))
print("Recall: ",recall_score(y_test, y_pred, average='weighted'))

Accuracy:  0.773001361602801
F1:  0.7326380608416796
Precision:  0.7345327869516831
Recall:  0.773001361602801


In [47]:
#Random Forest
rf = RandomForestClassifier(n_estimators = 100)  
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("F1: ",f1_score(y_test, y_pred, average='weighted'))
print("Precision: ",precision_score(y_test, y_pred, average='weighted'))
print("Recall: ",recall_score(y_test, y_pred, average='weighted'))


Accuracy:  0.9313056030467143
F1:  0.9299888337524302
Precision:  0.9293709737390874
Recall:  0.9313056030467143


In [None]:
#XGBoost
boost = xgb.XGBClassifier()
boost.fit(X_train, y_train)
y_pred = boost.predict(X_test)
print("Accuracy: ",accuracy_score(y_test, y_pred))
print("F1: ",f1_score(y_test, y_pred, average='weighted'))
print("Precision: ",precision_score(y_test, y_pred, average='weighted'))
print("Recall: ",recall_score(y_test, y_pred, average='weighted'))
