In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import requests
from requests import ReadTimeout
from tld import get_tld, is_tld
from urllib.parse import urlparse
import re

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

import pickle

In [4]:
urldata = pd.read_csv('urldata.csv')

In [5]:
urldata.columns

Index(['Unnamed: 0', 'url_len', '@', '?', '-', '=', '.', '#', '%', '+', '$',
       '!', '*', ',', '//', 'https', 'digits', 'letters', 'Shortining_Service',
       'having_ip_address', 'tld', 'len_subdomain', 'is_subdomain',
       'ratioUrlDomLen', 'count_spl', 'ratioSpltinurl'],
      dtype='object')

In [6]:
urldata.drop(['Unnamed: 0'], axis=1)

Unnamed: 0,url,type,url_len,domain,@,?,-,=,.,#,...,$,!,*,",",//,https,digits,letters,Shortining_Service,having_ip_address
0,br-icloud.com.br,malicious,16,br-icloud.com.br,0,0,1,0,2,0,...,0,0,0,0,0,0,0,13,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,35,mp3raid.com,0,0,0,0,2,0,...,0,0,0,0,0,0,1,29,0,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,bopsecrets.org,0,0,0,0,2,0,...,0,0,0,0,0,0,1,25,0,0
3,http://garage-pirenne.be/index.php?option=com_...,malicious,88,garage-pirenne.be,0,1,1,4,2,0,...,0,0,0,0,1,0,7,60,0,0
4,http://adventure-nicaragua.net/index.php?optio...,malicious,235,adventure-nicaragua.net,0,1,1,3,2,0,...,0,0,0,0,1,0,22,199,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090222,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,43,ecct-it.com,0,0,1,0,2,0,...,0,0,0,0,1,0,0,34,1,0
1090223,http://faboleena.com/js/infortis/jquery/plugin...,malicious,159,faboleena.com,0,0,0,1,2,0,...,0,0,0,0,1,0,21,118,0,0
1090224,http://faboleena.com/js/infortis/jquery/plugin...,malicious,147,faboleena.com,0,0,0,1,1,0,...,0,0,0,0,1,0,20,109,0,0
1090225,http://atualizapj.com/,malicious,22,atualizapj.com,0,0,0,0,1,0,...,0,0,0,0,1,0,0,17,0,0


In [29]:
def process_tld(url):
    try:
#         Extract the top level domain (TLD) from the URL given
        res = get_tld(url, as_object = True, fail_silently=False,fix_protocol=True)
        pri_domain= res.tld
    except :
        pri_domain= None
    return pri_domain

In [28]:
process_tld(urldata['url'][:1][0])

'com.br'

In [30]:
urldata['tld'] = urldata['url'].apply(lambda i: process_tld(i))

In [35]:
urldata['tld'].isnull().value_counts()

False    1073292
True       16935
Name: tld, dtype: int64

In [48]:
def process_subdomain(url):
    try:
#         Extract the top level domain (TLD) from the URL given
        res = get_tld(url, as_object = True, fail_silently=False,fix_protocol=True)
        pri_domain= res.subdomain
    except :
        pri_domain= None
    return pri_domain

In [56]:
urldata['subdomain'] = urldata['url'].apply(lambda i: process_subdomain(i))

In [53]:
urldata['len_subdomain'] = urldata['is_subdomain'].apply(lambda i: len(str(i)))

In [60]:
urldata = urldata.drop('Unnamed: 0', axis=1)

In [65]:
def is_subdomain(url):
    if url == 0:
        return 0
    else:
        return 1
        

In [66]:
urldata['is_subdomain'] = urldata['len_subdomain'].apply(lambda i: is_subdomain(i))

In [67]:
urldata['is_subdomain'].value_counts()

0    849854
1    240373
Name: is_subdomain, dtype: int64

In [68]:
urldata.to_csv('urldata.csv')

In [80]:
urldata['ratioUrlDomLen'] = urldata['url_len'] / len(urldata['domain'])

In [89]:
urldata['rationNumAlphaUrl'] = urldata['digits'] / urldata['letters']

In [108]:
urldata['count_spl'] = urldata[['@','?','-','=','.','#','%','+','$','!','*',',','//']].sum(axis=1)

Unnamed: 0,@,?,-,=,.,#,%,+,$,!,*,",",//
0,0,0,1,0,2,0,0,0,0,0,0,0,0
1,0,0,0,0,2,0,0,0,0,0,0,0,0
2,0,0,0,0,2,0,0,0,0,0,0,0,0
3,0,1,1,4,2,0,0,0,0,0,0,0,1
4,0,1,1,3,2,0,0,0,0,0,0,0,1


In [110]:
urldata['ratioSpltinurl'] = urldata['count_spl']/ urldata['url_len']

In [111]:
urldata.head()

Unnamed: 0,url,type,url_len,domain,@,?,-,=,.,#,...,Shortining_Service,having_ip_address,tld,len_subdomain,subdomain,is_subdomain,ratioUrlDomLen,rationAlphaNumUrl,count_spl,ratioSpltinurl
0,br-icloud.com.br,malicious,16,br-icloud.com.br,0,0,1,0,2,0,...,0,0,com.br,0,,0,1.5e-05,0.0,3,0.1875
1,mp3raid.com/music/krizz_kaliko.html,benign,35,mp3raid.com,0,0,0,0,2,0,...,0,0,com,0,,0,3.2e-05,0.034483,2,0.057143
2,bopsecrets.org/rexroth/cr/1.htm,benign,31,bopsecrets.org,0,0,0,0,2,0,...,0,0,org,0,,0,2.8e-05,0.04,2,0.064516
3,http://garage-pirenne.be/index.php?option=com_...,malicious,88,garage-pirenne.be,0,1,1,4,2,0,...,0,0,be,0,,0,8.1e-05,0.116667,9,0.102273
4,http://adventure-nicaragua.net/index.php?optio...,malicious,235,adventure-nicaragua.net,0,1,1,3,2,0,...,0,0,net,0,,0,0.000216,0.110553,8,0.034043


In [150]:
urldata.to_csv('urldata.csv')

In [120]:
urldata.columns

Index(['url', 'type', 'url_len', 'domain', '@', '?', '-', '=', '.', '#', '%',
       '+', '$', '!', '*', ',', '//', 'https', 'digits', 'letters',
       'Shortining_Service', 'having_ip_address', 'tld', 'len_subdomain',
       'subdomain', 'is_subdomain', 'ratioUrlDomLen', 'rationAlphaNumUrl',
       'count_spl', 'ratioSpltinurl'],
      dtype='object')

In [6]:
num_features = ['https', 'digits', 'letters','url_len','len_subdomain','is_subdomain', 'ratioUrlDomLen',  'count_spl', 
                'ratioSpltinurl','Shortining_Service', 'having_ip_address', '@', '?', '-', '=', '.', '#', '%', '+', '$', '!', '*', ',', '//']
cat_features = ['tld']

In [7]:
enc = OneHotEncoder(handle_unknown='ignore')
scaler = MinMaxScaler()


In [8]:
#Creating a pipeline transformer for encoding categorical attributes and normalising numerical attributes
pipeline = ColumnTransformer([
("num", scaler, num_features),
("cat", enc, cat_features)])

In [9]:
X = urldata

In [10]:
urldata = pd.read_csv('urldata_nlp.csv')

In [11]:
def type_encoder(x):
    if x == 'malicious':
        return 1
    elif x == 'benign':
        return 0
    else:
        return x

urldata['target'] = urldata['type'].apply(lambda x: type_encoder(x))

urldata[['type', 'target']]

Unnamed: 0,type,target
0,malicious,1
1,benign,0
2,benign,0
3,malicious,1
4,malicious,1
...,...,...
1090222,malicious,1
1090223,malicious,1
1090224,malicious,1
1090225,malicious,1


In [12]:
y = urldata.target

In [13]:
X_trans = 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_trans,y,test_size=0.2,shuffle=True)

In [168]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression (W2v)
lr_w2v=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_w2v.fit(X_train, y_train)  #model
#Predict y value for test dataset
y_predict = lr_w2v.predict(X_test)
y_prob = lr_w2v.predict_proba(X_test)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94    154830
           1       0.93      0.75      0.83     63216

    accuracy                           0.91    218046
   macro avg       0.92      0.86      0.88    218046
weighted avg       0.91      0.91      0.91    218046

Confusion Matrix: [[151142   3688]
 [ 15879  47337]]
AUC: 0.93249290657009


In [169]:
filename = 'lr_lexical_model.sav'
pickle.dump(lr_w2v, open(filename, 'wb'))

In [172]:
nb_lexical = MultinomialNB()
nb_lexical.fit(X_train, y_train)  
#Predict y value for test dataset
y_predict = nb_lexical.predict(X_test)
y_prob = nb_lexical.predict_proba(X_test)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.80      0.96      0.88    154830
           1       0.83      0.42      0.56     63216

    accuracy                           0.81    218046
   macro avg       0.82      0.69      0.72    218046
weighted avg       0.81      0.81      0.78    218046

Confusion Matrix: [[149244   5586]
 [ 36542  26674]]
AUC: 0.8594669343094319


In [15]:
rf_lexical =  RandomForestClassifier(n_estimators=100)
rf_lexical.fit(X_train,y_train)
y_pred = rf_lexical.predict(X_test)
y_prob = rf_lexical.predict_proba(X_test)[:,1]
print(classification_report(y_test,y_pred))
print('Confusion Matrix:',confusion_matrix(y_test, y_pred))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98    154877
           1       0.97      0.93      0.95     63169

    accuracy                           0.97    218046
   macro avg       0.97      0.96      0.97    218046
weighted avg       0.97      0.97      0.97    218046

Confusion Matrix: [[152861   2016]
 [  4117  59052]]
AUC: 0.9912118365880945
