In [2]:

import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import base64
from urllib import parse

In [5]:
# # Data Load
df_normal = pd.read_csv('Dataset/Benign_list_big_final.csv')
df_deface = pd.read_csv('Dataset/DefacementSitesURLFiltered.csv',encoding='latin-1')
df_malware = pd.read_csv('Dataset/Malware_dataset.csv')
df_phishing = pd.read_csv('Dataset/phishing_dataset.csv')
df_spam = pd.read_csv('Dataset/spam_dataset.csv')

In [6]:
df_normal.rename(columns={'Begin URL':'URL'},inplace=True)
df_deface.rename(columns={'Defacement URL':'URL'},inplace=True)
df_malware.rename(columns={'Malware URL':'URL'},inplace=True)
df_phishing.rename(columns={'Phishing URL':'URL'},inplace=True)
df_spam.rename(columns={'Spam URL':'URL'},inplace=True)


In [7]:
df_spam

Unnamed: 0,URL
0,http://astore.amazon.co.uk/allezvinsfrenchr/de...
1,http://archive.salisburyjournal.co.uk/2007/3/6...
2,http://appbasic.jettons.co.uk/links/index.html
3,http://archive.yorkpress.co.uk/2003/11/6/25684...
4,http://acard4u.co.uk/product_reviews.php?cPath...
...,...
11995,http://archive.salisburyjournal.co.uk/2001/3/7/
11996,http://astore.amazon.co.uk/allezvinsfrenchr/de...
11997,http://archive.thisischeshire.co.uk/2000/1/14/...
11998,http://applerugs.co.uk/rugs/product_info.php?p...


In [12]:
# df = sorted([x for x in Path("Dataset/").glob("*.csv")])
# df

[WindowsPath('Dataset/Benign_list_big_final.csv'),
 WindowsPath('Dataset/DefacementSitesURLFiltered.csv'),
 WindowsPath('Dataset/Malware_dataset.csv'),
 WindowsPath('Dataset/phishing_dataset.csv'),
 WindowsPath('Dataset/spam_dataset.csv')]

# Normal, Attack(Malware/Phishing/Spam/Defacement) Labeling

In [8]:
# Data Label List 
normal_label_list = []
malware_label_list = []
deface_label_list = []
spam_label_list = []
phishing_label_list = []
for i in range(len(df_normal)):
    normal_label_list.append('Normal')
for j in range(len(df_malware)):
    malware_label_list.append('Malware')
for n in range(len(df_deface)):
    deface_label_list.append('Defacement')
for m in range(len(df_phishing)):
    phishing_label_list.append('Phishing')
for k in range(len(df_spam)):
    spam_label_list.append('Spam')
 

In [9]:
# Labeling
df_normal['Label'] = normal_label_list
df_deface['Label'] = deface_label_list
df_malware['Label'] = malware_label_list
df_spam['Label'] = spam_label_list
df_phishing['Label'] = phishing_label_list

In [27]:
# 5개 data concat
df_total = pd.concat([df_normal, df_deface, df_malware, df_phishing, df_spam],axis=0)
df_total

Unnamed: 0,URL,Label
0,http://1337x.to/torrent/1048648/American-Snipe...,Normal
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,Normal
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,Normal
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,Normal
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,Normal
...,...,...
11995,http://archive.salisburyjournal.co.uk/2001/3/7/,Spam
11996,http://astore.amazon.co.uk/allezvinsfrenchr/de...,Spam
11997,http://archive.thisischeshire.co.uk/2000/1/14/...,Spam
11998,http://applerugs.co.uk/rugs/product_info.php?p...,Spam


# URL Decode

In [33]:
url_decode_data =df_total['URL']
url_decode_list = list(df_total['URL'])

In [34]:
def Decode(decode_text):
    decode_text = parse.unquote(decode_text)
    return decode_text

In [35]:
decode_res = []
for j in range(len(url_decode_list)):
    decode_res.append(Decode(url_decode_list[j]))

In [36]:
Decode_res = pd.DataFrame(decode_res, columns=['URL'])

Decode_res

Unnamed: 0,URL
0,http://1337x.to/torrent/1048648/American-Snipe...
1,http://1337x.to/torrent/1110018/Blackhat-2015-...
2,http://1337x.to/torrent/1122940/Blackhat-2015-...
3,http://1337x.to/torrent/1124395/Fast-and-Furio...
4,http://1337x.to/torrent/1145504/Avengers-Age-o...
...,...
165359,http://archive.salisburyjournal.co.uk/2001/3/7/
165360,http://astore.amazon.co.uk/allezvinsfrenchr/de...
165361,http://archive.thisischeshire.co.uk/2000/1/14/...
165362,http://applerugs.co.uk/rugs/product_info.php?p...


In [59]:
# save URL total csv
# df_total.to_csv('Output/Total_URL.csv',index=False)

In [37]:
len(df_malware)

11566

In [38]:
normal_ratio = len(df_normal) / len(df_total)
malware_ratio = len(df_malware) / len(df_total)
defacement_ratio = len(df_deface) / len(df_total)
spam_ratio = len(df_spam) / len(df_total)
phishing_ratio = len(df_phishing) / len(df_total)


In [39]:
print("Normal : ",normal_ratio)
print("Malware : ",malware_ratio)
print("Defacement : ",defacement_ratio)
print("Spam : ",spam_ratio)
print("Phishing : ",phishing_ratio)

Normal :  0.2139401562613386
Malware :  0.06994267192375608
Defacement :  0.5832889867202051
Spam :  0.07256718511888924
Phishing :  0.06026099997581094


In [40]:
# df_total -> df_url
# 아래부터는 Using df_url dataset 
df_url = df_total

# URL Lexical Featuring을 하기위한 split작업

In [42]:
import re
from urllib.parse import urlparse
import numpy as np

## URL내에서 Host 부분 추출
def URL_Host(url):
    url_0 = url
    if 'http' in url_0.split('.')[0]:
        proto = url_0.split('://')[0]
        del_proto = proto +'://'
        url_0 = url_0.replace(del_proto,'')
        if '/' in url_0:
            host = url_0.split('/')[0]
        else:
            host = url_0
    else:
        if '/' in url_0:
            host = url_0.split('/')[0]
        else:
            host = url_0
    return host

In [43]:
## URL내에서 Path 부분 추출

def URL_Path(url):
    url_0 = url
    if 'http' in url_0.split('.')[0]:
        proto = url_0.split('://')[0]
        del_proto = proto +'://'
        url_0 = url_0.replace(del_proto,'')
        if '?' in url_0:
            url_1 = url_0.split('?')[0]
            host = url_1.split('/')[0]
            path = url_1.replace(host, '')
        else:
            url_1 = url_0
            if '#' in url_1:
                url_1 = url_0.split('#')[0]
                host = url_1.split('/')[0]
                path = url_1.replace(host, '')
            else:
                url_1 = url_0
                host = url_1.split('/')[0]
                path = url_1.replace(host, '')
    else:
        if '?' in url_0:
            url_1 = url_0.split('?')[0]
            host = url_1.split('/')[0]
            path = url_1.replace(host, '')
        else:
            url_1 = url_0
            if '#' in url_1:
                url_1 = url_0.split('#')[0]
                host = url_1.split('/')[0]
                path = url_1.replace(host, '')
            else:
                url_1 = url_0
                host = url_1.split('/')[0]
                path = url_1.replace(host, '')
    return path

In [44]:
## URL내에서 query 부분 추출

def URL_Query(url):
    url_0 = url
    if 'http' in url_0.split('.')[0]:
        proto = url_0.split('://')[0]
        del_proto = proto +'://'
        url_0 = url_0.replace(del_proto,'')
        if '?' in url_0:
            host = url_0.split('/')[0]
            url_1 = url_0.replace(host, '')
            if '#' in url_1:
                url_2 = url_1.split('#')[0]
                path = url_2.split('?')[0]
                url_2 = url_2.replace(path, '')
                query = url_2.split('?')[-1]
            else:
                url_2 = url_1
                path = url_2.split('?')[0]
                url_2 = url_2.replace(path, '')
                query = url_2.split('?')[-1]
        else:
            query = ''

    else:
        if '?' in url_0:
            host = url_0.split('/')[0]
            url_1 = url_0.replace(host, '')
            if '#' in url_1:
                url_2 = url_1.split('#')[0]
                path = url_2.split('?')[0]
                url_2 = url_2.replace(path, '')
                query = url_2.split('?')[-1]
            else:
                url_2 = url_1
                path = url_2.split('?')[0]
                url_2 = url_2.replace(path, '')
                query = url_2.split('?')[-1]
        else:
            query = ''
    return query

In [45]:
df_featuring = pd.DataFrame()

# Featuring
### 사용할 feature들 리스트 만든 후 각각 featuring 진행(총 10개 사용)

In [46]:

dot_list = []
slash_list = []
ampersand_list = []
dash_list = []
space_list = []
doubleslash_list = []
Hash_list = []
len_host = []
len_path = []
len_query = []



for f in df_url['URL']:
    parts = urlparse(f)
    dot_list.append(f.count('.') + f.count('%2E'))    
    slash_list.append(f.count('/') + f.count('%2F'))
    ampersand_list.append(f.count('&') + f.count('%26'))
    dash_list.append(f.count('-') + f.count('%2D'))
    space_list.append(f.count(' ') + f.count('%20'))
    doubleslash_list.append(f.count('://'))
    Hash_list.append(1 if f.count('#') else 0)
    len_host.append(len(URL_Host(f))) # 추출된 url 호스트 length return
    len_path.append(len(URL_Path(f))) #추출된 path length
    len_query.append(len(URL_Query(f))) # 추출된 query length


# -------------------------------------------

# 총 10개 Feature사용
df_featuring['num_dot'] = dot_list
df_featuring['num_slash'] = slash_list
df_featuring['num_ampersand'] = ampersand_list
df_featuring['num_dash'] = dash_list
df_featuring['num_space'] = space_list
df_featuring['num_doubleslash'] = doubleslash_list
df_featuring['HashInURL'] = Hash_list
df_featuring['len_host'] = len_host
df_featuring['len_path'] = len_path
df_featuring['len_query'] = len_query


In [93]:
# Lexical Featuring Total save csv
# df_featuring.to_csv('URL_Feautring.csv',index=False)

In [47]:
df_featuring ## featuring restul

Unnamed: 0,num_dot,num_slash,num_ampersand,num_dash,num_space,num_doubleslash,HashInURL,len_host,len_path,len_query
0,1,6,0,8,0,1,0,8,68,0
1,1,6,0,9,0,1,0,8,68,0
2,1,6,0,9,0,1,0,8,68,0
3,1,6,0,11,0,1,0,8,68,0
4,1,6,0,9,0,1,0,8,68,0
...,...,...,...,...,...,...,...,...,...,...
165359,3,6,0,0,0,1,0,30,10,0
165360,3,6,0,2,0,1,0,19,55,0
165361,4,6,0,0,0,1,0,28,22,0
165362,3,4,1,0,0,1,0,15,22,7


# Min-Max Scaling

In [48]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
scaler = Pipeline([('scaler', MinMaxScaler())])
scaled_df = scaler.fit_transform(df_featuring)



# Train/ Test Split

In [49]:
df_url['Label'] = df_url['Label'].replace({'Normal':0, 'Defacement':1,'Phishing':2,'Malware':3,'Spam':4})
target = df_url['Label'].tolist()

In [50]:
df_url['Label'].value_counts()

Label
1    96455
0    35378
4    12000
3    11566
2     9965
Name: count, dtype: int64

In [None]:
# a = df_url[df_url['Label'].isin([0])]
# a

In [57]:
X_train, X_test, y_train, y_test = train_test_split(scaled_df,target, test_size=0.3, stratify=target, random_state=21)

In [67]:
len(X_train)

115754

In [68]:
len(X_test)

49610

In [59]:
# XGBoost Model
import xgboost as xgb
import pickle

model = xgb.XGBClassifier(
    booster='gbtree',
    max_depth=3,
    objective='multi:softmax',
    num_class=4, eval_metric='merror'
)
model.fit(X_train,y_train)

In [60]:
# AI Model Prediction
ypred_xgb=model.predict(X_test)

In [61]:
ypred_xgb

array([1, 1, 1, ..., 1, 3, 1])

In [62]:

# AI Model Accuracy
print(accuracy_score(y_test, ypred_xgb))

0.9137472283813747


In [65]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # model 평가

In [66]:
acc = accuracy_score(y_test, ypred_xgb)
print("XGBoost Accuracy: ", acc)
print("\n")
con_mat = confusion_matrix(y_test, ypred_xgb)
report = classification_report(y_test, ypred_xgb)
print(report)

XGBoost Accuracy:  0.9137472283813747


              precision    recall  f1-score   support

           0       0.92      0.97      0.94     10614
           1       0.92      0.97      0.94     28937
           2       0.87      0.53      0.66      2989
           3       0.89      0.75      0.81      3470
           4       0.91      0.79      0.85      3600

    accuracy                           0.91     49610
   macro avg       0.90      0.80      0.84     49610
weighted avg       0.91      0.91      0.91     49610

