In [1]:
# importing some useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle 

In [2]:
model_file = "../Dumps/model.sav"
scaler_file = "../Dumps/scaler.sav"
encoder_file = "../Dumps/encoder.sav"

# Wczytywanie danych

In [3]:
kaggle = pd.read_csv("../Data/URLs-mixed/kaggle_labeled.csv")
kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     420464 non-null  object
 1   label   420464 non-null  object
dtypes: object(2)
memory usage: 6.4+ MB


In [4]:
kaggle.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


## Dane polska

In [5]:
gov = pd.read_csv("../Data/URLs-good/gov_poland.csv", skiprows=2, sep=';', usecols=[1], names=["url"])
gov.head(2)

Unnamed: 0,url
0,http://1bcz.wp.mil.pl
1,http://1bdm.wp.mil.pl/pl/28.html


In [6]:
gov["url"] = gov["url"].astype(str)
gov["label"] = "good"
gov.head(2)

Unnamed: 0,url,label
0,http://1bcz.wp.mil.pl,good
1,http://1bdm.wp.mil.pl/pl/28.html,good


## Dane cert

In [7]:
cert = pd.read_csv("https://hole.cert.pl/domains/domains.txt", names=["url"])
cert.head(2)

Unnamed: 0,url
0,008753331120.com
1,02-wiadomosci.com.pl


In [8]:
cert["url"] = cert["url"].astype(str)
cert["label"] = "bad"
cert.head(2)

Unnamed: 0,url,label
0,008753331120.com,bad
1,02-wiadomosci.com.pl,bad


## Dane alexa

In [9]:
alexa = pd.read_csv("../Data/URLs-good/alexa1m.csv", names=["url"])
alexa = alexa.iloc[0:10000,:]
alexa["url"] = alexa["url"].apply(lambda x : "https://www." + x)
alexa["label"] = "good"
alexa.head(5)

Unnamed: 0,url,label
1,https://www.google.com,good
2,https://www.youtube.com,good
3,https://www.tmall.com,good
4,https://www.baidu.com,good
5,https://www.qq.com,good


## Dane ISCX

In [10]:
iscx = pd.read_csv("../Data/URLs-good/ISCX_benign.csv", names=["url"])
iscx['label'] = "good"
iscx

Unnamed: 0,url,label
0,http://1337x.to/torrent/1048648/American-Snipe...,good
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,good
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,good
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,good
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,good
...,...,...
35373,https://lastpass.com/signup2.php?ac=1&from_uri...,good
35374,https://lastpass.com/signup2.php?ac=1&from_uri...,good
35375,https://lastpass.com/signup2.php?ac=1&from_uri...,good
35376,https://lastpass.com/signup2.php?ac=1&from_uri...,good


In [11]:
bad = pd.read_csv("../Data/unique_bad_links.csv")
bad = pd.DataFrame(bad.domain)
bad.columns = ['url']
bad['label'] = "bad"
bad

Unnamed: 0,url,label
0,inpost.pl-paydelivery.cyou,bad
1,inpost.pl-paydelivery.work,bad
2,inpost.pl-paydelivery.xyz,bad
3,www.inpost.new-pl-oferta.cyou,bad
4,inpost.new-pl-oferta.casa,bad
...,...,...
15271,https://www.vdtpt.com/login.php,bad
15272,https://login-live-comjobmen.s3.us-east-2.amaz...,bad
15273,https://pssmedicareworkshop.com/login.php?cmd=...,bad
15274,http://bsmcing.lh52.cn/,bad


## Łączenie danych

In [12]:
df = pd.concat([alexa, iscx, bad])
df.head(2)

Unnamed: 0,url,label
1,https://www.google.com,good
2,https://www.youtube.com,good


In [13]:
df["url"] = df.url.astype(str)

# Trenowanie modelu

Ciekawe: niektóre linki zaczynają się od **http** co jest bardzo podejrzane. 

In [14]:
#df = df[df["url"].str.startswith("http")].reset_index()

## Czyszczenie danych

In [15]:
from urllib3.util import parse_url
import re

def parseurl(url):
    try:
        url = url.translate({'[': None, ']': None})
        url = parse_url(url)
        return url
    except Exception as e:
        return np.nan
    
def parseurl2(url):
    try:
        url = url.translate({'[': "", ']': ""})
        h = re.search("https?:?//", url)
        if h is not None:
            url = url[:h.start()] + url[h.end():]
        url = str(parse_url(url).host)
        
        if url.count('.') == 0:
            return np.nan
        else:
            return url
    except:
        return np.nan
    

In [16]:
df['parsed_url'] = df.url.apply(lambda x : parseurl(x))

In [17]:
df.groupby("label").count()

Unnamed: 0_level_0,url,parsed_url
label,Unnamed: 1_level_1,Unnamed: 2_level_1
bad,15276,15276
good,45378,45378


In [18]:
df.dropna(inplace=True)

In [19]:
df = df.reset_index()

In [20]:
u = df.url.apply(lambda x : list(parseurl(x)))

In [21]:
parseurl("https://goog.lp")

Url(scheme='https', auth=None, host='goog.lp', port=None, path=None, query=None, fragment=None)

In [22]:
df['schema'] = u.apply(lambda x : x[0])
df['auth'] = u.apply(lambda x : x[1])
df['host'] = u.apply(lambda x : x[2])
df['port'] = u.apply(lambda x : x[3])
df['path'] = u.apply(lambda x : x[4])
df['query'] = u.apply(lambda x : x[5])
df['fragment'] = u.apply(lambda x : x[6])

In [23]:
df = df[df['host'].notna()]

In [24]:
df.head(5)

Unnamed: 0,index,url,label,parsed_url,schema,auth,host,port,path,query,fragment
0,1,https://www.google.com,good,"(https, None, www.google.com, None, None, None...",https,,www.google.com,,,,
1,2,https://www.youtube.com,good,"(https, None, www.youtube.com, None, None, Non...",https,,www.youtube.com,,,,
2,3,https://www.tmall.com,good,"(https, None, www.tmall.com, None, None, None,...",https,,www.tmall.com,,,,
3,4,https://www.baidu.com,good,"(https, None, www.baidu.com, None, None, None,...",https,,www.baidu.com,,,,
4,5,https://www.qq.com,good,"(https, None, www.qq.com, None, None, None, None)",https,,www.qq.com,,,,


In [25]:
df.groupby("label").count()

Unnamed: 0_level_0,index,url,parsed_url,schema,auth,host,port,path,query,fragment
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
bad,15276,15276,15276,2102,0,15276,1,2102,319,3
good,45378,45378,45378,45378,0,45378,0,35378,9675,0


## Obliczanie metryk

### Popularność domeny

df['domain'] = df["url"].str.split('.', n=1, expand=True)[1]
count = df.groupby(["domain"]).count().reset_index().iloc[:, [0,1]]
count.columns = ["domain", "count"]
df= df.merge(count,on=["domain"])

In [26]:
df

Unnamed: 0,index,url,label,parsed_url,schema,auth,host,port,path,query,fragment
0,1,https://www.google.com,good,"(https, None, www.google.com, None, None, None...",https,,www.google.com,,,,
1,2,https://www.youtube.com,good,"(https, None, www.youtube.com, None, None, Non...",https,,www.youtube.com,,,,
2,3,https://www.tmall.com,good,"(https, None, www.tmall.com, None, None, None,...",https,,www.tmall.com,,,,
3,4,https://www.baidu.com,good,"(https, None, www.baidu.com, None, None, None,...",https,,www.baidu.com,,,,
4,5,https://www.qq.com,good,"(https, None, www.qq.com, None, None, None, None)",https,,www.qq.com,,,,
...,...,...,...,...,...,...,...,...,...,...,...
60649,15271,https://www.vdtpt.com/login.php,bad,"(https, None, www.vdtpt.com, None, /login.php,...",https,,www.vdtpt.com,,/login.php,,
60650,15272,https://login-live-comjobmen.s3.us-east-2.amaz...,bad,"(https, None, login-live-comjobmen.s3.us-east-...",https,,login-live-comjobmen.s3.us-east-2.amazonaws.com,,/milanion/index.html,,
60651,15273,https://pssmedicareworkshop.com/login.php?cmd=...,bad,"(https, None, pssmedicareworkshop.com, None, /...",https,,pssmedicareworkshop.com,,/login.php,cmd=login_submit&id=f5462a2ff7fcc421c4026f9829...,
60652,15274,http://bsmcing.lh52.cn/,bad,"(http, None, bsmcing.lh52.cn, None, /, None, N...",http,,bsmcing.lh52.cn,,/,,


### Pozostałe

In [27]:
import re
import ipaddress

def count_special_symbols(domain):
    counter = 0
    for char in domain:
        if char.isalpha() or char.isdigit():
            continue
        else:
            counter += 1
    return counter

def count_digits(domain):
    counter = 0
    for char in domain:
        if char.isdigit():
            counter += 1
    return counter

def count_digit_letter(domain):
    """Zlicza kombinacje litera cyfra"""
    res = re.findall("[A-Za-z][0-9]", domain)
    return len(res)

def count_sus(domain):
    res = re.findall("https|http|www", domain)
    return len(res)

def has_a(domain):
    """Sprawdza czy domain ma @"""
    res = re.findall("@", domain)
    return len(res)

def has_pref_or_suff(domain):
    """Sprawdza czy domain ma -"""
    res = re.findall("-", domain)
    return len(res)

def is_ipv4(string):
    try:
        ipaddress.IPv4Network(string)
        return True
    except ValueError:
        return False

In [28]:
def calculate_metrics(df_):
    ##### Schema
    schema = df_.schema
    schema = schema.fillna("n")
    schema = schema.replace({"https": 1, "http": 0, "n": 0.5, 'none': 0.5, 'hhtp': 0, 'nttps': 1, "htpps": 1, "htps": 1, "htt": 0})
    schema = schema.astype('float64')
    
    #### Host
    host = df_.host
    host_len = host.apply(lambda x : len(x)) # Długość 
    host_subdomains_count = host.apply(lambda x : x.count('.')) # Ilość subdomen
    host_subdomains_mean_len = host.apply(lambda x : (len(x)-x.count('.'))/x.count('.')) # Średnia długość domen
    host_digit = host.apply(lambda x : count_digits(x)) # Ilość cyfr
    host_nspecial = host.apply(lambda x : count_special_symbols(x) - x.count('.')) # Ilość znaków specjalnych - kropki
    host_first_len = host.apply(lambda x : len(x.split('.')[0])) # Długość pierwszej subdomeny
    host_digit_letter_count = host.apply(lambda x : count_digit_letter(x)) # Ilość kombinacji litera-cyfra
    host_has_a = host.apply(lambda x : has_a(x)) # Sprawdza czy host ma @
    host_pref_suf_number = host.apply(lambda x : has_pref_or_suff(x)) # Sprawdza czy ma -
    host_is_ipv4 = host.apply(lambda x : is_ipv4(x)) # Sprawdza czy host to adres ipv4s
    

    return np.array([schema, host_len, host_subdomains_count, host_subdomains_mean_len, host_digit, host_nspecial, host_first_len, host_digit_letter_count, host_has_a, host_pref_suf_number, host_is_ipv4]).T

In [29]:
calculate_metrics(df)

array([[ 1., 14.,  2., ...,  0.,  0.,  0.],
       [ 1., 15.,  2., ...,  0.,  0.,  0.],
       [ 1., 13.,  2., ...,  0.,  0.,  0.],
       ...,
       [ 1., 23.,  1., ...,  0.,  0.,  0.],
       [ 0., 15.,  2., ...,  0.,  0.,  0.],
       [ 0., 16.,  3., ...,  0.,  1.,  0.]])

# Trenowanie modelu

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
sc = StandardScaler()

dane = df

In [31]:
from sklearn import preprocessing

balanced_df = pd.concat([dane[dane.label == 'bad'],dane[dane.label == 'good'].iloc[1:100000,:]])
le = preprocessing.LabelEncoder()
le.fit(balanced_df.label)
balanced_df["enc"] = le.transform(balanced_df.label)
balanced_df

Unnamed: 0,index,url,label,parsed_url,schema,auth,host,port,path,query,fragment,enc
45378,0,inpost.pl-paydelivery.cyou,bad,"(None, None, inpost.pl-paydelivery.cyou, None,...",,,inpost.pl-paydelivery.cyou,,,,,0
45379,1,inpost.pl-paydelivery.work,bad,"(None, None, inpost.pl-paydelivery.work, None,...",,,inpost.pl-paydelivery.work,,,,,0
45380,2,inpost.pl-paydelivery.xyz,bad,"(None, None, inpost.pl-paydelivery.xyz, None, ...",,,inpost.pl-paydelivery.xyz,,,,,0
45381,3,www.inpost.new-pl-oferta.cyou,bad,"(None, None, www.inpost.new-pl-oferta.cyou, No...",,,www.inpost.new-pl-oferta.cyou,,,,,0
45382,4,inpost.new-pl-oferta.casa,bad,"(None, None, inpost.new-pl-oferta.casa, None, ...",,,inpost.new-pl-oferta.casa,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45373,35373,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1
45374,35374,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1
45375,35375,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1
45376,35376,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1


In [32]:
features = calculate_metrics(balanced_df)
features

array([[ 0.5, 26. ,  2. , ...,  0. ,  1. ,  0. ],
       [ 0.5, 26. ,  2. , ...,  0. ,  1. ,  0. ],
       [ 0.5, 25. ,  2. , ...,  0. ,  1. ,  0. ],
       ...,
       [ 1. , 12. ,  1. , ...,  0. ,  0. ,  0. ],
       [ 1. , 12. ,  1. , ...,  0. ,  0. ,  0. ],
       [ 1. ,  9. ,  1. , ...,  0. ,  0. ,  0. ]])

In [33]:
trainX, testX, trainY, testY = train_test_split(features, balanced_df.enc, test_size=0.2)
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

# Wektory wspierające

In [34]:
from sklearn import svm

clf = svm.SVC(verbose=True)
clf.fit(trainX, trainY)

[LibSVM]

SVC(verbose=True)

In [35]:
clf.score(testX, testY)

0.990602588409859

In [36]:
clf.score(testX[testY == 1], testY[testY == 1]) # Dane dobre = 1

0.9987789987789988

In [37]:
clf.score(testX[testY == 0], testY[testY == 0]) # Dane złe = 1

0.9670083279948751

In [38]:
#clf.predict(sc.transform(calculate_metrics(pd.DataFrame(["shbwgen.blogspot.com	"], columns = ["url"]))))

# Drzewo losowe

In [52]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(max_depth=3, random_state=0, min_samples_split=2, min_samples_leaf=2,verbose=1)
clf2.fit(trainX, trainY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


RandomForestClassifier(max_depth=3, min_samples_leaf=2, random_state=0,
                       verbose=1)

In [53]:
clf2.score(testX, testY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0.9209463358338142

In [54]:
clf2.score(testX[testY == 1], testY[testY == 1]) # Dane dobre

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0.9839637248396372

In [55]:
clf2.score(testX[testY == 0], testY[testY == 0]) # Dane złe

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0.7364842991259307

In [39]:
def transform(url):
    d = pd.DataFrame([url], columns = ['url'])
    d['parsed_url'] = d.url.apply(lambda x : parseurl(x))
    u = d.url.apply(lambda x : list(parseurl(x)))
    d['schema'] = u.apply(lambda x : x[0])
    d['auth'] = u.apply(lambda x : x[1])
    d['host'] = u.apply(lambda x : x[2])
    d['port'] = u.apply(lambda x : x[3])
    d['path'] = u.apply(lambda x : x[4])
    d['query'] = u.apply(lambda x : x[5])
    d['fragment'] = u.apply(lambda x : x[6])
    return d

In [40]:
transform("allegro.pl")

Unnamed: 0,url,parsed_url,schema,auth,host,port,path,query,fragment
0,allegro.pl,"(None, None, allegro.pl, None, None, None, None)",,,allegro.pl,,,,


# Podsumowanie
Model w obecnym stanie jest w stanie rozpoznać proste podmiany i działa szybko, jak dla mnie nadaje się do wrzucenia.

In [41]:
clf.predict(sc.transform(calculate_metrics(transform("https://allegro.pl"))))

array([1])

In [42]:
clf.predict(sc.transform(calculate_metrics(transform("https://a11egro.pl"))))

array([0])

# Zapisanie modelu

In [45]:
import joblib

joblib.dump(clf, model_file) # SVM
joblib.dump(sc, scaler_file) # StandardScaler
joblib.dump(le, encoder_file) # LabelEncoder

['../Dumps/encoder.sav']