In [138]:
# importing some useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle 

# Wczytywanie danych

In [139]:
kaggle = pd.read_csv("../Data/URLs-mixed/kaggle_labeled.csv")
kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420460 entries, 0 to 420459
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     420460 non-null  object
 1   label   420460 non-null  object
dtypes: object(2)
memory usage: 6.4+ MB


In [140]:
kaggle.head()

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad
2,iamagameaddict.com,bad
3,kalantzis.net,bad
4,slightlyoffcenter.net,bad


## Dane polska

In [141]:
gov = pd.read_csv("../Data/URLs-good/gov_poland.csv", skiprows=2, sep=';', usecols=[1], names=["url"])
gov.head(2)

Unnamed: 0,url
0,http://1bcz.wp.mil.pl
1,http://1bdm.wp.mil.pl/pl/28.html


In [142]:
gov["url"] = gov["url"].astype(str)
gov["label"] = "good"
gov.head(2)

Unnamed: 0,url,label
0,http://1bcz.wp.mil.pl,good
1,http://1bdm.wp.mil.pl/pl/28.html,good


## Dane cert

In [143]:
cert = pd.read_csv("https://hole.cert.pl/domains/domains.txt", names=["url"])
cert.head(2)

Unnamed: 0,url
0,008753331120.com
1,02-wiadomosci.com.pl


In [144]:
cert["url"] = cert["url"].astype(str)
cert["label"] = "bad"
cert.head(2)

Unnamed: 0,url,label
0,008753331120.com,bad
1,02-wiadomosci.com.pl,bad


## Dane alexa

In [145]:
alexa = pd.read_csv("../Data/URLs-good/alexa1m.csv", names=["url"])
alexa = alexa.iloc[0:10000,:]
alexa["url"] = alexa["url"].apply(lambda x : "https://www." + x)
alexa["label"] = "good"
alexa.head(5)

Unnamed: 0,url,label
1,https://www.google.com,good
2,https://www.youtube.com,good
3,https://www.tmall.com,good
4,https://www.baidu.com,good
5,https://www.qq.com,good


## Dane ISCX

In [146]:
iscx = pd.read_csv("../Data/URLs-good/ISCX_benign.csv", names=["url"])
iscx['label'] = "good"
iscx

Unnamed: 0,url,label
0,http://1337x.to/torrent/1048648/American-Snipe...,good
1,http://1337x.to/torrent/1110018/Blackhat-2015-...,good
2,http://1337x.to/torrent/1122940/Blackhat-2015-...,good
3,http://1337x.to/torrent/1124395/Fast-and-Furio...,good
4,http://1337x.to/torrent/1145504/Avengers-Age-o...,good
...,...,...
35373,https://lastpass.com/signup2.php?ac=1&from_uri...,good
35374,https://lastpass.com/signup2.php?ac=1&from_uri...,good
35375,https://lastpass.com/signup2.php?ac=1&from_uri...,good
35376,https://lastpass.com/signup2.php?ac=1&from_uri...,good


## Łączenie danych

In [147]:
df = pd.concat([kaggle,gov,cert, alexa, iscx])
df.head(2)

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad


In [148]:
df["url"] = df.url.astype(str)

# Trenowanie modelu

Ciekawe: niektóre linki zaczynają się od **http** co jest bardzo podejrzane. 

In [149]:
df = df[df["url"].str.startswith("http")].reset_index()

## Czyszczenie danych

In [150]:
from urllib3.util import parse_url
import re

def parseurl(url):
    try:
        url = url.translate({'[': None, ']': None})
        url = parse_url(url)
        return url
    except Exception as e:
        return np.nan
    
def parseurl2(url):
    try:
        url = url.translate({'[': "", ']': ""})
        h = re.search("https?:?//", url)
        if h is not None:
            url = url[:h.start()] + url[h.end():]
        url = str(parse_url(url).host)
        
        if url.count('.') == 0:
            return np.nan
        else:
            return url
    except:
        return np.nan
    

In [151]:
df['parsed_url'] = df.url.apply(lambda x : parseurl(x))

In [152]:
df.dropna(inplace=True)

In [153]:
df = df.reset_index()

In [154]:
u = df.url.apply(lambda x : list(parseurl(x)))

In [155]:
parseurl("https://goog.lp")

Url(scheme='https', auth=None, host='goog.lp', port=None, path=None, query=None, fragment=None)

In [156]:
df['schema'] = u.apply(lambda x : x[0])
df['auth'] = u.apply(lambda x : x[1])
df['host'] = u.apply(lambda x : x[2])
df['port'] = u.apply(lambda x : x[3])
df['path'] = u.apply(lambda x : x[4])
df['query'] = u.apply(lambda x : x[5])
df['fragment'] = u.apply(lambda x : x[6])

In [158]:
df.head(5)

Unnamed: 0,level_0,index,url,label,parsed_url,schema,auth,host,port,path,query,fragment
0,0,2628,http://www.oilchangeasheville.com/https:/www2....,bad,"(http, None, www.oilchangeasheville.com, None,...",http,,www.oilchangeasheville.com,,/https:/www2.Santander.com.br/TopoEscolhaAcess...,,
1,1,2783,https-paypal.verifications-updates.com/,bad,"(None, None, https-paypal.verifications-update...",,,https-paypal.verifications-updates.com,,/,,
2,2,3511,http://leticiaaraujo.com.br/VAN-GOG.969/ATUALI...,bad,"(http, None, leticiaaraujo.com.br, None, /VAN-...",http,,leticiaaraujo.com.br,,/VAN-GOG.969/ATUALIZACAO.CLIENTE.SANTANDER.ATI...,"%20id=03,04,03,AM,287,10,10,000000,14,3,2016,F...",
3,3,3524,http://leticiaaraujo.com.br/VAN-GOG.969/ATUALI...,bad,"(http, None, leticiaaraujo.com.br, None, /VAN-...",http,,leticiaaraujo.com.br,,/VAN-GOG.969/ATUALIZACAO.CLIENTE.SANTANDER.ATI...,"+id=03,04,03,AM,287,10,10,000000,14,3,2016,Fri...",
4,4,4280,http://www.ime.edu.co/firmasime/Santander/pess...,bad,"(http, None, www.ime.edu.co, None, /firmasime/...",http,,www.ime.edu.co,,/firmasime/Santander/pessoa-fisica/index1.php,"%20id=20,37,43,PM,283,10,10,000000,10,8,2016,M...",


In [159]:
df.groupby("label").count()

Unnamed: 0_level_0,level_0,index,url,parsed_url,schema,auth,host,port,path,query,fragment
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
bad,124,124,124,124,110,0,121,0,121,52,2
good,60381,60381,60381,60381,60375,0,60371,1,43960,10309,16


## Obliczanie metryk

### Popularność domeny

In [120]:
df['domain'] = df["url"].str.split('.', n=1, expand=True)[1]
count = df.groupby(["domain"]).count().reset_index().iloc[:, [0,1]]
count.columns = ["domain", "count"]
df= df.merge(count,on=["domain"])

In [121]:
df

Unnamed: 0,url,label,parsed_url,domain,count
0,diaryofagameaddict.com,bad,diaryofagameaddict.com,com,1613
1,iamagameaddict.com,bad,iamagameaddict.com,com,1613
2,toddscarwash.com,bad,toddscarwash.com,com,1613
3,tubemoviez.com,bad,tubemoviez.com,com,1613
4,sn-gzzx.com,bad,sn-gzzx.com,com,1613
...,...,...,...,...,...
506330,zaktualizacja-platnosci.netaidver.site,bad,zaktualizacja-platnosci.netaidver.site,netaidver.site,1
506331,zaktualizuj-konto.zaktualizuj4562.co.pl,bad,zaktualizuj-konto.zaktualizuj4562.co.pl,zaktualizuj4562.co.pl,1
506332,zaktualizuj.neftlix.tajbirds.com,bad,zaktualizuj.neftlix.tajbirds.com,neftlix.tajbirds.com,1
506333,zap588693-1.plesk10.zap-webspace.com,bad,zap588693-1.plesk10.zap-webspace.com,plesk10.zap-webspace.com,1


### Pozostałe

In [15]:
def count_special_symbols(domain):
    counter = 0
    for char in domain:
        if char.isalpha() or char.isdigit():
            continue
        else:
            counter += 1
    return counter


def count_digits(domain):
    counter = 0
    for char in domain:
        if char.isdigit():
            counter += 1
    return counter


In [122]:
def calculate_metrics(df_):
    
    series = df_["url"]
    # Długość
    length = series.apply(lambda x : len(x))
    
    # Ilość subdomen
    nsubdomains = series.apply(lambda x : x.count('.'))
    
    # Średnia długość subdomen
    meanlensubdomains = series.apply(lambda x : (len(x)-x.count('.'))/x.count('.'))
    
    # Ilość cyfr
    ndigits = series.apply(lambda x : count_digits(x))
    
    # Ilość znaków specjalnych - kropki
    nspecial = series.apply(lambda x : count_special_symbols(x) - x.count('.'))
    
    # Długość pierwszej subdomeny
    lastlen = series.apply(lambda x : len(x.split('.')[0]))
    
    # Popularność domeny

    df_['domain'] = df_["url"].str.split('.', n=1, expand=True)[1]
    df_ = df_.merge(count,on=["domain"])
    
    #df_["top"] = df_["url"].apply(lambda x : x.split('.')[-1])
    #popularity =  df_.apply(lambda x : len( df_[(df_["top"] == x.top) ]) , axis=1)
    
    # Polularność słów występujących
    
    
    return np.array([length, nsubdomains, meanlensubdomains, ndigits, nspecial, lastlen]).T

# Trenowanie modelu

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
sc = StandardScaler()

dane = pd.DataFrame(df.iloc[:,[1,2]])
dane.columns = ["label", "url"]

In [146]:
from sklearn import preprocessing

balanced_df = pd.concat([dane[dane.label == 'bad'],dane[dane.label == 'good'].iloc[1:100000,:]])
le = preprocessing.LabelEncoder()
le.fit(balanced_df.label)
balanced_df["enc"] = le.transform(balanced_df.label)
balanced_df

Unnamed: 0,label,url,enc
0,bad,diaryofagameaddict.com,0
1,bad,iamagameaddict.com,0
2,bad,toddscarwash.com,0
3,bad,tubemoviez.com,0
4,bad,sn-gzzx.com,0
...,...,...,...
148244,good,freepages.genealogy.rootsweb.ancestry.com,1
148245,good,freepages.genealogy.rootsweb.ancestry.com,1
148246,good,freepages.genealogy.rootsweb.ancestry.com,1
148247,good,freepages.genealogy.rootsweb.ancestry.com,1


In [125]:
features = calculate_metrics(balanced_df)
features

array([[22.  ,  1.  , 21.  ,  0.  ,  0.  , 18.  ],
       [18.  ,  1.  , 17.  ,  0.  ,  0.  , 14.  ],
       [16.  ,  1.  , 15.  ,  0.  ,  0.  , 12.  ],
       ...,
       [41.  ,  4.  ,  9.25,  0.  ,  0.  ,  9.  ],
       [41.  ,  4.  ,  9.25,  0.  ,  0.  ,  9.  ],
       [41.  ,  4.  ,  9.25,  0.  ,  0.  ,  9.  ]])

In [160]:
trainX, testX, trainY, testY = train_test_split(features, balanced_df.enc)
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

# Wektory wspierające

In [127]:
from sklearn import svm

clf = svm.SVC(verbose=True)
clf.fit(trainX, trainY)

[LibSVM]

SVC(verbose=True)

In [128]:
clf.score(testX, testY)

0.67564

In [129]:
clf.score(testX[testY == 1], testY[testY == 1]) # Dane dobre = 1

0.6932590743230267

In [130]:
clf.score(testX[testY == 0], testY[testY == 0]) # Dane złe = 1

0.6564990611308158

In [117]:
clf.predict(sc.transform(calculate_metrics(pd.DataFrame(["shbwgen.blogspot.com	"], columns = ["url"]))))

array([0])

# Drzewo losowe

In [161]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(max_depth=6, random_state=0, criterion="entropy", min_samples_split=2, min_samples_leaf=2,verbose=1)
clf2.fit(trainX, trainY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.0s finished


RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_leaf=2,
                       random_state=0, verbose=1)

In [162]:
clf2.score(testX, testY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


0.6768932766808298

In [163]:
clf2.score(testX[testY == 1], testY[testY == 1]) # Dane dobre

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


0.7854372425926667

In [164]:
clf2.score(testX[testY == 0], testY[testY == 0]) # Dane złe

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


0.5588836238751467

In [179]:
clf2.predict(sc.transform(calculate_metrics(pd.DataFrame(["podroznik.pl/"], columns = ["url"]))))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


array([0])

# ŚMIECI