In [1]:
# importing some useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle 

# Wczytywanie danych

In [180]:
kaggle = pd.read_csv("../Data/URLs-mixed/kaggle_labeled.csv")
kaggle.info()
kaggle = kaggle[kaggle["label"] == "bad"]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     420464 non-null  object
 1   label   420464 non-null  object
dtypes: object(2)
memory usage: 6.4+ MB


## Dane polska

In [3]:
gov = pd.read_csv("../Data/URLs-good/gov_poland.csv", skiprows=2, sep=';', usecols=[1], names=["url"])
gov.head(2)

Unnamed: 0,url
0,http://1bcz.wp.mil.pl
1,http://1bdm.wp.mil.pl/pl/28.html


In [4]:
gov["url"] = gov["url"].astype(str)
gov["label"] = "good"
gov.head(2)

Unnamed: 0,url,label
0,http://1bcz.wp.mil.pl,good
1,http://1bdm.wp.mil.pl/pl/28.html,good


## Dane cert

In [5]:
cert = pd.read_csv("https://hole.cert.pl/domains/domains.txt", names=["url"])
cert.head(2)

Unnamed: 0,url
0,008753331120.com
1,02-wiadomosci.com.pl


In [6]:
cert["url"] = cert["url"].astype(str)
cert["label"] = "bad"
cert.head(2)

Unnamed: 0,url,label
0,008753331120.com,bad
1,02-wiadomosci.com.pl,bad


In [7]:
df = pd.concat([kaggle,gov,cert])
df.head(2)

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad


In [8]:
df["url"] = df.url.astype(str)

# Trenowanie modelu

## Czyszczenie danych

In [9]:
from urllib3.util import parse_url
import re

def parseurl(url):
    try:
        url = url.translate({'[': None, ']': None})
        url = parse_url(url)
        return str(url.host)
    except Exception as e:
        if "//" in url:
            url = url.split("//")[1]
        url = url[:url.find("/")]
        return str(url)
    
def parseurl2(url):
    try:
        url = url.translate({'[': "", ']': ""})
        h = re.search("https?:?//", url)
        if h is not None:
            url = url[:h.start()] + url[h.end():]
        url = str(parse_url(url).host)
        
        if url.count('.') == 0:
            return np.nan
        else:
            return url
    except:
        return np.nan

In [10]:
df["url"] = df["url"].astype(str)
df["parsed_url"] = df.url.apply(parseurl2)

In [11]:
df.dropna(inplace=True)

In [12]:
good_domains = df[df["label"] == "good"]
bad = df[df["label"] == "bad"]

In [13]:
df.head(5)

Unnamed: 0,url,label,parsed_url
0,diaryofagameaddict.com,bad,diaryofagameaddict.com
1,espdesign.com.au,bad,espdesign.com.au
2,iamagameaddict.com,bad,iamagameaddict.com
3,kalantzis.net,bad,kalantzis.net
4,slightlyoffcenter.net,bad,slightlyoffcenter.net


In [14]:
df.groupby("label").count()

Unnamed: 0_level_0,url,parsed_url
label,Unnamed: 1_level_1,Unnamed: 2_level_1
bad,92049,92049
good,414286,414286


## Obliczanie metryk

### Popularność domeny

In [120]:
df['domain'] = df["url"].str.split('.', n=1, expand=True)[1]
count = df.groupby(["domain"]).count().reset_index().iloc[:, [0,1]]
count.columns = ["domain", "count"]
df= df.merge(count,on=["domain"])

In [121]:
df

Unnamed: 0,url,label,parsed_url,domain,count
0,diaryofagameaddict.com,bad,diaryofagameaddict.com,com,1613
1,iamagameaddict.com,bad,iamagameaddict.com,com,1613
2,toddscarwash.com,bad,toddscarwash.com,com,1613
3,tubemoviez.com,bad,tubemoviez.com,com,1613
4,sn-gzzx.com,bad,sn-gzzx.com,com,1613
...,...,...,...,...,...
506330,zaktualizacja-platnosci.netaidver.site,bad,zaktualizacja-platnosci.netaidver.site,netaidver.site,1
506331,zaktualizuj-konto.zaktualizuj4562.co.pl,bad,zaktualizuj-konto.zaktualizuj4562.co.pl,zaktualizuj4562.co.pl,1
506332,zaktualizuj.neftlix.tajbirds.com,bad,zaktualizuj.neftlix.tajbirds.com,neftlix.tajbirds.com,1
506333,zap588693-1.plesk10.zap-webspace.com,bad,zap588693-1.plesk10.zap-webspace.com,plesk10.zap-webspace.com,1


### Pozostałe

In [15]:
def count_special_symbols(domain):
    counter = 0
    for char in domain:
        if char.isalpha() or char.isdigit():
            continue
        else:
            counter += 1
    return counter


def count_digits(domain):
    counter = 0
    for char in domain:
        if char.isdigit():
            counter += 1
    return counter


In [122]:
def calculate_metrics(df_):
    
    series = df_["url"]
    # Długość
    length = series.apply(lambda x : len(x))
    
    # Ilość subdomen
    nsubdomains = series.apply(lambda x : x.count('.'))
    
    # Średnia długość subdomen
    meanlensubdomains = series.apply(lambda x : (len(x)-x.count('.'))/x.count('.'))
    
    # Ilość cyfr
    ndigits = series.apply(lambda x : count_digits(x))
    
    # Ilość znaków specjalnych - kropki
    nspecial = series.apply(lambda x : count_special_symbols(x) - x.count('.'))
    
    # Długość pierwszej subdomeny
    lastlen = series.apply(lambda x : len(x.split('.')[0]))
    
    # Popularność domeny

    df_['domain'] = df_["url"].str.split('.', n=1, expand=True)[1]
    df_ = df_.merge(count,on=["domain"])
    
    #df_["top"] = df_["url"].apply(lambda x : x.split('.')[-1])
    #popularity =  df_.apply(lambda x : len( df_[(df_["top"] == x.top) ]) , axis=1)
    
    # Polularność słów występujących
    
    
    return np.array([length, nsubdomains, meanlensubdomains, ndigits, nspecial, lastlen]).T

# Trenowanie modelu

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
sc = StandardScaler()

dane = pd.DataFrame(df.iloc[:,[1,2]])
dane.columns = ["label", "url"]

In [146]:
from sklearn import preprocessing

balanced_df = pd.concat([dane[dane.label == 'bad'],dane[dane.label == 'good'].iloc[1:100000,:]])
le = preprocessing.LabelEncoder()
le.fit(balanced_df.label)
balanced_df["enc"] = le.transform(balanced_df.label)
balanced_df

Unnamed: 0,label,url,enc
0,bad,diaryofagameaddict.com,0
1,bad,iamagameaddict.com,0
2,bad,toddscarwash.com,0
3,bad,tubemoviez.com,0
4,bad,sn-gzzx.com,0
...,...,...,...
148244,good,freepages.genealogy.rootsweb.ancestry.com,1
148245,good,freepages.genealogy.rootsweb.ancestry.com,1
148246,good,freepages.genealogy.rootsweb.ancestry.com,1
148247,good,freepages.genealogy.rootsweb.ancestry.com,1


In [125]:
features = calculate_metrics(balanced_df)
features

array([[22.  ,  1.  , 21.  ,  0.  ,  0.  , 18.  ],
       [18.  ,  1.  , 17.  ,  0.  ,  0.  , 14.  ],
       [16.  ,  1.  , 15.  ,  0.  ,  0.  , 12.  ],
       ...,
       [41.  ,  4.  ,  9.25,  0.  ,  0.  ,  9.  ],
       [41.  ,  4.  ,  9.25,  0.  ,  0.  ,  9.  ],
       [41.  ,  4.  ,  9.25,  0.  ,  0.  ,  9.  ]])

In [160]:
trainX, testX, trainY, testY = train_test_split(features, balanced_df.enc)
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

# Wektory wspierające

In [127]:
from sklearn import svm

clf = svm.SVC(verbose=True)
clf.fit(trainX, trainY)

[LibSVM]

SVC(verbose=True)

In [128]:
clf.score(testX, testY)

0.67564

In [129]:
clf.score(testX[testY == 1], testY[testY == 1]) # Dane dobre = 1

0.6932590743230267

In [130]:
clf.score(testX[testY == 0], testY[testY == 0]) # Dane złe = 1

0.6564990611308158

In [117]:
clf.predict(sc.transform(calculate_metrics(pd.DataFrame(["shbwgen.blogspot.com	"], columns = ["url"]))))

array([0])

# Drzewo losowe

In [161]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(max_depth=6, random_state=0, criterion="entropy", min_samples_split=2, min_samples_leaf=2,verbose=1)
clf2.fit(trainX, trainY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.0s finished


RandomForestClassifier(criterion='entropy', max_depth=6, min_samples_leaf=2,
                       random_state=0, verbose=1)

In [162]:
clf2.score(testX, testY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished


0.6768932766808298

In [163]:
clf2.score(testX[testY == 1], testY[testY == 1]) # Dane dobre

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


0.7854372425926667

In [164]:
clf2.score(testX[testY == 0], testY[testY == 0]) # Dane złe

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


0.5588836238751467

In [179]:
clf2.predict(sc.transform(calculate_metrics(pd.DataFrame(["podroznik.pl/"], columns = ["url"]))))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


array([0])

# ŚMIECI