In [1]:
# importing some useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle 

# Wczytywanie danych

In [2]:
kaggle = pd.read_csv("../Data/URLs-mixed/kaggle_labeled.csv")
kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420464 entries, 0 to 420463
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   url     420464 non-null  object
 1   label   420464 non-null  object
dtypes: object(2)
memory usage: 6.4+ MB


## Dane polska

In [3]:
gov = pd.read_csv("../Data/URLs-good/gov_poland.csv", skiprows=2, sep=';', usecols=[1], names=["url"])
gov.head(2)

Unnamed: 0,url
0,http://1bcz.wp.mil.pl
1,http://1bdm.wp.mil.pl/pl/28.html


In [4]:
gov["url"] = gov["url"].astype(str)
gov["label"] = "good"
gov.head(2)

Unnamed: 0,url,label
0,http://1bcz.wp.mil.pl,good
1,http://1bdm.wp.mil.pl/pl/28.html,good


## Dane cert

In [5]:
cert = pd.read_csv("https://hole.cert.pl/domains/domains.txt", names=["url"])
cert.head(2)

Unnamed: 0,url
0,008753331120.com
1,02-wiadomosci.com.pl


In [6]:
cert["url"] = cert["url"].astype(str)
cert["label"] = "bad"
cert.head(2)

Unnamed: 0,url,label
0,008753331120.com,bad
1,02-wiadomosci.com.pl,bad


In [7]:
df = pd.concat([kaggle,gov,cert])
df.head(2)

Unnamed: 0,url,label
0,diaryofagameaddict.com,bad
1,espdesign.com.au,bad


In [8]:
df["url"] = df.url.astype(str)

# Trenowanie modelu

## Czyszczenie danych

In [9]:
from urllib3.util import parse_url
import re

def parseurl(url):
    try:
        url = url.translate({'[': None, ']': None})
        url = parse_url(url)
        return str(url.host)
    except Exception as e:
        if "//" in url:
            url = url.split("//")[1]
        url = url[:url.find("/")]
        return str(url)
    
def parseurl2(url):
    try:
        url = url.translate({'[': "", ']': ""})
        h = re.search("https?:?//", url)
        if h is not None:
            url = url[:h.start()] + url[h.end():]
        url = str(parse_url(url).host)
        
        if url.count('.') == 0:
            return np.nan
        else:
            return url
    except:
        return np.nan

In [10]:
df["url"] = df["url"].astype(str)
df["parsed_url"] = df.url.apply(parseurl2)

In [11]:
df.dropna(inplace=True)

In [12]:
good_domains = df[df["label"] == "good"]
bad = df[df["label"] == "bad"]

In [13]:
df.head(5)

Unnamed: 0,url,label,parsed_url
0,diaryofagameaddict.com,bad,diaryofagameaddict.com
1,espdesign.com.au,bad,espdesign.com.au
2,iamagameaddict.com,bad,iamagameaddict.com
3,kalantzis.net,bad,kalantzis.net
4,slightlyoffcenter.net,bad,slightlyoffcenter.net


In [14]:
df.groupby("label").count()

Unnamed: 0_level_0,url,parsed_url
label,Unnamed: 1_level_1,Unnamed: 2_level_1
bad,92049,92049
good,414286,414286


## Obliczanie metryk

### Popularność domeny

### Pozostałe

In [15]:
def count_special_symbols(domain):
    counter = 0
    for char in domain:
        if char.isalpha() or char.isdigit():
            continue
        else:
            counter += 1
    return counter


def count_digits(domain):
    counter = 0
    for char in domain:
        if char.isdigit():
            counter += 1
    return counter


In [16]:
def calculate_metrics(df_):
    
    series = df_["url"]
    # Długość
    length = series.apply(lambda x : len(x))
    
    # Ilość subdomen
    nsubdomains = series.apply(lambda x : x.count('.'))
    
    # Średnia długość subdomen
    meanlensubdomains = series.apply(lambda x : (len(x)-x.count('.'))/x.count('.'))
    
    # Ilość cyfr
    ndigits = series.apply(lambda x : count_digits(x))
    
    # Ilość znaków specjalnych - kropki
    nspecial = series.apply(lambda x : count_special_symbols(x) - x.count('.'))
    
    # Długość pierwszej subdomeny
    lastlen = series.apply(lambda x : len(x.split('.')[0]))
    
    # Popularność domeny
    df_['domain'] = df_["url"].str.split('.', n=1, expand=True)[1]
    count = df_.groupby(["label", "domain"]).count().reset_index().iloc[:, [0,1,2]]
    count.columns = ["label", "domain", "count"]
    df_ = df_.merge(count,on=["label", "domain"])
    
    #df_["top"] = df_["url"].apply(lambda x : x.split('.')[-1])
    #popularity =  df_.apply(lambda x : len( df_[(df_["top"] == x.top) ]) , axis=1)
    
    # Polularność słów występujących
    
    
    return np.array([length, nsubdomains, meanlensubdomains, ndigits, nspecial, lastlen, df_["count"]]).T

# Trenowanie modelu

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
sc = StandardScaler()

dane = pd.DataFrame(df.iloc[:,[1,2]])
dane.columns = ["label", "url"]

In [57]:
from sklearn import preprocessing

balanced_df = pd.concat([dane[dane.label == 'bad'],dane[dane.label == 'good'].iloc[1:100000,:]])
le = preprocessing.LabelEncoder()
le.fit(balanced_df.label)
balanced_df["enc"] = le.transform(balanced_df.label)
balanced_df

Unnamed: 0,label,url,enc
0,bad,diaryofagameaddict.com,0
1,bad,espdesign.com.au,0
2,bad,iamagameaddict.com,0
3,bad,kalantzis.net,0
4,bad,slightlyoffcenter.net,0
...,...,...,...
142762,good,shayneandamberhoskins.blogspot.com,1
142763,good,shaystallings.blogspot.com,1
142764,good,shbwgen.blogspot.com,1
142765,good,shdownloadmovies.com,1


In [58]:
features = calculate_metrics(balanced_df)
features

array([[2.2000e+01, 1.0000e+00, 2.1000e+01, ..., 0.0000e+00, 1.8000e+01,
        2.7987e+04],
       [1.6000e+01, 2.0000e+00, 7.0000e+00, ..., 0.0000e+00, 9.0000e+00,
        2.7987e+04],
       [1.8000e+01, 1.0000e+00, 1.7000e+01, ..., 0.0000e+00, 1.4000e+01,
        2.7987e+04],
       ...,
       [2.0000e+01, 2.0000e+00, 9.0000e+00, ..., 0.0000e+00, 7.0000e+00,
        1.0000e+00],
       [2.0000e+01, 1.0000e+00, 1.9000e+01, ..., 0.0000e+00, 1.6000e+01,
        1.0000e+00],
       [3.0000e+01, 2.0000e+00, 1.4000e+01, ..., 0.0000e+00, 2.4000e+01,
        1.0000e+00]])

In [59]:
trainX, testX, trainY, testY = train_test_split(features, balanced_df.enc, stratify=balanced_df.enc, train_size=50000, test_size=50000)
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

# Wektory wspierające

In [60]:
from sklearn import svm

clf = svm.SVC(verbose=True)
clf.fit(trainX, trainY)

[LibSVM]

SVC(verbose=True)

In [61]:
clf.score(testX, testY)

0.86418

In [62]:
clf.score(testX[testY == 1], testY[testY == 1]) # Dane dobre

0.8362588822738621

In [63]:
clf.score(testX[testY == 0], testY[testY == 0]) # Dane złe

0.8945128312121844

# Drzewo losowe

In [64]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(max_depth=4, random_state=0, criterion="entropy", min_samples_split=2, min_samples_leaf=5,verbose=1)
clf2.fit(trainX, trainY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished


RandomForestClassifier(criterion='entropy', max_depth=4, min_samples_leaf=5,
                       random_state=0, verbose=1)

In [65]:
clf2.score(testX, testY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


0.84628

In [66]:
clf2.score(testX[testY == 1], testY[testY == 1]) # Dane dobre

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


0.850124831956981

In [67]:
clf2.score(testX[testY == 0], testY[testY == 0]) # Dane złe

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


0.8421030669726685

# ŚMIECI

In [18]:
from multiprocessing import  Pool
import random
def parallelize_dataframe(df__, func, n_cores=4):
    df_split = np.array_split(df__, n_cores)
    pool = Pool(n_cores)
    df_ = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df_

In [None]:
get_best_lev_match(data)

In [None]:
train = parallelize_dataframe(data, get_best_lev_match,n_cores=16)

In [74]:
parse_url("pm137lodz.wikom.pl")

Url(scheme=None, auth=None, host='pm137lodz.wikom.pl', port=None, path=None, query=None, fragment=None)

In [95]:
re.search("https?:?\/\/", "pm137lodz.wikom.pl")