In [1]:
# importing some useful libraries
import pandas as pd
%pylab inline
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from PIL import Image

Populating the interactive namespace from numpy and matplotlib


In [2]:
MODEL_FILE = "../Dumps/model-combined-URLs.sav"
SCALER_FILE = "../Dumps/scaler-combined-URLs.sav"
ENCODER_FILE = "../Dumps/encoder-combined-URLs.sav"

URL_REGEX = ("^((?P<scheme>[^:/?#]+):(?=//))?(//)?(((?P<login>[^:]+)" + 
    "(?::(?P<password>[^@]+)?)?@)?(?P<host>[^@/?#:]*)(?::(?P<port>\d+)?)?)?" + 
    "(?P<path>[^?#]*)(\?(?P<query>[^#]*))?(#(?P<fragment>.*))?")

# Wczytywanie danych

In [73]:
df = pd.read_csv("../Data/cleaned_combined_urls.csv", index_col=0)
df.head(2)

  mask |= (ar1 == a)


Unnamed: 0,url,label
0,?guid=Windows Updates Manager,1.0
1,69.162.100.198/,1.0


In [74]:
df["url"] = df.url.astype(str)

## Czyszczenie danych

In [75]:
print(df.shape)
print(df[df.url.str.count(".") > 0].shape)

(1958842, 2)
(1958842, 2)


In [76]:
url_parse_groups = ["scheme", 4, "host", "port", "path", "query", "fragment"]
df = pd.concat(
    [df, df.url.str.extract(URL_REGEX)[url_parse_groups]],
    axis=1
)    
print(df.shape)
df.columns = ["url", "label", "scheme", "user", "host", "port", "path", "query", "fragment"]
df.head()

(1958842, 9)


Unnamed: 0,url,label,scheme,user,host,port,path,query,fragment
0,?guid=Windows Updates Manager,1.0,,,,,,guid=Windows Updates Manager,
1,69.162.100.198/,1.0,,,69.162.100.198,,/,,
2,babicz123.ddns.net/,1.0,,,babicz123.ddns.net,,/,,
3,highpowerresources.com,1.0,,,highpowerresources.com,,,,
4,intent.nofrillspace.com/users/web11_focus/380...,1.0,,,intent.nofrillspace.com,,/users/web11_focus/3807/space.gif,,


In [77]:
df.groupby("label").count()

Unnamed: 0_level_0,url,scheme,user,host,port,path,query,fragment
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,1725805,963664,0,1725805,182,1725805,19340,518
1.0,233037,154514,0,233037,1121,233037,10189,135


## Obliczanie metryk

### Popularność domeny

In [78]:
df['tld'] = df.host.str.split(".").apply(lambda x : x[-1])
count = df.groupby(["tld"]).count().reset_index().iloc[:, [0,1]]
count.columns = ["tld", "count"]
df = df.merge(count, on=["tld"])
df.head(10)

Unnamed: 0,url,label,scheme,user,host,port,path,query,fragment,tld,count
0,?guid=Windows Updates Manager,1.0,,,,,,guid=Windows Updates Manager,,,16
1,250sb.com./jynvmx,1.0,,,250sb.com.,,/jynvmx,,,,16
2,asseveravronnakiewietsblom.shopdentalsupply.com.,1.0,,,asseveravronnakiewietsblom.shopdentalsupply.com.,,,,,,16
3,bkent.net./Doc/simple5.htm,0.0,,,bkent.net.,,/Doc/simple5.htm,,,,16
4,komunistycznymi.afshinnejad.com.,1.0,,,komunistycznymi.afshinnejad.com.,,,,,,16
5,http://medicalofficeoutsourcing.com.,0.0,http,,medicalofficeoutsourcing.com.,,,,,,16
6,nubeculaminor-blossgestellter.f-oaks.com.,1.0,,,nubeculaminor-blossgestellter.f-oaks.com.,,,,,,16
7,perverselymotorbikes.shopmedicalgloves.com.,1.0,,,perverselymotorbikes.shopmedicalgloves.com.,,,,,,16
8,pseudoalcaligenes.nhconstruction.com.,1.0,,,pseudoalcaligenes.nhconstruction.com.,,,,,,16
9,ssl-allegro.comuf.com./allegro.html,1.0,,,ssl-allegro.comuf.com.,,/allegro.html,,,,16


### Pozostałe

In [27]:
import re
import ipaddress

def count_special_symbols(domain):
    counter = 0
    for char in domain:
        if char.isalpha() or char.isdigit():
            continue
        else:
            counter += 1
    return counter

def count_digits(domain):
    counter = 0
    for char in domain:
        if char.isdigit():
            counter += 1
    return counter

def count_digit_letter(domain):
    """Zlicza kombinacje litera cyfra"""
    res = re.findall("[A-Za-z][0-9]", domain)
    return len(res)

def count_sus(domain):
    res = re.findall("https|http|www", domain)
    return len(res)

def has_a(domain):
    """Sprawdza czy domain ma @"""
    res = re.findall("@", domain)
    return len(res)

def has_pref_or_suff(domain):
    """Sprawdza czy domain ma -"""
    res = re.findall("-", domain)
    return len(res)

def is_ipv4(string):
    try:
        ipaddress.IPv4Network(string)
        return True
    except ValueError:
        return False

In [28]:
def calculate_metrics(df_):
    ##### Schema
    schema = df_.schema
    schema = schema.fillna("n")
    schema = schema.replace({"https": 1, "http": 0, "n": 0.5, 'none': 0.5, 'hhtp': 0, 'nttps': 1, "htpps": 1, "htps": 1, "htt": 0})
    schema = schema.astype('float64')
    
    #### Host
    host = df_.host
    host_len = host.apply(lambda x : len(x)) # Długość 
    host_subdomains_count = host.apply(lambda x : x.count('.')) # Ilość subdomen
    host_subdomains_mean_len = host.apply(lambda x : (len(x)-x.count('.'))/x.count('.')) # Średnia długość domen
    host_digit = host.apply(lambda x : count_digits(x)) # Ilość cyfr
    host_nspecial = host.apply(lambda x : count_special_symbols(x) - x.count('.')) # Ilość znaków specjalnych - kropki
    host_first_len = host.apply(lambda x : len(x.split('.')[0])) # Długość pierwszej subdomeny
    host_digit_letter_count = host.apply(lambda x : count_digit_letter(x)) # Ilość kombinacji litera-cyfra
    host_has_a = host.apply(lambda x : has_a(x)) # Sprawdza czy host ma @
    host_pref_suf_number = host.apply(lambda x : has_pref_or_suff(x)) # Sprawdza czy ma -
    host_is_ipv4 = host.apply(lambda x : is_ipv4(x)) # Sprawdza czy host to adres ipv4s
    

    return np.array([schema, host_len, host_subdomains_count, host_subdomains_mean_len, host_digit, host_nspecial, host_first_len, host_digit_letter_count, host_has_a, host_pref_suf_number, host_is_ipv4]).T

In [29]:
calculate_metrics(df)

array([[ 1., 14.,  2., ...,  0.,  0.,  0.],
       [ 1., 15.,  2., ...,  0.,  0.,  0.],
       [ 1., 13.,  2., ...,  0.,  0.,  0.],
       ...,
       [ 1., 23.,  1., ...,  0.,  0.,  0.],
       [ 0., 15.,  2., ...,  0.,  0.,  0.],
       [ 0., 16.,  3., ...,  0.,  1.,  0.]])

# Trenowanie modelu

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
sc = StandardScaler()

dane = df

In [31]:
from sklearn import preprocessing

balanced_df = pd.concat([dane[dane.label == 'bad'],dane[dane.label == 'good'].iloc[1:100000,:]])
le = preprocessing.LabelEncoder()
le.fit(balanced_df.label)
balanced_df["enc"] = le.transform(balanced_df.label)
balanced_df

Unnamed: 0,index,url,label,parsed_url,schema,auth,host,port,path,query,fragment,enc
45378,0,inpost.pl-paydelivery.cyou,bad,"(None, None, inpost.pl-paydelivery.cyou, None,...",,,inpost.pl-paydelivery.cyou,,,,,0
45379,1,inpost.pl-paydelivery.work,bad,"(None, None, inpost.pl-paydelivery.work, None,...",,,inpost.pl-paydelivery.work,,,,,0
45380,2,inpost.pl-paydelivery.xyz,bad,"(None, None, inpost.pl-paydelivery.xyz, None, ...",,,inpost.pl-paydelivery.xyz,,,,,0
45381,3,www.inpost.new-pl-oferta.cyou,bad,"(None, None, www.inpost.new-pl-oferta.cyou, No...",,,www.inpost.new-pl-oferta.cyou,,,,,0
45382,4,inpost.new-pl-oferta.casa,bad,"(None, None, inpost.new-pl-oferta.casa, None, ...",,,inpost.new-pl-oferta.casa,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
45373,35373,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1
45374,35374,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1
45375,35375,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1
45376,35376,https://lastpass.com/signup2.php?ac=1&from_uri...,good,"(https, None, lastpass.com, None, /signup2.php...",https,,lastpass.com,,/signup2.php,ac=1&from_uri=%2Fsignup2.php%3Fac%3D1%26from_u...,,1


In [32]:
features = calculate_metrics(balanced_df)
features

array([[ 0.5, 26. ,  2. , ...,  0. ,  1. ,  0. ],
       [ 0.5, 26. ,  2. , ...,  0. ,  1. ,  0. ],
       [ 0.5, 25. ,  2. , ...,  0. ,  1. ,  0. ],
       ...,
       [ 1. , 12. ,  1. , ...,  0. ,  0. ,  0. ],
       [ 1. , 12. ,  1. , ...,  0. ,  0. ,  0. ],
       [ 1. ,  9. ,  1. , ...,  0. ,  0. ,  0. ]])

In [33]:
trainX, testX, trainY, testY = train_test_split(features, balanced_df.enc, test_size=0.2)
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

# Wektory wspierające

In [34]:
from sklearn import svm

clf = svm.SVC(verbose=True)
clf.fit(trainX, trainY)

[LibSVM]

SVC(verbose=True)

In [35]:
clf.score(testX, testY)

0.990602588409859

In [36]:
clf.score(testX[testY == 1], testY[testY == 1]) # Dane dobre = 1

0.9987789987789988

In [37]:
clf.score(testX[testY == 0], testY[testY == 0]) # Dane złe = 1

0.9670083279948751

In [38]:
#clf.predict(sc.transform(calculate_metrics(pd.DataFrame(["shbwgen.blogspot.com	"], columns = ["url"]))))

# Drzewo losowe

In [52]:
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(max_depth=3, random_state=0, min_samples_split=2, min_samples_leaf=2,verbose=1)
clf2.fit(trainX, trainY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


RandomForestClassifier(max_depth=3, min_samples_leaf=2, random_state=0,
                       verbose=1)

In [53]:
clf2.score(testX, testY)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0.9209463358338142

In [54]:
clf2.score(testX[testY == 1], testY[testY == 1]) # Dane dobre

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0.9839637248396372

In [55]:
clf2.score(testX[testY == 0], testY[testY == 0]) # Dane złe

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


0.7364842991259307

In [39]:
def transform(url):
    d = pd.DataFrame([url], columns = ['url'])
    d['parsed_url'] = d.url.apply(lambda x : parseurl(x))
    u = d.url.apply(lambda x : list(parseurl(x)))
    d['schema'] = u.apply(lambda x : x[0])
    d['auth'] = u.apply(lambda x : x[1])
    d['host'] = u.apply(lambda x : x[2])
    d['port'] = u.apply(lambda x : x[3])
    d['path'] = u.apply(lambda x : x[4])
    d['query'] = u.apply(lambda x : x[5])
    d['fragment'] = u.apply(lambda x : x[6])
    return d

In [40]:
transform("allegro.pl")

Unnamed: 0,url,parsed_url,schema,auth,host,port,path,query,fragment
0,allegro.pl,"(None, None, allegro.pl, None, None, None, None)",,,allegro.pl,,,,


# Podsumowanie
Model w obecnym stanie jest w stanie rozpoznać proste podmiany i działa szybko, jak dla mnie nadaje się do wrzucenia.

In [41]:
clf.predict(sc.transform(calculate_metrics(transform("https://allegro.pl"))))

array([1])

In [42]:
clf.predict(sc.transform(calculate_metrics(transform("https://a11egro.pl"))))

array([0])

# Zapisanie modelu

In [45]:
import joblib

joblib.dump(clf, model_file) # SVM
joblib.dump(sc, scaler_file) # StandardScaler
joblib.dump(le, encoder_file) # LabelEncoder

['../Dumps/encoder.sav']