In [1]:
# importing some useful libraries
%pylab inline

import pandas as pd
import time 
import joblib

from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from PIL import Image
rng = np.random.default_rng(12345)

Populating the interactive namespace from numpy and matplotlib


In [2]:
MODEL_FILE = "../Dumps/model-combined-URLs.sav"
SCALER_FILE = "../Dumps/scaler-combined-URLs.sav"
ENCODER_FILE = "../Dumps/encoder-combined-URLs.sav"

URL_REGEX = ("^((?P<scheme>[^:/?#]+):(?=//))?(//)?(((?P<login>[^:]+)" + 
    "(?::(?P<password>[^@]+)?)?@)?(?P<host>[^@/?#:]*)(?::(?P<port>\d+)?)?)?" + 
    "(?P<path>[^?#]*)(\?(?P<query>[^#]*))?(#(?P<fragment>.*))?")

# Wczytywanie danych

In [3]:
df = pd.read_csv("../Data/cleaned_combined_urls.csv", index_col=0)
df.head(2)

  mask |= (ar1 == a)


Unnamed: 0,url,label
0,?guid=Windows Updates Manager,1.0
1,69.162.100.198/,1.0


In [4]:
df["url"] = df.url.astype(str)

## Czyszczenie danych

In [5]:
print(df.shape)
print(df[df.url.str.count(".") > 0].shape)

(1958842, 2)
(1958842, 2)


In [6]:
url_parse_groups = ["scheme", 4, "host", "port", "path", "query", "fragment"]
df = pd.concat(
    [df, df.url.str.extract(URL_REGEX)[url_parse_groups]],
    axis=1
)    
print(df.shape)
df.columns = ["url", "label", "scheme", 
              "user", "host", "port", 
              "path", "query", "fragment"]
df.head()

(1958842, 9)


Unnamed: 0,url,label,scheme,user,host,port,path,query,fragment
0,?guid=Windows Updates Manager,1.0,,,,,,guid=Windows Updates Manager,
1,69.162.100.198/,1.0,,,69.162.100.198,,/,,
2,babicz123.ddns.net/,1.0,,,babicz123.ddns.net,,/,,
3,highpowerresources.com,1.0,,,highpowerresources.com,,,,
4,intent.nofrillspace.com/users/web11_focus/380...,1.0,,,intent.nofrillspace.com,,/users/web11_focus/3807/space.gif,,


In [41]:
df.drop(np.where(df.host.str.len() == 0)[0], inplace=True)

In [42]:
df.groupby("label").count()

Unnamed: 0_level_0,url,scheme,user,host,port,path,query,fragment,tld,count
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,1725805,963664,0,1725805,182,1725805,19340,518,1725805,1725805
1.0,233036,154514,0,233036,1121,233036,10188,135,233036,233036


## Obliczanie metryk

### Popularność domeny

In [43]:
df['tld'] = df.host.str.split(".").apply(lambda x : x[-1])
count = df.groupby(["tld"]).count().reset_index().iloc[:, [0,1]]
count.columns = ["tld", "count"]
df = df.merge(count, on=["tld"])
df.head(10)

Unnamed: 0,url,label,scheme,user,host,port,path,query,fragment,tld,count_x,count_y
0,250sb.com./jynvmx,1.0,,,250sb.com.,,/jynvmx,,,,16,15
1,asseveravronnakiewietsblom.shopdentalsupply.com.,1.0,,,asseveravronnakiewietsblom.shopdentalsupply.com.,,,,,,16,15
2,bkent.net./Doc/simple5.htm,0.0,,,bkent.net.,,/Doc/simple5.htm,,,,16,15
3,komunistycznymi.afshinnejad.com.,1.0,,,komunistycznymi.afshinnejad.com.,,,,,,16,15
4,http://medicalofficeoutsourcing.com.,0.0,http,,medicalofficeoutsourcing.com.,,,,,,16,15
5,nubeculaminor-blossgestellter.f-oaks.com.,1.0,,,nubeculaminor-blossgestellter.f-oaks.com.,,,,,,16,15
6,perverselymotorbikes.shopmedicalgloves.com.,1.0,,,perverselymotorbikes.shopmedicalgloves.com.,,,,,,16,15
7,pseudoalcaligenes.nhconstruction.com.,1.0,,,pseudoalcaligenes.nhconstruction.com.,,,,,,16,15
8,ssl-allegro.comuf.com./allegro.html,1.0,,,ssl-allegro.comuf.com.,,/allegro.html,,,,16,15
9,syydettyjendatumm.brockalumni.com.,1.0,,,syydettyjendatumm.brockalumni.com.,,,,,,16,15


### Pozostałe

In [44]:
def calculate_metrics(df_):
    ##### Scheme
    scheme = df_.scheme
    scheme = scheme.fillna("n")
    scheme = scheme.replace({
        "[nh]+t+p+s+": 0, 
        "[nh]+t+p*(?![a-z]*s)" : 1,
        "n": 0.5, 
        "none": 0.5,
        "\w+" : 0.5
    }, regex=True)
    scheme = scheme.astype('float64')    
    
    #### Host
    host = df_.host
    host_len = host.str.len()
    host_subdomains_count = host.str.count(".")
    host_subdomains_mean_len = (host_len - host_subdomains_count) / host_subdomains_count
    host_digit = host.str.count("\d")
    host_nspecial = host.str.count("([^A-Za-z\d\s])")
    host_first_len = host.str.split(".", n=1, expand=True)[0].str.len()
    host_digit_letter_count = host.str.count("[A-Za-z]\d|\d[A-Za-z]")
    host_has_a = host.str.count("@")
    host_pref_suf_number = host.str.count("[-_]")
    host_is_ipv4 = host.str.count("(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" + 
                                  "(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}")
    return np.array([
        scheme, host_len, host_subdomains_count, host_subdomains_mean_len, 
        host_digit, host_nspecial, host_first_len, host_digit_letter_count, 
        host_has_a, host_pref_suf_number, host_is_ipv4
    ]).T

In [45]:
df[["url", "scheme"]].groupby("scheme").count()

Unnamed: 0_level_0,url
scheme,Unnamed: 1_level_1
http,1
ftp,207
gopher,18
hXXp,24
hXXps,2
htpp,1
http,1106003
https,11888
https.portal.apple.com.idmswebauth.login.html.appidkey.05c7e09b5896b0334b3af1139274f266b2hxxp,1
http,1


In [46]:
calculate_metrics(df)

array([[ 0.5, 10. , 10. , ...,  0. ,  0. ,  0. ],
       [ 0.5, 48. , 48. , ...,  0. ,  0. ,  0. ],
       [ 0.5, 10. , 10. , ...,  0. ,  0. ,  0. ],
       ...,
       [ 0.5, 27. , 27. , ...,  0. ,  4. ,  0. ],
       [ 1. , 22. , 22. , ...,  0. ,  0. ,  0. ],
       [ 1. , 20. , 20. , ...,  0. ,  0. ,  0. ]])

# Trenowanie modelu

In [52]:
sc = StandardScaler()
dane = df
label_distribution = df.groupby("label").count()
label_distribution

Unnamed: 0_level_0,url,scheme,user,host,port,path,query,fragment,tld,count_x,count_y
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,1725805,963664,0,1725805,182,1725805,19340,518,1725805,1725805,1725805
1.0,233036,154514,0,233036,1121,233036,10188,135,233036,233036,233036


In [58]:
balanced_df_half_size = round(min(label_distribution.url) / 100) * 100
good_indices = rng.integers(0, label_distribution.url[0]-1, balanced_df_half_size)
bad_indices = rng.integers(0, label_distribution.url[1]-1, balanced_df_half_size)

balanced_df = pd.concat(
    [dane[dane.label == 0].iloc[good_indices],
     dane[dane.label == 1].iloc[bad_indices]])
le = preprocessing.LabelEncoder()
le.fit(balanced_df.label)
balanced_df["enc"] = le.transform(balanced_df.label)
balanced_df

Unnamed: 0,url,label,scheme,user,host,port,path,query,fragment,tld,count_x,count_y,enc
1738329,http://www.hlpcontrols.com.au/,0.0,http,,www.hlpcontrols.com.au,,/,,,au,26017,26017,0
610968,vrbrothers.com,0.0,,,vrbrothers.com,,,,,com,1186983,1186983,0
273609,frederatorblogs.com/channel_frederator/categor...,0.0,,,frederatorblogs.com,,/channel_frederator/category/cartoon-network/,,,com,1186983,1186983,0
1166523,http://www.stocknod.com,0.0,http,,www.stocknod.com,,,,,com,1186983,1186983,0
630692,http://www.2steppin.com/,0.0,http,,www.2steppin.com,,/,,,com,1186983,1186983,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1933377,http://www.saservices.com.ph/KWLIVE/docss/cont...,1.0,http,,www.saservices.com.ph,,/KWLIVE/docss/contactform.php,,,ph,1021,1021,1
1919942,http://www.trinityweblinks-hubertchristian.6x.to,1.0,http,,www.trinityweblinks-hubertchristian.6x.to,,,,,to,663,663,1
61291,http://www.gctf.net,1.0,http,,www.gctf.net,,,,,net,90987,90987,1
1139714,http://www.siabtgb.com,1.0,http,,www.siabtgb.com,,,,,com,1186983,1186983,1


In [59]:
features = calculate_metrics(balanced_df)
features

array([[ 1. , 22. , 22. , ...,  0. ,  0. ,  0. ],
       [ 0.5, 14. , 14. , ...,  0. ,  0. ,  0. ],
       [ 0.5, 19. , 19. , ...,  0. ,  0. ,  0. ],
       ...,
       [ 1. , 12. , 12. , ...,  0. ,  0. ,  0. ],
       [ 1. , 15. , 15. , ...,  0. ,  0. ,  0. ],
       [ 0.5, 25. , 25. , ...,  0. ,  0. ,  0. ]])

In [60]:
trainX, testX, trainY, testY = train_test_split(features, balanced_df.enc, test_size=0.2)
trainX = sc.fit_transform(trainX)
testX = sc.transform(testX)

# Wektory wspierające

In [None]:
from sklearn import svm

clf = svm.SVC(verbose=True)
clf.fit(trainX, trainY)

[LibSVM]

In [35]:
clf.score(testX, testY)

0.990602588409859

In [36]:
clf.score(testX[testY == 1], testY[testY == 1]) # Dane dobre = 1

0.9987789987789988

In [37]:
clf.score(testX[testY == 0], testY[testY == 0]) # Dane złe = 1

0.9670083279948751

In [None]:
def transform(url):
    df_ = pd.DataFrame([url], columns = ['url'])
    df_ = pd.concat(
        [df_, df_.url.str.extract(URL_REGEX)[url_parse_groups]],
        axis=1
    )
    df_.columns = ["url", "label", "scheme", 
                  "user", "host", "port", 
                  "path", "query", "fragment"]
    return df_    

# Podsumowanie
Model w obecnym stanie jest w stanie rozpoznać proste podmiany i działa szybko, jak dla mnie nadaje się do wrzucenia.

In [41]:
clf.predict(sc.transform(calculate_metrics(transform("https://allegro.pl"))))

array([1])

In [42]:
clf.predict(sc.transform(calculate_metrics(transform("https://a11egro.pl"))))

array([0])

# Zapisanie modelu

In [45]:
import joblib

joblib.dump(clf, MODEL_FILE) # SVM
joblib.dump(sc, SCALER_FILE) # StandardScaler
joblib.dump(le, ENCODER_FILE) # LabelEncoder

['../Dumps/encoder.sav']