In [5]:
# importing some useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.tokenize import RegexpTokenizer  
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.pipeline import make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pickle 

# Wczytywanie danych

In [102]:
kaggle = pd.read_csv("../Data/URLs-mixed/kaggle_labeled.csv")
kaggle.head(2) # url label
kaggle.groupby("label").count()

Unnamed: 0_level_0,url
label,Unnamed: 1_level_1
bad,75643
good,344821


## Dane polska

In [100]:
gov = pd.read_csv("../Data/URLs-good/gov_poland.csv", skiprows=2, sep=';', usecols=[1], names=["url"])
gov

Unnamed: 0,url
0,http://1bcz.wp.mil.pl
1,http://1bdm.wp.mil.pl/pl/28.html
2,www.1bsp.wp.mil.pl
3,pulawypz.6.e-bip.pl
4,https://lo1.gizycko.edu.pl/
...,...
69521,https://www.gov.pl/web/hayastan
69522,https://www.gov.pl/web/hanguk
69523,https://www.gov.pl/web/nippon
69524,https://www.gov.pl/web/zhongguo


In [68]:
gov["url"] = gov["url"].str.replace("https://","")
gov["url"] = gov["url"].str.replace("http://","")
gov["url"] = gov["url"].str.replace("www.","")
gov["url"] = gov["url"].astype(str)
gov["label"] = "good"
gov.head(2)

  gov["url"] = gov["url"].str.replace("www.","")


Unnamed: 0,url,label
0,1bcz.wp.mil.pl,good
1,1bdm.wp.mil.pl/pl/28.html,good


## Dane cert

In [101]:
cert = pd.read_csv("https://hole.cert.pl/domains/domains.txt", names=["url"])
cert

Unnamed: 0,url
0,008753331120.com
1,02-wiadomosci.com.pl
2,03e1i.csb.app
3,0b8cc331813049912.temporary.link
4,0lx.group
...,...
15372,zwrocsluch.site
15373,zycienieuslaneluksusem.website
15374,zyciezdnianadzien.cyou
15375,zyjesiedlatakichhchwil.website


In [83]:
cert["url"] = cert["url"].astype(str)
cert["label"] = "bad"
cert.head(2)

Unnamed: 0,url,label
0,008753331120.com,bad
1,02-wiadomosci.com.pl,bad


In [103]:
df = pd.concat([kaggle,gov,cert])
df.groupby("label").count()

Unnamed: 0_level_0,url
label,Unnamed: 1_level_1
bad,75643
good,344821


# Trenowanie modelu

In [86]:
## RegexpTokenizer
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
# Tokenizing all the rows 
print('Getting words tokenized ...')
t0= time.perf_counter()
df['text_tokenized'] = df.url.map(lambda t: tokenizer.tokenize(t))
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

## SnowballStemmer
stemmer = SnowballStemmer("english") # choose a language
# Getting all the stemmed words
df['text_stemmed'] = df['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

# Joining all the stemmmed words.
df['text_sent'] = df['text_stemmed'].map(lambda l: ' '.join(l))

## Creating model
cv = CountVectorizer()
feature = cv.fit_transform(df.text_sent) #transform all text which we tokenize and stemed

trainX, testX, trainY, testY = train_test_split(feature, df.label)

Getting words tokenized ...
Time taken 2.010560700000042 sec


In [87]:
## LogisticRegression
lr = LogisticRegression(solver='saga',max_iter=350)
lr.fit(trainX,trainY)

print("Score: ",lr.score(testX,testY))



Score:  0.9674054139623239


In [98]:
lr.predict(cv.transform([' '.join([stemmer.stem(word) for word in tokenizer.tokenize("1bcz.wp.mil.pl")])]))

array(['good'], dtype=object)

In [92]:
tokenizer.tokenize("a11egro.pl")

['a', 'egro', 'pl']

In [97]:
cv.transform([' '.join([stemmer.stem(word) for word in tokenizer.tokenize("1bcz.wp.mil.pl")])])

<1x290653 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>