# 0. Init

## 0.1 Import Modules

In [1]:
from urllib.parse import urlparse, quote
import wordsegment as ws
from os import cpu_count

# custom modules
from code.util import *
from code.preprocessor import *
from code.gofaster import *

# dataframe parallelizer
gf = GoFaster(n_jobs=cpu_count()-1, n_partitions=cpu_count()*3)

# load wordsegment dictionary
ws.load()

## 0.2 Load Data

In [2]:
# urls = load("data/urls.pkl")
urls = load("data/urls_sampled.pkl")

# 1. Preprocessing

## 1.1 Sampling

Take a random sample of 5% of normal urls and all phishing urls for a total of ~67k data points

In [9]:
sample = urls[urls.target==0].sample(frac=0.05, random_state=42)
urls_sampled = pd.concat([sample, urls[urls.target==1]])
save(urls_sampled, "data/urls_sampled.pkl")
print(len(urls_sampled))

67690


In [3]:
def cleaner(df):
    df["url_clean"] = df.url.apply(clean)
    return df

urls = gf.parallelize(urls, cleaner)

In [None]:
# CAUTION! Takes forever...

def segmenter(df):
    df["url_segment"] = df.url_clean.apply(lambda url: " ".join(ws.segment(url)))
    return df

urls = gf.parallelize(urls, segmenter)

In [9]:
urls.url.iloc[0]

'http://vioz.org'

In [8]:
ws.segment(urls.url.iloc[0])

['http', 'vio', 'zorg']

In [15]:
ws.segment("https://www.who.www")

['https', 'www', 'who', 'www']

In [14]:
ws.segment("https://www.google.com")

['https', 'wwwgooglecom']

# Testing Grounds

In [None]:
df = load("data/urls_sampled.pkl")

In [None]:
def encoder(df):
    df["url_encode"] = df.url_clean.apply(c_level_encoder)
    return df

In [None]:
temp = gf.parallelize(urls, encoder)

In [None]:
temp.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df.vect
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
from sklearn.naive_bayes import MultinomialNB
model_mult = MultinomialNB().fit(np.vstack(X_train), y_train)

In [None]:
from sklearn.metrics import classification_report

y_pred = model_mult.predict(np.vstack(X_test))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
def count_non_ascii(s):
    ls = list(s)
    ords = list(map(ord, ls))
    count = sum(list(map(lambda x: 0 if x < 128 else 1, ords)))
    return count

df["n"] = df.url.apply(count_non_ascii)

In [None]:
np.sum(df.n)

In [None]:
def find_non_ascii(s):
    ls = []
    
    for c in s:
        if ord(c) > 127:
            ls.append(c)
        
    return ls

df["non_ascii"] = df.url.apply(find_non_ascii)

In [None]:
df[df.n != 0][["n", "non_ascii", "target"]]

In [None]:
from urllib.parse import quote

In [None]:
print(df.url.iloc[3507], "\n")
print(quote(df.url.iloc[3507], safe=string.printable))

In [None]:
a = "hello "
b = a.rstrip()

In [23]:
urlparse("https://www.google.com/news/?cat=100")

ParseResult(scheme='https', netloc='www.google.com', path='/news/', params='', query='cat=100', fragment='')