In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from urllib.parse import urlparse
from tldextract import extract

from code.util import *
from code.preprocessor import *

N0 = 45000
N1 = 15000
VOCAB = 5000
CVECT = 100
WVECT = 20
UCHARS = 139
MI = 30

# Sampling

In [2]:
url0 = pd.read_csv("data/raw/url_0.csv", skiprows=0).sample(
    n=N0,
    random_state=11
).reset_index(drop=True)

url1 = pd.read_csv("data/raw/url_1.csv", skiprows=0).sample(
    n=N1,
    random_state=11
).reset_index(drop=True)

In [3]:
url0["target"] = pd.Series(np.zeros(N0).astype(np.int32))
url1["target"] = pd.Series(np.ones(N1).astype(np.int32))

In [4]:
url = pd.concat(
    [
        url0[["url", "target"]],
        url1[["url", "target"]]
    ]
)

In [5]:
save(url, "data/url_45_15.pkl")

# X, Y

In [6]:
x0 = url.url
y0 = url.target

# Target Onehot

In [7]:
y_onehot = to_categorical(y0, dtype=np.int32)
np.save("data/xy/y_onehot.npy", y_onehot)

# Vectorizers

In [8]:
tk = Tokenizer(
    num_words=VOCAB,
    filters='!"#$%&()*+,-.:;<=>?@[\\]^_`{|}~',
    split="/"
)

In [9]:
tk.fit_on_texts(x0)
x_word = pad_sequences(tk.texts_to_sequences(x0), maxlen=WVECT)
x_word.shape

(60000, 20)

In [10]:
np.save("data/xy/x_word_45_15.npy", x_word)

# Character Level Vectorizer

In [11]:
x_char = np.vstack(url.url.apply(char_level_encoder, args=(CVECT,)))

## Onehot

In [12]:
unique_chars = np.unique(x_char).astype(int)

In [13]:
x_char_onehot = char_onehot(x_char, unique_chars, UCHARS)
x_char_onehot.shape

(60000, 13900)

In [14]:
np.savez_compressed("data/xy/x_char_onehot_45_15.npz", x_char_onehot)

# Feature Engineering

In [4]:
data = load("data/url_45_15.pkl")

In [2]:
words = load("data/mi_100.pkl")

In [3]:
suspicious_tlds = ['country','stream','download','xin','gdn','racing', 
                   'jetzt','win','bid','vip', 'ren', 'kim', 'loan',
                   'mom', 'party', 'review', 'trade', 'date', 'wang',
                   'accountants', 'zip','cricket','link','work','gq',
                   'science','tk', 'world', 'fit', 'work' 'ryukyu',
                   'life', 'cloud', 'desi', 'okinawa', 'ooo','men',
                   'click', 'loan', 'top', 'cf', 'ml', 'ga']

In [6]:
data["tokens"] = data.url.apply(tokenize)
data["len_url"] = data.url.apply(len)
data["len_tok"] = data.tokens.apply(len)
data["slash"] = data.url.apply(lambda url: url.count("/"))
data["dbl_slash"] = data.url.apply(lambda url: url.count("//"))
data["dot"] = data.url.apply(lambda url: url.count("."))
data["at"] = data.url.apply(lambda url: url.count("@"))
data["dash"] = data.url.apply(lambda url: url.count("-"))
data["qmark"] = data.url.apply(lambda url: url.count("?"))
data["amp"] = data.url.apply(lambda url: url.count("&"))
data["hash"] = data.url.apply(lambda url: url.count("#"))
data["perc"] = data.url.apply(lambda url: url.count("%"))
data["eq"] = data.url.apply(lambda url: url.count("="))
data["colon"] = data.url.apply(lambda url: url.count(":"))
data["scolon"] = data.url.apply(lambda url: url.count(";"))
data["scheme"] = data.url.apply(lambda url: urlparse(url).scheme)
data["netloc"] = data.url.apply(lambda url: urlparse(url).netloc)
data["tld"] = data.netloc.apply(lambda netloc: extract(netloc).suffix)
data["suspicious"] = data.tld.apply(lambda tld: 1 if tld in suspicious_tlds else 0)

def non_ascii(url):
    n = 0
    
    for c in url:
        if ord(c) > 127:
            n += 1
            
    return n

data["non_ascii"] = data.url.apply(non_ascii)

for i in range(MI):
    data["mi%d" % (i)] = data.tokens.apply(lambda tokens: 1 if words[i] in tokens else 0)
    
dummies = pd.get_dummies(data.scheme, drop_first=True)
data = pd.concat([data, dummies], axis=1)

In [8]:
drop = [
    "target",
    "url",
    "tokens",
    "scheme",
    "netloc",
    "tld"
]

data = data.drop(columns=drop, axis=1)

In [10]:
np.save("data/xy/x_feat_45_15.npy", data.values)