In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from urllib.parse import urlparse
from tldextract import extract

from code.util import *
from code.preprocessor import *
from code.gofaster import *

In [2]:
data = load("data/urls_sampled.pkl")

In [3]:
MAX_WORDS = 4000
MI_WORDS = 30
MAX_SENTENCE_LEN = 24

# Vectorizers

In [4]:
tk = Tokenizer(
    num_words=MAX_WORDS,
    filters='!"#$%&()*+,-.:;<=>?@[\\]^_`{|}~',
    split="/"
)

In [5]:
x0 = data.url
y0 = data.target

In [6]:
y_onehot = to_categorical(y0, dtype=np.int32)
np.save("data/y_onehot.npy", data)

In [7]:
tk.fit_on_texts(x0)
x_word = pad_sequences(tk.texts_to_sequences(x0), maxlen=MAX_SENTENCE_LEN)

np.save("data/x_word.npy", x_word)

# Character Level Vectorizer

In [11]:
x_char = np.vstack(data.url.apply(char_level_encoder))

## Onehot

In [28]:
unique_chars = np.unique(data).astype(int)

In [29]:
zlen = data.shape[0]
ylen = data.shape[1]
xlen = unique_chars.shape[0]

In [30]:
def char_onehot(data):

    arr = np.zeros(shape=(zlen, ylen * xlen))

    for i in range(zlen):
        for j in range(ylen):
            xidx = np.argwhere(data[i][j]==unique_chars)
            arr[i][ylen * j + xidx] = 1
            
    return arr

In [31]:
x_char_onehot = char_onehot(data)

In [33]:
np.savez_compressed("data/x_char_onehot.npz", x_char_onehot)

# Feature Engineering

In [8]:
words = load("data/mi_100.pkl")

In [9]:
suspicious_tlds = ['country','stream','download','xin','gdn','racing', 
                   'jetzt','win','bid','vip', 'ren', 'kim', 'loan',
                   'mom', 'party', 'review', 'trade', 'date', 'wang',
                   'accountants', 'zip','cricket','link','work','gq',
                   'science','tk', 'world', 'fit', 'work' 'ryukyu',
                   'life', 'cloud', 'desi', 'okinawa', 'ooo','men',
                   'click', 'loan', 'top', 'cf', 'ml', 'ga']

In [10]:
data["tokens"] = data.url.apply(tokenize)
data["len_url"] = data.url.apply(len)
data["len_tok"] = data.tokens.apply(len)
data["slash"] = data.url.apply(lambda url: url.count("/"))
data["dbl_slash"] = data.url.apply(lambda url: url.count("//"))
data["dot"] = data.url.apply(lambda url: url.count("."))
data["at"] = data.url.apply(lambda url: url.count("@"))
data["dash"] = data.url.apply(lambda url: url.count("-"))
data["qmark"] = data.url.apply(lambda url: url.count("?"))
data["amp"] = data.url.apply(lambda url: url.count("&"))
data["hash"] = data.url.apply(lambda url: url.count("#"))
data["perc"] = data.url.apply(lambda url: url.count("%"))
data["eq"] = data.url.apply(lambda url: url.count("="))
data["colon"] = data.url.apply(lambda url: url.count(":"))
data["scolon"] = data.url.apply(lambda url: url.count(";"))
data["scheme"] = data.url.apply(lambda url: urlparse(url).scheme)
data["netloc"] = data.url.apply(lambda url: urlparse(url).netloc)
data["tld"] = data.netloc.apply(lambda netloc: extract(netloc).suffix)
data["suspicious"] = data.tld.apply(lambda tld: 1 if tld in suspicious_tlds else 0)

def non_ascii(url):
    n = 0
    
    for c in url:
        if ord(c) > 127:
            n += 1
            
    return n

data["non_ascii"] = data.url.apply(non_ascii)

for i in range(MI_WORDS):
    data["mi%d" % (i)] = data.tokens.apply(lambda tokens: 1 if words[i] in tokens else 0)
    
dummies = pd.get_dummies(data.scheme, drop_first=True)
data = pd.concat([data, dummies], axis=1)

In [11]:
drop = [
    "target",
    "url",
    "tokens",
    "scheme",
    "netloc",
    "tld"
]

In [12]:
data = data.drop(columns=drop, axis=1)

In [14]:
np.save("data/x_feat.npy", data.values)