In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from urllib.parse import urlparse
from tldextract import extract

import wordsegment as ws
ws.load()

from code.util import *
from code.preprocessor import *
from code.gofaster import *

gf = GoFaster(11, 22)

N0 = 45000
N1 = 15000
VOCAB = 5000
CVECT = 100
WVECT = 20
UCHARS = 139
MI = 30

# Sampling

In [2]:
url0 = pd.read_csv("data/raw/url_0.csv", skiprows=0).sample(
    n=N0,
    random_state=11
).reset_index(drop=True)

url1 = pd.read_csv("data/raw/url_1.csv", skiprows=0).sample(
    n=N1,
    random_state=11
).reset_index(drop=True)

In [3]:
url0["target"] = pd.Series(np.zeros(N0).astype(np.int32))
url1["target"] = pd.Series(np.ones(N1).astype(np.int32))

In [4]:
url = pd.concat(
    [
        url0[["url", "target"]],
        url1[["url", "target"]]
    ]
)

In [5]:
save(url, "data/url_45_15.pkl")

# X, Y

In [6]:
x0 = url.url
y0 = url.target

# Target Onehot

In [7]:
y_onehot = to_categorical(y0, dtype=np.int32)
np.save("data/xy/y_onehot_45_15.npy", y_onehot)

# Vectorizers

In [20]:
tk = Tokenizer(
    num_words=VOCAB,
    filters='!"#$%&()*+,-.:;<=>?@[\\]^_`{|}~',
    split="/"
)

In [9]:
tk.fit_on_texts(x0)
x_word = pad_sequences(tk.texts_to_sequences(x0), maxlen=WVECT)
x_word.shape

(60000, 20)

In [10]:
np.save("data/xy/x_word_45_15.npy", x_word)

# Character Level Vectorizer

In [3]:
url = load("data/url_45_15.pkl")

In [4]:
x_char = np.vstack(url.url.apply(char_level_encoder, args=(CVECT,)))

In [5]:
x_char

array([[104, 116, 116, ...,   0,   0,   0],
       [104, 116, 116, ...,   0,   0,   0],
       [104, 116, 116, ...,   0,   0,   0],
       ...,
       [104, 116, 116, ..., 117, 115, 116],
       [104, 116, 116, ...,   0,   0,   0],
       [104, 116, 116, ...,   0,   0,   0]])

## Onehot

In [7]:
unique_chars = np.unique(x_char).astype(int)
len(unique_chars)

140

In [8]:
x_char_onehot = char_onehot(x_char, unique_chars, UCHARS)
x_char_onehot.shape

(60000, 13900)

In [11]:
np.unique(x_char_onehot[1])

array([0., 1.])

In [14]:
np.savez_compressed("data/xy/x_char_onehot_45_15.npz", x_char_onehot)

# Feature Engineering

In [5]:
data = load("data/url_45_15.pkl")

In [6]:
words = load("data/mi_100.pkl")

In [7]:
suspicious_tlds = ['country','stream','download','xin','gdn','racing', 
                   'jetzt','win','bid','vip', 'ren', 'kim', 'loan',
                   'mom', 'party', 'review', 'trade', 'date', 'wang',
                   'accountants', 'zip','cricket','link','work','gq',
                   'science','tk', 'world', 'fit', 'work' 'ryukyu',
                   'life', 'cloud', 'desi', 'okinawa', 'ooo','men',
                   'click', 'loan', 'top', 'cf', 'ml', 'ga']

In [8]:
data["tokens"] = data.url.apply(tokenize)
data["len_url"] = data.url.apply(len)
data["len_tok"] = data.tokens.apply(len)
data["slash"] = data.url.apply(lambda url: url.count("/"))
data["dbl_slash"] = data.url.apply(lambda url: url.count("//"))
data["dot"] = data.url.apply(lambda url: url.count("."))
data["at"] = data.url.apply(lambda url: url.count("@"))
data["dash"] = data.url.apply(lambda url: url.count("-"))
data["qmark"] = data.url.apply(lambda url: url.count("?"))
data["amp"] = data.url.apply(lambda url: url.count("&"))
data["hash"] = data.url.apply(lambda url: url.count("#"))
data["perc"] = data.url.apply(lambda url: url.count("%"))
data["eq"] = data.url.apply(lambda url: url.count("="))
data["colon"] = data.url.apply(lambda url: url.count(":"))
data["scolon"] = data.url.apply(lambda url: url.count(";"))
data["scheme"] = data.url.apply(lambda url: urlparse(url).scheme)
data["netloc"] = data.url.apply(lambda url: urlparse(url).netloc)
data["tld"] = data.netloc.apply(lambda netloc: extract(netloc).suffix)
data["suspicious"] = data.tld.apply(lambda tld: 1 if tld in suspicious_tlds else 0)

def non_ascii(url):
    n = 0
    
    for c in url:
        if ord(c) > 127:
            n += 1
            
    return n

data["non_ascii"] = data.url.apply(non_ascii)

for i in range(MI):
    data["mi%d" % (i)] = data.tokens.apply(lambda tokens: 1 if words[i] in tokens else 0)
    
dummies = pd.get_dummies(data.scheme, drop_first=True)
data = pd.concat([data, dummies], axis=1)

In [9]:
drop = [
    "target",
    "url",
    "tokens",
    "scheme",
    "netloc",
    "tld"
]

data = data.drop(columns=drop, axis=1)

In [10]:
data.values.shape

(60000, 50)

In [20]:
np.save("data/xy/x_feat_45_15.npy", data.values)

# Segments

In [2]:
data = load("data/url_45_15.pkl")

In [4]:
def segment(tokens):
    segments = []
    
    for token in tokens:
        segments.append(" ".join(ws.segment(token)))
        
    return " ".join(segments)

In [11]:
def worker(df):
    df["segments"] = df.tokens.apply(segment)
    return df

In [9]:
data["tokens"] = data.url.apply(tokenize)

In [12]:
data = gf.parallelize(data, worker)

In [18]:
data[["tokens", "segments"]].head(30)

Unnamed: 0,tokens,segments
0,"[http, www, meigaoyi, com]",http www mei gao yi com
1,"[http, www, cybertruffle, org, uk, vinales, en...",http www cyber truffle org uk vina les eng sch...
2,"[http, prague, tv, funny, pictures, archive]",http prague tv funny pictures archive
3,"[http, www, f1autographs, com]",http www f1 autographs com
4,"[http, members3, boardhost, com, ratterriers]",http members 3 boardhost com rat terriers
5,"[http, octts, com]",http oct ts com
6,"[http, www, consortiuminfo, org]",http www consortium info org
7,"[http, www, hmbreview, com]",http www hmb review com
8,"[http, www, slopezphotography, com]",http www s lopez photography com
9,"[http, www, metacritic, com, video, titles, ti...",http www metacritic com video titles timecode


In [24]:
tk = Tokenizer(
    num_words=VOCAB
)

In [25]:
tk.fit_on_texts(data.segments)
x_segments = pad_sequences(tk.texts_to_sequences(data.segments), maxlen=WVECT)
x_segments.shape

(60000, 20)

In [26]:
np.save("data/xy/x_segments_45_15.npy", x_segments)