In [2]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from urllib.parse import urlparse
from tldextract import extract
from numba import jit

import wordsegment as ws
ws.load()

from code.util import *
from code.preprocessor import *
from code.gofaster import *

gf = GoFaster(11, 22)

N0 = 45000
N1 = 15000
VOCAB = 10000
CVECT = 100
WVECT = 20
UCHARS = 139
MI = 30

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Sampling

In [None]:
url0 = pd.read_csv("data/raw/url_0.csv", skiprows=0).sample(
    n=N0,
    random_state=11
).reset_index(drop=True)

url1 = pd.read_csv("data/raw/url_1.csv", skiprows=0).sample(
    n=N1,
    random_state=11
).reset_index(drop=True)

In [None]:
url0["target"] = pd.Series(np.zeros(N0).astype(np.int32))
url1["target"] = pd.Series(np.ones(N1).astype(np.int32))

In [None]:
url = pd.concat(
    [
        url0[["url", "target"]],
        url1[["url", "target"]]
    ]
)

In [None]:
save(url, "data/url_45_15.pkl")

# X, Y

In [None]:
x0 = url.url
y0 = url.target

# Target Onehot

In [None]:
y_onehot = to_categorical(y0, dtype=np.int32)
np.save("data/xy/y_onehot_45_15.npy", y_onehot)

# Vectorizers

In [None]:
tk = Tokenizer(
    num_words=VOCAB,
    filters='!"#$%&()*+,-.:;<=>?@[\\]^_`{|}~',
    split="/"
)

In [None]:
tk.fit_on_texts(x0)
x_word = pad_sequences(tk.texts_to_sequences(x0), maxlen=WVECT)
x_word.shape

In [None]:
np.save("data/xy/x_word_45_15_10000.npy", x_word)

# Character Level Vectorizer

In [2]:
url = load("data/url_45_15.pkl")

In [3]:
x_char = np.vstack(url.url.apply(char_level_encoder, args=(CVECT,)))

In [4]:
x_char

array([[104, 116, 116, ...,   0,   0,   0],
       [104, 116, 116, ...,   0,   0,   0],
       [104, 116, 116, ...,   0,   0,   0],
       ...,
       [104, 116, 116, ..., 117, 115, 116],
       [104, 116, 116, ...,   0,   0,   0],
       [104, 116, 116, ...,   0,   0,   0]])

In [5]:
x_char.shape

(60000, 100)

## Onehot

In [6]:
unique_chars = np.unique(x_char).astype(int)
len(unique_chars)

140

In [7]:
x_char_onehot = char_onehot(x_char, unique_chars, UCHARS)
x_char_onehot.shape

(60000, 13900)

In [8]:
np.savez_compressed("data/xy/x_char_onehot_45_15.npz", x_char_onehot)

# Feature Engineering

In [51]:
data = load("data/url_45_15.pkl")

In [52]:
mi_words = load("data/mi_100.pkl")
mi_words = list(filter(lambda word: True if len(word) >= 3 else False, words))

suspicious_words = ["confirm", "account", "banking", "secure", "ebayisapi", "webscr",
           "login", "signin", "exe", "zip"]

companies = ["ebay", "paypal", "bankofscotland", "volksbank", "wellsfargo",
            "bankofamerica", "privatebanking", "hsbc", "chase", "amazon",
            "banamex", "barclays"]

keywords = list(set(mi_words + suspicious_words + companies))

In [53]:
suspicious_tlds = ['country','stream','download','xin','gdn','racing', 
                   'jetzt','win','bid','vip', 'ren', 'kim', 'loan',
                   'mom', 'party', 'review', 'trade', 'date', 'wang',
                   'accountants', 'zip','cricket','link','work','gq',
                   'science','tk', 'world', 'fit', 'work' 'ryukyu',
                   'life', 'cloud', 'desi', 'okinawa', 'ooo','men',
                   'click', 'loan', 'top', 'cf', 'ml', 'ga']

In [54]:
@jit
def non_ascii(url):
    n = 0
    
    for c in url:
        if ord(c) > 127:
            n += 1
            
    return n

In [55]:
def contains_ip(url):
    p = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
    result = re.findall(p, url)
    return 1 if len(result) > 0 else 0

In [56]:
data["tokens"] = data.url.apply(tokenize)
data["len_url"] = data.url.apply(len)
data["len_tok"] = data.tokens.apply(len)
data["slash"] = data.url.apply(lambda url: url.count("/"))
data["dbl_slash"] = data.url.apply(lambda url: url.count("//"))
data["dot"] = data.url.apply(lambda url: url.count("."))
data["at"] = data.url.apply(lambda url: url.count("@"))
data["at2"] = data.url.apply(lambda url: url.count("%40"))
data["dash"] = data.url.apply(lambda url: url.count("-"))
data["qmark"] = data.url.apply(lambda url: url.count("?"))
data["amp"] = data.url.apply(lambda url: url.count("&"))
data["hash"] = data.url.apply(lambda url: url.count("#"))
data["perc"] = data.url.apply(lambda url: url.count("%"))
data["eq"] = data.url.apply(lambda url: url.count("="))
data["colon"] = data.url.apply(lambda url: url.count(":"))
data["scolon"] = data.url.apply(lambda url: url.count(";"))
data["scheme"] = data.url.apply(lambda url: urlparse(url).scheme)
data["netloc"] = data.url.apply(lambda url: urlparse(url).netloc)
data["tld"] = data.netloc.apply(lambda netloc: extract(netloc).suffix)
data["suspicious"] = data.tld.apply(lambda tld: 1 if tld in suspicious_tlds else 0)
data["non_ascii"] = data.url.apply(non_ascii)
data["contains_ip"] = data.url.apply(contains_ip)
data["kw_count"] = data.tokens.apply(lambda tokens: len(set(tokens) & set(keywords)))

for i in range(len(keywords)):
    data["kw%d" % (i)] = data.tokens.apply(lambda tokens: 1 if keywords[i] in tokens else 0)
    
dummies = pd.get_dummies(data.scheme, drop_first=True)
data = pd.concat([data, dummies], axis=1)

In [57]:
drop = [
    "target",
    "url",
    "tokens",
    "scheme",
    "netloc",
    "tld"
]

data = data.drop(columns=drop, axis=1)

In [58]:
data.values.shape

(60000, 118)

In [59]:
np.save("data/xy/x_feat_45_15.npy", data.values)

# Segments

In [None]:
data = load("data/url_45_15.pkl")

In [None]:
def segment(tokens):
    segments = []
    
    for token in tokens:
        segments.append(" ".join(ws.segment(token)))
        
    return " ".join(segments)

In [None]:
def worker(df):
    df["segments"] = df.tokens.apply(segment)
    return df

In [None]:
data["tokens"] = data.url.apply(tokenize)

In [None]:
data = gf.parallelize(data, worker)

In [None]:
data[["tokens", "segments"]].head(30)

In [None]:
tk = Tokenizer(
    num_words=VOCAB
)

In [None]:
tk.fit_on_texts(data.segments)
x_segments = pad_sequences(tk.texts_to_sequences(data.segments), maxlen=WVECT)
x_segments.shape

In [None]:
np.save("data/xy/x_segments_45_15.npy", x_segments)

# MD5

In [3]:
data = load("data/url_45_15.pkl")

In [4]:
import hashlib

In [6]:
data["md5"] = data.url.apply(lambda url: hashlib.md5(url.encode()).hexdigest())

In [8]:
x_char = np.vstack(data.md5.apply(char_level_encoder, args=(32,)))
x_char.shape

(60000, 32)

In [9]:
unique_chars = np.unique(x_char).astype(int)
len(unique_chars)

16

In [10]:
x_char_onehot = char_onehot(x_char, unique_chars, 16)
x_char_onehot.shape

(60000, 512)

In [11]:
np.save("data/xy/x_md5.npy", x_char_onehot)