In [1]:
import pandas as pd
import numpy as np
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from urllib.parse import urlparse
from tldextract import extract

from code.util import *
from code.preprocessor import *
from code.gofaster import *

Using TensorFlow backend.


In [2]:
data = load("data/urls_sampled.pkl")

In [3]:
MAX_WORDS = 4000
MI_WORDS = 30
MAX_SENTENCE_LEN = 20

# Vectorizers

In [4]:
cnt = CountVectorizer(
    tokenizer=tokenize,
    max_features=MAX_WORDS
)

In [5]:
tf = TfidfVectorizer(
    tokenizer=tokenize,
    max_features=MAX_WORDS
)

In [6]:
tk = Tokenizer(
    num_words=MAX_WORDS,
    filters='!"#$%&()*+,-.:;<=>?@[\\]^_`{|}~',
    split="/"
)

In [7]:
x0 = data.url
y0 = data.target

In [8]:
y_onehot = to_categorical(y0)

save(y_onehot, "data/largefiles/y_onehot.pkl")

In [9]:
x_count = cnt.fit_transform(x0).toarray()
x_tfidf = tf.fit_transform(x0).toarray()

save(x_count, "data/largefiles/x_count.pkl")
save(x_tfidf, "data/largefiles/x_tfidf.pkl")

In [10]:
tk.fit_on_texts(x0)
x_word = pad_sequences(tk.texts_to_sequences(x0), maxlen=MAX_SENTENCE_LEN)

save(x_word, "data/largefiles/x_word.pkl")

In [11]:
x_char = np.vstack(data.url.apply(char_level_encoder))

save(x_char, "data/largefiles/x_char.pkl")

# Feature Engineering

In [12]:
words = load("data/mi_100.pkl")

In [13]:
suspicious_tlds = ['country','stream','download','xin','gdn','racing', 
                   'jetzt','win','bid','vip', 'ren', 'kim', 'loan',
                   'mom', 'party', 'review', 'trade', 'date', 'wang',
                   'accountants', 'zip','cricket','link','work','gq',
                   'science','tk', 'world', 'fit', 'work' 'ryukyu',
                   'life', 'cloud', 'desi', 'okinawa', 'ooo','men',
                   'click', 'loan', 'top', 'cf', 'ml', 'ga']

In [14]:
data["tokens"] = data.url.apply(tokenize)
data["len_url"] = data.url.apply(len)
data["len_tok"] = data.tokens.apply(len)
data["slash"] = data.url.apply(lambda url: url.count("/"))
data["dbl_slash"] = data.url.apply(lambda url: url.count("//"))
data["dot"] = data.url.apply(lambda url: url.count("."))
data["at"] = data.url.apply(lambda url: url.count("@"))
data["dash"] = data.url.apply(lambda url: url.count("-"))
data["qmark"] = data.url.apply(lambda url: url.count("?"))
data["amp"] = data.url.apply(lambda url: url.count("&"))
data["hash"] = data.url.apply(lambda url: url.count("#"))
data["perc"] = data.url.apply(lambda url: url.count("%"))
data["eq"] = data.url.apply(lambda url: url.count("="))
data["colon"] = data.url.apply(lambda url: url.count(":"))
data["scolon"] = data.url.apply(lambda url: url.count(";"))
data["scheme"] = data.url.apply(lambda url: urlparse(url).scheme)
data["netloc"] = data.url.apply(lambda url: urlparse(url).netloc)
data["tld"] = data.netloc.apply(lambda netloc: extract(netloc).suffix)
data["suspicious"] = data.tld.apply(lambda tld: 1 if tld in suspicious_tlds else 0)

for i in range(MI_WORDS):
    data["mi%d" % (i)] = data.tokens.apply(lambda tokens: 1 if words[i] in tokens else 0)
    
dummies = pd.get_dummies(data.scheme, drop_first=True)
data = pd.concat([data, dummies], axis=1)

In [15]:
data.columns

Index(['target', 'url', 'tokens', 'len_url', 'len_tok', 'slash', 'dbl_slash',
       'dot', 'at', 'dash', 'qmark', 'amp', 'hash', 'perc', 'eq', 'colon',
       'scolon', 'scheme', 'netloc', 'tld', 'suspicious', 'mi0', 'mi1', 'mi2',
       'mi3', 'mi4', 'mi5', 'mi6', 'mi7', 'mi8', 'mi9', 'mi10', 'mi11', 'mi12',
       'mi13', 'mi14', 'mi15', 'mi16', 'mi17', 'mi18', 'mi19', 'mi20', 'mi21',
       'mi22', 'mi23', 'mi24', 'mi25', 'mi26', 'mi27', 'mi28', 'mi29', 'ftp',
       'gopher', 'http', 'https', 'sherlock'],
      dtype='object')

In [16]:
drop = [
    "target",
    "url",
    "tokens",
    "scheme",
    "netloc",
    "tld"
]

In [17]:
data0 = data.drop(columns=drop, axis=1)

In [18]:
save(data0.values, "data/largefiles/x_feat.pkl")