# 0. Imports

In [1]:
import numpy as np
import pandas as pd
import wordsegment as ws
import tldextract as tld

from urllib.parse import urlparse
from os import cpu_count

# custom modules
from code.util import *
from code.preprocessor import *
from code.gofaster import *

# dataframe parallelizer
gf = GoFaster(n_jobs=cpu_count()-1, n_partitions=cpu_count()*3)

# load wordsegment dictionary
ws.load()

# 1. Preprocessing

In [2]:
# Load data
data = load("data/urls.pkl")

## 1.1 Sampling
Take a random sample of 5% of normal urls and all phishing urls for a total of ~67k data points

In [3]:
sample = data[data.target==0].sample(frac=0.05, random_state=42)
sample = pd.concat([sample, data[data.target==1]]).reset_index(drop=True)
save(sample, "data/urls_sampled.pkl")

data = sample
print(len(data))

67690


## 1.2 Parsing

In [4]:
def parser(df):
    parse_result = data.url.apply(urlparse)
    
    df["scheme"] = parse_result.apply(lambda x: x.scheme)
    df["netloc"] = parse_result.apply(lambda x: x.netloc)
    df["path"] = parse_result.apply(lambda x: x.path)
    df["params"] = parse_result.apply(lambda x: x.params)
    df["query"] = parse_result.apply(lambda x: x.query)
    df["fragment"] = parse_result.apply(lambda x: x.fragment)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, parser)

## 1.3 Domain Extraction
Separate netloc into subdomain, domain, and tld (top level domain)

In [5]:
def tldextracter(df):
    extract_result = df.netloc.apply(tld.extract)
    
    df["subdomain"] = extract_result.apply(lambda x: x.subdomain)
    df["domain"] = extract_result.apply(lambda x: x.domain)
    df["tld"] = extract_result.apply(lambda x: x.suffix)
    
    return df

if __name__ == "__main__":
    data = gf.parallelize(data, tldextracter)

## 1.4 Domain Segmentation
Segment domain into subwords based on wordsegment vocab

In [6]:
def segmenter(df):
    df["segments"] = df.domain.apply(ws.segment)
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, segmenter)

## 1.5 Concatenation

In [7]:
# concatenate path, params, query, and fragment
def cat(df):
    
    def worker(row):
        return row.path + row.params + row.query + row.fragment
        
    df["url_tail"] = df.apply(worker, axis=1)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, cat)

In [8]:
# Finding leftover substrings after parsing
def scavenger(df):
    
    def worker(row):
        result = row.netloc

        for item in [row.subdomain, row.domain, row.tld]:
            result = result.replace(item, "", 1)

        return result.replace(".", "")
    
    df["leftovers"] = df.apply(worker, axis=1)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, scavenger)

## 1.6 Additional Features

In [9]:
# counting special characters, subdomains etc.
def count_stuff(df):
    
    df['dots'] = df.url.apply(lambda x: x.count("."))
    df['hyphens'] = df.url.apply(lambda x: x.count("-"))
    df['ats'] = df.url.apply(lambda x: x.count("@"))
    df['slashes'] = df.url.apply(lambda x: x.count("/"))
    df['doubleslashes'] = df.url_tail.apply(lambda x: x.count("//"))
    df['subdomains'] = df.url.apply(lambda x: len(x.split(".")) if x else 0)
    df['queries'] = df.url.apply(lambda x: len(x.split("&")) if x else 0)
    df["len_url"] = df.url.apply(len)
    df["len_tail"] = df.url_tail.apply(len)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, count_stuff)

---

In [10]:
# 2017~2019's top 20 most suspicious TLD and words
# https://www.symantec.com/blogs/feature-stories/top-20-shady-top-level-domains
suspicious_tlds = ['country','stream','download','xin','gdn','racing', 
                   'jetzt','win','bid','vip', 'ren', 'kim', 'loan',
                   'mom', 'party', 'review', 'trade', 'date', 'wang',
                   'accountants', 'zip','cricket','link','work','gq',
                   'science','tk', 'world', 'fit', 'work' 'ryukyu',
                   'life', 'cloud', 'desi', 'okinawa', 'ooo','men',
                   'click', 'loan', 'top', 'cf', 'ml', 'ga']

suspicious_port = ['21', '22', '445', '1433', '1521', '3306', '3389']

In [11]:
def suspicious(df):
    
    def worker(s, ls):
        for item in ls:
            if item in s:
                return 1
            
        return 0
    
    df["suspicious_tld"] = df.url.apply(lambda x: worker(x, suspicious_tlds))
    df["suspicious_port"] = df.leftovers.apply(lambda x: worker(x, suspicious_port))
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, suspicious)

---

In [14]:
data.head().T

Unnamed: 0,0,1,2,3,4
target,0,0,0,0,0
url,http://vioz.org,http://www.bwca.cc/wildflowers/flora.htm,http://www.appleseedenterprises.com/travel20.htm,http://www.soyouwanna.com/site/syws/kitchen/ki...,http://www.peterbelanger.com/newton.html
scheme,http,http,http,http,http
netloc,vioz.org,www.bwca.cc,www.appleseedenterprises.com,www.soyouwanna.com,www.peterbelanger.com
path,,/wildflowers/flora.htm,/travel20.htm,/site/syws/kitchen/kitchen.html,/newton.html
params,,,,,
query,,,,,
fragment,,,,,
subdomain,,www,www,www,www
domain,vioz,bwca,appleseedenterprises,soyouwanna,peterbelanger


In [12]:
save(data, "data/urls_preprocessed.pkl")

# 2. Visualization

In [13]:
# def quantize(df):
#     df["cvect_url"] = df.url.apply(char_level_encoder)
#     df["cvect_url_tail"] = df.url_tail.apply(char_level_encoder)
#     return df


# if __name__ == "__main__":
#     data = gf.parallelize(data, quantize)