# 0. Imports

In [1]:
import numpy as np
import pandas as pd
import wordsegment as ws
import tldextract as tld

from urllib.parse import urlparse
from os import cpu_count

# custom modules
from code.util import *
from code.preprocessor import *
from code.gofaster import *

# dataframe parallelizer
gf = GoFaster(n_jobs=cpu_count()-1, n_partitions=cpu_count()*3)

# load wordsegment dictionary
ws.load()

# 1. Preprocessing

In [2]:
# Load data
data = load("data/urls.pkl")

## 1.1 Sampling
Take a random sample of 5% of normal urls and all phishing urls for a total of ~67k data points

In [3]:
sample = data[data.target==0].sample(frac=0.05, random_state=42)
sample = pd.concat([sample, data[data.target==1]]).reset_index(drop=True)
save(sample, "data/urls_sampled.pkl")

data = sample
print(len(data))

67690


## 1.2 Parsing

In [4]:
def parser(df):
    parse_result = data.url.apply(urlparse)
    
    df["scheme"] = parse_result.apply(lambda x: x.scheme)
    df["netloc"] = parse_result.apply(lambda x: x.netloc)
    df["path"] = parse_result.apply(lambda x: x.path)
    df["params"] = parse_result.apply(lambda x: x.params)
    df["query"] = parse_result.apply(lambda x: x.query)
    df["fragment"] = parse_result.apply(lambda x: x.fragment)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, parser)

## 1.3 Domain Extraction
Separate netloc into subdomain, domain, and tld (top level domain)

In [5]:
def tldextracter(df):
    extract_result = df.netloc.apply(tld.extract)
    
    df["subdomain"] = extract_result.apply(lambda x: x.subdomain)
    df["domain"] = extract_result.apply(lambda x: x.domain)
    df["tld"] = extract_result.apply(lambda x: x.suffix)
    
    return df

if __name__ == "__main__":
    data = gf.parallelize(data, tldextracter)

## 1.4 Domain Segmentation
Segment domain into subwords based on wordsegment vocab

In [6]:
def segmenter(df):
    df["segments"] = df.domain.apply(ws.segment)
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, segmenter)

## 1.5 Concatenation

In [7]:
# concatenate path, params, query, and fragment
def cat(df):
    
    def worker(row):
        return row.path + row.params + row.query + row.fragment
        
    df["tail"] = df.apply(worker, axis=1)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, cat)

In [8]:
# Finding leftover substrings after parsing
def scavenger(df):
    
    def worker(row):
        result = row.netloc

        for item in [row.subdomain, row.domain, row.tld]:
            result = result.replace(item, "", 1)

        return result.replace(".", "")
    
    df["leftovers"] = df.apply(worker, axis=1)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, scavenger)

In [9]:
data.tail(10)

Unnamed: 0,target,url,scheme,netloc,path,params,query,fragment,subdomain,domain,tld,segments,tail,leftovers
67680,1,http://www.mymvm.com/public/public/public/_cac...,http,www.mymvm.com,/public/public/public/_cache/images/event/www....,,,,www,mymvm,com,"[my, mvm]",/public/public/public/_cache/images/event/www....,
67681,1,http://dinas.tomsk.ru/pma/?www.paypal.com/de/c...,http,dinas.tomsk.ru,/pma/,,www.paypal.com/de/cgi-bin/webscr.cmd=_login-run.,,dinas,tomsk,ru,[tomsk],/pma/www.paypal.com/de/cgi-bin/webscr.cmd=_log...,
67682,1,http://dinas.tomsk.ru/err/?www.paypal.ch/ch/cg...,http,dinas.tomsk.ru,/err/,,www.paypal.ch/ch/cgi-bin/webscr1.htm?cmd=_logi...,,dinas,tomsk,ru,[tomsk],/err/www.paypal.ch/ch/cgi-bin/webscr1.htm?cmd=...,
67683,1,http://paypal.com.cgi-bin.login.submited.elkkl...,http,paypal.com.cgi-bin.login.submited.elkklkdgh54m...,/,,,,paypal.com.cgi-bin.login.submited.elkklkdgh54m...,begiu,com,"[be, giu]",/,
67684,1,http://creditiperhabbogratissicuro100.blogspot...,http,creditiperhabbogratissicuro100.blogspot.com,/2011/02/habbo-crediti-gratis-sicuro-100.html,,,,creditiperhabbogratissicuro100,blogspot,com,[blogspot],/2011/02/habbo-crediti-gratis-sicuro-100.html,
67685,1,https://sites.google.com/site/freehabbocoinsgb...,https,sites.google.com,/site/freehabbocoinsgbbo00/,,,,sites,google,com,[google],/site/freehabbocoinsgbbo00/,
67686,1,http://mundovirtualhabbo.blogspot.com/2009_01_...,http,mundovirtualhabbo.blogspot.com,/2009_01_01_archive.html,,,,mundovirtualhabbo,blogspot,com,[blogspot],/2009_01_01_archive.html,
67687,1,http://aijcs.blogspot.com/2005/03/colourful-li...,http,aijcs.blogspot.com,/2005/03/colourful-life-of-aij.html,,,,aijcs,blogspot,com,[blogspot],/2005/03/colourful-life-of-aij.html,
67688,1,http://tnet.at.ua/index/0-13,http,tnet.at.ua,/index/0-13,,,,tnet,at,ua,[at],/index/0-13,
67689,1,http://tudu-free.blogspot.com/2008/02/jogos-ja...,http,tudu-free.blogspot.com,/2008/02/jogos-java-aplicativos.html,,,footer-wrap2,tudu-free,blogspot,com,[blogspot],/2008/02/jogos-java-aplicativos.htmlfooter-wrap2,
