# 0. Imports

In [1]:
import numpy as np
import pandas as pd
import wordsegment as ws
import tldextract as tld

from urllib.parse import urlparse
from os import cpu_count

# custom modules
from code.util import *
from code.preprocessor import *
from code.gofaster import *

# dataframe parallelizer
gf = GoFaster(n_jobs=cpu_count()-1, n_partitions=cpu_count()*3)

# load wordsegment dictionary
ws.load()

# 1. Preprocessing

In [2]:
# Load data
data = load("data/urls.pkl")

## 1.1 Sampling
Take a random sample of 5% of normal urls and all phishing urls for a total of ~67k data points

In [3]:
sample = data[data.target==0].sample(frac=0.05, random_state=42)
sample = pd.concat([sample, data[data.target==1]])
save(sample, "data/urls_sampled.pkl")

data = sample
print(len(data))

67690


## 1.2 Parsing

In [4]:
def parser(df):
    parse_result = data.url.apply(urlparse)
    
    df["scheme"] = parse_result.apply(lambda x: x.scheme)
    df["netloc"] = parse_result.apply(lambda x: x.netloc)
    df["path"] = parse_result.apply(lambda x: x.path)
    df["params"] = parse_result.apply(lambda x: x.params)
    df["query"] = parse_result.apply(lambda x: x.query)
    df["fragment"] = parse_result.apply(lambda x: x.fragment)
    
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, parser)

In [5]:
data.tail()

Unnamed: 0,target,url,scheme,netloc,path,params,query,fragment
1063832,1,https://sites.google.com/site/freehabbocoinsgb...,https,sites.google.com,/site/freehabbocoinsgbbo00/,,,
1063833,1,http://mundovirtualhabbo.blogspot.com/2009_01_...,http,mundovirtualhabbo.blogspot.com,/2009_01_01_archive.html,,,
1063834,1,http://aijcs.blogspot.com/2005/03/colourful-li...,http,aijcs.blogspot.com,/2005/03/colourful-life-of-aij.html,,,
1063835,1,http://tnet.at.ua/index/0-13,http,tnet.at.ua,/index/0-13,,,
1063836,1,http://tudu-free.blogspot.com/2008/02/jogos-ja...,http,tudu-free.blogspot.com,/2008/02/jogos-java-aplicativos.html,,,footer-wrap2


## 1.3 Domain Extraction

In [6]:
def tldextracter(df):
    extract_result = df.netloc.apply(tld.extract)
    
    df["subdomain"] = extract_result.apply(lambda x: x.subdomain)
    df["domain"] = extract_result.apply(lambda x: x.domain)
    df["suffix"] = extract_result.apply(lambda x: x.suffix)
    
    return df

if __name__ == "__main__":
    data = gf.parallelize(data, tldextracter)

In [7]:
data.tail()

Unnamed: 0,target,url,scheme,netloc,path,params,query,fragment,subdomain,domain,suffix
1063832,1,https://sites.google.com/site/freehabbocoinsgb...,https,sites.google.com,/site/freehabbocoinsgbbo00/,,,,sites,google,com
1063833,1,http://mundovirtualhabbo.blogspot.com/2009_01_...,http,mundovirtualhabbo.blogspot.com,/2009_01_01_archive.html,,,,mundovirtualhabbo,blogspot,com
1063834,1,http://aijcs.blogspot.com/2005/03/colourful-li...,http,aijcs.blogspot.com,/2005/03/colourful-life-of-aij.html,,,,aijcs,blogspot,com
1063835,1,http://tnet.at.ua/index/0-13,http,tnet.at.ua,/index/0-13,,,,tnet,at,ua
1063836,1,http://tudu-free.blogspot.com/2008/02/jogos-ja...,http,tudu-free.blogspot.com,/2008/02/jogos-java-aplicativos.html,,,footer-wrap2,tudu-free,blogspot,com


## 1.4 Domain Segmentation

In [9]:
def segmenter(df):
    df["segments"] = df.domain.apply(ws.segment)
    return df


if __name__ == "__main__":
    data = gf.parallelize(data, segmenter)

In [10]:
data.tail()

Unnamed: 0,target,url,scheme,netloc,path,params,query,fragment,subdomain,domain,suffix,segments
1063832,1,https://sites.google.com/site/freehabbocoinsgb...,https,sites.google.com,/site/freehabbocoinsgbbo00/,,,,sites,google,com,[google]
1063833,1,http://mundovirtualhabbo.blogspot.com/2009_01_...,http,mundovirtualhabbo.blogspot.com,/2009_01_01_archive.html,,,,mundovirtualhabbo,blogspot,com,[blogspot]
1063834,1,http://aijcs.blogspot.com/2005/03/colourful-li...,http,aijcs.blogspot.com,/2005/03/colourful-life-of-aij.html,,,,aijcs,blogspot,com,[blogspot]
1063835,1,http://tnet.at.ua/index/0-13,http,tnet.at.ua,/index/0-13,,,,tnet,at,ua,[at]
1063836,1,http://tudu-free.blogspot.com/2008/02/jogos-ja...,http,tudu-free.blogspot.com,/2008/02/jogos-java-aplicativos.html,,,footer-wrap2,tudu-free,blogspot,com,[blogspot]
