# Project Info
## Author: Johnny Zhong
## Purpose: CS373 - DADA Homework 4


### Credits
URL Parsing: https://github.com/surajr/URL-Classification/blob/master/URL%20Classification.ipynb

### Breakdown of URL String
First, we break down the url into component parts and provide scores for what we find.
I used some steps from the link above to count up some factors.

In [1]:
# most suspicious tlds
# source: https://www.symantec.com/blogs/feature-stories/top-20-shady-top-level-domains
Suspicious_TLD = ['.country', '.kim', '.science', '.gq', '.work', '.ninja', '.xyz',
                  '.date', '.faith', '.zip', '.racing', '.cricket', '.win', '.space',
                  '.accountant', '.realtor', '.top', '.stream', '.christmas', '.gdn',
                  '.mom', '.pro', '.men']

### url parsing methods

In [2]:
import ipaddress as ip #works only in python 3
def isip(uri):
    try:
        if ip.ip_address(uri):
            return 1
    except:
        return 0

#method to check the presence of hyphens
def isPresentHyphen(url):
    return url.count('-')

# method to check the presence of @
# Using “@” symbol in the URL leads the browser to 
# ignore everything preceding the “@” symbol and the 
# real address often follows the “@” symbol.
def isPresentAt(url):
    return url.count('@')

def isPresentDSlash(url):
    return url.count('//')

def countSubDir(url):
    return url.count('/')

# Method to count number of dots
def countdots(url):  
    return url.count('.')

# method to count delims
def countdelim(url):
    count = 0
    delim=[';','_','?','=','&']
    for each in url:
        if each in delim:
            count = count + 1
    
    return count

def get_ext(url):
    """Return the filename extension from url, or ''."""
    
    root, ext = splitext(url)
    return ext

def countSubDomain(subdomain):
    if not subdomain:
        return 0
    else:
        return len(subdomain.split('.'))
    
def countQueries(query):
    if not query:
        return 0
    else:
        return len(query.split('&'))

In [3]:
from urllib.parse import urlparse
import tldextract
def getFeatures(url): 
    result = []
    url = str(url)    
    #parse the URL and extract the domain information
    path = urlparse(url)
    ext = tldextract.extract(url)
    
    #counting number of dots in subdomain    
    result.append(countdots(ext.subdomain))
    
    #checking hyphen in domain   
    result.append(isPresentHyphen(path.netloc))
    
    #length of URL    
    result.append(len(url))
    
    #checking @ in the url    
    result.append(isPresentAt(path.netloc))
    
    #checking presence of double slash    
    result.append(isPresentDSlash(path.path))
    
    #Count number of subdir    
    result.append(countSubDir(path.path))
    
    #number of sub domain    
    result.append(countSubDomain(ext.subdomain))
    
    #length of domain name    
    result.append(len(path.netloc))
    
    # len of queries    
    result.append(len(path.query))
    
    #Adding domain information
    
    #if IP address is being used as a URL     
    result.append(isip(ext.domain))
    
    # presence of Suspicious_TLD
    result.append(1 if ext.suffix in Suspicious_TLD else 0)
    
    return result

Import the data here:

In [4]:
# get data from json
# this includes some basic features
import pandas as pd

df = pd.read_json('train.json')
df.head()

Unnamed: 0,host_len,fragment,url_len,default_port,domain_age_days,tld,num_domain_tokens,ips,malicious_url,url,...,registered_domain,scheme,path,path_len,port,host,domain_tokens,mxhosts,path_tokens,num_path_tokens
0,12,,84,80,5621,com,3,"[{'geo': 'CN', 'ip': '115.236.98.124', 'type':...",0,http://www.oppo.com/?utm_source=WeiBo&amp;utm_...,...,oppo.com,http,/,1,80,www.oppo.com,"[www, oppo, com]","[{'mxhost': 'mail1.oppo.com', 'ips': [{'geo': ...",[],1
1,29,,58,80,172,com,3,"[{'geo': 'NL', 'ip': '5.39.220.76', 'type': 'A'}]",1,http://googledrive.royal-moments.com/docs/docs...,...,royal-moments.com,http,/docs/docs/googledocs/,22,80,googledrive.royal-moments.com,"[googledrive, royal-moments, com]","[{'mxhost': 'royal-moments.com', 'ips': [{'geo...","[docs, docs, googledocs, ]",4
2,39,,53,80,9,com,6,"[{'geo': 'NL', 'ip': '46.21.161.247', 'type': ...",1,http://www.coinbase.com.agreement.advicecm.com...,...,advicecm.com,http,/wallet,7,80,www.coinbase.com.agreement.advicecm.com,"[www, coinbase, com, agreement, advicecm, com]","[{'mxhost': 'ASPMX2.GOOGLEMAIL.com', 'ips': [{...",[wallet],1
3,56,,76,80,22,com,5,"[{'geo': 'HK', 'ip': '122.10.94.86', 'type': '...",1,http://eu.battle.net.blizzardentertainmentfree...,...,blizzardentertainmentfreeofactivitiese.com,http,/login/14.htm,13,80,eu.battle.net.blizzardentertainmentfreeofactiv...,"[eu, battle, net, blizzardentertainmentfreeofa...",,"[login, 14.htm]",2
4,20,,40,443,840,com,3,"[{'geo': 'US', 'ip': '199.59.149.243', 'type':...",0,https://business.twitter.com/twitter-101,...,twitter.com,https,/twitter-101,12,443,business.twitter.com,"[business, twitter, com]","[{'mxhost': 'ASPMX3.GOOGLEMAIL.com', 'ips': [{...",[twitter-101],1


### More Scoring
Parsing the URL isn't enough. There's a lot of other factors to consider here as well.

Since the machine learning algorithms I have in mind consider continuous factors and I'm not lookiong into NLP, I'm just going to get some counts and categorize some values.

In [5]:
# easy counting (to add features)
def transform_params(df):
    # count mxhosts
    df['num_mxhosts'] = df['mxhosts'].str.len()
    df['num_mxhosts'].fillna(0, inplace=True)

    # count ips
    df['num_ips'] = df['ips'].str.len()
    df['num_ips'].fillna(0, inplace=True)

    # factorize tld
    df['tld_category'] = df['tld'].factorize()[0]

    # factorize scheme
    df['scheme_category'] = df['scheme'].factorize()[0]

    # factorize file extensions
    df['file_extension_category'] = df['file_extension'].factorize()[0]

    # factorize ports
    df['default_port_category'] = df['default_port'].factorize()[0]
    df['port_category'] = df['port'].factorize()[0]
    
    return df

In [6]:
df = transform_params(df)

### Create url feature set

In [7]:
# create a feature set from the url
url_feature_set = pd.DataFrame(columns=('no of dots', 'presence of hyphen', 'len of url',
                                        'presence of at', 'presence of double slash',
                                        'no of subdir', 'no of subdomain', 'len of domain',
                                        'len of query', 'is IP', 'presence of suspicious tld'),
                              dtype='int32')

for i in range(len(df)):
    url_features = getFeatures(df['url'].loc[i])
    url_feature_set.loc[i] = url_features

In [8]:
url_feature_set.head()

Unnamed: 0,no of dots,presence of hyphen,len of url,presence of at,presence of double slash,no of subdir,no of subdomain,len of domain,len of query,is IP,presence of suspicious tld
0,0,0,84,0,0,1,1,12,63,0,0
1,0,1,58,0,0,4,1,29,0,0,0
2,3,0,53,0,0,1,4,39,0,0,0
3,2,0,76,0,0,2,3,56,0,0,0
4,0,0,40,0,0,1,1,20,0,0,0


### Join the feature set back into our dataset
We merge the feature set here, so we have all of our datapoints together.

In [9]:
# merge the url feature set back to the basic feature set
df_with_features = df.merge(url_feature_set, left_index=True, right_index=True)
df_with_features.head()

Unnamed: 0,host_len,fragment,url_len,default_port,domain_age_days,tld,num_domain_tokens,ips,malicious_url,url,...,presence of hyphen,len of url,presence of at,presence of double slash,no of subdir,no of subdomain,len of domain,len of query,is IP,presence of suspicious tld
0,12,,84,80,5621,com,3,"[{'geo': 'CN', 'ip': '115.236.98.124', 'type':...",0,http://www.oppo.com/?utm_source=WeiBo&amp;utm_...,...,0,84,0,0,1,1,12,63,0,0
1,29,,58,80,172,com,3,"[{'geo': 'NL', 'ip': '5.39.220.76', 'type': 'A'}]",1,http://googledrive.royal-moments.com/docs/docs...,...,1,58,0,0,4,1,29,0,0,0
2,39,,53,80,9,com,6,"[{'geo': 'NL', 'ip': '46.21.161.247', 'type': ...",1,http://www.coinbase.com.agreement.advicecm.com...,...,0,53,0,0,1,4,39,0,0,0
3,56,,76,80,22,com,5,"[{'geo': 'HK', 'ip': '122.10.94.86', 'type': '...",1,http://eu.battle.net.blizzardentertainmentfree...,...,0,76,0,0,2,3,56,0,0,0
4,20,,40,443,840,com,3,"[{'geo': 'US', 'ip': '199.59.149.243', 'type':...",0,https://business.twitter.com/twitter-101,...,0,40,0,0,1,1,20,0,0,0


### Manual Inspection Followed by Filtering
I looked at the dataset and did some testing and determined that there were some columns from the derived features that were completely filled with 0s. I didn't want the algorithsm to overfit on these points and because I couldn't be certain that the classifiy set of data I chose to trim those features out. The following code looks for columns that are only filled with 0s.

The columns list is referenced later with the classify set to be consistent in what we're looking at.

In [10]:
# drop columns where all values are false or 0
dropindices = [x for x, y in enumerate(df_with_features.any()) if y != True]
droplist = [y for x, y in enumerate(df_with_features.columns) if x in dropindices]
df_with_features = df_with_features.drop(columns=droplist)

In [11]:
df_with_features

Unnamed: 0,host_len,fragment,url_len,default_port,domain_age_days,tld,num_domain_tokens,ips,malicious_url,url,...,port_category,no of dots,presence of hyphen,len of url,presence of double slash,no of subdir,no of subdomain,len of domain,len of query,is IP
0,12,,84,80,5621,com,3,"[{'geo': 'CN', 'ip': '115.236.98.124', 'type':...",0,http://www.oppo.com/?utm_source=WeiBo&amp;utm_...,...,0,0,0,84,0,1,1,12,63,0
1,29,,58,80,172,com,3,"[{'geo': 'NL', 'ip': '5.39.220.76', 'type': 'A'}]",1,http://googledrive.royal-moments.com/docs/docs...,...,0,0,1,58,0,4,1,29,0,0
2,39,,53,80,9,com,6,"[{'geo': 'NL', 'ip': '46.21.161.247', 'type': ...",1,http://www.coinbase.com.agreement.advicecm.com...,...,0,3,0,53,0,1,4,39,0,0
3,56,,76,80,22,com,5,"[{'geo': 'HK', 'ip': '122.10.94.86', 'type': '...",1,http://eu.battle.net.blizzardentertainmentfree...,...,0,2,0,76,0,2,3,56,0,0
4,20,,40,443,840,com,3,"[{'geo': 'US', 'ip': '199.59.149.243', 'type':...",0,https://business.twitter.com/twitter-101,...,1,0,0,40,0,1,1,20,0,0
5,13,,21,80,3789,com,3,"[{'geo': 'CN', 'ip': '123.126.99.31', 'type': ...",0,http://www.youku.com/,...,0,0,0,21,0,1,1,13,0,0
6,17,,72,80,5647,cn,4,"[{'geo': 'HK', 'ip': '203.90.242.124', 'type':...",0,http://login.sina.com.cn/visitor/visitor?a=cro...,...,0,0,0,72,0,2,1,17,31,0
7,18,,54,80,52,br,3,"[{'geo': 'US', 'ip': '192.185.216.217', 'type'...",1,http://ciclobikelp.com.br/script_wideimage/ven...,...,0,0,0,54,0,3,0,18,0,0
8,17,,25,80,87,com,3,"[{'geo': 'US', 'ip': '67.215.65.133', 'type': ...",1,http://www.urunalarm.com/,...,0,0,0,25,0,1,1,17,0,0
9,22,,69,80,11,com,3,"[{'geo': 'US', 'ip': '209.188.7.134', 'type': ...",1,http://www.errandshopping.com/reb/bl.returns/p...,...,0,0,0,69,0,5,1,22,0,0


### Keep Numerical Columns
The column data types are checked to determine which ones are of some numeric type (int64 or float64). Since we can't use any strings or booleans, those have already been either factorized or turned into a binary representation.

The columns that are still strings are filtered out.

In [12]:
df_with_features.dtypes

host_len                      int64
fragment                     object
url_len                       int64
default_port                  int64
domain_age_days               int64
tld                          object
num_domain_tokens             int64
ips                          object
malicious_url                 int64
url                          object
alexa_rank                  float64
query                        object
file_extension               object
registered_domain            object
scheme                       object
path                         object
path_len                      int64
port                          int64
host                         object
domain_tokens                object
mxhosts                      object
path_tokens                  object
num_path_tokens               int64
num_mxhosts                 float64
num_ips                     float64
tld_category                  int64
scheme_category               int64
file_extension_category     

In [13]:
# get all non numerical types
df_only_features = df_with_features.select_dtypes(include=['int64', 'float64']).fillna(0)

### Machine Learning

We first load the libraries necessary. Then we drop some features that we don't need that were left over from the numeric type trim.

In [14]:
# import the ML libraries

import sklearn.ensemble as ek
from sklearn import model_selection, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis



In [15]:
for i, x in enumerate(df_only_features.drop(['malicious_url', 'default_port', 'port'],axis=1).columns):
    print(f'{i}: {x}')

0: host_len
1: url_len
2: domain_age_days
3: num_domain_tokens
4: alexa_rank
5: path_len
6: num_path_tokens
7: num_mxhosts
8: num_ips
9: tld_category
10: scheme_category
11: file_extension_category
12: default_port_category
13: port_category
14: no of dots
15: presence of hyphen
16: len of url
17: presence of double slash
18: no of subdir
19: no of subdomain
20: len of domain
21: len of query
22: is IP


### Split the Dataset

First we split the dataset into inputs (features) and outputs (malicious_url classification). Then we use a builtin function to split the inputs and outputs into train and test groups. The train group is there to teach the model while the test group is there to validate the model works.

In [16]:
print("Safe = 0, Malicious = 1")
print(df_only_features.groupby(df_only_features['malicious_url']).size())

X = df_only_features.drop(['malicious_url', 'default_port'],axis=1).values
y = df_only_features['malicious_url'].values

Safe = 0, Malicious = 1
malicious_url
0    1072
1     934
dtype: int64


In [17]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.3)

### Try Some Models
We gather a bunch of models to try on our dataset and we run each one, scoring the performance of each model by its accuracy in predicting the output for our test dataset. 

We see that some of the models (Decision Tree, Random Forest, AdaBoost, Gradient Boosting) perform perfectly for the dataset. I'm suspicious of this, as it may be a consequence of overfitting to dataset provided and we might not see the same type of data in the classify set.

In [18]:
model = { "DecisionTree":tree.DecisionTreeClassifier(max_depth=10),
         "RandomForest":ek.RandomForestClassifier(n_estimators=50),
         "Adaboost":ek.AdaBoostClassifier(n_estimators=50),
         "GradientBoosting":ek.GradientBoostingClassifier(n_estimators=50),
         "GNB":GaussianNB(),
         "LogisticRegression":LogisticRegression(),
         "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis()
}

In [19]:
results = {}
for algo in model:
    clf = model[algo]
    clf.fit(X_train,y_train)
    score = clf.score(X_test,y_test)
    print ("%s : %s " %(algo, score))
    results[algo] = score

DecisionTree : 1.0 
RandomForest : 1.0 
Adaboost : 1.0 
GradientBoosting : 1.0 
GNB : 0.7009966777408638 
LogisticRegression : 0.9833887043189369 
LinearDiscriminantAnalysis : 0.9302325581395349 




In [20]:
winner = max(results, key=results.get)
print(winner)

DecisionTree


In [21]:
clf = model[winner]
for algo in model:
    clf = model[algo]
    res = clf.predict(X)
    mt = confusion_matrix(y, res)
    print(algo)
    print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
    print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))

DecisionTree
False positive rate : 0.000000 %
False negative rate : 0.000000 %
RandomForest
False positive rate : 0.000000 %
False negative rate : 0.000000 %
Adaboost
False positive rate : 0.000000 %
False negative rate : 0.000000 %
GradientBoosting
False positive rate : 0.000000 %
False negative rate : 0.000000 %
GNB
False positive rate : 5.037313 %
False negative rate : 61.456103 %
LogisticRegression
False positive rate : 0.559701 %
False negative rate : 2.462527 %
LinearDiscriminantAnalysis
False positive rate : 5.410448 %
False negative rate : 10.706638 %


### Load the Classify Dataset
We load the classify dataset and perform the same breakdown of the features and filter out for unwanted columns like we did with the initial training set.

In [22]:
# load the unknown dataset
unknown_df = pd.read_json('classify.json')
unknown_df_transformed = transform_params(unknown_df)

# create a feature set from the url
url_feature_set = pd.DataFrame(columns=('no of dots', 'presence of hyphen', 'len of url',
                                        'presence of at', 'presence of double slash',
                                        'no of subdir', 'no of subdomain', 'len of domain',
                                        'len of query', 'is IP', 'presence of suspicious tld'),
                              dtype='int32')

for i in range(len(unknown_df)):
    url_features = getFeatures(unknown_df['url'].loc[i])
    url_feature_set.loc[i] = url_features

unknown_df_with_features = unknown_df_transformed.merge(url_feature_set, left_index=True, right_index=True)

In [23]:
# keep only the columns we need
unknown_df_with_features = unknown_df_with_features[df_only_features.columns]
unknown_df_with_features.head()

Unnamed: 0,host_len,url_len,default_port,domain_age_days,num_domain_tokens,malicious_url,alexa_rank,path_len,port,num_path_tokens,...,port_category,no of dots,presence of hyphen,len of url,presence of double slash,no of subdir,no of subdomain,len of domain,len of query,is IP
0,20,82,80,72,3,,,55,80,4,...,0,0,0,82,0,4,1,20,0,0
1,22,90,80,1018,4,,11328.0,61,80,3,...,0,1,0,90,0,3,2,22,0,0
2,20,46,80,65,3,,,19,80,2,...,0,0,0,46,0,2,0,20,0,0
3,13,48,80,50,2,,,28,80,4,...,0,0,0,48,0,4,0,13,0,0
4,14,31,80,816,4,,19.0,10,80,2,...,0,0,0,31,0,2,1,14,0,0


In [24]:
unknown_df_only_features = unknown_df_with_features.select_dtypes(include=['int64', 'float64']).fillna(0)
classify_X = unknown_df_only_features.drop(['default_port', 'malicious_url'], axis=1).values

### Run Classification Model Against Classify Dataset
I chose the Linear Discriminant Analysis model here to decrease the probability of overfitting the data based on the observed results from before. If I had more time or this was at my job, I would have run a Principal Component Analysis on this dataset in order to filter out features that might have been overvalued.

In [25]:
results = model['LinearDiscriminantAnalysis'].predict(classify_X)

In [26]:
classified_df = pd.DataFrame({'url': unknown_df['url'], 'malicious_url': results})

### Sanity Check
Because we know that there are supposed to be 50% malicious and 50% safe, we guess at how off the classifiers are for the classify.json dataset.

In [27]:
len(classified_df)

2024

### number of malicious urls

In [28]:
print(str(round(abs(len(classified_df[classified_df['malicious_url'] == 1]) 
    - len(classified_df)/2)/len(classified_df)/2 * 100, 2)) + '% diff')

1.24% diff


### number of safe urls

In [29]:
print(str(round(abs(len(classified_df[classified_df['malicious_url'] == 0]) 
    - len(classified_df)/2)/len(classified_df)/2 * 100, 2)) + '% diff')

1.24% diff


### Observations

We see that the difference in count we have from the expected count is 1.24%. Not too bad, but we can't be completely certain that what we have is correct.

In [30]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(classified_df)

                                                    url  malicious_url
0     http://www.oblogdacarla.com/~wscrxcom/paypal.c...              1
1     http://www.sanders.senate.gov/newsroom/press-r...              0
2        http://instruminahui.edu.ec/201403/editor.html              1
3      http://sirdarryl.com/wp-admin/network/gogdocsrt/              1
4                       http://id.yahoo.co.jp/security/              0
5       http://nlznrsil.co.cc/showthread.php?t=20140028              0
6                             https://id.pinterest.com/              0
7     http://aseandental.com.vn/en/upload/faqs/vodaf...              1
8     http://domestic.hotel.travel.yahoo.co.jp/seaso...              0
9       http://cd.focus.cn/news/2014-04-24/4970695.html              0
10    http://images.neobux.com/imagens/banner9/?u=ma...              0
11    http://www.ahorroenergialucense.es/images/leo/...              1
12            http://www.stumbleupon.com/submit/visitor              0
13    