In [1]:
import os, sys
import feather
import re
import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# project_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.path.curdir)))
project_dir = os.path.dirname(os.path.abspath(os.path.curdir))
new_path = os.path.join(project_dir, 'src')
sys.path.append(new_path)

import util as u
from model import pipeline as p

pd.options.display.max_columns = 100

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
full_df = feather.read_dataframe('../data/processed/train_df.feather')

In [3]:
full_df.shape

(7449443, 27)

In [4]:
full_df['label'].value_counts(1)

benign      0.998858
phishing    0.001142
Name: label, dtype: float64

In [5]:
# take subset of data for initial model assessment
df = full_df.sample(frac=0.2).reset_index(drop=True)

In [6]:
df.shape

(1489889, 27)

In [7]:
# Make sure we've retained proportion of labels
df['label'].value_counts(1)

benign      0.998892
phishing    0.001108
Name: label, dtype: float64

In [8]:
del full_df

Convert int to float64. Avoids a warning error while fitting some algorithms

In [9]:
int_cols = df.select_dtypes(include='int').columns
int_cols = [col for col in int_cols if re.search('_ind', col) is None]

for col in int_cols:
    df[col] = df[col].astype(np.float64)

From EDA: Correlated features:
- length_url with length_path, url_X_cnt, and url_entropy
    - convert these to fraction

In [10]:
cols_to_convert = ['length_path', 'length_domain', 'url_slash_cnt',
       'url_digit_cnt', 'url_special_char_cnt', 'url_reserved_char_cnt']

for col in cols_to_convert:
    new_col_name = col + '_frac_url_len'
    df[new_col_name] = df[col] / df['length_url'] 

Don't need this, since can't include suffix, since benign only contains .com

In [12]:
# def extract_base_suffix(s):
#     if len(s) > 0:
#         return s.split('.')[0]
#     else:
#         return s

# df['base_suffix'] = df['suffix'].apply(extract_base_suffix)

### Split into X, y and a further train-test

In [15]:
X.reset_index(inplace=True, drop=True)
y.reset_index(inplace=True, drop=True)

In [16]:
target = 'label'
X = df.drop(columns=target)
y = df[target]

# some algorithms require encoding of our target / label
enc = LabelEncoder()
y_enc = enc.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.25)

# Preprocessing

In [12]:
all_cols = df.columns

target = 'label'

proc_dict = {
#     'base_suffix':[p.Consolidate(1), OneHotEncoder(handle_unknown='ignore')]
            }

num_cols = [col for col in all_cols if re.search('_cnt', col) is not None] + \
            ['length_url', 'hostname_entropy', 'url_entropy']

bool_cols = [col for col in all_cols if re.search('_ind', col) is not None]

pass_thru_cols = [col for col in all_cols if re.search('_frac_url_len', col) is not None]

for col in num_cols:
    proc_dict[col] = [StandardScaler()]

for col in bool_cols + pass_thru_cols:
    proc_dict[col] = [p.PassThrough()]

Again, do not include suffix, as benign only contains '.com', so this is a leaky variable of sorts. Also exclude features that were found to have low association / correlation with the target (i.e. HEX indicator, 'abuse' suspicious word indicator).

In [13]:
model_cols = ['subdomain_null_ind', 'subdomain_www_ind', 'length_url',
        'domain_dot_cnt', 'path_dot_cnt',
       'hostname_dash_cnt',
       'hostname_entropy', 'url_entropy', 'php_ind', 'admin_ind',
       'verification_ind', 
              'length_path_frac_url_len',
       'length_domain_frac_url_len', 
       'url_slash_cnt_frac_url_len',
       'url_digit_cnt_frac_url_len', 'url_special_char_cnt_frac_url_len',
       'url_reserved_char_cnt_frac_url_len']

preproc_pipe = FeatureUnion(p.gen_pipeline(model_cols, proc_dict))

# Modeling

The `assess_model` function in the custom `util` (`u`) module peforms 5-fold cross-validation

Start with good old Logistic Regression

In [18]:
from importlib import reload
reload(u)

<module 'util' from '/Users/kendra/Documents/data_science/Projects/phishing-urls/src/util.py'>

In [19]:
model = LogisticRegression(penalty='l2', random_state=19, solver='lbfgs')
lr_res = u.assess_model_df(preproc_pipe, model, X, y_enc, n=5)



In [20]:
lr_res

Precision-0                    0.998913
Recall-0 (Specificty)          0.999997
F1score-0                      0.999455
Precision-1                    0.887778
Recall-1 (Sensitivity)         0.019379
F1score-1                      0.037857
TN                        297646.600000
FN                           323.800000
FP                             1.000000
TP                             6.400000
AUC                            0.826314
Accuracy                       0.998910
dtype: float64

Failure to converge could be an indication of a "leaky" variable, or a feature that is "too good" at predicting the target.

In [22]:
model = GaussianNB()
nb_res = u.assess_model_df(preproc_pipe, model, X, y_enc, n=5)

In [23]:
nb_res

Precision-0                    0.998989
Recall-0 (Specificty)          0.996595
F1score-0                      0.997791
Precision-1                    0.028809
Recall-1 (Sensitivity)         0.090840
F1score-1                      0.043734
TN                        296634.200000
FN                           300.200000
FP                          1013.400000
TP                            30.000000
AUC                            0.832448
Accuracy                       0.995592
dtype: float64

As RandomForest takes longer to fit, let's start with one round:

In [25]:
model = RandomForestClassifier(n_estimators=100)

start_time = time.time()
rf_res = u.assess_model_no_cv(preproc_pipe, model, X_train, y_train, X_test, y_test)
duration = time.time() - start_time
print('Elapsed time: {} seconds'.format(duration))

Elapsed time: 602.4431281089783 seconds


In [26]:
rf_res

Precision-0                    0.999125
Recall-0 (Specificty)          0.999984
F1score-0                      0.999554
Precision-1                    0.916667
Recall-1 (Sensitivity)         0.168367
F1score-1                      0.284483
TN                        372075.000000
FN                           326.000000
FP                             6.000000
TP                            66.000000
AUC                            0.846035
Accuracy                       0.999109
dtype: float64

A few notes on why I didn't choose other models to examine:
- Logistic Regression using L1 penalty didn't converge either (not shown)
- SVMs are not efficient on large datasets (est O(n^3))
- KNNs also are not efficient on large datasets (O(nd))
- I would have liked to examine an implementation of Gradient Boosting of Decision Trees, but I did not, given time contrainsts

## Compare models

In [27]:
res_dict = {'Log Reg': lr_res,
            'Naive Bayes': nb_res,
            'Random Forest': rf_res}

In [28]:
res_df = pd.DataFrame(res_dict)
res_df

Unnamed: 0,Log Reg,Naive Bayes,Random Forest
Precision-0,0.998913,0.998989,0.999125
Recall-0 (Specificty),0.999997,0.996595,0.999984
F1score-0,0.999455,0.997791,0.999554
Precision-1,0.887778,0.028809,0.916667
Recall-1 (Sensitivity),0.019379,0.09084,0.168367
F1score-1,0.037857,0.043734,0.284483
TN,297646.6,296634.2,372075.0
FN,323.8,300.2,326.0
FP,1.0,1013.4,6.0
TP,6.4,30.0,66.0


In [29]:
def fpr(tn, fp):
    return 1 - (tn / (tn + fp))

In [30]:
fpr(297646.600000, 1)

3.359677685943474e-06

In [31]:
fpr(296634.200000, 1013.4)

0.0034046973669534797

In [32]:
fpr(372075.000000, 6)

1.6125521055898595e-05

Random Forest performs the best (per F1-score)

# Tune RandomForest

As the algorithm took X min to fit-predict one iteration, 3- to 5-fold cross validation inside a `GridSearchCV` would be time prohibitive. Continue to use our train-test-split of our subset of our train data (so, we are still sufficiently protected from out hold-out test set), and perform tuning "by hand".

In [19]:
from sklearn.pipeline import make_pipeline

In [20]:
from sklearn.metrics import f1_score

In [30]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, criterion='gini')
pipe = make_pipeline(preproc_pipe, rf)
pipe.fit(X_train, y_train)
y_pred_test = pipe.predict(X_test)
f1_score(y_test, y_pred_test)

0.0665188470066519

In [31]:
y_pred_train = pipe.predict(X_train)
f1_score(y_train, y_pred_train)

0.12700729927007298