In [2]:
%run train_model.py -h

train_model.py

Trains RandomForestClassifier model on <input> file located in
 ../../data/processed. Saves model to ../../models.

Usage:
    train_model.py <filename> [options]

Options
    -i --input <file>       Filename, with extension, but without path
                            (../../data/processed). [default: train_df.feather]
    -o --output <file>      Filename for resulting model, without extension.
                            Will be save to ../../models/. [default: rf_model]
    -h --help               Show docstring.
    -t                      Test mode.


In [7]:
%run train_model.py -t

Running test...
['subdomain_null_ind', 'subdomain_www_ind', 'length_url', 'domain_dot_cnt', 'path_dot_cnt', 'hostname_dash_cnt', 'hostname_entropy', 'url_entropy', 'php_ind', 'abuse_ind', 'admin_ind', 'verification_ind', 'length_path_frac_url_len', 'length_domain_frac_url_len', 'url_slash_cnt_frac_url_len', 'url_digit_cnt_frac_url_len', 'url_special_char_cnt_frac_url_len', 'url_reserved_char_cnt_frac_url_len']


In [11]:
%run train_model.py

Loading data...
Prepping data...


KeyError: ['length_path_frac_url_len', 'length_domain_frac_url_len', 'url_slash_cnt_frac_url_len', 'url_digit_cnt_frac_url_len', 'url_special_char_cnt_frac_url_len', 'url_reserved_char_cnt_frac_url_len']

In [12]:
import os, sys
import time
import re
import feather
import pandas as pd
import numpy as np
from docopt import docopt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.ensemble import RandomForestClassifier

project_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.path.curdir)))
# project_dir = os.path.dirname(os.path.abspath(os.path.curdir))
new_path = os.path.join(project_dir, 'src')
sys.path.append(new_path)

import util as u
from model import pipeline as p

In [19]:
from sklearn.pipeline import FeatureUnion, make_pipeline

In [13]:
input_file = 'train_df.feather'
input_path = os.path.join('../../data/processed/', input_file)
df = feather.read_dataframe(input_path)

In [14]:
int_cols = df.select_dtypes(include='int').columns
int_cols = [col for col in int_cols if re.search('_ind', col) is None]

for col in int_cols:
    df[col] = df[col].astype(np.float64)

cols_to_convert = ['length_path', 'length_domain', 'url_slash_cnt',
                    'url_digit_cnt', 'url_special_char_cnt',
                    'url_reserved_char_cnt']

for col in cols_to_convert:
    new_col_name = col + '_frac_url_len'
    df[new_col_name] = df[col] / df['length_url']

In [16]:
model_cols = ['subdomain_null_ind', 'subdomain_www_ind', 'length_url',
              'domain_dot_cnt', 'path_dot_cnt', 'hostname_dash_cnt',
              'hostname_entropy', 'url_entropy', 'php_ind', 'abuse_ind',
              'admin_ind','verification_ind',
              'length_path_frac_url_len', 'length_domain_frac_url_len',
              'url_slash_cnt_frac_url_len', 'url_digit_cnt_frac_url_len',
              'url_special_char_cnt_frac_url_len', 'url_reserved_char_cnt_frac_url_len']

In [17]:
df = df.dropna(subset=model_cols)

In [None]:
target = 'label'
X = df.drop(columns=target)
y = df[target]

all_cols = df.columns

# Preprocessing
print('Getting pipeline ready...')
proc_dict = {
#     'base_suffix':[p.Consolidate(1), OneHotEncoder(handle_unknown='ignore')]
            }

num_cols = [col for col in all_cols if re.search('_cnt', col) is not None] + \
            ['length_url', 'hostname_entropy', 'url_entropy']

bool_cols = [col for col in all_cols if re.search('_ind', col) is not None]

pass_thru_cols = [col for col in all_cols if re.search('_frac_url_len', col) is not None]

for col in num_cols:
    proc_dict[col] = [StandardScaler()]

for col in bool_cols + pass_thru_cols:
    proc_dict[col] = [p.PassThrough()]

# Pipeline
preproc_pipe = FeatureUnion(p.gen_pipeline(model_cols, proc_dict))
clf = RandomForestClassifier(n_estimators=100, max_depth=10, criterion='entropy')
pipe = make_pipeline(preproc_pipe, clf)

print('Fitting model....')
pipe.fit(X, y)

Getting pipeline ready...
Fitting model....


In [None]:
print('Predicting train...')
y_pred = pipe.predict(X)
score = f1_score(y, y_pred)
print('Training score: ', score)

print('Saving model...')
outputh_path = os.path.join('../../models/', output_file)
joblib.dump(pipe, output_path)

u.pickle_this(y_pred, '../../data/processed/train_pred.pkl')
print('Script complete!')