In [2]:
import os, sys
import logging
import pickle
import feather

import pandas as pd
from docopt import docopt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB

# project_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.path.curdir)))
project_dir = os.path.dirname(os.path.abspath(os.path.curdir))
new_path = os.path.join(project_dir, 'src')
sys.path.append(new_path)

from model import pipeline as p
import util as u

pd.options.display.max_columns = 100

In [4]:
from sklearn.preprocessing import LabelEncoder

In [3]:
# Load data
filename = 'train_df.pkl'
file_path = os.path.join('../../data/interim', filename)
save_path = '../../data/processed/'

print('Loading data...')
train_df = feather.read_dataframe('../data/processed/train_df.feather')
print('Data loaded!')

# define stop words
stopwords = set(ENGLISH_STOP_WORDS).union(set(('com', 'net', 'gov', 'edu', 'http', 'https', 'www')))

Loading data...


In [6]:
X_urls = train_df['url']
X_path = train_df['path']

target = 'label'
y = train_df['label']
enc = LabelEncoder()
y_enc = enc.fit_transform(y)

In [7]:
del train_df

## TFIDF

In [9]:
# The token pattern excludes "words" that start with a digit or
# an underscore (_).
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern='(?u)\\b[a-zA-Z]\\w+\\b',
                             stop_words=stopwords)
tfidf_output = vectorizer.fit_transform(X_path)
tfidf_features = vectorizer.get_feature_names()
print('Total number of features: {}'.format(len(tfidf_features)))

Total number of features: 1565431


In [10]:
mnb = MultinomialNB()
mnb_res = u.assess_model_only(mnb, tfidf_output, y_enc, n=5)

In [11]:
mnb_res

Precision-0               9.988630e-01
Recall-0 (Specificty)     9.999989e-01
F1score-0                 9.994306e-01
Precision-1               7.940476e-01
Recall-1 (Sensitivity)    3.997994e-03
F1score-1                 7.951334e-03
TN                        1.488186e+06
FN                        1.694000e+03
FP                        1.600000e+00
TP                        6.800000e+00
AUC                       8.873018e-01
Accuracy                  9.988619e-01
dtype: float64

## Most important terms

In [21]:
import numpy as np

In [30]:
neg_class_prob_list = list(zip(tfidf_features, mnb.feature_log_prob_[0, :]))
pos_class_prob_list = list(zip(tfidf_features, mnb.feature_log_prob_[1, :]))

In [None]:
neg_class_prob_list.sort(key=lambda x: x[1], reverse=True)
pos_class_prob_list.sort(key=lambda x: x[1], reverse=True)

In [None]:
# neg_class_prob_sorted = mnb.feature_log_prob_[0, :].argsort()
# pos_class_prob_sorted = mnb.feature_log_prob_[1, :].argsort()

In [39]:
print('Most important terms for benign class:')
print([x[0] for x in neg_class_prob_list[:10]])
print('')
print('Most important terms for phishing class')
# print(np.take(tfidf_features, pos_class_prob_sorted[:10]))
print([x[0] for x in pos_class_prob_list[:10]])

Most important terms for benign class:
['html', 'php', 'txt', 'robots', 'index', 'tag', 'news', 'products', 'en', 'product']

Most important terms for phishing class
['php', 'login', 'wp', 'index', 'bankofamerica', 'includes', 'html', 'content', 'signin', 'myaccount']


Modeling TF-IDF on urls:

In [20]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern='(?u)\\b[a-zA-Z]\\w+\\b',
                             stop_words=stopwords)
tfidf_output = vectorizer.fit_transform(X_path)
tfidf_features = vectorizer.get_feature_names()
mnb_res = u.assess_model_only(mnb, tfidf_output, y_enc, n=5)
mnb_res

Precision-0               9.988584e-01
Recall-0 (Specificty)     9.999970e-01
F1score-0                 9.994274e-01
Precision-1               0.000000e+00
Recall-1 (Sensitivity)    0.000000e+00
F1score-1                 0.000000e+00
TN                        1.488183e+06
FN                        1.700800e+03
FP                        4.400000e+00
TP                        0.000000e+00
AUC                       5.026199e-01
Accuracy                  9.988555e-01
dtype: float64

## Count Vectorizer

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,1),
                             token_pattern='(?u)\\b[a-zA-Z]\\w+\\b',
                             stop_words=stopwords)
cv_output = vectorizer.fit_transform(train_df['url'])
cv_features = vectorizer.get_feature_names()

In [28]:
print('Total number of features: {}'.format(len(cv_features)))

Total number of features: 1850497


In [29]:
mnb_res = u.assess_model_only(mnb, cv_output, y_enc, n=5)
mnb_res

Precision-0               9.992449e-01
Recall-0 (Specificty)     9.994575e-01
F1score-0                 9.993512e-01
Precision-1               4.167435e-01
Recall-1 (Sensitivity)    3.391339e-01
F1score-1                 3.739356e-01
TN                        1.487380e+06
FN                        1.124000e+03
FP                        8.074000e+02
TP                        5.768000e+02
AUC                       9.568612e-01
Accuracy                  9.987037e-01
dtype: float64

Constrain tokens returns to reduce # of features

In [30]:
vectorizer = CountVectorizer(ngram_range=(1,1),
                             token_pattern='(?u)\\b[a-zA-Z]\\w+\\b',
                             stop_words=stopwords, 
                            max_df = 0.5,
                            min_df = 5)
cv_output = vectorizer.fit_transform(train_df['url'])
cv_features = vectorizer.get_feature_names()

print('Total number of features: {}'.format(len(cv_features)))

Total number of features: 308153


In [31]:
mnb_res = u.assess_model_only(mnb, cv_output, y_enc, n=5)
mnb_res

Precision-0               9.992974e-01
Recall-0 (Specificty)     9.988764e-01
F1score-0                 9.990868e-01
Precision-1               2.817344e-01
Recall-1 (Sensitivity)    3.854645e-01
F1score-1                 3.255104e-01
TN                        1.486516e+06
FN                        1.045200e+03
FP                        1.672200e+03
TP                        6.556000e+02
AUC                       9.483653e-01
Accuracy                  9.981761e-01
dtype: float64

F1-socre is worse

More proper way of cross-validating: perform CountVect inside cross-val

In [37]:
vectorizer = CountVectorizer(ngram_range=(1,1),
                             token_pattern='(?u)\\b[a-zA-Z]\\w+\\b',
                             stop_words=stopwords,)
# cv_output = vectorizer.fit_transform(train_df['url'])
# cv_features = vectorizer.get_feature_names()

In [41]:
mnb_res = u.assess_model(vectorizer, mnb, train_df['url'], y_enc, n=5)

In [42]:
mnb_res

Precision-0               9.991810e-01
Recall-0 (Specificty)     9.999413e-01
F1score-0                 9.995610e-01
Precision-1               8.463615e-01
Recall-1 (Sensitivity)    2.828077e-01
F1score-1                 4.239280e-01
TN                        1.488100e+06
FN                        1.219800e+03
FP                        8.740000e+01
TP                        4.810000e+02
AUC                       9.436324e-01
Accuracy                  9.991226e-01
dtype: float64

Fit to entire dataset; save model (to apply to test); save predictions to use as feature

In [43]:
vectorizer = CountVectorizer(ngram_range=(1,1),
                             token_pattern='(?u)\\b[a-zA-Z]\\w+\\b',
                             stop_words=stopwords)
cv_output = vectorizer.fit_transform(train_df['url'])
cv_features = vectorizer.get_feature_names()

In [44]:
import joblib

In [45]:
joblib.dump(vectorizer, '../models/count_vectorizer_train.pkl')

['../models/count_vectorizer_train.pkl']

In [47]:
mnb.fit(cv_output, y_enc)
y_pred = mnb.predict(cv_output)

In [48]:
type(y_pred)

numpy.ndarray

In [49]:
from importlib import reload

In [50]:
reload(u)

<module 'util' from '/Users/kendra/Documents/data_science/Projects/phishing-urls/src/util.py'>

In [51]:
u.pickle_this(y_pred, '../data/processed/NLP_pred_train_ref09.pkl')