In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import math
import random
import itertools
import datetime
import pytz
from pytz import timezone 

In [17]:
df = pd.read_pickle('filtered_combined_day.pkl', compression='gzip')
df_pos = pd.read_pickle('filtered_pos.pkl', compression='gzip')

In [18]:
#Cast the features that are lists into strings. 
df_pos.ref_keywords = df_pos.ref_keywords.astype(str)
df_pos.url_keywords = df_pos.url_keywords.astype(str)
df.ref_keywords = df.ref_keywords.astype(str)
df.url_keywords = df.url_keywords.astype(str)

In [38]:
df_pos.shape

(9974, 40)

In [40]:
"""Split data into positive and negative classes. We are only going to sample
.02% of the negative classes. """
df_negatives = df[df['c_cnt'] == 0].sample(frac=0.02)
df_negatives.shape

(31385, 40)

In [41]:
"""Combine the positive and negative classes. """
df_combined = pd.concat([df_pos, df_negatives])
df_combined = df_combined[np.isfinite(df_combined['c_cnt'])]

In [42]:
#Shuffle the rows. 
df_combined = df_combined.sample(frac=1)
df_combined.shape

(39532, 40)

In [52]:
df_combined

Unnamed: 0,_host,ad_network_id,advertiser_id,c_cnt,c_flag_cnt,campaign_id,campaign_type,f_cnt,geo_city_name,geo_country_code3,...,ua_os_name,url,user_agent,uuid,vi_cnt,vi_flag_cnt,ref_domain,ref_keywords,url_domain,url_keywords
337859,an-prod-ralphie-frontline-gamic.us-east-1,893.0,3842.0,0.0,0.0,25419.0,private,0.0,Nacogdoches,USA,...,iOS,http://media.breitbart.com/media/if/newsmax.html,Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like...,ad86086a-56a9-44a8-99ef-97ddefc5f562,0.0,0.0,www.breitbart.com,"['big', 'government', 'turley', 'sessions', 'u...",media.breitbart.com,"['media', 'newsmax']"
138273,an-prod-ralphie-frontline-bulks.us-east-1,1403.0,3672.0,0.0,0.0,26174.0,private,0.0,Corrigan,USA,...,iOS,https://texasguntrader.com/index.php?a=4&b=8&c...,Mozilla/5.0 (iPad; CPU OS 11_2 like Mac OS X) ...,051c829a-d13c-45ff-bda9-f4f91d4f847c,0.0,0.0,www.breitbart.com,"['big', 'journalism', 'media', 'seek', 'destro...",media.breitbart.com,"['media', 'newsmax']"
919886,an-prod-ralphie-frontline-fleys.us-east-1,767.0,4735.0,0.0,0.0,26179.0,outside,0.0,Northville,USA,...,Windows 10,https://www.roughlyexplained.com/2017/12/four-...,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,7ec7c265-d911-4554-a9b2-b56c3f004ccd,0.0,0.0,www.breitbart.com,[],media.breitbart.com,"['media', 'newsmax']"
9418,an-prod-ralphie-frontline-rider.us-east-1,1839.0,4523.0,1.0,0.0,22365.0,private,0.0,Toronto,CAN,...,Linux,http://texasguntrader.com/index.php?a=5&b=548,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,945b2cf0-1baa-4b5e-af16-30b83a11a56f,1.0,0.0,,,,
1058988,an-prod-ralphie-frontline-orlon.us-east-1,893.0,3842.0,0.0,0.0,25419.0,private,0.0,La Grange,USA,...,iOS,http://www.armslist.com/posts/7160649/chicago-...,Mozilla/5.0 (iPhone; CPU iPhone OS 11_2_6 like...,3166e4da-bc5b-4f4f-a29f-0a5408a8c5f7,1.0,0.0,www.breitbart.com,"['big', 'government', 'usda', 'hires', 'integr...",media.breitbart.com,"['media', 'newsmax']"
2176879,an-prod-ralphie-frontline-goofs.us-east-1,767.0,4735.0,0.0,0.0,21827.0,outside,0.0,Baltic,USA,...,Mac OS X,http://thehill.com/blogs/blog-briefing-room/ne...,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6...,36207719-54bd-4631-bf7a-879dc02fc086,0.0,0.0,,,,
1906358,an-prod-ralphie-frontline-goofs.us-east-1,1349.0,3587.0,0.0,0.0,20523.0,private,0.0,Frisco,USA,...,iOS,http://media.breitbart.com/media/if/newsmax.html,Mozilla/5.0 (iPad; CPU OS 11_2_6 like Mac OS X...,e47f3650-696b-4a64-980b-8b624602265f,0.0,0.0,,,,
25642,an-prod-ralphie-frontline-bedim.us-east-1,893.0,3842.0,0.0,0.0,25419.0,private,0.0,Mansfield,USA,...,Android,http://media.breitbart.com/media/if/newsmax.html,Mozilla/5.0 (Linux; Android 7.0; SM-G930V Buil...,669dc570-d60f-4f27-b978-dc68963c3c5c,0.0,0.0,www.breitbart.com,"['big', 'hollywood', 'jim', 'carrey', 'shares'...",media.breitbart.com,"['media', 'newsmax']"
481174,an-prod-ralphie-frontline-bedim.us-east-1,767.0,4735.0,0.0,0.0,25937.0,outside,0.0,Thousand Oaks,USA,...,iOS,https://www.billoreilly.com/b/Shooting-Straigh...,Mozilla/5.0 (iPad; CPU OS 9_3_5 like Mac OS X)...,80b25d48-549d-439f-ba3b-79995236aee3,1.0,0.0,,[],www.armslist.com,"['posts', 'southeastern', 'pennsylvania', 'rif..."
1840402,an-prod-ralphie-frontline-mirin.eu-west-1,767.0,4735.0,0.0,0.0,26179.0,outside,0.0,Shrewsbury,GBR,...,Windows 7,https://www.roughlyexplained.com/viral-videos/,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,3d6256b5-6011-4295-9f86-3b7c7a8b6333,1.0,0.0,,,,


In [43]:
#Check how many unuique values there are in each column. 
df_combined.apply(pd.Series.nunique)

_host                    137
ad_network_id             40
advertiser_id             63
c_cnt                      2
c_flag_cnt                16
campaign_id              118
campaign_type              4
f_cnt                      2
geo_city_name           5947
geo_country_code3         58
geo_region_name          327
geo_timezone             100
i_cnt                      2
i_flag_cnt                79
i_timestamp               24
pub_network_id             6
r_cnt                      2
r_num_ads_requested        6
r_num_ads_returned         9
r_timestamp               24
rate_metric                2
referer                 6687
session_id             38867
site_id                   55
token                    247
ua                      4490
ua_device               1172
ua_device_type             4
ua_major                  77
ua_minor                  33
ua_os_name                19
url                     7044
user_agent              3870
uuid                   35481
vi_cnt        

Use One Hot Encoding for Categorical Variables

In [44]:
def transform_column(df, col, thresh=200):
    print(col)
    if df[col].nunique() > thresh:
        df_frequency = df[[col, 'c_cnt']].groupby(col).agg('count').sort_values('c_cnt',ascending=False)
        cat = [sorted(df_frequency[0:thresh].index.values)]
        dict2 = {}
        for i, item in enumerate(cat[0]):
            dict2[item] = i
        #enc = CategoricalEncoder(categories=[sorted(df_frequency[0:thresh].index.values)],handle_unknown='ignore')
    else:
        dict2 = {}
        i = 0
        for item in df[col].values:
            if item not in dict2:
                dict2[item] = i
                i+=1
        #enc = CategoricalEncoder(categories='auto',handle_unknown='ignore')
    return [[1 if j == i else 0 for j in dict2] for i in df[col].values]


In [45]:
numerical_features = ['c_flag_cnt', 'f_cnt', 'i_cnt', 'i_flag_cnt', 'r_cnt', 'r_num_ads_requested',
                      'r_num_ads_returned', 'ua_major', 'ua_minor', 'vi_cnt', 'vi_flag_cnt']

In [46]:
#we create a copy so that X will not include 'c_cnt'
df2 = df_combined.copy()
df2.drop('c_cnt',inplace=True,axis=1)
Y = df_combined['c_cnt'].values
X = np.hstack([transform_column(df_combined, col) if col not in numerical_features else df_combined[col].values.reshape(-1,1)
               for col in df2])

_host
ad_network_id
advertiser_id
campaign_id
campaign_type
geo_city_name
geo_country_code3
geo_region_name
geo_timezone
i_timestamp
pub_network_id
r_timestamp
rate_metric
referer
session_id
site_id
token
ua
ua_device
ua_device_type
ua_os_name
url
user_agent
uuid
ref_domain
ref_keywords
url_domain
url_keywords


In [47]:
#Train and Test Split 
ind_cutoff = int(0.7*len(X))
X_train, Y_train = X[0:ind_cutoff], Y[0:ind_cutoff]
X_test, Y_test = X[ind_cutoff:], Y[ind_cutoff:]

In [48]:
print("Positive samples in training: ", sum(Y_train))
print("Positive samples in testing: ", sum(Y_test))

Positive samples in training:  5646.0
Positive samples in testing:  2501.0


In [15]:
#Feature Selection and Training 
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X, Y)

print(model.feature_importances_)

# # select features using threshold
# selection = SelectFromModel(model, threshold=thresh, prefit=True)
# select_X_train = selection.transform(X_train)

# # train model
# selection_model = XGBClassifier()
# selection_model.fit(select_X_train, Y_train)

# # eval model
# select_X_test = selection.transform(X_test)
# y_pred = selection_model.predict(select_X_test)

# plot_importance(model)
# plt.show()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/Anjana/miniconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-e014b03c58a2>", line 5, in <module>
    model.fit(X, Y)
  File "/Users/Anjana/miniconda3/lib/python3.6/site-packages/xgboost/sklearn.py", line 506, in fit
    verbose_eval=verbose, xgb_model=None)
  File "/Users/Anjana/miniconda3/lib/python3.6/site-packages/xgboost/training.py", line 204, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/Users/Anjana/miniconda3/lib/python3.6/site-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, i, obj)
  File "/Users/Anjana/miniconda3/lib/python3.6/site-packages/xgboost/core.py", line 895, in update
    dtrain.handle))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/Anjana/miniconda3/lib/

KeyboardInterrupt: 

In [49]:
from sklearn.linear_model import LogisticRegression
#Train with Filtered Features Using Logistic Regression 
logreg = LogisticRegression(C=1)
logreg.fit(X_train, Y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
from sklearn.metrics import confusion_matrix

print("Training accuracy", 100*logreg.score(X_train,Y_train), "%")
print("Validation accuracy", 100*logreg.score(X_test,Y_test), "%")

train_prediction = logreg.predict(X_train)
test_prediction = logreg.predict(X_test)

train_confusion_matrix = confusion_matrix(Y_train, train_prediction)
test_confusion_matrix = confusion_matrix(Y_test, test_prediction)

train_recall = train_confusion_matrix[1][1]/sum(train_confusion_matrix[1])
test_recall = test_confusion_matrix[1][1]/sum(test_confusion_matrix[1])

# #What percent of the positive cases did we catch? 
print("Training Recall:", train_recall)
print("Validation Recall:", test_recall)

# #What percent of the positive predictions was correct?

train_precision = train_confusion_matrix[1][1]/(train_confusion_matrix[1][1] + train_confusion_matrix[0][1])
test_precision = test_confusion_matrix[1][1]/(test_confusion_matrix[1][1] + test_confusion_matrix[0][1])

print("Training Precision:", train_precision)
print("Validation Precision:", test_precision)

print("Confusion Matrix for Training Data")
print(train_confusion_matrix)

print("Confusion Matrix for Testing Data")
# #TN FP 
# #FN TP 
print(test_confusion_matrix)

Training accuracy 97.62575888985255 %
Validation accuracy 97.14165261382799 %
Training Recall: 0.899929153382926
Validation Recall: 0.8856457417033187
Training Precision: 0.9822153489271216
Validation Precision: 0.9766313932980599
Confusion Matrix for Training Data
[[21934    92]
 [  565  5081]]
Confusion Matrix for Testing Data
[[9306   53]
 [ 286 2215]]


In [51]:
(2* (test_precision*test_recall))/(test_precision+test_recall)

0.9289159152862235