In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import math
import random
import itertools
import datetime
import pytz
from pytz import timezone 

In [60]:
pos_1_store = pd.HDFStore('day1_positives_processed.h5')
pos_2_store = pd.HDFStore('day2_positives_processed.h5')
neg_1_store = pd.HDFStore('day1_negatives_processed.h5')
neg_2_store = pd.HDFStore('day2_negatives_processed.h5')

#Load out dataframes
pos_day1 = pos_1_store['df'] 
pos_day2 = pos_2_store['df']
neg_day1 = neg_1_store['df']
neg_day2 = neg_2_store['df']

#Close our hdf files
pos_1_store.close()
pos_2_store.close()
neg_1_store.close()
neg_2_store.close()

In [103]:
#Combine negative and positive data. 
neg = pd.concat([neg_day1, neg_day2])
pos = pd.concat([pos_day1, pos_day2])

In [104]:
neg = neg.sample(frac=0.2)
pos = pos.sample(frac=0.02)

In [105]:
df_combined = pd.concat([neg, pos])
df_combined = df_combined[np.isfinite(df_combined['c_cnt'])]
df_combined = df_combined.sample(frac=1)
df_combined.shape

(151372, 39)

In [106]:
#df_combined.ref_keywords = df_combined.ref_keywords.astype(str)
df_combined.keywords = df_combined.keywords.astype(str)

In [17]:
# df = pd.read_pickle('filtered_combined_day.pkl', compression='gzip')
# df_pos = pd.read_pickle('filtered_pos.pkl', compression='gzip')

In [18]:
#Cast the features that are lists into strings. 
# df_pos.ref_keywords = df_pos.ref_keywords.astype(str)
# df_pos.url_keywords = df_pos.url_keywords.astype(str)
# df.ref_keywords = df.ref_keywords.astype(str)
# df.url_keywords = df.url_keywords.astype(str)

In [40]:
"""Split data into positive and negative classes. We are only going to sample
.02% of the negative classes. """
# df_negatives = df[df['c_cnt'] == 0].sample(frac=0.02)
# df_negatives.shape

(31385, 40)

In [41]:
"""Combine the positive and negative classes. """
# df_combined = pd.concat([df_pos, df_negatives])
# df_combined = df_combined[np.isfinite(df_combined['c_cnt'])]

In [107]:
#Shuffle the rows. 
df_combined = df_combined.sample(frac=1)
df_combined.shape

(151372, 39)

In [118]:
#df_combined['']

In [102]:
#Check how many unuique values there are in each column. 
# df_combined.apply(pd.Series.nunique)

Use One Hot Encoding for Categorical Variables

In [108]:
numerical_features = ['c_flag_cnt', 'f_cnt', 'i_cnt', 'i_flag_cnt', 'r_cnt', 'r_num_ads_requested',
                      'r_num_ads_returned', 'ua_major', 'ua_minor', 'vi_cnt', 'vi_flag_cnt']

In [109]:
def transform_column(df, col, thresh=200):
    print(col)
    if df[col].nunique() > thresh:
        #print("frequency", df_frequency)
        df_frequency = df[[col, 'c_cnt']].groupby(col).agg('count').sort_values('c_cnt',ascending=False)
        #df_frequency.reset_index(inplace=True)
        df_frequency = df_frequency.reset_index()
        cat = [sorted(df_frequency[0:thresh].index.values)]
        dict2 = {}
        for i, item in enumerate(cat[0]):
            dict2[item] = i
        #enc = CategoricalEncoder(categories=[sorted(df_frequency[0:thresh].index.values)],handle_unknown='ignore')
    else:
        dict2 = {}
        i = 0
        for item in df[col].values:
            if item not in dict2:
                dict2[item] = i
                i+=1
        #enc = CategoricalEncoder(categories='auto',handle_unknown='ignore')
    return [[1 if j == i else 0 for j in dict2] for i in df[col].values]


In [110]:
#we create a copy so that X will not include 'c_cnt'
df2 = df_combined.copy()
df2.drop('c_cnt',inplace=True,axis=1)
Y = df_combined['c_cnt'].values
X = np.hstack([transform_column(df_combined, col) if col not in numerical_features else df_combined[col].values.reshape(-1,1)
               for col in df2])

_host
ad_network_id
advertiser_id
campaign_id
campaign_type
geo_city_name
geo_country_code3
geo_region_name
geo_timezone
i_timestamp
pub_network_id
r_timestamp
rate_metric
referer
session_id
site_id
token
ua
ua_device
ua_device_type
ua_os_name
url
user_agent
uuid
url_domain
red_domain
keywords


In [111]:
#Train and Test Split 
ind_cutoff = int(0.7*len(X))
X_train, Y_train = X[0:ind_cutoff], Y[0:ind_cutoff]
X_test, Y_test = X[ind_cutoff:], Y[ind_cutoff:]

In [112]:
print("Positive samples in training: ", sum(Y_train))
print("Positive samples in testing: ", sum(Y_test))

Positive samples in training:  17586.0
Positive samples in testing:  7591.0


In [116]:
#Feature Selection and Training 
# from xgboost import XGBClassifier

# model = XGBClassifier()
# model.fit(X, Y)

# print(model.feature_importances_)

# # select features using threshold
# selection = SelectFromModel(model, threshold=thresh, prefit=True)
# select_X_train = selection.transform(X_train)

# # train model
# selection_model = XGBClassifier()
# selection_model.fit(select_X_train, Y_train)

# # eval model
# select_X_test = selection.transform(X_test)
# y_pred = selection_model.predict(select_X_test)

# plot_importance(model)
# plt.show()

In [113]:
from sklearn.linear_model import LogisticRegression
#Train with Filtered Features Using Logistic Regression 
logreg = LogisticRegression(C=1)
logreg.fit(X_train, Y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [123]:
print("Probabilities for Testing Data")

predictions_test = pd.DataFrame(logreg.predict_proba(X_test), columns=['Probability of No Click', 'Probability of Click'])
predictions_test['Prediction'] = logreg.predict(X_test)
predictions_test['Actual'] = Y_test
predictions_test['Correct'] = (predictions_test['Prediction'] == predictions_test['Actual'])

predictions_test

Probabilities for Testing Data


Unnamed: 0,Probability of No Click,Probability of Click,Prediction,Actual,Correct
0,0.994730,0.005270,0.0,0.0,True
1,0.997376,0.002624,0.0,0.0,True
2,0.995259,0.004741,0.0,0.0,True
3,0.999988,0.000012,0.0,0.0,True
4,0.006568,0.993432,1.0,1.0,True
5,0.987748,0.012252,0.0,0.0,True
6,0.999367,0.000633,0.0,0.0,True
7,0.999608,0.000392,0.0,0.0,True
8,0.999568,0.000432,0.0,0.0,True
9,0.997549,0.002451,0.0,0.0,True


In [121]:
from sklearn.metrics import confusion_matrix

print("Training accuracy", 100*logreg.score(X_train,Y_train), "%")
print("Validation accuracy", 100*logreg.score(X_test,Y_test), "%")

train_prediction = logreg.predict(X_train)
test_prediction = logreg.predict(X_test)

train_confusion_matrix = confusion_matrix(Y_train, train_prediction)
test_confusion_matrix = confusion_matrix(Y_test, test_prediction)

train_recall = train_confusion_matrix[1][1]/sum(train_confusion_matrix[1])
test_recall = test_confusion_matrix[1][1]/sum(test_confusion_matrix[1])

# #What percent of the positive cases did we catch? 
print("Training Recall:", train_recall)
print("Validation Recall:", test_recall)

# #What percent of the positive predictions was correct?

train_precision = train_confusion_matrix[1][1]/(train_confusion_matrix[1][1] + train_confusion_matrix[0][1])
test_precision = test_confusion_matrix[1][1]/(test_confusion_matrix[1][1] + test_confusion_matrix[0][1])

print("Training Precision:", train_precision)
print("Validation Precision:", test_precision)

print("Confusion Matrix for Training Data")
print(train_confusion_matrix)

print("Confusion Matrix for Testing Data")
# #TN FP 
# #FN TP 
print(test_confusion_matrix)

Training accuracy 97.72838807097017 %
Validation accuracy 97.74068528142341 %
Training Recall: 0.9058341862845445
Validation Recall: 0.9137136082202608
Training Precision: 0.9549787183022601
Validation Precision: 0.9492267688517859
Confusion Matrix for Training Data
[[87623   751]
 [ 1656 15930]]
Confusion Matrix for Testing Data
[[37450   371]
 [  655  6936]]


In [119]:
from sklearn.metrics import log_loss

log_loss(Y_test, test_prediction)

0.7803461911350524

In [120]:
(2* (test_precision*test_recall))/(test_precision+test_recall)

0.9311316955296013