In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import math
import random
import itertools
import datetime
import pytz
import pickle
from pytz import timezone 

In [3]:
pos_1_store = pd.HDFStore('day1_positives_processed.h5')
pos_2_store = pd.HDFStore('day2_positives_processed.h5')
neg_1_store = pd.HDFStore('day1_negatives_processed.h5')
neg_2_store = pd.HDFStore('day2_negatives_processed.h5')

#Load out dataframes
pos_day1 = pos_1_store['df'] 
pos_day2 = pos_2_store['df']
neg_day1 = neg_1_store['df']
neg_day2 = neg_2_store['df']

#Close our hdf files
pos_1_store.close()
pos_2_store.close()
neg_1_store.close()
neg_2_store.close()

In [4]:
#Load separate filtered testing data
new_test_store = pd.HDFStore('combined_day1_processed.h5')

new_test = new_test_store['df']

new_test_store.close()

In [5]:
#Process testing data. 
sampled_test = new_test.sample(frac=0.03)
#sampled_test.keywords = sampled_test.keywords.astype(str)

In [6]:
len(new_test)

1599546

In [7]:
#Combine negative and positive data. 
neg = pd.concat([neg_day1, neg_day2])
pos = pd.concat([pos_day1, pos_day2])

In [8]:
neg = neg.sample(frac=0.2)
pos = pos.sample(frac=0.02)

In [9]:
df_combined = pd.concat([neg, pos])
df_combined = df_combined[np.isfinite(df_combined['c_cnt'])]
df_combined = df_combined.sample(frac=1)
df_combined.shape

(151372, 39)

In [10]:
#df_combined.ref_keywords = df_combined.ref_keywords.astype(str)
#df_combined.keywords = df_combined.keywords.astype(str)

Use One Hot Encoding for Categorical Variables

In [11]:
OHE_PATH = "ohe_labels.pkl"


#this is the helper function for preprocessing ohe labels - dont need to explicitly call this function
def generate_ohe_labels(df, c, thresh=200, k_most_freq=False):
    appears, oh_index = {}, {}
    if c == 'keywords':
        for val in df[c].values:
            for word in val:
                if word not in appears:
                    appears[word] = 0
                appears[word] += 1
        if k_most_freq:
            for v in sorted(appears)[0:thresh]:
                oh_index[v] = len(oh_index)
        else:
            for v in [k for k in appears.keys()]:
                if appears[v] < thresh:
                    del appears[v]
            for v in sorted(appears):
                oh_index[v] = len(oh_index)
    else:    
        for val in df[c].values:
            if val not in appears:
                appears[val] = 0
            appears[val] += 1
        if k_most_freq:
            for v in sorted(appears)[0:thresh]:
                oh_index[v] = len(oh_index)
        else:
            for v in [k for k in appears.keys()]:
                if appears[v] < thresh:
                    del appears[v]
            for v in sorted(appears):
                oh_index[v] = len(oh_index)
    return oh_index

#generate OHE labels to be used for batch learning - run this FIRST
def preprocess_ohe(df, thresh=200, path=OHE_PATH, k_most_freq=False):
    #create and save our ohe labels
    ohe_labels = {}
    for c in df:
        if c == 'c_cnt':
            continue
        else:
            ohe_labels[c] = generate_ohe_labels(df, c, thresh, k_most_freq=k_most_freq)

    with open(path, 'wb') as f:
        pickle.dump(ohe_labels, f, pickle.HIGHEST_PROTOCOL)
        
    return ohe_labels
        
#generates a small X and Y matrix by sampling from both negative and positive dataframes
def generate_batch(df_pos, df_neg, batch_size, path=OHE_PATH, pos_ratio=1, ohe_labels=None):
    #load our ohe labels
    if ohe_labels == None:
        with open(path, 'rb') as f:
            ohe_labels = pickle.load(path)
    
    #pick which indices to use for our batch training
    indices_touse_pos = np.random.permutation(len(df_pos))[0:batch_size]
    indices_touse_neg = np.random.permutation(len(df_neg))[0:int(batch_size*pos_ratio)]
    
    #generate X and Y matrices
    X, Y = [], []
    for i in range(batch_size):
        sample_x, sample_y = generate_one_sample(df_pos, ohe_labels)
        X.append(sample_x)
        Y.append(sample_y)
    for i in range(int(batch_size*pos_ratio)):
        sample_x, sample_y = generate_one_sample(df_neg, ohe_labels)
        X.append(sample_x)
        Y.append(sample_y)
    
    #shuffle X and Y matrices
    shuffled_indices = np.random.permutation(len(X))
    return [X[i] for i in shuffled_indices], [Y[i] for i in shuffled_indices]
    
#generates exactly one random sample from a dataframe using OHE. this is a helper function, shouldn't be explicitly called
#if index is not -1, will not generate random index
def generate_one_sample(df, ohe_labels, index=-1):
    if index == -1:
        index = np.random.randint(0,len(df))
    X = [[0 if ohe_labels[c][df[c].values[index]] != j else 1 for j in range(len(ohe_labels[c]))]
         if df[c].values[index] in ohe_labels[c] else [0 for j in range(len(ohe_labels[c]))]
                  for c in ohe_labels if c != 'c_cnt' and c != 'keywords']
    wordset = set([w for w in df['keywords'].values[index]])
    X.append([1 if v in wordset else 0 for v in ohe_labels['keywords']])
    X = np.array(X)
    X = np.hstack(X)
    return X, df['c_cnt'].values[index]

#takes in the dataframe, returns an X and Y matrix 
def transform_df(df, ohe_labels):
    X, Y = [], []
    for i in range(len(df)):
        xmini, ymini = generate_one_sample(df, ohe_labels, index=i)
        X.append(xmini)
        Y.append(ymini)
    return X, Y

In [12]:
ohe_labels = preprocess_ohe(df_combined, k_most_freq=True)

In [13]:
X, Y = transform_df(df_combined, ohe_labels)

In [14]:
X_filtered_test, Y_filtered_test = transform_df(sampled_test, ohe_labels)

In [15]:
X = np.array(X)
Y = np.array(Y)
X_filtered_test = np.array(X_filtered_test)
Y_filtered_test = np.array(Y_filtered_test)

In [16]:
#Train and Test Split 
ind_cutoff = int(0.7*len(X))
X_train, Y_train = X[0:ind_cutoff], Y[0:ind_cutoff]
X_test, Y_test = X[ind_cutoff:], Y[ind_cutoff:]

In [17]:
len(X_train)

105960

In [18]:
print("Positive samples in training: ", sum(Y_train))
print("Positive samples in testing: ", sum(Y_test))

Positive samples in training:  17602.0
Positive samples in testing:  7532.0


In [37]:
from sklearn.linear_model import LogisticRegression, ElasticNet
#Train with Filtered Features Using Logistic Regression 
logreg = LogisticRegression(penalty='l2', C=0.01)
logreg.fit(X_train, Y_train)


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
print("Probabilities for Testing Data")

predictions_test = pd.DataFrame(logreg.predict_proba(X_test), columns=['Probability of No Click', 'Probability of Click'])
predictions_test['Prediction'] = logreg.predict(X_test)
predictions_test['Actual'] = Y_test
predictions_test['Correct'] = (predictions_test['Prediction'] == predictions_test['Actual'])

predictions_test

Probabilities for Testing Data


Unnamed: 0,Probability of No Click,Probability of Click,Prediction,Actual,Correct
0,0.980319,0.019681,0.0,0.0,True
1,0.992106,0.007894,0.0,0.0,True
2,0.968606,0.031394,0.0,0.0,True
3,0.996696,0.003304,0.0,0.0,True
4,0.840239,0.159761,0.0,0.0,True
5,0.997237,0.002763,0.0,0.0,True
6,0.981699,0.018301,0.0,0.0,True
7,0.995558,0.004442,0.0,0.0,True
8,0.928130,0.071870,0.0,0.0,True
9,0.999607,0.000393,0.0,0.0,True


In [42]:
from sklearn.metrics import confusion_matrix

print("Training accuracy", 100*logreg.score(X_train,Y_train), "%")
print("Validation accuracy", 100*logreg.score(X_test,Y_test), "%")

train_prediction = logreg.predict(X_train)
test_prediction = logreg.predict(X_test)

train_confusion_matrix = confusion_matrix(Y_train, train_prediction)
test_confusion_matrix = confusion_matrix(Y_test, test_prediction)

train_recall = train_confusion_matrix[1][1]/sum(train_confusion_matrix[1])
test_recall = test_confusion_matrix[1][1]/sum(test_confusion_matrix[1])

# #What percent of the positive cases did we catch? 
print("Training Recall:", train_recall)
print("Validation Recall:", test_recall)

# #What percent of the positive predictions was correct?

train_precision = train_confusion_matrix[1][1]/(train_confusion_matrix[1][1] + train_confusion_matrix[0][1])
test_precision = test_confusion_matrix[1][1]/(test_confusion_matrix[1][1] + test_confusion_matrix[0][1])

print("Training Precision:", train_precision)
print("Validation Precision:", test_precision)

print("Confusion Matrix for Training Data")
print(train_confusion_matrix)

print("Confusion Matrix for Testing Data")
# #TN FP 
# #FN TP 
print(test_confusion_matrix)

Training accuracy 96.96300490751227 %
Validation accuracy 96.72993922311284 %
Training Recall: 0.8616066356095898
Validation Recall: 0.8519649495485927
Training Precision: 0.9509656383245548
Validation Precision: 0.9454840135553264
Confusion Matrix for Training Data
[[87576   782]
 [ 2436 15166]]
Confusion Matrix for Testing Data
[[37510   370]
 [ 1115  6417]]


In [43]:
from sklearn.metrics import log_loss

print("Log Loss on Unfiltered Data")
log_loss(Y_test, logreg.predict_proba(X_test))

Log Loss on Unfiltered Data


0.0885386783229558

In [44]:
print("F1 Score for Unfiltered Data")
(2* (test_precision*test_recall))/(test_precision+test_recall)

F1 Score for Unfiltered Data


0.8962916404776871

In [45]:
print("Validation Accuracy", 100*logreg.score(X_filtered_test,Y_filtered_test), "%")

test_prediction = logreg.predict(X_filtered_test)

test_confusion_matrix = confusion_matrix(Y_filtered_test, test_prediction)

test_recall = test_confusion_matrix[1][1]/sum(test_confusion_matrix[1])

#What percent of the positive cases did we catch? 
print("Validation Recall:", test_recall)

#What percent of the positive predictions was correct?
test_precision = test_confusion_matrix[1][1]/(test_confusion_matrix[1][1] + test_confusion_matrix[0][1])
print("Validation Precision:", test_precision)


print("Confusion Matrix for Testing Data")
# #TN FP 
# #FN TP 
print(test_confusion_matrix)

Validation Accuracy 98.79756595673737 %
Validation Recall: 0.16666666666666666
Validation Precision: 0.008976660682226212
Confusion Matrix for Testing Data
[[47404   552]
 [   25     5]]


In [46]:
print("Log Loss on Filtered Data")
log_loss(Y_filtered_test, logreg.predict_proba(X_filtered_test))

Log Loss on Filtered Data


0.03696905290846593

In [47]:
print("F1 Score for Filtered Data")
(2* (test_precision*test_recall))/(test_precision+test_recall)

F1 Score for Filtered Data


0.017035775127768313

In [27]:
#Grid Search for Hyperparameter
# for i in np.arange(0.1, 0.5, 0.1): 
#     logreg = LogisticRegression(penalty='l2', C=i)
#     logreg.fit(X_train, Y_train)
    
#     loss = log_loss(Y_filtered_test, logreg.predict_proba(X_filtered_test))
#     print("Weight: ", i, "|", "Log Loss: ", loss) 
    

In [28]:
#Grid Search for Hyperparameter
# for i in np.arange(0.1, 1.1, 0.1): 
#     logreg = LogisticRegression(penalty='l1', C=i)
#     logreg.fit(X_train, Y_train)
    
#     loss = log_loss(Y_filtered_test, logreg.predict_proba(X_filtered_test))
#     print("Weight: ", i, "|", "Log Loss: ", loss) 
    

In [30]:
#Feature Selection and Training 
# from xgboost import XGBClassifier

# model = XGBClassifier()
# model.fit(X_filtered_test, Y_filtered_test)

# print(model.feature_importances_)

# # select features using threshold
# selection = SelectFromModel(model, threshold=thresh, prefit=True)
# select_X_train = selection.transform(X_train)

# # train model
# selection_model = XGBClassifier()
# selection_model.fit(select_X_train, Y_train)

# # eval model
# select_X_test = selection.transform(X_test)
# y_pred = selection_model.predict(select_X_test)

# plot_importance(model)
# plt.show()