In [2]:
import numpy as np
import xgboost as xgb
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
import json
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBT
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import tensorflow as tf
import json
import math
import random
import itertools
import pickle
import datetime
import pytz
from pytz import timezone
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
%matplotlib inline


OHE_PATH = "data/ohe_labels.pkl"
#from sklearn.preprocessing import CategoricalEncoder
#CategoricalEncoder is part of sklearn's developer version, which you can't just update with conda. If you have issues
#getting this version, try a hard code implementation of the library here - https://pastebin.com/qs1es9XE

In [3]:
#Load in data for train and test

df_store = pd.HDFStore('data/day2_negatives_processed.h5')
neg = df_store['df']
neg = neg[:50000]
df_store = pd.HDFStore('data/day2_positives_processed.h5')
pos = df_store['df']
pos = pos[:50000]
df = pd.concat([neg,pos])

In [77]:
df_removed = df.drop( [ "i_cnt", "vi_cnt", "r_num_ads_returned", "i_flag_cnt", "vi_flag_cnt"] , axis=1)

In [25]:
df_store = pd.HDFStore('data/combined_day1_processed.h5')
test = df_store['df']
test = test[:50000]

In [15]:
#this is the helper function for preprocessing ohe labels - dont need to explicitly call this function
def generate_ohe_labels(df, c, thresh=200, k_most_freq=False):
    appears, oh_index = {}, {}
    if c == 'keywords':
        for val in df[c].values:
            for word in val:
                if word not in appears:
                    appears[word] = 0
                appears[word] += 1
        if k_most_freq:
            for v in sorted(appears)[0:thresh]:
                oh_index[v] = len(oh_index)
        else:
            for v in [k for k in appears.keys()]:
                if appears[v] < thresh:
                    del appears[v]
            for v in sorted(appears):
                oh_index[v] = len(oh_index)
    else:    
        for val in df[c].values:
            if val not in appears:
                appears[val] = 0
            appears[val] += 1
        if k_most_freq:
            for v in sorted(appears)[0:thresh]:
                oh_index[v] = len(oh_index)
        else:
            for v in [k for k in appears.keys()]:
                if appears[v] < thresh:
                    del appears[v]
            for v in sorted(appears):
                oh_index[v] = len(oh_index)
    return oh_index

#generate OHE labels to be used for batch learning - run this FIRST
def preprocess_ohe(df, thresh=200, path=OHE_PATH, k_most_freq=False):
    #create and save our ohe labels
    ohe_labels = {}
    for c in df:
        if c == 'c_cnt':
            continue
        else:
            ohe_labels[c] = generate_ohe_labels(df, c, thresh, k_most_freq=k_most_freq)

    with open(path, 'wb') as f:
        pickle.dump(ohe_labels, f, pickle.HIGHEST_PROTOCOL)
        
    return ohe_labels
        
#generates a small X and Y matrix by sampling from both negative and positive dataframes
def generate_batch(df_pos, df_neg, batch_size, path=OHE_PATH, pos_ratio=1, ohe_labels=None):
    #load our ohe labels
    if ohe_labels == None:
        with open(path, 'rb') as f:
            ohe_labels = pickle.load(path)
    
    #pick which indices to use for our batch training
    indices_touse_pos = np.random.permutation(len(df_pos))[0:batch_size]
    indices_touse_neg = np.random.permutation(len(df_neg))[0:int(batch_size*pos_ratio)]
    
    #generate X and Y matrices
    X, Y = [], []
    for i in range(batch_size):
        sample_x, sample_y = generate_one_sample(df_pos, ohe_labels)
        X.append(sample_x)
        Y.append(sample_y)
    for i in range(int(batch_size*pos_ratio)):
        sample_x, sample_y = generate_one_sample(df_neg, ohe_labels)
        X.append(sample_x)
        Y.append(sample_y)
    
    #shuffle X and Y matrices
    shuffled_indices = np.random.permutation(len(X))
    return [X[i] for i in shuffled_indices], [Y[i] for i in shuffled_indices]
    
#generates exactly one random sample from a dataframe using OHE. this is a helper function, shouldn't be explicitly called
#if index is not -1, will not generate random index
def generate_one_sample(df, ohe_labels, index=-1):
    if index == -1:
        index = np.random.randint(0,len(df))
    X = [[0 if ohe_labels[c][df[c].values[index]] != j else 1 for j in range(len(ohe_labels[c]))]
         if df[c].values[index] in ohe_labels[c] else [0 for j in range(len(ohe_labels[c]))]
                  for c in ohe_labels if c != 'c_cnt' and c != 'keywords']
    wordset = set([w for w in df['keywords'].values[index]])
    X.append([1 if v in wordset else 0 for v in ohe_labels['keywords']])
    X = np.array(X)
    X = np.hstack(X)
    return X, df['c_cnt'].values[index]

#takes in the dataframe, returns an X and Y matrix 
def transform_df(df, ohe_labels):
    X, Y = [], []
    for i in range(len(df)):
        xmini, ymini = generate_one_sample(df, ohe_labels, index=i)
        X.append(xmini)
        Y.append(ymini)
    return X, Y



k = 20

#ohe_labels = preprocess_ohe(df, thresh=k, k_most_freq=True)

X, Y = transform_df(df, ohe_labels)


In [32]:
#Convert data to DataFrames
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

In [46]:
#Split data
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.3)
y_train = y_train[0].ravel()

In [49]:
#Caclulates scoring of model on data 
def score(y_pred , y_test):
    test = confusion_matrix(y_test , y_pred)
    prec = test[1][1] / (test[1][1] + test[0][1])
    rec = test[1][1] / (test[1][1] + test[1][0])
    print("Precision: ", prec)
    print("Recall" , rec)
    return f1_score(y_pred,y_test)

In [58]:
#train binary classifier models and get f1 score
model = xgb.XGBClassifier(gamma = 5 , min_child_weight = 3, objective = 'binary:logistic')
model.fit(X_train, y_train )
y_pred = model.predict(X_test)
score(y_pred ,y_test )

Precision:  0.941537071691
Recall 0.967380494597


0.95428384565075208

### Removing extra columns

In [84]:
#ohe_labels = preprocess_ohe(df_removed, thresh=k, k_most_freq=True)
#X_rem, Y_rem = transform_df(df_removed, ohe_labels)
X_rem = pd.DataFrame(X_rem)
Y_rem = pd.DataFrame(Y_rem)
X_train_rem, X_test_rem, y_train_rem, y_test_rem = train_test_split( X_rem, Y_rem, test_size=0.3)
y_train_rem = y_train_rem[0].ravel()

In [85]:
#train binary classifier models and get f1 score

model = xgb.XGBClassifier(gamma = 5 , min_child_weight = 3, objective = 'binary:logistic')
model.fit(X_train_rem, y_train_rem )
y_pred_rem = model.predict(X_test_rem)
score(y_pred_rem ,y_test_rem )

Precision:  0.919039318226
Recall 0.952365023082


0.93540544092522016

## XGBoost for CTR

In [51]:
#Create new Regressor model to 
model = xgb.XGBRegressor(objective = "binary:logitraw")
model.fit(X_train ,y_train )
y_pred_prob = model.predict(X_test)


In [52]:
#Normalize probabilities around average click rate and give a "strength" to how much from the mean the CTR can vary 
#(how much power you give to the model)
def normalize(arr , strength):
    avg = np.mean(arr)
    stddev = np.std(arr)
    arr = (arr-avg)/(stddev * 2 * strength)
    arr = arr + (1/2234)
    for i in range(len(arr)):
        arr[i] = max(0 , arr[i])
        arr[i] = min(1 , arr[i])      
    return arr
def evaluate(arr, corect):
    return abs(sum(abs(corect - arr)) / len(arr)) * 100
    
new_pred = normalize(y_pred_prob , 2000)


In [56]:
#New pred now has estimated click through rates for each ad.
new_pred

array([ 0.00015335,  0.00013226,  0.00012721, ...,  0.00022853,
        0.00041127,  0.00024896], dtype=float32)