# Outbrain Click Prediction

## 1. Data Preparation and Cleaning

### Import Python Libraries

In [1]:
import os
import numpy as np
import pandas as p
import pickle
import zipfile
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

#### Get metadata and functions utils

In [108]:
path = os.getcwd() + "/"
utils_path = path + "utils/"
meta = utils_path + "meta.py"
functions = utils_path + "functions.py"
%run $meta
%run $functions

### Unzip source tables

In [3]:
zip_ref = zipfile.ZipFile(data_tables, 'r')
zip_ref.extractall(tables_path)
zip_ref.close()

### Load Train & Test

In [13]:
train = p.read_csv(tables_path + "train.csv")
test = p.read_csv(tables_path + "test.csv")

print (train.head())

   display_id   ad_id clicked
0           1   42337   False
1           1  139684   False
2           1  144739    True
3           1  156824   False
4           1  279295   False


#### Choose "fraction" of train & test for training  & predicting

In [14]:
#set size of fraction to take from train and test, for full tables, fraction = 0
fraction = 0.1

if fraction:
    train, test = fractioned(train,test,fraction)
    print (train.shape)
    print (test.shape)

(1311190, 3)
(433480, 3)


### Prepare platform table
#### 1. Impute missing values with median
#### 2. Create one-hot

In [15]:
skip_computation = True

if (not skip_computation):
    platform_prep = features_path + "platform_prep.py"
    %run $platform_prep

platform = p.read_csv(tables_path + "platform.csv")
print (platform.head())

   display_id  document_id  plat_1  plat_2  plat_3
0           1       379743       0       0       1
1           2      1794259       0       1       0
2           3      1179111       0       1       0
3           4      1777797       0       1       0
4           5       252458       0       1       0


### Prepare topics & categories table
#### 1. Merge categories, topics tables
#### 2. For each document - leave only topic/category with the highest confidence level
#### 3. Impute missing documents with value -1 (assign confidence 0) and cast id's back to integer

In [16]:
skip_computation = True

if (not skip_computation):
    topics_categories_prep = features_path + "topics_categories_prep.py"
    %run $topics_categories_prep
    
topics_categories = p.read_csv(tables_path + "topics_categories.csv")
print (topics_categories.head())

   document_id  topic_id  confi_top  category_id  confi_cat
0      1595802       140   0.073113         1611   0.920000
1      1524246       113   0.196450         1807   0.920000
2      1617787       113   0.216892         1807   0.920000
3      1615583        89   0.316306         1305   0.920000
4      1615460       260   0.097964         1613   0.540646


### Prepare Promoted Content table
#### 1. Merge with topics_categories table
#### 2. Impute missing documents with -1 (assign confidence 0) and cast id's back to integer

In [17]:
skip_computation = True

if (not skip_computation):
    promoted_prep = features_path + "promoted_content_prep.py"
    %run $promoted_prep

promoted = p.read_csv(tables_path + "promoted_content_prep.csv")
print (promoted.head())

   ad_id  ad_document_id  campaign_id  advertiser_id  topic_id  confi_top  \
0      1            6614            1              7        -1   0.000000   
1      2          471467            2              7        89   0.122998   
2      3            7692            3              7        26   0.104300   
3      4          471471            2              7       168   0.076367   
4      5          471472            2              7       143   0.089059   

   category_id  confi_cat  
0         1209   0.925264  
1         1505   0.920000  
2         1209   0.920000  
3         1205   0.920000  
4         1608   0.920000  


### Merge all tables with train & test

In [18]:
train = train.merge(platform, how='left', on='display_id')
test = test.merge(platform, how='left', on='display_id')
del platform

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3
0     1409220   26722   False      1049648       0       0       1
1     1409220   39675   False      1049648       0       0       1
2     1409220  129490    True      1049648       0       0       1
3     1409220  144096   False      1049648       0       0       1
4     1409220  179041   False      1049648       0       0       1


In [19]:
train = train.merge(promoted,how = 'left',on = 'ad_id')
test = test.merge(promoted,how = 'left',on = 'ad_id')
del promoted

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1409220   26722   False      1049648       0       0       1   
1     1409220   39675   False      1049648       0       0       1   
2     1409220  129490    True      1049648       0       0       1   
3     1409220  144096   False      1049648       0       0       1   
4     1409220  179041   False      1049648       0       0       1   

   ad_document_id  campaign_id  advertiser_id  topic_id  confi_top  \
0          840223         4035           1299       160   0.428540   
1          700044         2491           1299       198   0.323995   
2         1060089         9756           1593       227   0.334232   
3         1333780          606             90       151   0.076847   
4         1351158        21726           3190       138   0.065299   

   category_id  confi_cat  
0         1403   0.920000  
1         1403   0.920000  
2         1403   0.920000  
3         1210   0.447978  
4         1403   0

In [20]:
train = train.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
test = test.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
del topics_categories

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1409220   26722   False      1049648       0       0       1   
1     1409220   39675   False      1049648       0       0       1   
2     1409220  129490    True      1049648       0       0       1   
3     1409220  144096   False      1049648       0       0       1   
4     1409220  179041   False      1049648       0       0       1   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0          840223         4035           1299          160      0.428540   
1          700044         2491           1299          198      0.323995   
2         1060089         9756           1593          227      0.334232   
3         1333780          606             90          151      0.076847   
4         1351158        21726           3190          138      0.065299   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1403      0.920000         2

#### Impute missing topics/categories with value -1 (assign confidence 0)

In [21]:
missing_values = missing_values_table(train,1)
print (missing_values[missing_values["% of Total Values"] != 0])

                 Missing Values  % of Total Values
topic_id_doc              31587           2.409033
confi_top_doc             31587           2.409033
category_id_doc           31587           2.409033
confi_cat_doc             31587           2.409033


In [22]:
train.confi_top_doc = train.confi_top_doc.fillna(0)
train.confi_cat_doc = train.confi_cat_doc.fillna(0)
test.confi_top_doc = test.confi_top_doc.fillna(0)
test.confi_cat_doc = test.confi_cat_doc.fillna(0)
train = train.fillna(-1)
test = test.fillna(-1)

#### Add boolean for identity between ad and document topic & ad and document category

In [None]:
shared_subjects(train)
shared_subjects(test)

In [47]:
print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1409220   26722   False      1049648       0       0       1   
1     1409220   39675   False      1049648       0       0       1   
2     1409220  129490    True      1049648       0       0       1   
3     1409220  144096   False      1049648       0       0       1   
4     1409220  179041   False      1049648       0       0       1   

   ad_document_id  campaign_id  advertiser_id      ...        \
0          840223         4035           1299      ...         
1          700044         2491           1299      ...         
2         1060089         9756           1593      ...         
3         1333780          606             90      ...         
4         1351158        21726           3190      ...         

   score_docXad_doc  score_docXadv  score_docXcamp  weekend  morning  noon  \
0          1.100718       1.514402        1.106262        0        0     0   
1          1.673785       1.514402    

In [23]:
#export end of data preparation and cleaning
train.to_csv(tables_path + 'train_prep.csv', index=False)
test.to_csv(tables_path + 'test_prep.csv', index=False)

## 2. Feature Engineering
### 2.1. Click Through Rate
### See feature engineering section in documetation

In [24]:
skip_computation = True

if (skip_computation):
    zip_ref = zipfile.ZipFile(ctr_tables, 'r')
    zip_ref.extractall(tables_path)
    zip_ref.close()

else:
    ctr_features = features_path + "ctr_features.py"
    %run $ctr_features

### 2.2 topics & categories correlations
#### 1. Build dictionaries holding for each pair (topic, topic) or (category, category) the strength of their correlation (more on documentaion)
#### 2. confidence_cut_category, confidence_cut_topic, score_cut_category, score_cut_topic determines min thresholds for confidence level and correlation score

In [26]:
skip_computation = True

if (not skip_computation):
    dictionaries_prep = features_path + "dictionaries_prep.py"
    %run $dictionaries_prep
    
    #threshold confidence level to keep
    confidence_cut_category = 0.5  
    confidence_cut_topic = 0.1
    
    #which portion of top scores to take, takes 1/parameter highest scores
    score_cut_category = 4
    score_cut_topic = 4
    
    category_dict_name, topic_dict_name = create_dicts(confidence_cut_category, confidence_cut_topic, 
                                                      score_cut_category, score_cut_topic)

else:    
    topic_dict_name = default_topic_dict_name
    category_dict_name = default_category_dict_name


#load dictionaries for correlation of topics and categories
with open(dicts_path + topic_dict_name, 'rb') as handle:
    top_dict = pickle.load(handle, encoding='iso-8859-1')
    
with open(dicts_path + category_dict_name, 'rb') as handle:
    cat_dict = pickle.load(handle, encoding='iso-8859-1')

#### 2. Fill NA values with median correlation
#### 3. Merge correlations values with train & test

In [27]:
train, test = correlations(train, test, top_dict, cat_dict)

### 2.3. Click Time
### Requires packages: pygecoders, geopy, pycountry, pytz, tzwhere, shapley
#### 1. Parse geo_location to country and state
#### 2. For each location compute offset from UTC
#### 3. Fill NA with offset 0
#### 4. Add offset to timestamp
#### 5. Convert timestamp to one-hot of: { morning, noon, evening, night, weekend }

In [28]:
print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1409220   26722   False      1049648       0       0       1   
1     1409220   39675   False      1049648       0       0       1   
2     1409220  129490    True      1049648       0       0       1   
3     1409220  144096   False      1049648       0       0       1   
4     1409220  179041   False      1049648       0       0       1   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0          840223         4035           1299          160      0.428540   
1          700044         2491           1299          198      0.323995   
2         1060089         9756           1593          227      0.334232   
3         1333780          606             90          151      0.076847   
4         1351158        21726           3190          138      0.065299   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1403      0.920000         2

In [29]:
skip_computation = True

if (not skip_computation):
    timezone = features_path + "timezone.py"
    %run $timezone
    
time_table = p.read_csv(tables_path + "time_table.csv")
print (time_table.head())

   display_id  weekend  morning  noon  evening  night
0           1        0        0     0        1      0
1           2        0        0     1        0      0
2           3        0        0     0        1      0
3           4        0        0     0        1      0
4           5        0        1     0        0      0


### Merge features with train & test

In [30]:
train, test = merge_ctrs_and_time(train, test)

In [46]:
print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0     1409220   26722   False      1049648       0       0       1   
1     1409220   39675   False      1049648       0       0       1   
2     1409220  129490    True      1049648       0       0       1   
3     1409220  144096   False      1049648       0       0       1   
4     1409220  179041   False      1049648       0       0       1   

   ad_document_id  campaign_id  advertiser_id      ...        \
0          840223         4035           1299      ...         
1          700044         2491           1299      ...         
2         1060089         9756           1593      ...         
3         1333780          606             90      ...         
4         1351158        21726           3190      ...         

   score_docXad_doc  score_docXadv  score_docXcamp  weekend  morning  noon  \
0          1.100718       1.514402        1.106262        0        0     0   
1          1.673785       1.514402    

In [34]:
#save final train & test
train.to_csv(tables_path + 'train_features.csv', index=False)
test.to_csv(tables_path + 'test_features.csv', index=False)

In [None]:
#load final train & test - for repeated runs
train = p.read_csv(path_b + 'train_features.csv')
test = p.read_csv(path_b + 'test_features.csv')

### all pairs of display documents and ad attributes has extremely high rate of nulls
### Impute missing values with mean values or median

In [35]:
missing_values = missing_values_table(test, 0)
print (missing_values[missing_values["% of Total Values"] != 0])

                  Missing Values  % of Total Values
score_ad                   16561           3.820476
score_ad_doc                8684           2.003322
score_adv                   1052           0.242687
score_camp                  4266           0.984128
score_docXad              264438          61.003507
score_docXad_doc          241571          55.728292
score_docXadv             203941          47.047384
score_docXcamp            237292          54.741165


In [36]:
#choose "median" or "mean"
fill = "median"
fill_na(test,fill)

## 3. Model Selection
### 1. Split train to train and validation set

In [53]:
train, validation = train_validation_split(train)

### 2. set predictors for model comparison

In [None]:
predictors= [x for x in train.columns if x not in 
             ['display_id','ad_id','clicked','document_id','ad_document_id',
              'campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc',
              'category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]

In [102]:
train[predictors].head()

Unnamed: 0,plat_1,plat_2,plat_3,cor_top,cor_cat,score_ad,score_ad_doc,score_adv,score_camp,score_docXad,score_docXad_doc,score_docXadv,score_docXcamp,weekend,morning,noon,evening,night,same_topic,same_category
0,0,0,1,0.009336,1.056498,1.077518,1.059151,1.147862,1.060096,1.200817,1.100718,1.514402,1.106262,0,0,0,1,0,False,False
1,0,0,1,0.006586,1.056498,1.146762,1.139692,1.147862,1.140961,1.662041,1.673785,1.514402,1.678247,0,0,0,1,0,False,False
2,0,0,1,0.007801,1.056498,0.719273,0.662975,0.626853,0.72116,0.991616,0.999217,1.023067,1.002104,0,0,0,1,0,False,False
3,0,0,1,0.001573,0.493635,1.350336,1.350924,1.568107,1.302799,1.560653,1.575921,1.623828,1.58172,0,0,0,1,0,False,False
4,0,0,1,0.001369,0.505777,1.287411,1.215748,1.085697,1.218249,1.219842,1.232581,1.146237,1.237421,0,0,0,1,0,False,False


#### Run Logistic Regression with different combinations of features to get the best predictors
#### Logistic Regression was chosen because this computation takes long time, and it is the fastest

In [109]:
predictors = feature_selection(train, validation)

best score is: 0.74708156797344183


In [110]:
print(predictors)

['plat_1', 'plat_2', 'plat_3', 'cor_cat', 'score_ad', 'score_ad_doc', 'score_adv', 'score_camp', 'score_docXad', 'score_docXad_doc', 'score_docXadv', 'score_docXcamp', 'same_topic', 'same_category']


### 3. Compare between Random Forest, Gradient Boost, Logistic Regression
#### Parameters Tuning - for each model run grid search on 2 selected parameters to get best values
#### Evaluate model by computing MAP @12 score on prediction made on validation

In [72]:
#Random Forest
min_sample_split = [2,4]
n_estimators = [10, 25]
forest = grid_search("randomforest", min_sample_split, n_estimators, train, validation, predictors)

parameters: min_samples_split = 2, n_estimators = 10, score: 0.78980125798627487
parameters: min_samples_split = 4, n_estimators = 10, score: 0.79377628264884736
parameters: min_samples_split = 2, n_estimators = 25, score: 0.80211288953850501
parameters: min_samples_split = 4, n_estimators = 25, score: 0.80561643390478666
best parameters: min_samples_split = 4, n_estimators = 25, score: 0.80561643390478666


In [62]:
#Gradiant Boosting
learning_rate = [0.05,0.1]
max_depth = [2,4]
gradient = grid_search("gradient", learning_rate, max_depth, train, validation, predictors)

parameters: learning_rate = 0.05, max_depth = 2, score: 0.82479658610921103
parameters: learning_rate = 0.1, max_depth = 2, score: 0.83340222588290447
parameters: learning_rate = 0.05, max_depth = 4, score: 0.83893429556470611
parameters: learning_rate = 0.1, max_depth = 4, score: 0.84261499589162014
best parameters: learning_rate = 0.1, max_depth = 4, score: 0.84261499589162014


In [74]:
#Logistic Regression
C = [0.00001,0.000000001]
solver = ["sag", "lbfgs"]
logistic = grid_search("logistic", C, solver, train, validation, predictors)

parameters: C = 1e-05, solver = 'sag', score: 0.74706545940403568
parameters: C = 1e-09, solver = 'sag', score: 0.73966907846262897
parameters: C = 1e-05, solver = 'lbfgs', score: 0.74705311342018421
parameters: C = 1e-09, solver = 'lbfgs', score: 0.73966907846262897
best parameters: C = 1e-05, solver = 'sag', score: 0.74706545940403568


### 4. Make prediction on test with all models using selected features and optimal parameters found

In [119]:
for model in [forest, gradient, logistic]:
    alg = model[0]
    alg.fit(train[predictors], train["clicked"])
    predict(alg, test, predictors)

MAP: 0.574430357937
PORTION: 0.348629257431
MAP: 0.617754417959
PORTION: 0.401143373997
MAP: 0.649552832968
PORTION: 0.445617652636
