# Outbrain Click Prediction

## 1. Data Preparation and Cleaning

### Import Python Libraries

In [86]:
import os
import numpy as np
import pandas as p
import pickle
import zipfile
from sklearn.linear_model import LogisticRegression

#### Get metadata and functions utils

In [None]:
path = os.getcwd() + "/"
utils_path = path + "utils/"
meta = utils_path + "meta.py"
functions = utils_path + "functions.py"
%run $meta
%run $functions

### Unzip source tables

In [3]:
#added unzip boolean so if they run it twice they dont have to unzip..
unzip = False
if unzip:
    zip_ref = zipfile.ZipFile(data_tables, 'r')
    zip_ref.extractall(tables_path)
    zip_ref.close()

### Load Train & Test

In [4]:
train = p.read_csv(tables_path + "train.csv")
test = p.read_csv(tables_path + "test.csv")

print (train.head())

   display_id   ad_id clicked
0           1   42337   False
1           1  139684   False
2           1  144739    True
3           1  156824   False
4           1  279295   False


#### Choose "fraction" of train & test for training  & predicting

In [5]:
#set size of fraction to take from train and test, for full tables, fraction = 0
fraction = 0.1

if fraction:
    train, test = fractioned(train,test,fraction)
    print (train.shape)
    print (test.shape)

(1312040, 3)
(434169, 3)


### Prepare platform table
#### 1. Impute missing values with median
#### 2. Create one-hot

In [6]:
skip_computation = True

if (not skip_computation):
    platform_prep = features_path + "platform_prep.py"
    %run $platform_prep

platform = p.read_csv(tables_path + "platform.csv")
print (platform.head())

   display_id  document_id  plat_1  plat_2  plat_3
0           1       379743       0       0       1
1           2      1794259       0       1       0
2           3      1179111       0       1       0
3           4      1777797       0       1       0
4           5       252458       0       1       0


### Prepare topics & categories table
#### 1. Merge categories, topics tables
#### 2. For each document - leave only topic/category with the highest confidence level
#### 3. Impute missing documents with value -1 (assign confidence 0) and cast id's back to integer

In [7]:
skip_computation = True

if (not skip_computation):
    topics_categories_prep = features_path + "topics_categories_prep.py"
    %run $topics_categories_prep
    
topics_categories = p.read_csv(tables_path + "topics_categories.csv")
print (topics_categories.head())

   document_id  topic_id  confi_top  category_id  confi_cat
0      1595802       140   0.073113         1611   0.920000
1      1524246       113   0.196450         1807   0.920000
2      1617787       113   0.216892         1807   0.920000
3      1615583        89   0.316306         1305   0.920000
4      1615460       260   0.097964         1613   0.540646


### Prepare Promoted Content table
#### 1. Merge with topics_categories table
#### 2. Impute missing documents with -1 (assign confidence 0) and cast id's back to integer

In [8]:
skip_computation = True

if (not skip_computation):
    promoted_prep = features_path + "promoted_content_prep.py"
    %run $promoted_prep

promoted = p.read_csv(tables_path + "promoted_content_prep.csv")
print (promoted.head())

   ad_id  ad_document_id  campaign_id  advertiser_id  topic_id  confi_top  \
0      1            6614            1              7        -1   0.000000   
1      2          471467            2              7        89   0.122998   
2      3            7692            3              7        26   0.104300   
3      4          471471            2              7       168   0.076367   
4      5          471472            2              7       143   0.089059   

   category_id  confi_cat  
0         1209   0.925264  
1         1505   0.920000  
2         1209   0.920000  
3         1205   0.920000  
4         1608   0.920000  


### Merge all tables with train & test

In [9]:
train = train.merge(platform, how='left', on='display_id')
test = test.merge(platform, how='left', on='display_id')
del platform

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3
0          35  180913   False      1271578       0       0       1
1          35   45024   False      1271578       0       0       1
2          35  289116    True      1271578       0       0       1
3          37  234713    True      1779285       0       1       0
4          37  235443   False      1779285       0       1       0


In [10]:
train = train.merge(promoted,how = 'left',on = 'ad_id')
test = test.merge(promoted,how = 'left',on = 'ad_id')
del promoted

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0          35  180913   False      1271578       0       0       1   
1          35   45024   False      1271578       0       0       1   
2          35  289116    True      1271578       0       0       1   
3          37  234713    True      1779285       0       1       0   
4          37  235443   False      1779285       0       1       0   

   ad_document_id  campaign_id  advertiser_id  topic_id  confi_top  \
0         1151028          674            769        16   0.172854   
1          923095         5762            551       136   0.464029   
2         1687165        28025           3899       125   0.041293   
3         1586431          245            232       163   0.369382   
4         1377696        11654            555       284   0.263319   

   category_id  confi_cat  
0         1702   0.920000  
1         1403   0.920000  
2         2100   0.356819  
3         1403   0.920000  
4         1206   0

In [11]:
train = train.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
test = test.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
del topics_categories

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0          35  180913   False      1271578       0       0       1   
1          35   45024   False      1271578       0       0       1   
2          35  289116    True      1271578       0       0       1   
3          37  234713    True      1779285       0       1       0   
4          37  235443   False      1779285       0       1       0   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0         1151028          674            769           16      0.172854   
1          923095         5762            551          136      0.464029   
2         1687165        28025           3899          125      0.041293   
3         1586431          245            232          163      0.369382   
4         1377696        11654            555          284      0.263319   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1702      0.920000         1

#### Impute missing topics/categories with value -1 (assign confidence 0)

In [12]:
missing_values = missing_values_table(train,1)
print (missing_values[missing_values["% of Total Values"] != 0])

                 Missing Values  % of Total Values
topic_id_doc              31505           2.401223
confi_top_doc             31505           2.401223
category_id_doc           31505           2.401223
confi_cat_doc             31505           2.401223


In [13]:
train.confi_top_doc = train.confi_top_doc.fillna(0)
train.confi_cat_doc = train.confi_cat_doc.fillna(0)
test.confi_top_doc = test.confi_top_doc.fillna(0)
test.confi_cat_doc = test.confi_cat_doc.fillna(0)
train = train.fillna(-1)
test = test.fillna(-1)

#### Add boolean for identity between ad and document topic & ad and document category

In [316]:
#export end of data preparation and cleaning
train.to_csv(tables_path + 'train_prep.csv', index=False)
test.to_csv(tables_path + 'test_prep.csv', index=False)

## 2. Feature Engineering
### 2.1. Click Through Rate
### See documetation

In [None]:
skip_computation = True

if (skip_computation):
    zip_ref = zipfile.ZipFile(ctr_tables, 'r')
    zip_ref.extractall(tables_path)
    zip_ref.close()

else:
    ctr_features = features_path + "ctr_features.py"
    %run $ctr_features

### 2.2 topics & categories correlations
#### 1. Build dictionaries holding for each pair (topic, topic) or (category, category) the strength of their correlation (more on documentaion)

In [14]:
skip_computation = True

if (not skip_computation):
    dictionaries_prep = features_path + "dictionaries_prep.py"
    %run $dictionaries_prep
    
    #threshold confidence level to keep
    confidence_cut_category = 0.5  
    confidence_cut_topic = 0.1
    
    #which portion of top scores to take, takes 1/parameter highest scores
    score_cut_category = 4
    score_cut_topic = 4
    
    category_dict_name, topic_dict_name = create_dicts(confidence_cut_category, confidence_cut_topic, 
                                                      score_cut_category, score_cut_topic)

else:    
    topic_dict_name = default_topic_dict_name
    category_dict_name = default_category_dict_name

Python3 = 0
if Python3:
    #load dictionaries we created into variables top_dict, cat_dict
    with open(dicts_path + topic_dict_name, 'rb') as handle:
        top_dict = pickle.load(handle, encoding='iso-8859-1')
    
    with open(dicts_path + category_dict_name, 'rb') as handle:
        cat_dict = pickle.load(handle, encoding='iso-8859-1')

        
#added because i couldnt load the dictionaries with 2.7... the dict files are different. 
#should ask amit which they are gonna use (i think it was 2.7, maybe was optional)
else:
    #load dictionaries for correlation of topics and categories
    with open(dicts_path + topic_dict_name, 'rb') as handle:
        top_dict = pickle.load(handle)
    with open(dicts_path + category_dict_name, 'rb') as handle:
        cat_dict = pickle.load(handle)

#### 2. Fill NA values with median correlation
#### 3. Merge correlations values with train & test

In [15]:
train, test = correlations(train, test, top_dict, cat_dict)

In [16]:
print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0          35  180913   False      1271578       0       0       1   
1          35   45024   False      1271578       0       0       1   
2          35  289116    True      1271578       0       0       1   
3          37  234713    True      1779285       0       1       0   
4          37  235443   False      1779285       0       1       0   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0         1151028          674            769           16      0.172854   
1          923095         5762            551          136      0.464029   
2         1687165        28025           3899          125      0.041293   
3         1586431          245            232          163      0.369382   
4         1377696        11654            555          284      0.263319   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1702      0.920000         1

### 2.3. Click Time
### Requires packages: pygecoders, geopy, pycountry, pytz, tzwhere, shapley
#### 1. Parse geo_location to country and state
#### 2. For each location compute offset from UTC
#### 3. Fill NA with offset 0
#### 4. Add offset to timestamp
#### 5. Convert timestamp to one-hot of: { morning, noon, evening, night, weekend }

In [17]:
skip_computation = True

if (not skip_computation):
    timezone = features_path + "timezone.py"
    %run $timezone
    
time_table = p.read_csv(tables_path + "time_table.csv")
print (time_table.head())

   display_id  weekend  morning  noon  evening  night
0           1        0        0     0        1      0
1           2        0        0     1        0      0
2           3        0        0     0        1      0
3           4        0        0     0        1      0
4           5        0        1     0        0      0


### Merge features with train & test

In [18]:
train, test = merge_ctrs_and_time(train, test)

In [19]:
print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0          35  180913   False      1271578       0       0       1   
1          35   45024   False      1271578       0       0       1   
2          35  289116    True      1271578       0       0       1   
3          37  234713    True      1779285       0       1       0   
4          37  235443   False      1779285       0       1       0   

   ad_document_id  campaign_id  advertiser_id  ...    score_camp  \
0         1151028          674            769  ...      0.168798   
1          923095         5762            551  ...      0.972884   
2         1687165        28025           3899  ...      1.802084   
3         1586431          245            232  ...      1.536780   
4         1377696        11654            555  ...      0.290885   

   score_docXad  score_docXad_doc  score_docXadv  score_docXcamp  weekend  \
0      0.333150          0.227551       0.230555        0.227915        0   
1      0.508195 

In [20]:
#add a feature of: if ad.topic == doc.topic, same for document, takes quite some time, but adds to score
shared_subjects(train)
shared_subjects(test)

In [29]:
print train.head()

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0          35  180913   False      1271578       0       0       1   
1          35   45024   False      1271578       0       0       1   
2          35  289116    True      1271578       0       0       1   
3          37  234713    True      1779285       0       1       0   
4          37  235443   False      1779285       0       1       0   

   ad_document_id  campaign_id  advertiser_id      ...        \
0         1151028          674            769      ...         
1          923095         5762            551      ...         
2         1687165        28025           3899      ...         
3         1586431          245            232      ...         
4         1377696        11654            555      ...         

   score_docXad_doc  score_docXadv  score_docXcamp  weekend  morning  noon  \
0          0.227551       0.230555        0.227915        0        0     0   
1          1.102761       1.444204    

In [None]:
#save
train.to_csv(tables_path + 'train_features.csv', index=False)
test.to_csv(tables_path + 'test_features.csv', index=False)

In [None]:
#load
train = p.read_csv(path_b + 'train_features.csv')
test = p.read_csv(path_b + 'test_features.csv')

### Impute missing values with mean values or median
### all pairs of display documents and ad attributes has extremely high rate of nulls

In [23]:
missing_values = missing_values_table(test, 0)
print (missing_values[missing_values["% of Total Values"] != 0])

                  Missing Values  % of Total Values
score_ad                   16629           3.830075
score_ad_doc                8491           1.955690
score_adv                   1003           0.231016
score_camp                  4116           0.948018
score_docXad              263801          60.759981
score_docXad_doc          241025          55.514097
score_docXadv             203699          46.916984
score_docXcamp            236934          54.571837


In [24]:
#choose "median" or "mean"
fill = "median"
fill_na(test,fill)

## 3. Model Selection
### 1. Feature Selection - Choose predictors for algorithm

In [25]:
predictors= [x for x in train.columns if x not in 
             ['display_id','ad_id','clicked','document_id','ad_document_id',
              'campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc',
              'category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]

In [26]:
train[predictors].head()

Unnamed: 0,plat_1,plat_2,plat_3,cor_top,cor_cat,score_ad,score_ad_doc,score_adv,score_camp,score_docXad,score_docXad_doc,score_docXadv,score_docXcamp,weekend,morning,noon,evening,night,same_topic,same_category
0,0,0,1,0.119042,1.241683,0.214123,0.182049,0.211121,0.168798,0.33315,0.227551,0.230555,0.227915,0,0,0,1,0,False,False
1,0,0,1,0.307566,1.227878,0.740417,0.888856,1.812682,0.972884,0.508195,1.102761,1.444204,1.109772,0,0,0,1,0,False,True
2,0,0,1,0.028369,0.475537,1.850501,1.801142,1.723684,1.802084,1.405909,1.41089,1.675267,1.415749,0,0,0,1,0,False,False
3,0,1,0,0.009743,0.454876,2.249686,1.993534,1.455647,1.53678,1.379369,1.393723,1.243846,1.250615,0,0,0,1,0,False,True
4,0,1,0,0.006974,0.450254,0.309719,0.278252,0.311351,0.290885,0.786492,0.956785,1.003753,0.811572,0,0,0,1,0,False,False


### 2. Build the model: Logistic Regression
#### 2.1. Parameter Tuning - use Grid Search to find optimal parameters

In [30]:
#reduce last 1/5 for validation (it's not good we're using test for grid search and feature selection..)
train, validation = train_validation_split(train)

In [None]:
#put whichever values you want in all of the 3 algorithms
#i didnt put xgboost cause you need to download and install it, so i figures gradient boosting will also be ok
#and easier for them
#each algorithmes uses the first list as 1 specific parameter, and the 2 as another
#i tried to put the most cruical parameters as i could see
#i put stupid numbers.. didnt try to make it well, its late... @_@
#maybe tomorrow we can change the parameters.. ill consider staying at home until 
#13:00 or so so we can work on it together
#should also remove the prints from the scoring function for these 3..

In [36]:
#Random Forest
min_samples_split, min_samples_leaf = grid_search("randomforest", [3,4,5], [5,6,7], train, validation, predictors)

MAP: 0.708060382615
PORTION: 0.457830637216
MAP: 0.709378637148
PORTION: 0.464022019311
MAP: 0.474771379162
PORTION: 0.192342617887
MAP: 0.474771379162
PORTION: 0.192342617887
best parameters found were: min_samples_split = 4
min_samples_leaf = 0.3
with score: 0.70937863714807459


In [None]:
#Gradiant Boosting
min_split, max_depth = grid_search("gradient", [4,5,6,7], [4,5,6], train, validation, predictors)

In [95]:
#Logistic Regression
c, solv = grid_search("logistic", [10**-10,10**-9,10**-8,10**-7], ["sag", "lbfgs"], train, validation, predictors)

best parameters found were: C = 1e-09
solver = lbfgs
with score: 0.72571011396815366


In [82]:
#select features:
predictors = feature_selection(train, validation, c, solv)

MAP: 0.742291056817
MAP: 0.727769617711
MAP: 0.730624010758
MAP: 0.713256385703
MAP: 0.726183288120
MAP: 0.708945173755
MAP: 0.712061943140
MAP: 0.692013951598
MAP: 0.725336820292
MAP: 0.708113152617
MAP: 0.711265807484
MAP: 0.691081226923
MAP: 0.706149258694
MAP: 0.685727612185
MAP: 0.689378604083
MAP: 0.663771905123
MAP: 0.758348783374
MAP: 0.743252243744
MAP: 0.746172630649
MAP: 0.726931521595
MAP: 0.741862994988
MAP: 0.721934934556
MAP: 0.725624256893
MAP: 0.700319575194
MAP: 0.740290280474
MAP: 0.720542228230
MAP: 0.724217723133
MAP: 0.699127821298
MAP: 0.718351772192
MAP: 0.692241470745
MAP: 0.697374858448
MAP: 0.663674282188
MAP: 0.753662725691
MAP: 0.739020523862
MAP: 0.741985716669
MAP: 0.723021783381
MAP: 0.737544915643
MAP: 0.718213261983
MAP: 0.721787057836
MAP: 0.698550447557
MAP: 0.736271521883
MAP: 0.717100741477
MAP: 0.720469375527
MAP: 0.697439518304
MAP: 0.714997765833
MAP: 0.691794955236
MAP: 0.695375100421
MAP: 0.664433043011
MAP: 0.775599580725
MAP: 0.759471572552


['cor_top',
 'cor_cat',
 'score_ad',
 'score_ad_doc',
 'score_adv',
 'score_camp',
 'score_docXad_doc',
 'score_docXadv',
 'score_docXcamp',
 'same_category']

#### 2.2. Make prediction using selected features and parameters

In [43]:
alg = LogisticRegression(C = c, solver = solv)
alg.fit(train[predictors], train["clicked"])

LogisticRegression(C=1e-07, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [85]:
predict(alg, test, predictors)

MAP: 0.650306256631
PORTION: 0.444746011805


(0.65030625663144359, 0.44474601180458434)