# Outbrain Click Prediction

## 1. Data Preparation and Cleaning

### Import Python Libraries

In [341]:
import os
import numpy as np
import pandas as p
import pickle
import zipfile
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

#### Get metadata and functions utils

In [342]:
path = os.getcwd() + "/"
utils_path = path + "utils/"
meta = utils_path + "meta.py"
functions = utils_path + "functions.py"
%run $meta
%run $functions

### Unzip source tables

In [343]:
zip_ref = zipfile.ZipFile(data_tables, 'r')
zip_ref.extractall(tables_path)
zip_ref.close()

### Load Train & Test

In [344]:
train = p.read_csv(tables_path + "train.csv")
test = p.read_csv(tables_path + "test.csv")

print (train.head())

   display_id   ad_id clicked
0           1   42337   False
1           1  139684   False
2           1  144739    True
3           1  156824   False
4           1  279295   False


#### Choose "fraction" of train & test for training  & predicting

In [345]:
#set size of fraction to take from train and test, for full tables, fraction = 0
fraction = 0.1

if fraction:
    train, test = fractioned(train,test,fraction)
    print (train.shape)
    print (test.shape)

(1310024, 3)
(432622, 3)


### Prepare platform table
#### 1. Impute missing values with median
#### 2. Create one-hot

In [346]:
skip_computation = True

if (not skip_computation):
    platform_prep = features_path + "platform_prep.py"
    %run $platform_prep

platform = p.read_csv(tables_path + "platform.csv")
print (platform.head())

platform.csv created in C:\Users\Jon\Submission\ludast_outbrain-master/tables/ directory
   display_id  document_id  plat_1  plat_2  plat_3
0           1       379743       0       0       1
1           2      1794259       0       1       0
2           3      1179111       0       1       0
3           4      1777797       0       1       0
4           5       252458       0       1       0


### Prepare topics & categories table
#### 1. Merge categories, topics tables
#### 2. For each document - leave only topic/category with the highest confidence level
#### 3. Impute missing documents with value -1 (assign confidence 0) and cast id's back to integer

In [347]:
skip_computation = True

if (not skip_computation):
    topics_categories_prep = features_path + "topics_categories_prep.py"
    %run $topics_categories_prep
    
topics_categories = p.read_csv(tables_path + "topics_categories.csv")
print (topics_categories.head())

topics_categories.csv created in C:\Users\Jon\Submission\ludast_outbrain-master/tables/ directory
   document_id  topic_id  confi_top  category_id  confi_cat
0      1595802       140   0.073113         1611   0.920000
1      1524246       113   0.196450         1807   0.920000
2      1617787       113   0.216892         1807   0.920000
3      1615583        89   0.316306         1305   0.920000
4      1615460       260   0.097964         1613   0.540646


### Prepare Promoted Content table
#### 1. Merge with topics_categories table
#### 2. Impute missing documents with -1 (assign confidence 0) and cast id's back to integer

In [348]:
skip_computation = True

if (not skip_computation):
    promoted_prep = features_path + "promoted_content_prep.py"
    %run $promoted_prep

promoted = p.read_csv(tables_path + "promoted_content_prep.csv")
print (promoted.head())

promoted_content_prep.csv created in C:\Users\Jon\Submission\ludast_outbrain-master/tables/ directory
   ad_id  ad_document_id  campaign_id  advertiser_id  topic_id  confi_top  \
0      1            6614            1              7        -1   0.000000   
1      2          471467            2              7        89   0.122998   
2      3            7692            3              7        26   0.104300   
3      4          471471            2              7       168   0.076367   
4      5          471472            2              7       143   0.089059   

   category_id  confi_cat  
0         1209   0.925264  
1         1505   0.920000  
2         1209   0.920000  
3         1205   0.920000  
4         1608   0.920000  


### Merge all tables with train & test

In [349]:
train = train.merge(platform, how='left', on='display_id')
test = test.merge(platform, how='left', on='display_id')
del platform

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3
0      313736  120162   False      1267760       0       1       0
1      313736  141421   False      1267760       0       1       0
2      313736  274113    True      1267760       0       1       0
3      313736  305965   False      1267760       0       1       0
4      150716   26724   False      1717049       1       0       0


In [350]:
train = train.merge(promoted,how = 'left',on = 'ad_id')
test = test.merge(promoted,how = 'left',on = 'ad_id')
del promoted

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0      313736  120162   False      1267760       0       1       0   
1      313736  141421   False      1267760       0       1       0   
2      313736  274113    True      1267760       0       1       0   
3      313736  305965   False      1267760       0       1       0   
4      150716   26724   False      1717049       1       0       0   

   ad_document_id  campaign_id  advertiser_id  topic_id  confi_top  \
0         1207420        15369           2844       116   0.009824   
1         1312695        17582           2151        68   0.356705   
2         1128195        26261           1135       299   0.093640   
3            6146         9859           2023       254   0.044364   
4          840223         4035           1299       160   0.428540   

   category_id  confi_cat  
0         1607   0.306107  
1         1613   0.920000  
2         1607   0.504424  
3         1808   1.000000  
4         1403   0

In [351]:
train = train.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
test = test.merge(topics_categories, how='left', on='document_id',suffixes=('_ad', '_doc'))
del topics_categories

print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0      313736  120162   False      1267760       0       1       0   
1      313736  141421   False      1267760       0       1       0   
2      313736  274113    True      1267760       0       1       0   
3      313736  305965   False      1267760       0       1       0   
4      150716   26724   False      1717049       1       0       0   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0         1207420        15369           2844          116      0.009824   
1         1312695        17582           2151           68      0.356705   
2         1128195        26261           1135          299      0.093640   
3            6146         9859           2023          254      0.044364   
4          840223         4035           1299          160      0.428540   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1607      0.306107         1

#### Impute missing topics/categories with value -1 (assign confidence 0)

In [352]:
missing_values = missing_values_table(train)
print (missing_values[missing_values["% of Total Values"] != 0])

                 Missing Values  % of Total Values
topic_id_doc              31078           2.372323
confi_top_doc             31078           2.372323
category_id_doc           31078           2.372323
confi_cat_doc             31078           2.372323


In [353]:
train.confi_top_doc = train.confi_top_doc.fillna(0)
train.confi_cat_doc = train.confi_cat_doc.fillna(0)
test.confi_top_doc = test.confi_top_doc.fillna(0)
test.confi_cat_doc = test.confi_cat_doc.fillna(0)
train = train.fillna(-1)
test = test.fillna(-1)

In [316]:
#export end of data preparation and cleaning
train.to_csv(tables_path + 'train_prep.csv', index=False)
test.to_csv(tables_path + 'test_prep.csv', index=False)

## 2. Feature Engineering
### 2.1. Click Through Rate
### See documetation

In [354]:
skip_computation = True

if (skip_computation):
    zip_ref = zipfile.ZipFile(ctr_tables, 'r')
    zip_ref.extractall(tables_path)
    zip_ref.close()

else:
    ctr_features = features_path + "ctr_features.py"
    %run $ctr_features

ad_ctr = p.read_csv(tables_path + "ad_ctr.csv")
ad_document_ctr = p.read_csv(tables_path + "ad_document_ctr.csv")
advertiser_ctr = p.read_csv(tables_path + "advertiser_ctr.csv")
campaign_ctr = p.read_csv(tables_path + "campaign_ctr.csv")
document_on_ad_ctr = p.read_csv(tables_path + "document_on_ad_ctr.csv")
document_on_ad_document_ctr = p.read_csv(tables_path + "document_on_ad_document_ctr.csv")
document_on_advertiser_ctr = p.read_csv(tables_path + "document_on_advertiser_ctr.csv")
document_on_campaign_ctr = p.read_csv(tables_path + "document_on_campaign_ctr.csv")

ctr tables created in C:\Users\Jon\Submission\ludast_outbrain-master/tables/ directory


### 2.2 topics & categories correlations
#### 1. Build dictionaries holding for each pair (topic, topic) or (category, category) the strength of their correlation (more on documentaion)

In [356]:
skip_computation = True

if (not skip_computation):
    dictionaries_prep = features_path + "dictionaries_prep.py"
    %run $dictionaries_prep
    
    #threshold confidence level to keep
    confidence_cut_category = 0.5  
    confidence_cut_topic = 0.1
    
    #which portion of top scores to take, takes 1/parameter highest scores
    score_cut_category = 4
    score_cut_topic = 4
    
    category_dict_name, topic_dict_name = create_dicts(confidence_cut_category, confidence_cut_topic, 
                                                      score_cut_category, score_cut_topic)

else:    
    topic_dict_name = default_topic_dict_name
    category_dict_name = default_category_dict_name

#load dictionaries we created into variables top_dict, cat_dict
with open(dicts_path + topic_dict_name, 'rb') as handle:
    top_dict = pickle.load(handle, encoding='iso-8859-1')
    
with open(dicts_path + category_dict_name, 'rb') as handle:
    cat_dict = pickle.load(handle, encoding='iso-8859-1')

#### 2. Fill NA values with median correlation
#### 3. Merge correlations values with train & test

In [357]:
train, test = correlations(train, test, top_dict, cat_dict)

In [358]:
print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0      313736  120162   False      1267760       0       1       0   
1      313736  141421   False      1267760       0       1       0   
2      313736  274113    True      1267760       0       1       0   
3      313736  305965   False      1267760       0       1       0   
4      150716   26724   False      1717049       1       0       0   

   ad_document_id  campaign_id  advertiser_id  topic_id_ad  confi_top_ad  \
0         1207420        15369           2844          116      0.009824   
1         1312695        17582           2151           68      0.356705   
2         1128195        26261           1135          299      0.093640   
3            6146         9859           2023          254      0.044364   
4          840223         4035           1299          160      0.428540   

   category_id_ad  confi_cat_ad  topic_id_doc  confi_top_doc  category_id_doc  \
0            1607      0.306107         1

### 2.3. Click Time
### Requires packages: pygecoders, geopy, pycountry, pytz, tzwhere, shapley
#### 1. Parse geo_location to country and state
#### 2. For each location compute offset from UTC
#### 3. Fill NA with offset 0
#### 4. Add offset to timestamp
#### 5. Convert timestamp to one-hot of: { morning, noon, evening, night, weekend }

In [359]:
skip_computation = True

if (not skip_computation):
    timezone = features_path + "timezone.py"
    %run $timezone
    
time_table = p.read_csv(tables_path + "time_table.csv")
print (time_table.head())

   display_id  weekend  morning  noon  evening  night
0           1        0        0     0        1      0
1           2        0        0     1        0      0
2           3        0        0     0        1      0
3           4        0        0     0        1      0
4           5        0        1     0        0      0


### Merge features with train & test

In [360]:
train, test = merge_ctrs_and_time(train, test)

In [361]:
print (train.head())

   display_id   ad_id clicked  document_id  plat_1  plat_2  plat_3  \
0      313736  120162   False      1267760       0       1       0   
1      313736  141421   False      1267760       0       1       0   
2      313736  274113    True      1267760       0       1       0   
3      313736  305965   False      1267760       0       1       0   
4      150716   26724   False      1717049       1       0       0   

   ad_document_id  campaign_id  advertiser_id  ...    score_camp  \
0         1207420        15369           2844  ...      2.227240   
1         1312695        17582           2151  ...      0.141040   
2         1128195        26261           1135  ...      1.920571   
3            6146         9859           2023  ...      1.164667   
4          840223         4035           1299  ...      1.060096   

   score_docXad  score_docXad_doc  score_docXadv  score_docXcamp  weekend  \
0      1.491305          1.500558       1.394483        1.504073        0   
1      0.328429 

In [None]:
#save
train.to_csv(tables_path + 'train_features.csv', index=False)
test.to_csv(tables_path + 'test_features.csv', index=False)

In [None]:
#load
train = p.read_csv(path_b + 'train_features.csv')
test = p.read_csv(path_b + 'test_features.csv')

### Impute missing values with mean values or median
### all pairs of display documents and ad attributes has extremely high rate of nulls

In [362]:
missing_values = missing_values_table(test)
print (missing_values[missing_values["% of Total Values"] != 0])

                  Missing Values  % of Total Values
score_ad                   16500           3.813953
score_ad_doc                8534           1.972623
score_adv                   1078           0.249178
score_camp                  4194           0.969438
score_docXad              262169          60.600016
score_docXad_doc          239297          55.313183
score_docXadv             202002          46.692494
score_docXcamp            235026          54.325947


In [363]:
fill = "mean"

if (fill == "median"):
    test.score_ad = test.score_ad.fillna(test.score_ad.median())
    test.score_ad_doc = test.score_ad_doc.fillna(test.score_ad_doc.median())
    test.score_adv = test.score_adv.fillna(test.score_adv.median())
    test.score_camp = test.score_camp.fillna(test.score_camp.median())
    test.score_docXad = test.score_docXad.fillna(test.score_docXad.median())
    test.score_docXad_doc = test.score_docXad_doc.fillna(test.score_docXad_doc.median())
    test.score_docXadv = test.score_docXadv.fillna(test.score_docXadv.median())
    test.score_docXcamp = test.score_docXcamp.fillna(test.score_docXcamp.median())

else:
    test.score_ad = test.score_ad.fillna(test.score_ad.mean())
    test.score_ad_doc = test.score_ad_doc.fillna(test.score_ad_doc.mean())
    test.score_adv = test.score_adv.fillna(test.score_adv.mean())
    test.score_camp = test.score_camp.fillna(test.score_camp.mean())
    test.score_docXad = test.score_docXad.fillna(test.score_docXad.mean())
    test.score_docXad_doc = test.score_docXad_doc.fillna(test.score_docXad_doc.mean())
    test.score_docXadv = test.score_docXadv.fillna(test.score_docXadv.mean())
    test.score_docXcamp = test.score_docXcamp.fillna(test.score_docXcamp.mean())

## 3. Model Selection
### 1. Feature Selection - Choose predictors for algorithm

In [364]:
predictors= [x for x in train.columns if x not in 
             ['display_id','ad_id','clicked','document_id','ad_document_id',
              'campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc',
              'category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]

In [365]:
train[predictors].head()

Unnamed: 0,plat_1,plat_2,plat_3,cor_top,cor_cat,score_ad,score_ad_doc,score_adv,score_camp,score_docXad,score_docXad_doc,score_docXadv,score_docXcamp,weekend,morning,noon,evening,night
0,0,1,0,0.00015,0.405004,2.232185,2.232317,2.63495,2.22724,1.491305,1.500558,1.394483,1.504073,0,0,0,0,1
1,0,1,0,0.005773,1.233541,0.152031,0.139929,0.171076,0.14104,0.328429,0.194719,0.105442,0.19594,0,0,0,0,1
2,0,1,0,0.001552,0.667392,2.020071,1.889954,2.059613,1.920571,1.031069,1.033336,1.570852,1.034197,0,0,0,0,1
3,0,1,0,0.000753,1.338185,1.122321,1.122554,1.165057,1.164667,0.58717,0.587997,0.590196,0.587962,0,0,0,0,1
4,1,0,0,0.01261,0.669201,0.982355,1.059151,1.147862,1.060096,0.785145,1.153472,1.145201,1.154159,0,0,0,1,0


### 2. Build the model: Logistic Regression
#### 2.1. Parameter Tuning - use Grid Search to find optimal parameters

In [None]:
#multi_class, fit_intercept, intercept_scaling, dual, random_state, solver, verbose, pentaly, class weights
alg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='sag', max_iter=75, multi_class='ovr', verbose=0, n_jobs=1)
log_params = {'C' : [10 ** i for i in range(-12,2)], 'solver' : ['lbfgs', 'sag']}
grid_log = GridSearchCV(alg, log_params, fit_params=None, n_jobs=-1, iid=True, refit=True, verbose=1, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True)
grid_log.fit(train[predictors], train["clicked"])

In [326]:
alg = LogisticRegression(C = 0.0000000001, solver = 'lbfgs')
alg.fit(train[predictors], train["clicked"])

LogisticRegression(C=1e-10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
#multi_class, fit_intercept, intercept_scaling, dual, random_state, solver, verbose, pentaly, class weights
alg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='sag', max_iter=75, multi_class='ovr', verbose=1, n_jobs=4)
log_params = {'C' : [10 ** i for i in range(-12,2)], 'solver' : ['lbfgs', 'sag']}
grid_log = GridSearchCV(alg, log_params, fit_params=None, n_jobs=-1, iid=True, refit=True, verbose=1, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True)
grid_log.fit(train[predictors], train["clicked"])

Fitting 3 folds for each of 28 candidates, totalling 84 fits


  **self._backend_args)
  **self._backend_args)
  **self._backend_args)


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    6.1s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

  **self._backend_args)


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...


  **self._backend_args)


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    6.2s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    6.5s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    5.2s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...


[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    5.2s finished


rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...
rescaling...

In [188]:
#[i -> score_ad, j -> score_ad_doc, k->score_adv, l->score_camp, t->score_docXad,n->score_docXad_doc,m->score_docXadv,p->score_docXcamp]
i_l = ['','score_ad']
j_l = ['','score_ad_doc']
k_l = ['','score_adv']
l_l = ['','score_camp']
t_l = ['','score_docXad']
n_l = ['','score_docXad_doc']
m_l = ['','score_docXadv']
p_l = ['','score_docXcamp']
results = [0] * 256
for i in range(2):
    for j in range(2):
        for k in range(2):
            for l in range(2):
                for t in range(2):
                    for n in range(2):
                        for m in range(2):
                            for p in range(2):
                                predictors=[x for x in train.columns if x not in ['display_id','ad_id','clicked','document_id','platform','ad_document_id','campaign_id','advertiser_id','confi_top_ad','topic_id_ad','topic_id_doc','category_id_ad','confi_cat_ad','confi_top_doc','category_id_doc','confi_cat_doc']]
                                to_reduce = i_l[i] + j_l[j] + k_l[k] + l_l[l] + t_l[t] + n_l[n] + m_l[m] + p_l[p]
                                predictors = [x for x in predictors if x not in to_reduce]
                                alg = LogisticRegression(C = 0.0000000001, solver = 'lbfgs')
                                alg.fit(train[predictors], train["clicked"])
                                predY = list(alg.predict_proba(test[predictors]).astype(float)[:,1])
                                predict = np.asarray(predY)
                                test_copy = test.copy()
                                test_copy['predict'] = predict
                                results[i + j * 2 + k * 4 + l * 8 + t * 16 + n * 32 + m * 64 + p * 128] = score_map(test_copy)

MAP: 0.649737819877
MAP: 0.650101485384
MAP: 0.649530906309
MAP: 0.648415965659
MAP: 0.649764846759
MAP: 0.648138087874
MAP: 0.648183828897
MAP: 0.644473403284
MAP: 0.649984732469
MAP: 0.649957822527
MAP: 0.649530906309
MAP: 0.648415965659
MAP: 0.649764846759
MAP: 0.648138087874
MAP: 0.648183828897
MAP: 0.644473403284
MAP: 0.648364339188
MAP: 0.649261721798
MAP: 0.649435152663
MAP: 0.648558998070
MAP: 0.649458393875
MAP: 0.648269543077
MAP: 0.648202286499
MAP: 0.643961837954
MAP: 0.649194778412
MAP: 0.649657418206
MAP: 0.649435152663
MAP: 0.648558998070
MAP: 0.649458393875
MAP: 0.648269543077
MAP: 0.648202286499
MAP: 0.643961837954
MAP: 0.646508970320
MAP: 0.648178032597
MAP: 0.648959976844
MAP: 0.648811248611
MAP: 0.649307606911
MAP: 0.649078135810
MAP: 0.648830938672
MAP: 0.643811842843
MAP: 0.648075269005
MAP: 0.649584920950
MAP: 0.648959976844
MAP: 0.648811248611
MAP: 0.649307606911
MAP: 0.649078135810
MAP: 0.648830938672
MAP: 0.643811842843
MAP: 0.639095776162
MAP: 0.642051811385


#### 2.2. Make prediction using selected features and parameters

In [366]:
alg = LogisticRegression(C = 0.0000000001, solver = 'lbfgs')
alg.fit(train[predictors], train["clicked"])

LogisticRegression(C=1e-10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [367]:
predict(alg, test, predictors)

MAP: 0.650939766384
PORTION: 0.446047002216


(0.650939766383688, 0.44604700221619126)