## Using Adveritiser ID as a proxy to Boost likelihood 

In [40]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

p = sns.color_palette()


## Where is our data?

In [41]:
dataDir = "data/"  # Directory where our data is stored

## Reads the training data from CSV file

In [42]:
dfTrain = pd.read_csv(dataDir + "promoted_content.csv")

In [43]:
dfTrain.head()

Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id
0,1,6614,1,7
1,2,471467,2,7
2,3,7692,3,7
3,4,471471,2,7
4,5,471472,2,7


In [44]:
dfTrain.shape

(559583, 4)

In [45]:
advertiserAdCounts = dfTrain.groupby('advertiser_id')['ad_id'].count()

In [46]:
totalAdCounts = sum(advertiserAdCounts)

In [47]:
advertiserAdCounts = advertiserAdCounts.reset_index()
advertiserAdCounts = advertiserAdCounts.rename(columns={'ad_id' : '# ads'})

In [48]:
advertiserAdCounts['likelihood'] = advertiserAdCounts['# ads'] / totalAdCounts
advertiserAdCounts.head()

Unnamed: 0,advertiser_id,# ads,likelihood
0,2,2,4e-06
1,3,12,2.1e-05
2,4,168,0.0003
3,5,45,8e-05
4,6,15,2.7e-05


## Now use this to boost the small click algorithm

In [49]:
ad_likelihood = dfTrain.merge(advertiserAdCounts, on='advertiser_id')

In [None]:
del dfTrain

dfTrain = pd.read_csv(dataDir + "clicks_train.csv", usecols=['ad_id','clicked'])

In [None]:
M = dfTrain.clicked.mean()
ad_likelihood2 = dfTrain.groupby('ad_id').clicked.agg(['count','sum','mean']).reset_index()
ad_likelihood2['likelihood2'] = (ad_likelihood2['sum'] + 12*M) / (12 + ad_likelihood2['count'])

In [55]:
ad_likelihood2 = ad_likelihood2[['ad_id', 'likelihood2']]

In [56]:
ad_likelihood = ad_likelihood.merge(ad_likelihood2, on='ad_id')

In [58]:
ad_likelihood['likelihood'] = ad_likelihood['likelihood'] + ad_likelihood['likelihood2']

## Read the test data

In [59]:
del dfTrain  # free up memory

In [60]:
dfTest = pd.read_csv(dataDir + "clicks_test.csv")

## Add the ad_likelihood to the test data

In [61]:
ad_likelihood = ad_likelihood.reset_index()  # Bring the ad_id back into the columns
dfTest = dfTest.merge(ad_likelihood, how='left', on='ad_id')

In [62]:
dfTest.head()

Unnamed: 0,display_id,ad_id,index,document_id,campaign_id,advertiser_id,# ads,likelihood,likelihood2
0,16874594,66758,216999.0,1051283.0,8949.0,555.0,310.0,0.066936,0.066382
1,16874594,150083,179225.0,1358132.0,19045.0,1913.0,2151.0,0.071735,0.067891
2,16874594,162754,262417.0,1292723.0,17770.0,2391.0,584.0,0.240063,0.239019
3,16874594,170392,251683.0,1083829.0,20943.0,1731.0,158.0,0.349633,0.349351
4,16874594,172888,176188.0,1433954.0,1384.0,16.0,1981.0,0.345707,0.342167


## Rearrange the table sorting the ads based on likelihood. This gives us the ad order.

In [63]:
dfTest.sort_values(['display_id','likelihood'], inplace=True, ascending=False)

## Now just combine rows of the dataframe so that the ads and listed after each display_id. 

In [64]:
subm = dfTest.groupby('display_id')['ad_id'].apply(lambda x: " ".join(map(str,x))).reset_index()

In [65]:
subm.head()

Unnamed: 0,display_id,ad_id
0,16874594,170392 172888 162754 150083 66758 180797
1,16874595,8846 143982 30609
2,16874596,289915 11430 289122 132820 57197 153260 173005...
3,16874597,305790 285834 143981 182039 155945 180965 3088...
4,16874598,145937 335632 67292 250082


In [66]:
subm.to_csv(dataDir + "submissionAdvertiser.csv", header=True, index=False)