## Using Adveritiser ID as a proxy to Boost likelihood 
## Try finding best model to predict itself

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

p = sns.color_palette()


## Where is our data?

In [2]:
dataDir = "data/"  # Directory where our data is stored

## Reads the training data from CSV file

In [3]:
dfTrain = pd.read_csv(dataDir + "promoted_content.csv")

In [4]:
dfTrain.head()

Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id
0,1,6614,1,7
1,2,471467,2,7
2,3,7692,3,7
3,4,471471,2,7
4,5,471472,2,7


In [5]:
dfTrain.shape

(559583, 4)

In [6]:
advertiserAdCounts = dfTrain.groupby('advertiser_id')['ad_id'].count()

In [7]:
totalAdCounts = sum(advertiserAdCounts)

In [8]:
advertiserAdCounts = advertiserAdCounts.reset_index()
advertiserAdCounts = advertiserAdCounts.rename(columns={'ad_id' : '# ads'})

In [9]:
advertiserAdCounts['likelihood'] = advertiserAdCounts['# ads'] / totalAdCounts
advertiserAdCounts.head()

Unnamed: 0,advertiser_id,# ads,likelihood
0,2,2,4e-06
1,3,12,2.1e-05
2,4,168,0.0003
3,5,45,8e-05
4,6,15,2.7e-05


## Now use this to boost the small click algorithm

In [10]:
ad_likelihood = dfTrain.merge(advertiserAdCounts, on='advertiser_id')

In [11]:
del dfTrain

dfTrain = pd.read_csv(dataDir + "clicks_train.csv")

In [12]:
M = dfTrain['clicked'].mean()
ad_likelihood2 = dfTrain.groupby('ad_id')['clicked'].agg(['count','sum','mean']).reset_index()
ad_likelihood2['likelihood2'] = (ad_likelihood2['sum'] + 12*M) / (12 + ad_likelihood2['count'])

In [13]:
ad_likelihood2 = ad_likelihood2[['ad_id', 'likelihood2']]

In [14]:
ad_likelihood = ad_likelihood.merge(ad_likelihood2, on='ad_id')

In [15]:
ad_likelihood.head()

Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id,# ads,likelihood,likelihood2
0,1,6614,1,7,95,0.00017,0.165982
1,2,471467,2,7,95,0.00017,0.097757
2,3,7692,3,7,95,0.00017,0.077016
3,4,471471,2,7,95,0.00017,0.143721
4,5,471472,2,7,95,0.00017,0.17875


In [16]:
ad_likelihood['likelihood'] = ad_likelihood['likelihood'] + ad_likelihood['likelihood2']

In [17]:
ad_likelihood.head()

Unnamed: 0,ad_id,document_id,campaign_id,advertiser_id,# ads,likelihood,likelihood2
0,1,6614,1,7,95,0.166152,0.165982
1,2,471467,2,7,95,0.097927,0.097757
2,3,7692,3,7,95,0.077186,0.077016
3,4,471471,2,7,95,0.143891,0.143721
4,5,471472,2,7,95,0.178919,0.17875


## Predict itself

In [18]:
dfResult = dfTrain.merge(ad_likelihood, how='left', on='ad_id')

## Rearrange the table sorting the ads based on likelihood. This gives us the ad order.

In [19]:
dfResult = dfResult.sort_values(['display_id'], ascending=False)

## Now just combine rows of the dataframe so that the ads and listed after each display_id. 

In [20]:
predictedAd = dfResult.groupby('display_id')['ad_id'].first().reset_index()

In [26]:
predictedAd.head()

Unnamed: 0,display_id,ad_id
0,1,156824
1,2,169564
2,3,250082
3,4,184709
4,5,326514


In [None]:
actualAd = dfTrain[dfTrain['clicked'] == 1]

In [25]:
actualAd.head()

Unnamed: 0,display_id,ad_id,clicked
2,1,144739,1
9,2,308455,1
14,3,228657,1
17,4,153623,1
25,5,326514,1


In [27]:
d1 = predictedAd['ad_id']
d2 = actualAd['ad_id']

In [31]:
print('Accuracy = {:.2f}'.format(100.0*sum(d1 == d2)/float(len(d1))))

Accuracy = 21.46
