## Got this online kernel from Kaggle. It scores 0.63714

In [1]:
import pandas as pd
import numpy as np

dtypes = {'ad_id': np.float32, 'clicked': np.int8}

## Reads the training data and calculates the count, sum, and mean for each ad. Interesting: Completely strips the display_id here.

In [2]:
train = pd.read_csv("data/clicks_train.csv", usecols=['ad_id','clicked'], dtype=dtypes)

ad_likelihood = train.groupby('ad_id').clicked.agg(['count','sum','mean']).reset_index()
M = train.clicked.mean()
del train   # free up memory

## Possible replacement for likelihood

### Use Bayes Law as the measure. So likelihood of being clicked given ad = ad_id

+ P(ad|+) = # ads with ad_id in the set of all clicked ads divided by total
+ P(+) = clicks divided by (clicks plus not clicks)
+ P(+|not ad) = # clicks for all ads not ad_id over total
+ P(not ad) = # not ad_id over total ads (1 - frequency of ad_id)

$\huge P(+|{ad}) = \frac{P({ad}|+) \times P(+)}{ P({ad}|+) \times P(+) + P(+|\overline{ad}) \times P(\overline{ad})}$

## Good god what it that thing? I've asked on the forum why the 12. Otherwise, it looks like the equation just handles the case where there are no clicks on an ad despite it being shown. The ad will default to the global mean M.

In [3]:
ad_likelihood['likelihood'] = (ad_likelihood['sum'] + 12*M) / (12 + ad_likelihood['count'])
ad_likelihood.head()

Unnamed: 0,ad_id,count,sum,mean,likelihood
0,1.0,2,0.0,0.0,0.165982
1,2.0,22,1.0,0.045455,0.097757
2,3.0,161,11.0,0.068323,0.077016
3,4.0,32,4.0,0.125,0.143721
4,5.0,1,0.0,0.0,0.17875


## Read the test data

In [5]:
test = pd.read_csv("data/clicks_test.csv")

## Add the ad_likelihood to the test data

In [6]:
test = test.merge(ad_likelihood, how='left')

In [7]:
test.head()

Unnamed: 0,display_id,ad_id,count,sum,mean,likelihood
0,16874594,66758,5642.0,373.0,0.066111,0.067219
1,16874594,150083,77514.0,5261.0,0.067872,0.059845
2,16874594,162754,66599.0,15919.0,0.239028,0.229476
3,16874594,170392,15054.0,5261.0,0.349475,0.344313
4,16874594,172888,15081.0,5162.0,0.342285,0.33518


## Replace any NaNs with the global mean

In [8]:
test.likelihood.fillna(M, inplace=True)

## Rearrange the table sorting the ads based on likelihood. This is a nice approach.

In [9]:
test.sort_values(['display_id','likelihood'], inplace=True, ascending=False)

## Now just rearrange the dataframe so that the ads and listed after each display_id. Again, really elegant code.

In [16]:
subm = test.groupby('display_id')['ad_id'].apply(lambda x: " ".join(map(str,x)))

In [17]:
subm.head()

display_id
16874594             170392 172888 162754 66758 150083 180797
16874595                                    8846 143982 30609
16874596    289915 11430 289122 132820 57197 153260 173005...
16874597    305790 285834 143981 182039 180965 155945 3088...
16874598                           145937 335632 67292 250082
Name: ad_id, dtype: object

In [13]:
subm.to_csv("data/submission.csv.gz", index=False, compression='gzip')