## Uses the two notebooks: Cutout Only Unclicked Ads.ipynb and Cutout Only Clicked Ads.ipynb to create two CSV files with the ads sorted in order of likelihood. We need to load those two CSVs, concatenate them into a pandas dataframe, and then use the order to predict the test CSV.

In [1]:
import pandas as pd
import numpy as np

## Reads the training data and calculates the count, sum, and mean for each ad. Interesting: Completely strips the display_id here.

In [2]:
dfClick = pd.read_csv('data/clickedOrder.csv')
dfNotClick = pd.read_csv('data/notClickedOrder.csv')

In [3]:
dfClick.head()

Unnamed: 0,ad_id
0,57191
1,117429
2,334650
3,8138
4,309026


## Concatenate the two lists (ignore the index and regenerate a new one). Now reset_index so that the index becomes a column we can query. The lower the index, the more the likelihood that this ad will be clicked

In [4]:
# Check if there is any overlap in the sets. If so, then our other two python notebooks were wrong.
set.intersection(set(dfNotClick['ad_id']), set(dfClick['ad_id']))

set()

In [5]:
dfAll = pd.concat([dfClick, dfNotClick], ignore_index=True).reset_index()

In [6]:
dfAll.head()

Unnamed: 0,index,ad_id
0,0,57191
1,1,117429
2,2,334650
3,3,8138
4,4,309026


## Read the test data

In [7]:
dfTest = pd.read_csv("data/clicks_test.csv")

## Add the ad_likelihood to the test data

In [8]:
dfTest = dfTest.merge(dfAll, how='left', on='ad_id')

In [9]:
dfTest.head()

Unnamed: 0,display_id,ad_id,index
0,16874594,66758,225168.0
1,16874594,150083,224571.0
2,16874594,162754,58400.0
3,16874594,170392,15437.0
4,16874594,172888,18224.0


## Rearrange the table sorting the ads based on likelihood. We sort ascending because the lower indexes are more likely to be clicked

In [None]:
dfTest.sort_values(['display_id','index'], inplace=True, ascending=True)

## Now just rearrange the dataframe so that the ads and listed after each display_id.

In [None]:
subm = dfTest.groupby('display_id')['ad_id'].apply(lambda x: " ".join(map(str,x))).reset_index()

In [None]:
subm.head()

In [None]:
subm.to_csv("data/submissionFinal.csv.gz", header=True, index=False, compression='gzip')