# Preliminary Model
- Sorting ADs by Display based on click rate probabilities

### Food for Thought:
- how many times has an ad appeared?
- how many times has an ad been clicked?
- % of time the ad was clicked vs appeared
- the more an ad appears the higher chance it has of being clicked (as a baseline truth, not thinking about locale, user diversity, etc)
- can I generate a weight to rank each ad based on the above properties?
- if so we can use this to sort the ads and group by display_id
- This could provide a fair baseline based soley on click-by-show popularity not taking into account the other data.

In [117]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
plt.style.use('ggplot')
import collections
import json
from IPython.display import display, HTML
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Prelim Model based on Mini Dataset

In [118]:
mini_train = pd.read_csv("data_mini/clicks_train_mini.csv") #read mini DF into memory

In [119]:
mini_train_copy = mini_train.copy() #copy of mini DF for playing with

In [120]:
def df_info(df):
    print 'temp_train: ', df.shape
    print 'Num of unique display_id: ', len(df['display_id'].unique())
    print 'Num of unique ad_id: ', len(df['ad_id'].unique())
    print 'Click ratio: {}% of ads clicked.'.format(round(float(df['clicked'].value_counts()[1])/df.shape[0]*100,3))
    display(df.head(8))

df_info(mini_train_copy)

temp_train:  (49999, 3)
Num of unique display_id:  9797
Num of unique ad_id:  10740
Click ratio: 19.592% of ads clicked.


Unnamed: 0,display_id,ad_id,clicked
0,1,42337,0
1,1,139684,0
2,1,144739,1
3,1,156824,0
4,1,279295,0
5,1,296965,0
6,2,125211,0
7,2,156535,0


perhaps to speed things up we can add a column below for display_rank which has a value 1-n for each display that ranks the order of ad probability of click. Then from that we could simply groupby display_id and create another column with ad_ids sorted by their rank and split by a ' '. If we could do this, then we could skip all these steps I was doing of going from pandas Df --> python dict data structure which is computationally slow to do arithmatic opperations. If we can we should make use of NUMPY data structure/operations so keep things in pandas if we can. 

## Good idea. I agree. -Tony

In [121]:
# Tony, you are right, these transform statements are powerful!
mini_train_copy['ad_appeared'] = mini_train_copy.groupby('ad_id')['display_id'].transform('count')
mini_train_copy['ad_click_ratio'] = mini_train_copy.groupby('ad_id')['clicked'].transform('mean')
#mini_train_copy['ads_in_display'] = mini_train_copy.groupby('display_id')['ad_id'].transform('count')

mini_train_copy

Unnamed: 0,display_id,ad_id,clicked,ad_appeared,ad_click_ratio
0,1,42337,0,5,0.200000
1,1,139684,0,2,0.000000
2,1,144739,1,120,0.308333
3,1,156824,0,51,0.019608
4,1,279295,0,3,0.333333
5,1,296965,0,1,0.000000
6,2,125211,0,2,0.500000
7,2,156535,0,1,0.000000
8,2,169564,0,9,0.000000
9,2,308455,1,2,0.500000


In [122]:
#simple reshaping of DF, could possibly be rolled into the above operation along with the bottom to_dict()
mini_train_copy = mini_train_copy.drop(['display_id', 'clicked'], axis=1)
mini_train_copy = mini_train_copy.set_index(['ad_id'])

mini_train_copy

Unnamed: 0_level_0,ad_appeared,ad_click_ratio
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1
42337,5,0.200000
139684,2,0.000000
144739,120,0.308333
156824,51,0.019608
279295,3,0.333333
296965,1,0.000000
125211,2,0.500000
156535,1,0.000000
169564,9,0.000000
308455,2,0.500000


In [123]:
# %%time
# CPU times: user 3.49 s, sys: 134 ms, total: 3.63 s
# Wall time: 3.54 s

# this makes a dict where the key is the ad_id and the values are the columns from the mini_train_copy DF
ad_count_dict = mini_train_copy.to_dict(orient='index')
display(ad_count_dict)

{163843: {'ad_appeared': 33.0, 'ad_click_ratio': 0.060606060606060608},
 98310: {'ad_appeared': 3.0, 'ad_click_ratio': 0.0},
 125480: {'ad_appeared': 1.0, 'ad_click_ratio': 0.0},
 317519: {'ad_appeared': 1.0, 'ad_click_ratio': 0.0},
 98315: {'ad_appeared': 4.0, 'ad_click_ratio': 0.0},
 32780: {'ad_appeared': 8.0, 'ad_click_ratio': 0.25},
 327696: {'ad_appeared': 1.0, 'ad_click_ratio': 0.0},
 98322: {'ad_appeared': 1.0, 'ad_click_ratio': 1.0},
 98307: {'ad_appeared': 20.0, 'ad_click_ratio': 0.050000000000000003},
 98324: {'ad_appeared': 5.0, 'ad_click_ratio': 0.0},
 98325: {'ad_appeared': 1.0, 'ad_click_ratio': 0.0},
 98326: {'ad_appeared': 13.0, 'ad_click_ratio': 0.076923076923076927},
 98327: {'ad_appeared': 1.0, 'ad_click_ratio': 0.0},
 98328: {'ad_appeared': 9.0, 'ad_click_ratio': 0.0},
 327710: {'ad_appeared': 2.0, 'ad_click_ratio': 0.0},
 229407: {'ad_appeared': 1.0, 'ad_click_ratio': 0.0},
 327712: {'ad_appeared': 3.0, 'ad_click_ratio': 0.33333333333333331},
 327713: {'ad_appeare

In [124]:
# %%time
# CPU times: user 7.1 s, sys: 16.2 ms, total: 7.12 s
# Wall time: 7.12 s
    
display_list = mini_train['display_id'].unique().tolist()
display_dict = {}
for display in display_list:
    ad_list = mini_train.loc[mini_train['display_id'] == display]['ad_id'].tolist()
    
    display_dict[display] = ad_list
display_dict

{1: [42337, 139684, 144739, 156824, 279295, 296965],
 2: [125211, 156535, 169564, 308455],
 3: [71547, 95814, 152141, 183846, 228657, 250082],
 4: [149930, 153623, 184709, 186849, 233730],
 5: [54606, 162868, 184548, 282674, 326514],
 6: [85397, 160754, 173388, 175234, 180923, 234445],
 7: [105766, 215967, 300808],
 8: [95724, 175694, 280430, 329774],
 9: [19959, 104208, 140940, 151028],
 10: [241662, 332908],
 11: [51955, 96116, 139059, 149541, 156477, 159431],
 12: [35982, 41584, 75426, 97178, 105789, 211592],
 13: [121703, 143294, 149541],
 14: [98270, 143467, 224171, 288396],
 15: [52086, 153016, 191589, 335983],
 16: [104208, 172888, 235104, 273567],
 17: [27309, 37191, 81652, 250080],
 18: [130833, 164299, 232272, 285989],
 19: [39872, 111036],
 20: [117567, 151926, 177068, 282350],
 21: [5558, 28337, 89256, 137128],
 22: [171594, 225435, 232272, 288069],
 23: [105664, 192854, 213690, 223729, 224786, 284367, 284504, 294332],
 24: [48185, 64255, 80301, 87029, 137247, 193650],
 25:

In [126]:
# %%time
# CPU times: user 161 ms, sys: 22.3 ms, total: 184 ms
# Wall time: 167 ms

# sorted lists of ad_id based on click probability binded to a key of display_id

display_dict_sorted = {}
for key, value in display_dict.iteritems():
    temp_list = []
    for e in value:
        temp_list.append(ad_count_dict[e]['ad_click_ratio'])
        #temp_list.append(ad_count_dict[e][0])
    
    sorted_list = [x for (y,x) in sorted(zip(temp_list, value), reverse=True)]
    sorted_list = map(str, sorted_list) #turn whole list to string
    
    display_dict_sorted[key] = key, ' '.join(sorted_list) #strip out everything and join with ' '.

display_dict_sorted

CPU times: user 161 ms, sys: 22.3 ms, total: 184 ms
Wall time: 167 ms


In [127]:
# %%time
# CPU times: user 24.5 ms, sys: 7.49 ms, total: 32 ms
# Wall time: 29.5 ms


#create submission file going from dict --> DF --> CSV  we can prolly skip all of these data structure transformations...
submit_df = pd.DataFrame.from_dict(display_dict_sorted, orient='index')
submit_df.columns = ['display_id', 'ad_id']
submit_df.set_index(['display_id'], inplace=True)
# display(submit_df.head())
submit_df.to_csv('csv_submissions/prelim_model_v1.csv')

CPU times: user 24.5 ms, sys: 7.49 ms, total: 32 ms
Wall time: 29.5 ms


In [31]:
# creates dict by pandas groupby with ad_id/ click-nonclick as key and count as value. 
#['display_id'] removes the index label?
# great idea tony! 

ad_count_dict = mini_train_copy.groupby(['ad_id', 'clicked']).count().to_dict()['display_id']
# sample output: {(ad_id, boolen) : count}

### Kyle: The line below completely replaces the for loop and executes in seconds, not hours.

In [9]:
ad_id = 347571
print ('Ad id #{} was clicked {} times and not clicked {} times'.format(ad_id, 
                                                                        ad_count_dict[(ad_id,1)],
                                                                        ad_count_dict[(ad_id,0)]))

Ad id #347571 was clicked 4 times and not clicked 8 times


### Kyle: Note that we get the same result as the loop. The only caveat is that if an ad doesn't have data (e.g. it was never clicked or was always clicked, then it won't show up in the result). There's probably a parameter I can set to return zero counts. Will need to investigate more.

In [7]:
# from timeit import default_timer as timer
# # could add here a normalizer value to account for ads with very low and very high occurances.

# # VERY SLOW... c'mon we can make this better.... took 20 sec

# #gives me a dict where the key is the ad_id and the value is a tuple with information about its click ratio
# unique_ads = temp_train['ad_id'].unique().tolist()
# ad_count_dict = {}
# i=0
# ct = len(unique_ads)

# start = timer()

# for ad in unique_ads:
# #     temp_tuple = () #3 elements for clicked, not clicked, and %
    
#     # KYLE: Next line just keeps the part of the table that we want
#     # Maybe it is faster??
#     temp = temp_train_copy.loc[temp_train_copy['ad_id'] == ad]
    
#     appeared = temp.shape[0]
#     clicked = temp.loc[temp_train_copy['clicked'] == 1].shape[0]
#     #not_clicked = temp.loc[temp_train_copy['clicked'] == 0].shape[0]
#     percent_clicked = float(clicked)/appeared
    
#     temp_tuple = (percent_clicked, appeared, clicked, appeared-clicked)
    
    
#     ad_count_dict[ad] = temp_tuple
    
#     if ((i % 100) == 0):
#         elapsed = (timer() - start) / 3600.0
#         rate = elapsed / (i+1)
#         print('Ad #{} of {}, Time left = {:,.2f} hours'.format(i+1, ct, (ct - i)*rate))
        
#     i += 1
    
# ad_count_dict
    

### Train on full train file

In [2]:
train = pd.read_csv("data/clicks_train.csv")

NameError: name 'pd' is not defined

In [1]:
train.head()

NameError: name 'train' is not defined

In [None]:
ad_count_dictNew = train.groupby(['ad_id', 'clicked']).count().to_dict()['display_id']


In [None]:
# from timeit import default_timer as timer

# unique_ads = train['ad_id'].unique().tolist()
# ad_count_dict = {}
# i = 0
# ct = len(unique_ads)

# start = timer()

# for ad in unique_ads:
# #     temp_tuple = () #3 elements for clicked, not clicked, and %
    
#     temp = train.loc[train['ad_id'] == ad]
    
#     appeared = temp.shape[0]
    
#     not_clicked = temp.groupby('clicked').count().values[0,0]
    
#     clicked = appeared - not_clicked
    
#     percent_clicked = float(clicked)/appeared
    
#     temp_tuple = (percent_clicked, appeared, clicked, not_clicked)
    
    
#     ad_count_dict[ad] = temp_tuple
    
#     if ((i % 1000) == 0):
#         elapsed = (timer() - start) / 3600.0
#         rate = elapsed / (i+1)
#         print('Ad #{} of {}, Time left = {:,.2f} hours'.format(i+1, ct, (ct - i)*rate))
        
#     i += 1
    


In [None]:
display_list = train['display_id'].unique().tolist()
display_dict = {}
for display in display_list:
    ad_list = train.loc[train['display_id'] == display]['ad_id'].tolist()
    
    display_dict[display] = ad_list

    
# step 3
    
display_dict_sorted = {}
for key, value in display_dict.iteritems():
    temp_list = []
    for e in value:
        temp_list.append(ad_count_dict[e][0])
    
    sorted_list = [x for (y,x) in sorted(zip(temp_list, value), reverse=True)]
    sorted_list = map(str, sorted_list) #turn whole list to string
    
    display_dict_sorted[key] = key, ' '.join(sorted_list) #strip out everything and join with ' '.


    
#step 4
    
submit_df = pd.DataFrame.from_dict(display_dict_sorted, orient='index')
submit_df.columns = ['display_id', 'ad_id']
submit_df.set_index(['display_id'], inplace=True)
submit_df.to_csv('csv_submissions/prelim_model_v1.csv')