# Predicting Click
### Understanding cross-feature correlations

Ryan Koch, Sam Kahr, Julia Kang - AMLI 2019

### References
 - CTR: https://towardsdatascience.com/mobile-ads-click-through-rate-ctr-prediction-44fdac40c6ff

## Import Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import psutil
import random
import datetime as datetime
import matplotlib.pyplot as plt

In [3]:
# read in the Avazu - criteo labs - csv file
# rand_sample_csv is a randomized subset (1% the size) of the sample_csv which is ~400k instances 

df = pd.read_csv('rand_sample_eng.csv')


# Data Exploration

In [4]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,C15,C16,C17,C18,C19,C20,C21,new_date,new_time,day_of_week
0,0,10004510652136496837,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,...,320,50,2333,0,39,-1,157,2014-10-21,00:00:00,1
1,1,10007164336863914220,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,320,50,1722,0,35,-1,79,2014-10-21,00:00:00,1
2,2,10076859283156800622,0,14102100,1002,0,f17ebd97,c4e18dd6,50e219e0,ecad2386,...,216,36,2497,3,43,100151,42,2014-10-21,00:00:00,1
3,3,10078825124049580646,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,320,50,1722,0,35,-1,79,2014-10-21,00:00:00,1
4,4,10085233430943183912,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,320,50,1722,0,35,-1,79,2014-10-21,00:00:00,1


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,day_of_week
count,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0
mean,202204.5,9.213896e+18,0.169991,14102560.0,1004.967372,0.288638,1.014695,0.331688,18844.936193,318.795856,60.076581,2112.733881,1.431426,227.89114,53239.462209,83.304955,2.601259
std,116743.255518,5.319411e+18,0.375625,296.8127,1.090207,0.504033,0.523959,0.855877,4947.526554,20.667022,47.023691,607.929983,1.325359,351.686105,49955.39177,70.251537,1.727362
min,0.0,73068130000000.0,0.0,14102100.0,1001.0,0.0,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,1.0,0.0
25%,101102.25,4.60724e+18,0.0,14102300.0,1005.0,0.0,1.0,0.0,16920.0,320.0,50.0,1863.0,0.0,35.0,-1.0,23.0,1.0
50%,202204.5,9.218443e+18,0.0,14102600.0,1005.0,0.0,1.0,0.0,20346.0,320.0,50.0,2323.0,2.0,39.0,100048.0,61.0,2.0
75%,303306.75,1.3822e+19,0.0,14102810.0,1005.0,1.0,1.0,0.0,21893.0,320.0,50.0,2526.0,3.0,171.0,100086.0,101.0,4.0
max,404409.0,1.844673e+19,1.0,14103020.0,1012.0,7.0,5.0,5.0,24043.0,1024.0,1024.0,2757.0,3.0,1839.0,100248.0,255.0,6.0


In [6]:
df.shape # this sample has 404,410 rows of data with 26 columns

(404410, 28)

In [7]:
df.dtypes
# avazu: "all integer features are categorical variables, all IDs, no numerical meaning"

Unnamed: 0           int64
id                  uint64
click                int64
hour                 int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type          int64
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C17                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
new_date            object
new_time            object
day_of_week          int64
dtype: object

In [8]:
# what do the columns mean?

 - id: ad identifier
 - click: 0/1 for non-click/click
 - hour: format is YYMMDDHH
 - C1 — anonymized categorical variable
 - banner_pos
 - site_id
 - site_domain
 - site_category
 - app_id
 - app_domain
 - app_category
 - device_id
 - device_ip
 - device_model
 - device_type
 - device_conn_type
 - C14-C21 — anonymized categorical variables

In [9]:
# what are the range of unique values of each column
for col in df.columns.values:
    print( "{}".format(len(df[col].unique())))

404410
404410
2
240
7
7
2195
2172
21
2305
153
26
64913
262453
4369
5
4
2067
8
9
415
4
65
159
60
10
24
7


In [10]:
# lets look at an individual user

df[df.id == 13447361190641805430]

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,C15,C16,C17,C18,C19,C20,C21,new_date,new_time,day_of_week
259,259,13447361190641805430,0,14102100,1005,1,17caea14,0dde25ec,f028772b,ecad2386,...,320,50,1800,3,167,100075,23,2014-10-21,00:00:00,1


# Data Preprocessing

In [13]:
#  unnamed column are columns that are created when a dataframe is converted to a csv. 
# 'Unnamed: 0', 'Unnamed: 0.1' are row indexes which were tansposed into columns.
df_new = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1) 

KeyError: "['Unnamed: 0.1'] not found in axis"

In [18]:
df_new.shape

(4184, 24)

In [19]:
# how many unique values are in each col?

for col in df_new.columns.values:
    print(str(col)+ ": " +  "{}".format(len(df_new[col].unique())) )
    

id: 4184
click: 2
hour: 240
C1: 5
banner_pos: 5
site_id: 354
site_domain: 292
site_category: 14
app_id: 244
app_domain: 32
app_category: 14
device_id: 660
device_ip: 3908
device_model: 871
device_type: 4
device_conn_type: 4
C14: 675
C15: 4
C16: 5
C17: 279
C18: 4
C19: 59
C20: 105
C21: 54


In [20]:
# check for missing values

df_new.isnull().sum()


id                  0
click               0
hour                0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
dtype: int64

In [21]:
# summed list of each column for df_new, looking for inconsistencies

for col in df_new.columns.values:
    total = len(df_new[col].unique())
    print(str(col) + " " + "total: " + str(total))

id total: 4184
click total: 2
hour total: 240
C1 total: 5
banner_pos total: 5
site_id total: 354
site_domain total: 292
site_category total: 14
app_id total: 244
app_domain total: 32
app_category total: 14
device_id total: 660
device_ip total: 3908
device_model total: 871
device_type total: 4
device_conn_type total: 4
C14 total: 675
C15 total: 4
C16 total: 5
C17 total: 279
C18 total: 4
C19 total: 59
C20 total: 105
C21 total: 54


In [22]:
# tried writing an algo to check for erroneous, inconsistent spelling or abbreviations, formatting issues (e.g., odd/unexpected characters or punctuation)
# couldn't work one out... moving on

Each instance of a column are hashed values of an original ID.
Hashing was done to anonymize the services contributing ad data to this dataset.
For illustrative/descriptive purposes we will treat each hashed value as names or in other fictional contexts (e.g.,'7801e8d9' = 'www.overstock.com'). (Thank you Naomi!)

# Feature Engineering
## Hour & Date 

In [23]:
# check hour column data type
df_new.hour.dtype

dtype('int64')

In [24]:
# separate the date and time
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')
df_new['new_hour'] = df_new.hour.astype(str).apply(parse_date)
df_new['new_hour']

0      2014-10-21 00:00:00
1      2014-10-21 00:00:00
2      2014-10-21 00:00:00
3      2014-10-21 00:00:00
4      2014-10-21 00:00:00
5      2014-10-21 00:00:00
6      2014-10-21 00:00:00
7      2014-10-21 00:00:00
8      2014-10-21 00:00:00
9      2014-10-21 00:00:00
10     2014-10-21 00:00:00
11     2014-10-21 01:00:00
12     2014-10-21 01:00:00
13     2014-10-21 01:00:00
14     2014-10-21 01:00:00
15     2014-10-21 01:00:00
16     2014-10-21 01:00:00
17     2014-10-21 01:00:00
18     2014-10-21 01:00:00
19     2014-10-21 01:00:00
20     2014-10-21 01:00:00
21     2014-10-21 01:00:00
22     2014-10-21 01:00:00
23     2014-10-21 01:00:00
24     2014-10-21 01:00:00
25     2014-10-21 01:00:00
26     2014-10-21 01:00:00
27     2014-10-21 01:00:00
28     2014-10-21 01:00:00
29     2014-10-21 01:00:00
               ...        
4154   2014-10-30 20:00:00
4155   2014-10-30 20:00:00
4156   2014-10-30 21:00:00
4157   2014-10-30 21:00:00
4158   2014-10-30 21:00:00
4159   2014-10-30 21:00:00
4

In [25]:
# check if column 'new_hour' was created and parsed to string
df_new.head(3)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,new_hour
0,13120299559997056165,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,0,20362,320,50,2333,0,39,-1,157,2014-10-21
1,13447361190641805430,0,14102100,1005,1,17caea14,0dde25ec,f028772b,ecad2386,7801e8d9,...,0,19950,320,50,1800,3,167,100075,23,2014-10-21
2,14758321504714974000,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,febd1138,82e27996,...,0,19743,320,50,2264,3,427,100000,61,2014-10-21


In [26]:
#confirm dtype of new_hour
df_new.new_hour.dtype

dtype('<M8[ns]')

In [27]:
# create new_date & new_time columns from parsed new_hour column
df_new['date'] = [d.date() for d in df_new['new_hour']]
df_new['time'] = [d.time() for d in df_new['new_hour']]

In [28]:
#check if columns were established properly
df_new.head(3)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,C15,C16,C17,C18,C19,C20,C21,new_hour,date,time
0,13120299559997056165,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,320,50,2333,0,39,-1,157,2014-10-21,2014-10-21,00:00:00
1,13447361190641805430,0,14102100,1005,1,17caea14,0dde25ec,f028772b,ecad2386,7801e8d9,...,320,50,1800,3,167,100075,23,2014-10-21,2014-10-21,00:00:00
2,14758321504714974000,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,febd1138,82e27996,...,320,50,2264,3,427,100000,61,2014-10-21,2014-10-21,00:00:00


In [29]:
df_new.dtypes

id                          uint64
click                        int64
hour                         int64
C1                           int64
banner_pos                   int64
site_id                     object
site_domain                 object
site_category               object
app_id                      object
app_domain                  object
app_category                object
device_id                   object
device_ip                   object
device_model                object
device_type                  int64
device_conn_type             int64
C14                          int64
C15                          int64
C16                          int64
C17                          int64
C18                          int64
C19                          int64
C20                          int64
C21                          int64
new_hour            datetime64[ns]
date                        object
time                        object
dtype: object

In [30]:
# drop redundant cols
df_tmp = df_new.drop(['new_hour', 'hour'], axis=1)

In [31]:
df_tmp.head(2)

Unnamed: 0,id,click,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,app_category,...,C14,C15,C16,C17,C18,C19,C20,C21,date,time
0,13120299559997056165,0,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,07d7df22,...,20362,320,50,2333,0,39,-1,157,2014-10-21,00:00:00
1,13447361190641805430,0,1005,1,17caea14,0dde25ec,f028772b,ecad2386,7801e8d9,07d7df22,...,19950,320,50,1800,3,167,100075,23,2014-10-21,00:00:00


In [110]:
# sannity check of summed columns for unique vals

for col in df_tmp.columns.values:
    total = len(df_tmp[col].unique())
    val = df_tmp[col].unique()
    print(str(col) + " " + "total: " + str(total))

id total: 404410
click total: 2
C1 total: 7
banner_pos total: 7
site_id total: 2195
site_domain total: 2172
site_category total: 21
app_id total: 2305
app_domain total: 153
app_category total: 26
device_id total: 64913
device_ip total: 262453
device_model total: 4369
device_type total: 5
device_conn_type total: 4
C14 total: 2067
C15 total: 8
C16 total: 9
C17 total: 415
C18 total: 4
C19 total: 65
C20 total: 159
C21 total: 60
new_date total: 10
new_time total: 24
day_of_week total: 7
date total: 10
time total: 24


In [101]:
# iterate through columns and print the unique values of each column
for col in df_tmp.columns.values:
    val = df_tmp[col].unique()
    print(str(col) + " " + ", val: " + str(val))

id , val: [10004510652136496837 10007164336863914220 10076859283156800622 ...
  9930625418032326788  9953588061726377330  9959058523366506236]
click , val: [0 1]
C1 , val: [1005 1002 1010 1007 1008 1012 1001]
banner_pos , val: [0 1 2 5 7 4 3]
site_id , val: ['543a539e' '1fbe01fe' 'f17ebd97' ... '9fd919ea' '1b72ccd8' '5a51436e']
site_domain , val: ['c7ca3108' 'f3845767' 'c4e18dd6' ... '0da06afc' '3e87e1c9' '645c06d3']
site_category , val: ['3e814130' '28905ebd' '50e219e0' '76b2941d' 'f028772b' 'f66779e6'
 '0569f928' '335d28a8' '72722551' '75fa27f6' 'c0dd3be3' 'a818d37a'
 '8fd0aea4' '70fb0e29' 'dedf689d' 'e787de0e' '5378d028' 'bcf865d9'
 '42a36e14' '9ccfa2ea' 'c706e647']
app_id , val: ['ecad2386' '1779deee' 'febd1138' ... '96f19b66' '5717fe5d' '404b2054']
app_domain , val: ['7801e8d9' '2347f47a' '82e27996' '45a51db4' '5c5a694b' 'afdf1f54'
 'aefc06bd' 'ae637522' 'd9b5648e' '828da833' '5b9c592b' '0654b444'
 '885c7f3f' 'b8d325c3' 'b5f3b24a' 'ad63ec9b' '33da2e74' '43cf4f06'
 '15ec7f39' '18eb

In [102]:
# use device_type as practice. There are 5 unique vals -- smaller number is easier to work with
df_tmp.device_type.nunique()

5

In [103]:
# store df_tmp.device_type as var for ease of re-use
dvc_type = df_tmp.device_type

In [104]:
# check instance of dvc_type
dvc_type[0]

1

In [105]:
# val counts gives me the count of each unique values 
dvc_type.value_counts()

1    373412
0     22074
4      7676
5      1247
2         1
Name: device_type, dtype: int64

In [112]:
# make var to hold col 'names' based off unique values stored as a list
col_names = df_tmp['device_type'].unique().tolist()
col_names

[1, 0, 4, 5, 2]

In [116]:
# check it
df_tmp.head(3)

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,C20,C21,new_date,new_time,day_of_week,1,0,4,5,2
0,0,10004510652136496837,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,...,-1,157,2014-10-21,00:00:00,1,0,1,0,0,0
1,1,10007164336863914220,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,-1,79,2014-10-21,00:00:00,1,0,1,0,0,0
2,2,10076859283156800622,0,14102100,1002,0,f17ebd97,c4e18dd6,50e219e0,ecad2386,...,100151,42,2014-10-21,00:00:00,1,1,0,0,0,0


In [117]:
df_tmp[col_names] = pd.get_dummies(df_tmp['device_type'])
df_tmp[col_names].describe()

Unnamed: 0,1,0,4,5,2
count,404410.0,404410.0,404410.0,404410.0,404410.0
mean,0.054583,0.92335,2e-06,0.018981,0.003084
std,0.227165,0.266036,0.001572,0.136457,0.055444
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0


In [120]:
# 2 unique numbers for column 1, this means it is either 1 or not 1?
df_tmp[col_names][1].nunique()

2

In [122]:
# check for missing values -- there are none, good.

df_tmp.isnull().sum()

id                  0
click               0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
new_date            0
new_time            0
day_of_week         0
date                0
time                0
1                   0
0                   0
4                   0
5                   0
2                   0
dtype: int64

In [123]:
# ya final rows are columns stratified by device type
df_tmp.shape

(404410, 33)

# Original features

 - Target feature : click
 - Site features : site_id, site_domain, site_category
 - App feature: app_id, app_domain, app_category
 - Device feature: device_id, device_ip, device_model, device_type, device_conn_type
 - Anonymized categorical features: C14-C21

# New Features

 - Target feature : click
 - Site features : site_id, site_domain, site_category
 - App feature: app_id, app_domain, app_category
 - Device feature: device_id, device_ip, device_model, device_type, device_conn_type
 - Anonymized categorical features: C14-C21