# Predicting Click
### Understanding cross-feature correlations

Ryan Koch, Sam Kahr, Julia Kang - AMLI 2019

### References
 - CTR: https://towardsdatascience.com/mobile-ads-click-through-rate-ctr-prediction-44fdac40c6ff

## Import Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import psutil
import random
import datetime as datetime
import matplotlib.pyplot as plt

In [2]:
# read in the Avazu - criteo labs - csv file
# rand_sample_csv is a randomized subset (1% the size) of the sample_csv which is ~400k instances 

df = pd.read_csv('rand_sample_eng.csv')


# Data Exploration

In [3]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,C15,C16,C17,C18,C19,C20,C21,new_date,new_time,day_of_week
0,0,10004510652136496837,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,...,320,50,2333,0,39,-1,157,2014-10-21,00:00:00,1
1,1,10007164336863914220,1,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,320,50,1722,0,35,-1,79,2014-10-21,00:00:00,1
2,2,10076859283156800622,0,14102100,1002,0,f17ebd97,c4e18dd6,50e219e0,ecad2386,...,216,36,2497,3,43,100151,42,2014-10-21,00:00:00,1
3,3,10078825124049580646,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,320,50,1722,0,35,-1,79,2014-10-21,00:00:00,1
4,4,10085233430943183912,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,...,320,50,1722,0,35,-1,79,2014-10-21,00:00:00,1


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21,day_of_week
count,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0,404410.0
mean,202204.5,9.213896e+18,0.169991,14102560.0,1004.967372,0.288638,1.014695,0.331688,18844.936193,318.795856,60.076581,2112.733881,1.431426,227.89114,53239.462209,83.304955,2.601259
std,116743.255518,5.319411e+18,0.375625,296.8127,1.090207,0.504033,0.523959,0.855877,4947.526554,20.667022,47.023691,607.929983,1.325359,351.686105,49955.39177,70.251537,1.727362
min,0.0,73068130000000.0,0.0,14102100.0,1001.0,0.0,0.0,0.0,375.0,120.0,20.0,112.0,0.0,33.0,-1.0,1.0,0.0
25%,101102.25,4.60724e+18,0.0,14102300.0,1005.0,0.0,1.0,0.0,16920.0,320.0,50.0,1863.0,0.0,35.0,-1.0,23.0,1.0
50%,202204.5,9.218443e+18,0.0,14102600.0,1005.0,0.0,1.0,0.0,20346.0,320.0,50.0,2323.0,2.0,39.0,100048.0,61.0,2.0
75%,303306.75,1.3822e+19,0.0,14102810.0,1005.0,1.0,1.0,0.0,21893.0,320.0,50.0,2526.0,3.0,171.0,100086.0,101.0,4.0
max,404409.0,1.844673e+19,1.0,14103020.0,1012.0,7.0,5.0,5.0,24043.0,1024.0,1024.0,2757.0,3.0,1839.0,100248.0,255.0,6.0


In [5]:
df.shape # this sample has 404,410 rows of data with 26 columns

(404410, 28)

In [6]:
df.dtypes
# avazu: "all integer features are categorical variables, all IDs, no numerical meaning"

Unnamed: 0           int64
id                  uint64
click                int64
hour                 int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type          int64
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C17                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
new_date            object
new_time            object
day_of_week          int64
dtype: object

In [7]:
# what do the columns mean?

 - id: ad identifier
 - click: 0/1 for non-click/click
 - hour: format is YYMMDDHH
 - C1 — anonymized categorical variable
 - banner_pos
 - site_id
 - site_domain
 - site_category
 - app_id
 - app_domain
 - app_category
 - device_id
 - device_ip
 - device_model
 - device_type
 - device_conn_type
 - C14-C21 — anonymized categorical variables

In [8]:
# what are the range of unique values of each column
for col in df.columns.values:
    print( "{}".format(len(df[col].unique())))

404410
404410
2
240
7
7
2195
2172
21
2305
153
26
64913
262453
4369
5
4
2067
8
9
415
4
65
159
60
10
24
7


In [9]:
# lets look at an individual user

df[df.id == 13447361190641805430]

Unnamed: 0.1,Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,...,C15,C16,C17,C18,C19,C20,C21,new_date,new_time,day_of_week
259,259,13447361190641805430,0,14102100,1005,1,17caea14,0dde25ec,f028772b,ecad2386,...,320,50,1800,3,167,100075,23,2014-10-21,00:00:00,1


# Data Preprocessing

In [10]:
#  unnamed column are columns that are created when a dataframe is converted to a csv. 
# 'Unnamed: 0', 'Unnamed: 0.1' are row indexes which were tansposed into columns.
df_new = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1) 

KeyError: "['Unnamed: 0.1'] not found in axis"

In [None]:
df_new.shape

In [None]:
# how many unique values are in each col?

for col in df_new.columns.values:
    print(str(col)+ ": " +  "{}".format(len(df_new[col].unique())) )
    

In [None]:
# check for missing values

df_new.isnull().sum()


In [None]:
# summed list of each column for df_new, looking for inconsistencies

for col in df_new.columns.values:
    total = len(df_new[col].unique())
    print(str(col) + " " + "total: " + str(total))

In [None]:
# tried writing an algo to check for erroneous, inconsistent spelling or abbreviations, formatting issues (e.g., odd/unexpected characters or punctuation)
# couldn't work one out... moving on

Each instance of a column are hashed values of an original ID.
Hashing was done to anonymize the services contributing ad data to this dataset.
For illustrative/descriptive purposes we will treat each hashed value as names or in other fictional contexts (e.g.,'7801e8d9' = 'www.overstock.com'). (Thank you Naomi!)

# Feature Engineering
## Hour & Date 

In [None]:
# check hour column data type
df_new.hour.dtype

In [None]:
# separate the date and time
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')
df_new['new_hour'] = df_new.hour.astype(str).apply(parse_date)
df_new['new_hour']

In [None]:
# check if column 'new_hour' was created and parsed to string
df_new.head(3)

In [None]:
#confirm dtype of new_hour
df_new.new_hour.dtype

In [None]:
# create new_date & new_time columns from parsed new_hour column
df_new['date'] = [d.date() for d in df_new['new_hour']]
df_new['time'] = [d.time() for d in df_new['new_hour']]

In [None]:
#check if columns were established properly
df_new.head(3)

In [None]:
df_new.dtypes

In [None]:
# drop redundant cols
df_tmp = df_new.drop(['new_hour', 'hour'], axis=1)

In [None]:
df_tmp.head(2)

In [None]:
# sannity check of summed columns for unique vals

for col in df_tmp.columns.values:
    total = len(df_tmp[col].unique())
    val = df_tmp[col].unique()
    print(str(col) + " " + "total: " + str(total))

In [None]:
# iterate through columns and print the unique values of each column
for col in df_tmp.columns.values:
    val = df_tmp[col].unique()
    print(str(col) + " " + ", val: " + str(val))

In [None]:
# use device_type as practice. There are 5 unique vals -- smaller number is easier to work with
df_tmp.device_type.nunique()

In [None]:
# store df_tmp.device_type as var for ease of re-use
dvc_type = df_tmp.device_type

In [None]:
# check instance of dvc_type
dvc_type[0]

In [None]:
# val counts gives me the count of each unique values 
dvc_type.value_counts()

In [None]:
# make var to hold col 'names' based off unique values stored as a list
col_names = df_tmp['device_type'].unique().tolist()
col_names

In [None]:
# check it
df_tmp.head(3)

In [None]:
df_tmp[col_names] = pd.get_dummies(df_tmp['device_type'])
df_tmp[col_names].describe()

In [None]:
# 2 unique numbers for column 1, this means it is either 1 or not 1?
df_tmp[col_names][1].nunique()

In [None]:
# check for missing values -- there are none, good.

df_tmp.isnull().sum()

In [None]:
# ya final rows are columns stratified by device type
df_tmp.shape

In [None]:
# confirm successful selection of last 5 cols -- device types
df_tmp[df_tmp.columns[-5:]].head(3)

In [None]:
# rename device_type col name for readability / understanding
df_dvtype = df_tmp
#df_dvtype.columns = ['a', 'b']
df_dvtype

# Original features

 - Target feature : click
 - Site features : site_id, site_domain, site_category
 - App feature: app_id, app_domain, app_category
 - Device feature: device_id, device_ip, device_model, device_type, device_conn_type
 - Anonymized categorical features: C14-C21

# New Features

 - Target feature : click
 - Site features : site_id, site_domain, site_category
 - App feature: app_id, app_domain, app_category
 - Device feature: device_id, device_ip, device_model, device_type, device_conn_type
 - Anonymized categorical features: C14-C21