# Predicting Click-Through Rate Prediction 
## Random Forest

Ryan Koch, Sam Kahr, Julia Kang - AMLI 2019

References: https://towardsdatascience.com/mobile-ads-click-through-rate-ctr-prediction-44fdac40c6ff

## Import Libraries and Data

In [2]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import psutil
import random
import datetime as datetime
import matplotlib.pyplot as plt

In [46]:
# read in the Avazu - criteo labs - csv file
# rand_sample_csv is a randomized subset (1% the size) of the sample_csv which is ~400k instances 

df = pd.read_csv('/home/data/rand_sample_csv.csv')



# Data Exploration

In [4]:
df.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,0,236,13120299559997056165,0,14102100,1005,0,543a539e,c7ca3108,3e814130,...,1,0,20362,320,50,2333,0,39,-1,157
1,1,259,13447361190641805430,0,14102100,1005,1,17caea14,0dde25ec,f028772b,...,1,0,19950,320,50,1800,3,167,100075,23
2,2,357,14758321504714974000,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,...,1,0,19743,320,50,2264,3,427,100000,61
3,3,455,16246356889796608406,0,14102100,1005,0,5b08c53b,7687a86e,3e814130,...,1,0,17654,300,250,1994,2,39,100084,33
4,4,540,17569568361412548369,0,14102100,1005,0,543a539e,c7ca3108,3e814130,...,1,0,20362,320,50,2333,0,39,-1,157


In [5]:
df.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,click,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
count,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0,4184.0
mean,2091.5,202351.664675,9.113168e+18,0.170172,14102560.0,1004.977533,0.283222,1.012189,0.331501,18728.379302,318.803059,60.868069,2100.065488,1.412285,229.095841,53381.884082,84.231836
std,1207.961092,116384.632429,5.389524e+18,0.375829,296.3423,1.032567,0.489278,0.488054,0.856303,5058.361185,22.743031,48.729395,621.145767,1.324642,358.883751,49952.289351,70.673532
min,0.0,236.0,1513211000000000.0,0.0,14102100.0,1002.0,0.0,0.0,0.0,375.0,216.0,36.0,112.0,0.0,33.0,-1.0,13.0
25%,1045.75,99768.5,4.449284e+18,0.0,14102300.0,1005.0,0.0,1.0,0.0,16687.0,320.0,50.0,1800.0,0.0,35.0,-1.0,23.0
50%,2091.5,204607.5,9.042769e+18,0.0,14102600.0,1005.0,0.0,1.0,0.0,20312.0,320.0,50.0,2314.0,2.0,39.0,100053.5,61.0
75%,3137.25,301481.25,1.381808e+19,0.0,14102810.0,1005.0,1.0,1.0,0.0,21893.0,320.0,50.0,2526.0,3.0,169.0,100084.75,111.0
max,4183.0,404396.0,1.844673e+19,1.0,14103020.0,1012.0,7.0,5.0,5.0,24041.0,728.0,480.0,2756.0,3.0,1839.0,100248.0,255.0


In [47]:
# looks like clicks were normalized to 1. 
# What is 0.17 of a click, likely the probability of a click. 0 no click, 1 click

In [7]:
df.shape # this sample has 4,184 rows of data with 26 columns

(4184, 26)

In [48]:
df.dtypes
# avazu -"all integer features are categorical variables they are all IDs and have no numerical meaning"

Unnamed: 0           int64
Unnamed: 0.1         int64
id                  uint64
click                int64
hour                 int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type          int64
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C17                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
dtype: object

In [9]:
# what do the columns mean?

 - id: ad identifier
 - click: 0/1 for non-click/click
 - hour: format is YYMMDDHH
 - C1 — anonymized categorical variable
 - banner_pos
 - site_id
 - site_domain
 - site_category
 - app_id
 - app_domain
 - app_category
 - device_id
 - device_ip
 - device_model
 - device_type
 - device_conn_type
 - C14-C21 — anonymized categorical variables

In [10]:

for col in df.columns.values:
    print( "{}".format(len(df[col].unique())))

4184
4184
4184
2
240
5
5
354
292
14
244
32
14
660
3908
871
4
4
675
4
5
279
4
59
105
54


In [17]:
# lets look at an individual user

df[df.id == 13447361190641805430]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
1,1,259,13447361190641805430,0,14102100,1005,1,17caea14,0dde25ec,f028772b,...,1,0,19950,320,50,1800,3,167,100075,23


# Questions of the data

In [12]:
# At which time do people click ads most frequently?
# where are most clicks coming from? 
# what nique correlations from any or all combinations of each column relate to click?
# what correlations between seemingly unrelated columns (banner_pos and device type, etc.) exist?
# how are banner_pos, device_type, device_conn_type encoded?

# Data Processing

In [13]:
#  unnamed column are columns that are created when a dataframe is converted to a csv. 
# 'Unnamed: 0', 'Unnamed: 0.1' are row indexes which were tansposed into columns.
df_new = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1) 

In [14]:
df_new.head(5)

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,13120299559997056165,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,1,0,20362,320,50,2333,0,39,-1,157
1,13447361190641805430,0,14102100,1005,1,17caea14,0dde25ec,f028772b,ecad2386,7801e8d9,...,1,0,19950,320,50,1800,3,167,100075,23
2,14758321504714974000,0,14102100,1005,0,85f751fd,c4e18dd6,50e219e0,febd1138,82e27996,...,1,0,19743,320,50,2264,3,427,100000,61
3,16246356889796608406,0,14102100,1005,0,5b08c53b,7687a86e,3e814130,ecad2386,7801e8d9,...,1,0,17654,300,250,1994,2,39,100084,33
4,17569568361412548369,0,14102100,1005,0,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,...,1,0,20362,320,50,2333,0,39,-1,157


In [15]:
df_new.shape

(4184, 24)

In [45]:
# how many unique values are in each col?

for col in df_new.columns.values:
    print(str(col)+ ": " +  "{}".format(len(df_new[col].unique())) )
    

id: 4184
click: 2
hour: 240
C1: 5
banner_pos: 5
site_id: 354
site_domain: 292
site_category: 14
app_id: 244
app_domain: 32
app_category: 14
device_id: 660
device_ip: 3908
device_model: 871
device_type: 4
device_conn_type: 4
C14: 675
C15: 4
C16: 5
C17: 279
C18: 4
C19: 59
C20: 105
C21: 54


In [26]:
df_new.dtypes

id                  uint64
click                int64
hour                 int64
C1                   int64
banner_pos           int64
site_id             object
site_domain         object
site_category       object
app_id              object
app_domain          object
app_category        object
device_id           object
device_ip           object
device_model        object
device_type          int64
device_conn_type     int64
C14                  int64
C15                  int64
C16                  int64
C17                  int64
C18                  int64
C19                  int64
C20                  int64
C21                  int64
dtype: object

In [36]:
# check for missing values

df_new.isnull().sum()


id                  0
click               0
hour                0
C1                  0
banner_pos          0
site_id             0
site_domain         0
site_category       0
app_id              0
app_domain          0
app_category        0
device_id           0
device_ip           0
device_model        0
device_type         0
device_conn_type    0
C14                 0
C15                 0
C16                 0
C17                 0
C18                 0
C19                 0
C20                 0
C21                 0
dtype: int64

In [34]:
# no apparant erroneous, inconsistent spelling or abbreviations
# No apparant formatting issues (e.g., odd/unexpected characters or punctuation)

In [37]:
# I don't know what the value of features 5-12 mean? 