# Generating predictions for click through rate (pCTR) via logistic regression

In [51]:
from util import *
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np

In [52]:
#Setting up variables
inPath = "../../make-ipinyou-data/"
outPath = "../data/"
campaigns = ["1458", "2261", "2997", "3386", "3476", "2259", "2821", "3358", "3427"]

In [4]:
camp = campaigns[0]

In [5]:
inTrainName = inPath + camp + "/" +  "train.yzx.txt"
outTrainPath = outPath + camp + "/" + "train.theta.txt"

inTestName = inPath + camp + "/" + "test.yzx.txt"
outTestPath = outPath + camp + "/" + "test.theta.txt"

In [6]:
def log(msg):
  if isLogging:
    print(msg)

In [7]:
isLogging = True

### Loading Data

Data format:

click(0,1)  winning_price(int)  features(featindex1:1 featindex2:1 featindex3:1 ...)

In [8]:
def loadData(filename):
  log("Loading data...")
  with open(filename) as f:
    lines = f.readlines()
    
  log("Reformatting data...")
  m = len(lines)
  clicks = np.zeros((m,), dtype=int)
  prices = np.zeros((m,), dtype=int)
  features = []
  for i, line in enumerate(lines):
    line = line.split(" ")
    clicks[i] = line[0]
    prices[i] = line[1]
    features.append({int(k):int(v) for k,v in (x.split(":") for x in line[2:])})

  return clicks, prices, features

In [9]:
%%time
d_train = loadData(inTrainName)
clicks_train, prices_train, features_train = d_train

Loading data...
Reformatting data...
CPU times: user 55 s, sys: 1.8 s, total: 56.8 s
Wall time: 56.9 s


In [11]:
%%time
d_test = loadData(inTestName)
clicks_test, prices_test, features_test = d_test

Loading data...
Reformatting data...
CPU times: user 11.2 s, sys: 369 ms, total: 11.6 s
Wall time: 11.6 s


In [24]:
m_train = len(clicks_train)
m_test = len(clicks_test)

nFeatures_train = max(max(i) for i in (feature for feature in features_train)) + 1
nFeatures_test = max(max(i) for i in (feature for feature in features_train)) + 1
nFeatures = max(nFeatures_train, nFeatures_test)

In [25]:
print(m_train)
print(m_test)
print(nFeatures)
print(clicks_train[0])
print(prices_train[0])
print(features_train[0])

3083056
614638
560870
0
51
{0: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1}


### Training logistic regression model on train data
Input: features

Output: P(click)

In [14]:
def getSparseMatrix(features, m, nFeatures):
  mat = sparse.dok_matrix((m, nFeatures), dtype=np.int8)
  
  for i in range(m):
    mat[i, [key for key in features[i]]] = 1

  return mat

In [18]:
%%time
phi_train = getSparseMatrix(features_train, m_train, nFeatures)

CPU times: user 5min 38s, sys: 37.2 s, total: 6min 15s
Wall time: 6min 21s


In [26]:
%%time
phi_test = getSparseMatrix(features_test, m_test, nFeatures)

CPU times: user 1min 11s, sys: 19.2 s, total: 1min 30s
Wall time: 1min 34s


In [27]:
%%time
LR = LogisticRegression()
LR.fit(phi_train, clicks_train)

CPU times: user 3min 29s, sys: 15.3 s, total: 3min 44s
Wall time: 2min 3s


### Using model to evaluate predicted click through rate (pCTR) on train and test datasets

In [30]:
%%time
acc_train = LR.score(phi_train, clicks_train)
acc_test = LR.score(phi_test, clicks_test)

CPU times: user 23.6 s, sys: 12.7 s, total: 36.3 s
Wall time: 38.8 s


In [31]:
print(acc_train)
print(acc_test)

0.999562122777
0.9994696065


In [47]:
def getPDF(probabilities):
  return probabilities / np.sum(probabilities)

In [48]:
%%time
pred_test = LR.predict_proba(phi_test)[:,1]
pCTR_test = getPDF(pred_test)

CPU times: user 3.87 s, sys: 158 ms, total: 4.03 s
Wall time: 4.03 s


In [49]:
print(clicks_test)
print(prices_test)
print(pCTR_test)
print(pred_test)

[0 0 0 ..., 0 0 0]
[118  66 110 ...,  70  81  70]
[  1.54217054e-07   1.49105821e-07   2.26184434e-07 ...,   2.85218279e-07
   3.49716300e-07   1.09496832e-06]
[ 0.00010886  0.00010525  0.00015965 ...,  0.00020132  0.00024685
  0.00077289]


In [53]:
print(roc_auc_score(clicks_test, pred_test))

0.985915156714
