# Generating predictions for click through rate (pCTR) via logistic regression

In [1]:
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
from config import *

In [2]:
campaigns

['1458', '2259', '2261', '2821', '2997', '3358', '3386', '3427', '3476']

In [119]:
camp = campaigns[0]

In [120]:
inTrainName = inPath + camp + "/" +  "train.yzx.txt"
outTrainPath = outPath + camp + "/train/"

inTestName = inPath + camp + "/" + "test.yzx.txt"
outTestPath = outPath + camp + "/test/"

In [121]:
def log(msg):
  if isLogging:
    print(msg)

In [122]:
isLogging = True

### Loading Data

Data format:

click(0,1)  winning_price(int)  features(featindex1:1 featindex2:1 featindex3:1 ...)

In [123]:
def loadData(filename):
  log("Loading data...")
  with open(filename) as f:
    lines = f.readlines()
    
  log("Reformatting data...")
  m = len(lines)
  clicks = np.zeros((m,), dtype=int)
  prices = np.zeros((m,), dtype=int)
  features = []
  for i, line in enumerate(lines):
    line = line.split(" ")
    clicks[i] = line[0]
    prices[i] = line[1]
    features.append({int(k):int(v) for k,v in (x.split(":") for x in line[2:])})

  return clicks, prices, features

In [124]:
%%time
d_train = loadData(inTrainName)
clicks_train, prices_train, features_train = d_train

Loading data...
Reformatting data...
CPU times: user 58.1 s, sys: 9.53 s, total: 1min 7s
Wall time: 1min 13s


In [125]:
%%time
d_test = loadData(inTestName)
clicks_test, prices_test, features_test = d_test

Loading data...
Reformatting data...
CPU times: user 11.8 s, sys: 1.71 s, total: 13.5 s
Wall time: 14.1 s


In [126]:
m_train = len(clicks_train)
m_test = len(clicks_test)

nFeatures_train = max(max(i) for i in (feature for feature in features_train)) + 1
nFeatures_test = max(max(i) for i in (feature for feature in features_train)) + 1
nFeatures = max(nFeatures_train, nFeatures_test)

In [127]:
print(m_train)
print(m_test)
print(nFeatures)
print(clicks_train[0])
print(prices_train[0])
print(features_train[0])

3083056
614638
560870
0
51
{0: 1, 27: 1, 28: 1, 29: 1, 30: 1, 31: 1, 32: 1, 33: 1, 34: 1, 35: 1, 36: 1, 37: 1, 38: 1, 39: 1, 40: 1, 41: 1, 42: 1, 43: 1, 44: 1}


In [128]:
#save data
#Save data
np.save(outTrainPath + "features", features_train)
np.save(outTrainPath + "clicks", clicks_train)
np.save(outTrainPath + "prices", prices_train)

np.save(outTestPath + "features", features_test)
np.save(outTestPath + "clicks", clicks_test)
np.save(outTestPath + "prices", prices_test)

### Training logistic regression model on train data
Input: features

Output: P(click)

In [129]:
def getSparseMatrix(features, m, nFeatures):
  mat = sparse.dok_matrix((m, nFeatures), dtype=np.int8)
  
  for i in range(m):
    mat[i, [key for key in features[i]]] = 1

  return mat

In [130]:
%%time
phi_train = getSparseMatrix(features_train, m_train, nFeatures)

CPU times: user 6min 16s, sys: 2min 59s, total: 9min 15s
Wall time: 9min 51s


In [131]:
%%time
phi_test = getSparseMatrix(features_test, m_test, nFeatures)

CPU times: user 1min 16s, sys: 34.3 s, total: 1min 51s
Wall time: 1min 58s


In [132]:
%%time
LR = LogisticRegression(penalty="l2", max_iter=1000)
LR.fit(phi_train, clicks_train)

CPU times: user 2min 36s, sys: 21.1 s, total: 2min 58s
Wall time: 2min 35s


### Using model to evaluate predicted click through rate (pCTR) on train and test datasets

In [133]:
%%time
acc_train = LR.score(phi_train, clicks_train)
acc_test = LR.score(phi_test, clicks_test)

CPU times: user 25.8 s, sys: 19.1 s, total: 44.8 s
Wall time: 48.7 s


In [134]:
print(acc_train)
print(acc_test)

0.999562122777
0.9994696065


In [135]:
%%time
pred_train = LR.predict_proba(phi_train)[:,1]
pred_test = LR.predict_proba(phi_test)[:,1]

CPU times: user 26.6 s, sys: 20.4 s, total: 47 s
Wall time: 51.7 s


In [136]:
print(clicks_test)
print(prices_test)
print(pred_test)

[0 0 0 ..., 0 0 0]
[118  66 110 ...,  70  81  70]
[ 0.00010886  0.00010525  0.00015965 ...,  0.00020132  0.00024685
  0.00077289]


In [137]:
print(roc_auc_score(clicks_train, pred_train))
print(roc_auc_score(clicks_test, pred_test))

0.987304871071
0.985915156714


In [138]:
#Save data
np.save(outTrainPath + "pCTR", pred_train)

np.save(outTestPath + "pCTR", pred_test)