In [20]:
import pandas as pd
import datetime as dt
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array

<div style="color:black;font-family: Arial; font-size:1.1em;line-height:65%">
<h1 style="font-family: Arial; font-size:1.5em;color:#2462C0">Step 1: Attribute Selection</h1>
</div>

In [21]:
adclicksDF = pd.read_csv('./flamingo-data/ad-clicks.csv', infer_datetime_format=True)
adclicksDF = adclicksDF.rename(columns=lambda x: x.strip()) #remove whitespaces from headers
adclicksDF.timestamp = pd.to_datetime(adclicksDF.timestamp)
adclicksDF['ad_hour'] = adclicksDF.timestamp.dt.hour
type(adclicksDF['timestamp'].iloc[0])

pandas.tslib.Timestamp

In [22]:
adclicksDF.head(n=5)

Unnamed: 0,timestamp,txId,userSessionId,teamId,userId,adId,adCategory,ad_hour
0,2016-05-26 15:13:22,5974,5809,27,611,2,electronics,15
1,2016-05-26 15:17:24,5976,5705,18,1874,21,movies,15
2,2016-05-26 15:22:52,5978,5791,53,2139,25,computers,15
3,2016-05-26 15:22:57,5973,5756,63,212,10,fashion,15
4,2016-05-26 15:22:58,5980,5920,9,1027,20,clothing,15


In [23]:
adclicksDF['adCount'] = 1

In [24]:
buyclicksDF = pd.read_csv('./flamingo-data/buy-clicks.csv')
buyclicksDF = buyclicksDF.rename(columns=lambda x: x.strip()) #removes whitespaces from headers
buyclicksDF.timestamp = pd.to_datetime(buyclicksDF.timestamp)
buyclicksDF['buy_hour'] = buyclicksDF.timestamp.dt.hour

In [25]:
buyclicksDF.head(n=5)

Unnamed: 0,timestamp,txId,userSessionId,team,userId,buyId,price,buy_hour
0,2016-05-26 15:36:54,6004,5820,9,1300,2,3.0,15
1,2016-05-26 15:36:54,6005,5775,35,868,4,10.0,15
2,2016-05-26 15:36:54,6006,5679,97,819,5,20.0,15
3,2016-05-26 16:36:54,6067,5665,18,121,2,3.0,16
4,2016-05-26 17:06:54,6093,5709,11,2222,5,20.0,17


In [26]:
hitclicksDF = pd.read_csv('./flamingo-data/game-clicks.csv')
hitclicksDF = hitclicksDF.rename(columns=lambda x: x.strip())
hitclicksDF.timestamp = pd.to_datetime(hitclicksDF.timestamp)
hitclicksDF['hit_hour'] = hitclicksDF.timestamp.dt.hour

In [27]:
hitclicksDF.head(n=5)

Unnamed: 0,timestamp,clickId,userId,userSessionId,isHit,teamId,teamLevel,hit_hour
0,2016-05-26 15:06:55,105,1038,5916,0,25,1,15
1,2016-05-26 15:07:09,154,1099,5898,0,44,1,15
2,2016-05-26 15:07:14,229,899,5757,0,71,1,15
3,2016-05-26 15:07:14,322,2197,5854,0,99,1,15
4,2016-05-26 15:07:20,22,1362,5739,0,13,1,15


<h1 style="font-family: Arial; font-size:1.5em;color:#2462C0">Feature Selection</h1>

In [46]:
userTimePurchases = buyclicksDF[['userId', 'price', 'buy_hour']] #select only userid and price
userTimePurchases.head(n=5)
buyPerUser = userTimePurchases.groupby('userId').agg({'price':'sum', 'buy_hour':'mean'}).reset_index()
buyPerUser.head(n=2)

Unnamed: 0,userId,buy_hour,price
0,1,6.333333,21.0
1,8,12.8,53.0


In [47]:
userTimeadClicks = adclicksDF[['userId', 'adCount', 'ad_hour']]
adsPerUser = userTimeadClicks.groupby('userId').agg({'adCount':'sum', 'ad_hour':'mean'}).reset_index()
adsPerUser.head(n=2)

Unnamed: 0,userId,adCount,ad_hour
0,1,44,12.727273
1,8,10,11.7


In [48]:
userTimeHits = hitclicksDF[['userId', 'isHit', 'hit_hour']]
hitsPerUser = userTimeHits.groupby('userId').agg({'isHit':'mean', 'hit_hour':'mean'}).reset_index()
hitsPerUser.head(n=2)

Unnamed: 0,userId,hit_hour,isHit
0,0,11.552768,0.105535
1,1,11.627095,0.134078


## Merge

In [49]:
adBuyDF = adsPerUser.merge(buyPerUser, on='userId') #userId, adCount, price

In [50]:
adBuyHitDF = adBuyDF.merge(hitsPerUser, on='userId') #userId, adCount, price, ishit, hours

In [56]:
adBuyHitDF.head(n=10)

Unnamed: 0,userId,adCount,ad_hour,buy_hour,price,hit_hour,isHit
0,1,44,12.727273,6.333333,21.0,11.627095,0.134078
1,8,10,11.7,12.8,53.0,10.955263,0.1
2,9,37,10.27027,11.5,80.0,11.326772,0.122047
3,10,19,10.947368,12.1,11.0,11.765047,0.10943
4,12,46,12.76087,13.230769,215.0,11.579545,0.130682
5,13,16,13.4375,12.333333,20.0,11.217484,0.102345
6,15,15,10.733333,6.0,2.0,11.144444,0.111111
7,21,52,10.788462,10.0,19.0,11.395623,0.097643
8,26,9,8.888889,17.0,2.0,11.159091,0.103147
9,27,10,10.9,10.5,4.0,11.69,0.07


<h1 style="font-family: Arial; font-size:1.5em;color:#2462C0">Create the final training dataset</h1>

In [52]:
trainingDF = adBuyHitDF[['adCount','ad_hour','price', 'buy_hour', 'isHit', 'hit_hour']]
trainingDF.shape

(543, 6)

In [53]:
sqlContext = SQLContext(sc)
pDF = sqlContext.createDataFrame(trainingDF)
# ['adCount','ad_hour','price', 'buy_hour', 'isHit', 'hit_hour'
parsedData = pDF.rdd.map(lambda line: array([line[0], line[1], line[2], line[3], line[4], line[5]])) 

<h1 style="font-family: Arial; font-size:1.5em;color:#2462C0">Train KMeans model</h1>

In [54]:
my_kmmodel = KMeans.train(parsedData, 3, maxIterations=10, runs=10, initializationMode="random")

  "Support for runs is deprecated in 1.6.0. This param will have no effect in 1.7.0.")


In [55]:
print(my_kmmodel.centers)

[array([ 34.13740458,  11.81431833,  64.29007634,  10.8114075 ,
         0.11948593,  11.47731656]), array([ 26.08539945,  11.56683137,  16.47933884,  11.35156097,
         0.11021882,  11.36533368]), array([  4.10000000e+01,   1.17641183e+01,   1.42102041e+02,
         1.13862755e+01,   1.27581546e-01,   1.15061366e+01])]


In [57]:
my_kmmodel.centers

[array([ 34.13740458,  11.81431833,  64.29007634,  10.8114075 ,
          0.11948593,  11.47731656]),
 array([ 26.08539945,  11.56683137,  16.47933884,  11.35156097,
          0.11021882,  11.36533368]),
 array([  4.10000000e+01,   1.17641183e+01,   1.42102041e+02,
          1.13862755e+01,   1.27581546e-01,   1.15061366e+01])]

In [60]:
trainingDF.describe()

Unnamed: 0,adCount,ad_hour,price,buy_hour,isHit,hit_hour
count,543.0,543.0,543.0,543.0,543.0,543.0
mean,29.373849,11.644341,39.349908,11.22438,0.114021,11.405056
std,15.216343,1.751375,41.221737,3.899722,0.017304,0.624851
min,1.0,3.25,1.0,0.0,0.044534,4.882353
25%,16.0,10.630952,10.0,9.0,0.103741,11.160032
50%,30.0,11.658537,25.0,11.0,0.114234,11.456701
75%,42.0,12.675821,55.0,13.333333,0.124165,11.701136
max,67.0,22.0,223.0,23.0,0.176471,13.370968
