# Google Analytics Challenge!!

In [27]:
import sklearn
import pandas as pd
import numpy as np
import pylab as pl
from itertools import chain
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor

## Utility functions

In [2]:
def ProcDict(total, keyList=['bounces', 'hits', 'newVisits', 'pageviews', 'transactionRevenue', 'visits'],defValue=0,defFormat=float):
    res=[]
    tdict=eval(total)
    for key in keyList:
        try:
            res.append(defFormat(tdict[key]))
        except KeyError:
            res.append(defValue)
    return res

## Loading in data

In [42]:
nTrain=500000
nTest=10000

# Loading data
fdf=pd.read_csv("train.csv",dtype={'fullVisitorId':str})

# Selecting a random subset of users for train and test
ids=np.random.choice(fdf.fullVisitorId.unique(),nTrain+nTest,replace=False)
trainids=set(ids[:nTrain])
testids=set(ids[nTrain:])
df=fdf[fdf.fullVisitorId.isin(set(ids))].reset_index(drop=True)

In [57]:
# Getting the different keytypes
#print(set(chain(*list([eval(tmp).keys() for tmp in df.geoNetwork]))))

#true=True
#print(set(chain(*list([eval(tmp).keys() for tmp in df.trafficSource.head(3)]))))

## Feature Engineering

In [46]:
keysTotal=['bounces', 'hits', 'newVisits', 'pageviews', 'transactionRevenue', 'visits']
totals=pd.DataFrame([ProcDict(tmp,keysTotal) for tmp in df.totals],columns=keysTotal)

keysGeo=['continent']
continents=pd.DataFrame([ProcDict(tmp,keysGeo,'',str) for tmp in df.geoNetwork],columns=keysGeo)

df=pd.concat((df,totals,continents),axis=1)

In [48]:
df['america']=(df.continent=='Americas').apply(int)
df['europe']=(df.continent=='Europe').apply(int)
df['asia']=(df.continent=='Asia').apply(int)

In [49]:
#df.head()
dftrain=df[df.fullVisitorId.isin(trainids)]
dftest=df[df.fullVisitorId.isin(testids)]

In [50]:
xtrain=dftrain.groupby('fullVisitorId')[['bounces','hits','pageviews','visits','america','europe','asia']].mean()

trans=dftrain.groupby('fullVisitorId')['transactionRevenue'].sum()
yytrain=np.log(trans+1)
ytrain=(yytrain>0).astype(int)

In [51]:
xtest=dftest.groupby('fullVisitorId')[['bounces','hits','pageviews','visits','america','europe','asia']].mean()

trans=dftest.groupby('fullVisitorId')['transactionRevenue'].sum()
yytest=np.log(trans+1)
ytest=(yytest>0).astype(int)

## Classification

In [52]:
rf=RandomForestClassifier(n_estimators=200).fit(xtrain,ytrain)

In [53]:
print(sklearn.metrics.accuracy_score(ytest,rf.predict(xtest)))
print(sklearn.metrics.precision_score(ytest,rf.predict(xtest)))
print(sklearn.metrics.recall_score(ytest,rf.predict(xtest)))

0.9867
0.5526315789473685
0.2978723404255319


## Regression

In [54]:
rrf=RandomForestRegressor(n_estimators=300).fit(xtrain,yytrain)

In [55]:
np.sqrt(sklearn.metrics.mean_squared_error(yytest,rrf.predict(xtest)))

1.742262609702321

In [56]:
np.std(yytest)

2.1006991483207598

In [16]:
#list(zip(xtrain.columns,rf.feature_importances_))

[('bounces', 0.0842248901972804),
 ('hits', 0.4057101509077649),
 ('pageviews', 0.44886436712937305),
 ('visits', 0.0),
 ('america', 0.03746501759339347),
 ('europe', 0.01299042106894805),
 ('asia', 0.010745153103240085)]