In [1]:
import gc
import re
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from livelossplot import PlotLossesKeras
from scipy.stats import norm, probplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense, LSTM, CuDNNLSTM, Dropout, Activation, Bidirectional, TimeDistributed

Using TensorFlow backend.


In [2]:
IN_TRAIN = 'in/train-wrangled.csv'
IN_TEST = 'in/test-wrangled.csv'

LABEL = 'totals.transactionRevenue'

In [3]:
df = pd.read_csv(IN_TRAIN, dtype={'fullVisitorId': 'str'}, low_memory=False)
df.head()

Unnamed: 0,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.isMobile,totals.bounces,totals.hits,totals.newVisits,...,geoNetwork.subContinent Western Asia,geoNetwork.subContinent Western Europe,trafficSource.source (direct),trafficSource.source Other,trafficSource.source Partners,trafficSource.source analytics.google.com,trafficSource.source google,trafficSource.source mall.googleplex.com,trafficSource.source youtube.com,totals.transactionRevenue
0,0.0101,1131660440785968503,1131660440785968503_1472830385,0.088405,0.0,0.088405,0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,
1,0.0101,377306020877927890,377306020877927890_1472880147,0.089979,0.0,0.089979,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
2,0.0101,3895546263509774583,3895546263509774583_1472865386,0.089512,0.0,0.089512,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
3,0.0101,4763447161404445595,4763447161404445595_1472881213,0.090012,0.0,0.090012,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
4,0.0101,27294437909732085,27294437909732085_1472822600,0.088159,0.002538,0.088159,0,0.0,0.0,,...,0,0,0,0,0,0,1,0,0,


In [4]:
num_features = df.shape[1]

def gen_Xy(data):
    data = np.nan_to_num(data)
    num_features = data.shape[1]
    X = data[:, :num_features - 1]  # remove the last col (the label - remember we moved it to be the last col)
    y = data[:, num_features - 1].sum()
    return (X, np.log1p(y))

samples = []
time_sorted_df = df.sort_values(['date', 'visitStartTime'])
visitor_grouped_df = time_sorted_df.groupby('fullVisitorId', axis=0, sort=False)
for visitor_id, visitor_group in visitor_grouped_df:
    pruned_group = visitor_group.drop(['fullVisitorId', 'sessionId'], axis=1)
    sample = gen_Xy(pruned_group.values)
    samples.append(sample)
    
samples[0]

(array([[0.        , 0.        , 0.00507614, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0

In [5]:
train_set, test_set = train_test_split(samples, test_size=.2, random_state=1, shuffle=False)

train_set = sorted(train_set, key=lambda x: x[0].shape[0])
test_set = sorted(test_set, key=lambda x: x[0].shape[0])

num_features = samples[0][0].shape[1]
num_features

157

In [6]:
# We need to manually calculate the batches, since we are using variable length
# All the items in one batch have the same length
def calc_num_batches(data, batch_size):
    current_sequence_length = 1
    num_batches = 0
    current_batch_size = 0
    for i in range(len(data)):
        sequence_length = data[i][0].shape[0]
        if sequence_length > current_sequence_length:
            current_sequence_length = sequence_length
            current_batch_size = 0  # a new batch starts here, since the sequence length changes
        if current_batch_size % batch_size == 0:
            num_batches += 1
        current_batch_size += 1
    return num_batches
    
def batch_iter(data, batch_size):
    num_batches_per_epoch = calc_num_batches(data, batch_size)

    def data_generator():
        data_size = len(data)
        while True:
            current_sequence_length = 1
            index = 0
            while True:
                start_index = index
                end_index = min(start_index + batch_size, data_size)
                X = []
                y = []

                for i in range(start_index, end_index):
                    sequence_length = data[i][0].shape[0]
                    # one batch should have all examples of the same length
                    # when we reach a greater sequence length, we stop the iteration and return the batch as is
                    if sequence_length > current_sequence_length:
                        current_sequence_length = sequence_length
                        break
                    X.append(data[i][0])
                    y.append(1 if data[i][1] else 0)
                    index += 1

                # stack arrays to create 3d numpy arrays
                X_ndarr = np.dstack(X)
                y_ndarr = np.dstack(y)
                # move axes in the order required by the model
                X_ndarr = np.moveaxis(X_ndarr, 2, 0)
                y_ndarr = y_ndarr.reshape(y_ndarr.shape[2], y_ndarr.shape[0])
                yield X_ndarr, y_ndarr

                if index == data_size:
                    break

    return data_generator(), num_batches_per_epoch

In [7]:
batch_size = 128
train_batches, train_steps = batch_iter(train_set, batch_size)

# Just a simple validation of batch shapes to make sure our generator is fine
# The second dimension of the shape should be monotonically increasing up until the end
i = 0
for train_batch in train_batches:
    print(i, train_batch[0].shape)
    i += 1
    if i == train_steps:
        break

0 (128, 1, 157)
1 (128, 1, 157)
2 (128, 1, 157)
3 (128, 1, 157)
4 (128, 1, 157)
5 (128, 1, 157)
6 (128, 1, 157)
7 (128, 1, 157)
8 (128, 1, 157)
9 (128, 1, 157)
10 (128, 1, 157)
11 (128, 1, 157)
12 (128, 1, 157)
13 (128, 1, 157)
14 (128, 1, 157)
15 (128, 1, 157)
16 (128, 1, 157)
17 (128, 1, 157)
18 (128, 1, 157)
19 (128, 1, 157)
20 (128, 1, 157)
21 (128, 1, 157)
22 (128, 1, 157)
23 (128, 1, 157)
24 (128, 1, 157)
25 (128, 1, 157)
26 (128, 1, 157)
27 (128, 1, 157)
28 (128, 1, 157)
29 (128, 1, 157)
30 (128, 1, 157)
31 (128, 1, 157)
32 (128, 1, 157)
33 (128, 1, 157)
34 (128, 1, 157)
35 (128, 1, 157)
36 (128, 1, 157)
37 (128, 1, 157)
38 (128, 1, 157)
39 (128, 1, 157)
40 (128, 1, 157)
41 (128, 1, 157)
42 (128, 1, 157)
43 (128, 1, 157)
44 (128, 1, 157)
45 (128, 1, 157)
46 (128, 1, 157)
47 (128, 1, 157)
48 (128, 1, 157)
49 (128, 1, 157)
50 (128, 1, 157)
51 (128, 1, 157)
52 (128, 1, 157)
53 (128, 1, 157)
54 (128, 1, 157)
55 (128, 1, 157)
56 (128, 1, 157)
57 (128, 1, 157)
58 (128, 1, 157)
59 (128

568 (128, 1, 157)
569 (128, 1, 157)
570 (128, 1, 157)
571 (128, 1, 157)
572 (128, 1, 157)
573 (128, 1, 157)
574 (128, 1, 157)
575 (128, 1, 157)
576 (128, 1, 157)
577 (128, 1, 157)
578 (128, 1, 157)
579 (128, 1, 157)
580 (128, 1, 157)
581 (128, 1, 157)
582 (128, 1, 157)
583 (128, 1, 157)
584 (128, 1, 157)
585 (128, 1, 157)
586 (128, 1, 157)
587 (128, 1, 157)
588 (128, 1, 157)
589 (128, 1, 157)
590 (128, 1, 157)
591 (128, 1, 157)
592 (128, 1, 157)
593 (128, 1, 157)
594 (128, 1, 157)
595 (128, 1, 157)
596 (128, 1, 157)
597 (128, 1, 157)
598 (128, 1, 157)
599 (128, 1, 157)
600 (128, 1, 157)
601 (128, 1, 157)
602 (128, 1, 157)
603 (128, 1, 157)
604 (128, 1, 157)
605 (128, 1, 157)
606 (128, 1, 157)
607 (128, 1, 157)
608 (128, 1, 157)
609 (128, 1, 157)
610 (128, 1, 157)
611 (128, 1, 157)
612 (128, 1, 157)
613 (128, 1, 157)
614 (128, 1, 157)
615 (128, 1, 157)
616 (128, 1, 157)
617 (128, 1, 157)
618 (128, 1, 157)
619 (128, 1, 157)
620 (128, 1, 157)
621 (128, 1, 157)
622 (128, 1, 157)
623 (128, 

1143 (128, 1, 157)
1144 (128, 1, 157)
1145 (128, 1, 157)
1146 (128, 1, 157)
1147 (128, 1, 157)
1148 (128, 1, 157)
1149 (128, 1, 157)
1150 (128, 1, 157)
1151 (128, 1, 157)
1152 (128, 1, 157)
1153 (128, 1, 157)
1154 (128, 1, 157)
1155 (128, 1, 157)
1156 (128, 1, 157)
1157 (128, 1, 157)
1158 (128, 1, 157)
1159 (128, 1, 157)
1160 (128, 1, 157)
1161 (128, 1, 157)
1162 (128, 1, 157)
1163 (128, 1, 157)
1164 (128, 1, 157)
1165 (128, 1, 157)
1166 (128, 1, 157)
1167 (128, 1, 157)
1168 (128, 1, 157)
1169 (128, 1, 157)
1170 (128, 1, 157)
1171 (128, 1, 157)
1172 (128, 1, 157)
1173 (128, 1, 157)
1174 (128, 1, 157)
1175 (128, 1, 157)
1176 (128, 1, 157)
1177 (128, 1, 157)
1178 (128, 1, 157)
1179 (128, 1, 157)
1180 (128, 1, 157)
1181 (128, 1, 157)
1182 (128, 1, 157)
1183 (128, 1, 157)
1184 (128, 1, 157)
1185 (128, 1, 157)
1186 (128, 1, 157)
1187 (128, 1, 157)
1188 (128, 1, 157)
1189 (128, 1, 157)
1190 (128, 1, 157)
1191 (128, 1, 157)
1192 (128, 1, 157)
1193 (128, 1, 157)
1194 (128, 1, 157)
1195 (128, 1

1732 (128, 1, 157)
1733 (128, 1, 157)
1734 (128, 1, 157)
1735 (128, 1, 157)
1736 (128, 1, 157)
1737 (128, 1, 157)
1738 (128, 1, 157)
1739 (128, 1, 157)
1740 (128, 1, 157)
1741 (128, 1, 157)
1742 (128, 1, 157)
1743 (128, 1, 157)
1744 (128, 1, 157)
1745 (128, 1, 157)
1746 (128, 1, 157)
1747 (128, 1, 157)
1748 (128, 1, 157)
1749 (128, 1, 157)
1750 (128, 1, 157)
1751 (128, 1, 157)
1752 (128, 1, 157)
1753 (128, 1, 157)
1754 (128, 1, 157)
1755 (128, 1, 157)
1756 (128, 1, 157)
1757 (128, 1, 157)
1758 (128, 1, 157)
1759 (128, 1, 157)
1760 (128, 1, 157)
1761 (128, 1, 157)
1762 (128, 1, 157)
1763 (128, 1, 157)
1764 (128, 1, 157)
1765 (128, 1, 157)
1766 (128, 1, 157)
1767 (128, 1, 157)
1768 (128, 1, 157)
1769 (128, 1, 157)
1770 (128, 1, 157)
1771 (128, 1, 157)
1772 (128, 1, 157)
1773 (128, 1, 157)
1774 (128, 1, 157)
1775 (128, 1, 157)
1776 (128, 1, 157)
1777 (128, 1, 157)
1778 (128, 1, 157)
1779 (128, 1, 157)
1780 (128, 1, 157)
1781 (128, 1, 157)
1782 (128, 1, 157)
1783 (128, 1, 157)
1784 (128, 1

2326 (128, 1, 157)
2327 (128, 1, 157)
2328 (128, 1, 157)
2329 (128, 1, 157)
2330 (128, 1, 157)
2331 (128, 1, 157)
2332 (128, 1, 157)
2333 (128, 1, 157)
2334 (128, 1, 157)
2335 (128, 1, 157)
2336 (128, 1, 157)
2337 (128, 1, 157)
2338 (128, 1, 157)
2339 (128, 1, 157)
2340 (128, 1, 157)
2341 (128, 1, 157)
2342 (128, 1, 157)
2343 (128, 1, 157)
2344 (128, 1, 157)
2345 (128, 1, 157)
2346 (128, 1, 157)
2347 (128, 1, 157)
2348 (128, 1, 157)
2349 (128, 1, 157)
2350 (128, 1, 157)
2351 (128, 1, 157)
2352 (128, 1, 157)
2353 (128, 1, 157)
2354 (128, 1, 157)
2355 (128, 1, 157)
2356 (128, 1, 157)
2357 (128, 1, 157)
2358 (128, 1, 157)
2359 (128, 1, 157)
2360 (128, 1, 157)
2361 (128, 1, 157)
2362 (128, 1, 157)
2363 (128, 1, 157)
2364 (128, 1, 157)
2365 (128, 1, 157)
2366 (128, 1, 157)
2367 (128, 1, 157)
2368 (128, 1, 157)
2369 (128, 1, 157)
2370 (128, 1, 157)
2371 (128, 1, 157)
2372 (128, 1, 157)
2373 (128, 1, 157)
2374 (128, 1, 157)
2375 (128, 1, 157)
2376 (128, 1, 157)
2377 (128, 1, 157)
2378 (128, 1

2916 (128, 1, 157)
2917 (128, 1, 157)
2918 (128, 1, 157)
2919 (128, 1, 157)
2920 (128, 1, 157)
2921 (128, 1, 157)
2922 (128, 1, 157)
2923 (128, 1, 157)
2924 (128, 1, 157)
2925 (128, 1, 157)
2926 (128, 1, 157)
2927 (128, 1, 157)
2928 (128, 1, 157)
2929 (128, 1, 157)
2930 (128, 1, 157)
2931 (128, 1, 157)
2932 (128, 1, 157)
2933 (128, 1, 157)
2934 (128, 1, 157)
2935 (128, 1, 157)
2936 (128, 1, 157)
2937 (128, 1, 157)
2938 (128, 1, 157)
2939 (128, 1, 157)
2940 (128, 1, 157)
2941 (128, 1, 157)
2942 (128, 1, 157)
2943 (128, 1, 157)
2944 (128, 1, 157)
2945 (128, 1, 157)
2946 (128, 1, 157)
2947 (128, 1, 157)
2948 (128, 1, 157)
2949 (128, 1, 157)
2950 (128, 1, 157)
2951 (128, 1, 157)
2952 (128, 1, 157)
2953 (128, 1, 157)
2954 (128, 1, 157)
2955 (128, 1, 157)
2956 (128, 1, 157)
2957 (128, 1, 157)
2958 (128, 1, 157)
2959 (128, 1, 157)
2960 (128, 1, 157)
2961 (128, 1, 157)
2962 (128, 1, 157)
2963 (128, 1, 157)
2964 (128, 1, 157)
2965 (128, 1, 157)
2966 (128, 1, 157)
2967 (128, 1, 157)
2968 (128, 1

3507 (128, 1, 157)
3508 (128, 1, 157)
3509 (128, 1, 157)
3510 (128, 1, 157)
3511 (128, 1, 157)
3512 (128, 1, 157)
3513 (128, 1, 157)
3514 (128, 1, 157)
3515 (128, 1, 157)
3516 (128, 1, 157)
3517 (128, 1, 157)
3518 (128, 1, 157)
3519 (128, 1, 157)
3520 (128, 1, 157)
3521 (128, 1, 157)
3522 (128, 1, 157)
3523 (128, 1, 157)
3524 (128, 1, 157)
3525 (128, 1, 157)
3526 (128, 1, 157)
3527 (128, 1, 157)
3528 (128, 1, 157)
3529 (128, 1, 157)
3530 (128, 1, 157)
3531 (128, 1, 157)
3532 (128, 1, 157)
3533 (128, 1, 157)
3534 (128, 1, 157)
3535 (128, 1, 157)
3536 (128, 1, 157)
3537 (128, 1, 157)
3538 (128, 1, 157)
3539 (128, 1, 157)
3540 (128, 1, 157)
3541 (128, 1, 157)
3542 (128, 1, 157)
3543 (128, 1, 157)
3544 (128, 1, 157)
3545 (128, 1, 157)
3546 (128, 1, 157)
3547 (128, 1, 157)
3548 (128, 1, 157)
3549 (128, 1, 157)
3550 (128, 1, 157)
3551 (128, 1, 157)
3552 (128, 1, 157)
3553 (128, 1, 157)
3554 (128, 1, 157)
3555 (128, 1, 157)
3556 (128, 1, 157)
3557 (128, 1, 157)
3558 (128, 1, 157)
3559 (128, 1

4076 (128, 2, 157)
4077 (128, 2, 157)
4078 (128, 2, 157)
4079 (128, 2, 157)
4080 (128, 2, 157)
4081 (128, 2, 157)
4082 (128, 2, 157)
4083 (128, 2, 157)
4084 (128, 2, 157)
4085 (128, 2, 157)
4086 (128, 2, 157)
4087 (128, 2, 157)
4088 (128, 2, 157)
4089 (128, 2, 157)
4090 (128, 2, 157)
4091 (128, 2, 157)
4092 (128, 2, 157)
4093 (128, 2, 157)
4094 (128, 2, 157)
4095 (128, 2, 157)
4096 (128, 2, 157)
4097 (128, 2, 157)
4098 (128, 2, 157)
4099 (128, 2, 157)
4100 (128, 2, 157)
4101 (128, 2, 157)
4102 (128, 2, 157)
4103 (128, 2, 157)
4104 (128, 2, 157)
4105 (128, 2, 157)
4106 (128, 2, 157)
4107 (128, 2, 157)
4108 (128, 2, 157)
4109 (128, 2, 157)
4110 (128, 2, 157)
4111 (128, 2, 157)
4112 (128, 2, 157)
4113 (128, 2, 157)
4114 (128, 2, 157)
4115 (128, 2, 157)
4116 (128, 2, 157)
4117 (128, 2, 157)
4118 (128, 2, 157)
4119 (128, 2, 157)
4120 (128, 2, 157)
4121 (128, 2, 157)
4122 (128, 2, 157)
4123 (128, 2, 157)
4124 (128, 2, 157)
4125 (128, 2, 157)
4126 (128, 2, 157)
4127 (128, 2, 157)
4128 (128, 2

In [16]:
def build_model(neurons=128, activ_func='relu', dropout=.3, loss='mean_squared_error', optimizer='adam'):
    model = Sequential()

    model.add(CuDNNLSTM(neurons, return_sequences=True, input_shape=(None, num_features)))
    model.add(Dropout(dropout))
    model.add(CuDNNLSTM(neurons, return_sequences=False))
    model.add(Dropout(dropout))
#     model.add(LSTM(neurons, return_sequences=False, activation=activ_func))
#     model.add(Dropout(dropout))

    model.add(Dense(24, kernel_initializer='normal', activation=activ_func))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss=loss, optimizer=optimizer, metrics=['mse'])
    return model

In [17]:
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_3 (CuDNNLSTM)     (None, None, 128)         146944    
_________________________________________________________________
dropout_5 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
cu_dnnlstm_4 (CuDNNLSTM)     (None, 128)               132096    
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 24)                3096      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 25        
Total params: 282,161
Trainable params: 282,161
Non-trainable params: 0
_________________________________________________________________


In [None]:
gc.collect() # clean up the memory
num_epochs = 10

train_batches, train_steps = batch_iter(train_set, batch_size)
test_batches, test_steps = batch_iter(test_set, batch_size)

# train model on data
model.fit_generator(train_batches, train_steps,
          epochs=num_epochs,
          validation_data=test_batches, validation_steps=test_steps)

Epoch 1/10

In [None]:
predicted_revenues = {}
i = 0
for visitor_id, visitor_group in visitor_grouped_df:
    X = samples[i][0]
    num_timesteps = X.shape[0]
    predicted_log_revenue = np.expm1(model.predict(X.reshape(1, num_timesteps, num_features))).sum()
    predicted_revenues[visitor_id] = predicted_log_revenue
    true_revenue = visitor_group[LABEL].sum()
    i = i+1
    
    print('predict', prediction)
    print('true', true_revenue)

In [None]:
from sklearn import metrics

val_pred_df = pd.DataFrame({"fullVisitorId": val_df["fullVisitorId"].values})

val_pred_df["transactionRevenue"] = val_df["totals.transactionRevenue"].values
val_pred_df["predictedRevenue"] = np.expm1(pred_val)
val_pred_df = val_pred_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))