In [1]:
import gc
import re
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from livelossplot import PlotLossesKeras
from scipy.stats import norm, probplot
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense, LSTM, CuDNNLSTM, Dropout, Activation, Bidirectional, TimeDistributed

Using TensorFlow backend.


In [2]:
IN_TRAIN = 'in/train-wrangled.csv'
IN_TEST = 'in/test-wrangled.csv'

LABEL = 'totals.transactionRevenue'

In [3]:
df = pd.read_csv(IN_TRAIN, dtype={'fullVisitorId': 'str'}, low_memory=False)
df.head()

Unnamed: 0,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,device.isMobile,totals.bounces,totals.hits,totals.newVisits,...,geoNetwork.subContinent Western Asia,geoNetwork.subContinent Western Europe,trafficSource.source (direct),trafficSource.source Other,trafficSource.source Partners,trafficSource.source analytics.google.com,trafficSource.source google,trafficSource.source mall.googleplex.com,trafficSource.source youtube.com,totals.transactionRevenue
0,0.0101,1131660440785968503,1131660440785968503_1472830385,0.088405,0.0,0.088405,0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,
1,0.0101,377306020877927890,377306020877927890_1472880147,0.089979,0.0,0.089979,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
2,0.0101,3895546263509774583,3895546263509774583_1472865386,0.089512,0.0,0.089512,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
3,0.0101,4763447161404445595,4763447161404445595_1472881213,0.090012,0.0,0.090012,0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,
4,0.0101,27294437909732085,27294437909732085_1472822600,0.088159,0.002538,0.088159,0,0.0,0.0,,...,0,0,0,0,0,0,1,0,0,


In [8]:
num_features = df.shape[1]

def gen_Xy(data, is_train=True):
    data = np.nan_to_num(data)
    num_features = data.shape[1]
    num_samples = data.shape[0]
    end = (num_features - 1) if is_train else num_features
    X = data[:, :num_features - 1]  # remove the last col (the label - remember we moved it to be the last col)
    y = data[:, num_features - 1].sum()
    return (X, np.log1p(y))

def gen_samples(df, is_train=True):
    samples = []
    time_sorted_df = df.sort_values(['date', 'visitStartTime'])
    visitor_grouped_df = time_sorted_df.groupby('fullVisitorId', axis=0, sort=False)
    for visitor_id, visitor_group in visitor_grouped_df:
        pruned_group = visitor_group.drop(['fullVisitorId', 'sessionId'], axis=1)
        sample = gen_Xy(pruned_group.values, is_train)
        samples.append(sample)
    return samples, visitor_grouped_df

In [9]:
samples, visitor_grouped_df = gen_samples(df)
samples[0]

(array([[0.        , 0.        , 0.00507614, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 1.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         1.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 1.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 1.        ,
         0.        , 0.        , 0

In [10]:
train_set, test_set = train_test_split(samples, test_size=.2, random_state=1, shuffle=False)

train_set = sorted(train_set, key=lambda x: x[0].shape[0])
test_set = sorted(test_set, key=lambda x: x[0].shape[0])

num_features = samples[0][0].shape[1]
num_features

157

In [24]:
# We need to manually calculate the batches, since we are using variable length
# All the items in one batch have the same length
def calc_num_batches(data, batch_size):
    current_sequence_length = 1
    num_batches = 0
    current_batch_size = 0
    for i in range(len(data)):
        sequence_length = data[i][0].shape[0]
        if sequence_length > current_sequence_length:
            current_sequence_length = sequence_length
            current_batch_size = 0  # a new batch starts here, since the sequence length changes
        if current_batch_size % batch_size == 0:
            num_batches += 1
        current_batch_size += 1
    return num_batches
    
def batch_iter(data, batch_size):
    num_batches_per_epoch = calc_num_batches(data, batch_size)

    def data_generator():
        data_size = len(data)
        while True:
            current_sequence_length = 1
            index = 0
            while True:
                start_index = index
                end_index = min(start_index + batch_size, data_size)
                X = []
                y = []

                for i in range(start_index, end_index):
                    sequence_length = data[i][0].shape[0]
                    # one batch should have all examples of the same length
                    # when we reach a greater sequence length, we stop the iteration and return the batch as is
                    if sequence_length > current_sequence_length:
                        current_sequence_length = sequence_length
                        break
                    X.append(data[i][0])
                    y.append(data[i][1])
                    index += 1

                # stack arrays to create 3d numpy arrays
                X_ndarr = np.dstack(X)
                y_ndarr = np.dstack(y)
                # move axes in the order required by the model
                X_ndarr = np.moveaxis(X_ndarr, 2, 0)
                y_ndarr = y_ndarr.reshape(y_ndarr.shape[2], y_ndarr.shape[0])
                yield X_ndarr, y_ndarr

                if index == data_size:
                    break

    return data_generator(), num_batches_per_epoch

In [21]:
batch_size = 128
train_batches, train_steps = batch_iter(train_set, batch_size)

# Just a simple validation of batch shapes to make sure our generator is fine
# The second dimension of the shape should be monotonically increasing up until the end
i = 0
for train_batch in train_batches:
    print(i, train_batch[0].shape)
    i += 1
    if i == train_steps:
        break

0 (128, 1, 157)
1 (128, 1, 157)
2 (128, 1, 157)
3 (128, 1, 157)
4 (128, 1, 157)
5 (128, 1, 157)
6 (128, 1, 157)
7 (128, 1, 157)
8 (128, 1, 157)
9 (128, 1, 157)
10 (128, 1, 157)
11 (128, 1, 157)
12 (128, 1, 157)
13 (128, 1, 157)
14 (128, 1, 157)
15 (128, 1, 157)
16 (128, 1, 157)
17 (128, 1, 157)
18 (128, 1, 157)
19 (128, 1, 157)
20 (128, 1, 157)
21 (128, 1, 157)
22 (128, 1, 157)
23 (128, 1, 157)
24 (128, 1, 157)
25 (128, 1, 157)
26 (128, 1, 157)
27 (128, 1, 157)
28 (128, 1, 157)
29 (128, 1, 157)
30 (128, 1, 157)
31 (128, 1, 157)
32 (128, 1, 157)
33 (128, 1, 157)
34 (128, 1, 157)
35 (128, 1, 157)
36 (128, 1, 157)
37 (128, 1, 157)
38 (128, 1, 157)
39 (128, 1, 157)
40 (128, 1, 157)
41 (128, 1, 157)
42 (128, 1, 157)
43 (128, 1, 157)
44 (128, 1, 157)
45 (128, 1, 157)
46 (128, 1, 157)
47 (128, 1, 157)
48 (128, 1, 157)
49 (128, 1, 157)
50 (128, 1, 157)
51 (128, 1, 157)
52 (128, 1, 157)
53 (128, 1, 157)
54 (128, 1, 157)
55 (128, 1, 157)
56 (128, 1, 157)
57 (128, 1, 157)
58 (128, 1, 157)
59 (128

544 (128, 1, 157)
545 (128, 1, 157)
546 (128, 1, 157)
547 (128, 1, 157)
548 (128, 1, 157)
549 (128, 1, 157)
550 (128, 1, 157)
551 (128, 1, 157)
552 (128, 1, 157)
553 (128, 1, 157)
554 (128, 1, 157)
555 (128, 1, 157)
556 (128, 1, 157)
557 (128, 1, 157)
558 (128, 1, 157)
559 (128, 1, 157)
560 (128, 1, 157)
561 (128, 1, 157)
562 (128, 1, 157)
563 (128, 1, 157)
564 (128, 1, 157)
565 (128, 1, 157)
566 (128, 1, 157)
567 (128, 1, 157)
568 (128, 1, 157)
569 (128, 1, 157)
570 (128, 1, 157)
571 (128, 1, 157)
572 (128, 1, 157)
573 (128, 1, 157)
574 (128, 1, 157)
575 (128, 1, 157)
576 (128, 1, 157)
577 (128, 1, 157)
578 (128, 1, 157)
579 (128, 1, 157)
580 (128, 1, 157)
581 (128, 1, 157)
582 (128, 1, 157)
583 (128, 1, 157)
584 (128, 1, 157)
585 (128, 1, 157)
586 (128, 1, 157)
587 (128, 1, 157)
588 (128, 1, 157)
589 (128, 1, 157)
590 (128, 1, 157)
591 (128, 1, 157)
592 (128, 1, 157)
593 (128, 1, 157)
594 (128, 1, 157)
595 (128, 1, 157)
596 (128, 1, 157)
597 (128, 1, 157)
598 (128, 1, 157)
599 (128, 

1082 (128, 1, 157)
1083 (128, 1, 157)
1084 (128, 1, 157)
1085 (128, 1, 157)
1086 (128, 1, 157)
1087 (128, 1, 157)
1088 (128, 1, 157)
1089 (128, 1, 157)
1090 (128, 1, 157)
1091 (128, 1, 157)
1092 (128, 1, 157)
1093 (128, 1, 157)
1094 (128, 1, 157)
1095 (128, 1, 157)
1096 (128, 1, 157)
1097 (128, 1, 157)
1098 (128, 1, 157)
1099 (128, 1, 157)
1100 (128, 1, 157)
1101 (128, 1, 157)
1102 (128, 1, 157)
1103 (128, 1, 157)
1104 (128, 1, 157)
1105 (128, 1, 157)
1106 (128, 1, 157)
1107 (128, 1, 157)
1108 (128, 1, 157)
1109 (128, 1, 157)
1110 (128, 1, 157)
1111 (128, 1, 157)
1112 (128, 1, 157)
1113 (128, 1, 157)
1114 (128, 1, 157)
1115 (128, 1, 157)
1116 (128, 1, 157)
1117 (128, 1, 157)
1118 (128, 1, 157)
1119 (128, 1, 157)
1120 (128, 1, 157)
1121 (128, 1, 157)
1122 (128, 1, 157)
1123 (128, 1, 157)
1124 (128, 1, 157)
1125 (128, 1, 157)
1126 (128, 1, 157)
1127 (128, 1, 157)
1128 (128, 1, 157)
1129 (128, 1, 157)
1130 (128, 1, 157)
1131 (128, 1, 157)
1132 (128, 1, 157)
1133 (128, 1, 157)
1134 (128, 1

1642 (128, 1, 157)
1643 (128, 1, 157)
1644 (128, 1, 157)
1645 (128, 1, 157)
1646 (128, 1, 157)
1647 (128, 1, 157)
1648 (128, 1, 157)
1649 (128, 1, 157)
1650 (128, 1, 157)
1651 (128, 1, 157)
1652 (128, 1, 157)
1653 (128, 1, 157)
1654 (128, 1, 157)
1655 (128, 1, 157)
1656 (128, 1, 157)
1657 (128, 1, 157)
1658 (128, 1, 157)
1659 (128, 1, 157)
1660 (128, 1, 157)
1661 (128, 1, 157)
1662 (128, 1, 157)
1663 (128, 1, 157)
1664 (128, 1, 157)
1665 (128, 1, 157)
1666 (128, 1, 157)
1667 (128, 1, 157)
1668 (128, 1, 157)
1669 (128, 1, 157)
1670 (128, 1, 157)
1671 (128, 1, 157)
1672 (128, 1, 157)
1673 (128, 1, 157)
1674 (128, 1, 157)
1675 (128, 1, 157)
1676 (128, 1, 157)
1677 (128, 1, 157)
1678 (128, 1, 157)
1679 (128, 1, 157)
1680 (128, 1, 157)
1681 (128, 1, 157)
1682 (128, 1, 157)
1683 (128, 1, 157)
1684 (128, 1, 157)
1685 (128, 1, 157)
1686 (128, 1, 157)
1687 (128, 1, 157)
1688 (128, 1, 157)
1689 (128, 1, 157)
1690 (128, 1, 157)
1691 (128, 1, 157)
1692 (128, 1, 157)
1693 (128, 1, 157)
1694 (128, 1

2202 (128, 1, 157)
2203 (128, 1, 157)
2204 (128, 1, 157)
2205 (128, 1, 157)
2206 (128, 1, 157)
2207 (128, 1, 157)
2208 (128, 1, 157)
2209 (128, 1, 157)
2210 (128, 1, 157)
2211 (128, 1, 157)
2212 (128, 1, 157)
2213 (128, 1, 157)
2214 (128, 1, 157)
2215 (128, 1, 157)
2216 (128, 1, 157)
2217 (128, 1, 157)
2218 (128, 1, 157)
2219 (128, 1, 157)
2220 (128, 1, 157)
2221 (128, 1, 157)
2222 (128, 1, 157)
2223 (128, 1, 157)
2224 (128, 1, 157)
2225 (128, 1, 157)
2226 (128, 1, 157)
2227 (128, 1, 157)
2228 (128, 1, 157)
2229 (128, 1, 157)
2230 (128, 1, 157)
2231 (128, 1, 157)
2232 (128, 1, 157)
2233 (128, 1, 157)
2234 (128, 1, 157)
2235 (128, 1, 157)
2236 (128, 1, 157)
2237 (128, 1, 157)
2238 (128, 1, 157)
2239 (128, 1, 157)
2240 (128, 1, 157)
2241 (128, 1, 157)
2242 (128, 1, 157)
2243 (128, 1, 157)
2244 (128, 1, 157)
2245 (128, 1, 157)
2246 (128, 1, 157)
2247 (128, 1, 157)
2248 (128, 1, 157)
2249 (128, 1, 157)
2250 (128, 1, 157)
2251 (128, 1, 157)
2252 (128, 1, 157)
2253 (128, 1, 157)
2254 (128, 1

2744 (128, 1, 157)
2745 (128, 1, 157)
2746 (128, 1, 157)
2747 (128, 1, 157)
2748 (128, 1, 157)
2749 (128, 1, 157)
2750 (128, 1, 157)
2751 (128, 1, 157)
2752 (128, 1, 157)
2753 (128, 1, 157)
2754 (128, 1, 157)
2755 (128, 1, 157)
2756 (128, 1, 157)
2757 (128, 1, 157)
2758 (128, 1, 157)
2759 (128, 1, 157)
2760 (128, 1, 157)
2761 (128, 1, 157)
2762 (128, 1, 157)
2763 (128, 1, 157)
2764 (128, 1, 157)
2765 (128, 1, 157)
2766 (128, 1, 157)
2767 (128, 1, 157)
2768 (128, 1, 157)
2769 (128, 1, 157)
2770 (128, 1, 157)
2771 (128, 1, 157)
2772 (128, 1, 157)
2773 (128, 1, 157)
2774 (128, 1, 157)
2775 (128, 1, 157)
2776 (128, 1, 157)
2777 (128, 1, 157)
2778 (128, 1, 157)
2779 (128, 1, 157)
2780 (128, 1, 157)
2781 (128, 1, 157)
2782 (128, 1, 157)
2783 (128, 1, 157)
2784 (128, 1, 157)
2785 (128, 1, 157)
2786 (128, 1, 157)
2787 (128, 1, 157)
2788 (128, 1, 157)
2789 (128, 1, 157)
2790 (128, 1, 157)
2791 (128, 1, 157)
2792 (128, 1, 157)
2793 (128, 1, 157)
2794 (128, 1, 157)
2795 (128, 1, 157)
2796 (128, 1

3294 (128, 1, 157)
3295 (128, 1, 157)
3296 (128, 1, 157)
3297 (128, 1, 157)
3298 (128, 1, 157)
3299 (128, 1, 157)
3300 (128, 1, 157)
3301 (128, 1, 157)
3302 (128, 1, 157)
3303 (128, 1, 157)
3304 (128, 1, 157)
3305 (128, 1, 157)
3306 (128, 1, 157)
3307 (128, 1, 157)
3308 (128, 1, 157)
3309 (128, 1, 157)
3310 (128, 1, 157)
3311 (128, 1, 157)
3312 (128, 1, 157)
3313 (128, 1, 157)
3314 (128, 1, 157)
3315 (128, 1, 157)
3316 (128, 1, 157)
3317 (128, 1, 157)
3318 (128, 1, 157)
3319 (128, 1, 157)
3320 (128, 1, 157)
3321 (128, 1, 157)
3322 (128, 1, 157)
3323 (128, 1, 157)
3324 (128, 1, 157)
3325 (128, 1, 157)
3326 (128, 1, 157)
3327 (128, 1, 157)
3328 (128, 1, 157)
3329 (128, 1, 157)
3330 (128, 1, 157)
3331 (128, 1, 157)
3332 (128, 1, 157)
3333 (128, 1, 157)
3334 (128, 1, 157)
3335 (128, 1, 157)
3336 (128, 1, 157)
3337 (128, 1, 157)
3338 (128, 1, 157)
3339 (128, 1, 157)
3340 (128, 1, 157)
3341 (128, 1, 157)
3342 (128, 1, 157)
3343 (128, 1, 157)
3344 (128, 1, 157)
3345 (128, 1, 157)
3346 (128, 1

3838 (128, 1, 157)
3839 (128, 1, 157)
3840 (128, 1, 157)
3841 (128, 1, 157)
3842 (128, 1, 157)
3843 (128, 1, 157)
3844 (128, 1, 157)
3845 (128, 1, 157)
3846 (128, 1, 157)
3847 (128, 1, 157)
3848 (128, 1, 157)
3849 (128, 1, 157)
3850 (128, 1, 157)
3851 (128, 1, 157)
3852 (128, 1, 157)
3853 (128, 1, 157)
3854 (128, 1, 157)
3855 (128, 1, 157)
3856 (128, 1, 157)
3857 (128, 1, 157)
3858 (128, 1, 157)
3859 (128, 1, 157)
3860 (128, 1, 157)
3861 (128, 1, 157)
3862 (128, 1, 157)
3863 (128, 1, 157)
3864 (128, 1, 157)
3865 (128, 1, 157)
3866 (128, 1, 157)
3867 (128, 1, 157)
3868 (128, 1, 157)
3869 (128, 1, 157)
3870 (128, 1, 157)
3871 (128, 1, 157)
3872 (128, 1, 157)
3873 (128, 1, 157)
3874 (128, 1, 157)
3875 (128, 1, 157)
3876 (128, 1, 157)
3877 (128, 1, 157)
3878 (128, 1, 157)
3879 (128, 1, 157)
3880 (128, 1, 157)
3881 (128, 1, 157)
3882 (128, 1, 157)
3883 (128, 1, 157)
3884 (128, 1, 157)
3885 (128, 1, 157)
3886 (128, 1, 157)
3887 (128, 1, 157)
3888 (47, 1, 157)
3889 (128, 2, 157)
3890 (128, 2,

4313 (128, 3, 157)
4314 (128, 3, 157)
4315 (128, 3, 157)
4316 (128, 3, 157)
4317 (128, 3, 157)
4318 (128, 3, 157)
4319 (128, 3, 157)
4320 (128, 3, 157)
4321 (128, 3, 157)
4322 (128, 3, 157)
4323 (128, 3, 157)
4324 (128, 3, 157)
4325 (128, 3, 157)
4326 (128, 3, 157)
4327 (128, 3, 157)
4328 (128, 3, 157)
4329 (128, 3, 157)
4330 (128, 3, 157)
4331 (128, 3, 157)
4332 (128, 3, 157)
4333 (128, 3, 157)
4334 (128, 3, 157)
4335 (128, 3, 157)
4336 (128, 3, 157)
4337 (128, 3, 157)
4338 (128, 3, 157)
4339 (128, 3, 157)
4340 (128, 3, 157)
4341 (128, 3, 157)
4342 (128, 3, 157)
4343 (128, 3, 157)
4344 (128, 3, 157)
4345 (128, 3, 157)
4346 (128, 3, 157)
4347 (128, 3, 157)
4348 (128, 3, 157)
4349 (128, 3, 157)
4350 (128, 3, 157)
4351 (43, 3, 157)
4352 (128, 4, 157)
4353 (128, 4, 157)
4354 (128, 4, 157)
4355 (128, 4, 157)
4356 (128, 4, 157)
4357 (128, 4, 157)
4358 (128, 4, 157)
4359 (128, 4, 157)
4360 (128, 4, 157)
4361 (128, 4, 157)
4362 (128, 4, 157)
4363 (128, 4, 157)
4364 (128, 4, 157)
4365 (128, 4,

In [28]:
def build_model(neurons=128, activ_func='relu', dropout=.4, loss='mean_squared_error', optimizer='adam'):
    model = Sequential()

    model.add(CuDNNLSTM(neurons, return_sequences=True, input_shape=(None, num_features)))
    model.add(Dropout(dropout))
    model.add(CuDNNLSTM(neurons, return_sequences=True))
    model.add(Dropout(dropout))
    model.add(CuDNNLSTM(neurons, return_sequences=False))
    model.add(Dropout(dropout))

    model.add(Dense(neurons, kernel_initializer='normal', activation=activ_func))
    model.add(Dense(24, kernel_initializer='normal', activation=activ_func))
    model.add(Dense(1, kernel_initializer='normal'))

    model.compile(loss=loss, optimizer=optimizer, metrics=['mse'])
    return model

In [29]:
model = build_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_10 (CuDNNLSTM)    (None, None, 128)         146944    
_________________________________________________________________
dropout_10 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
cu_dnnlstm_11 (CuDNNLSTM)    (None, None, 128)         132096    
_________________________________________________________________
dropout_11 (Dropout)         (None, None, 128)         0         
_________________________________________________________________
cu_dnnlstm_12 (CuDNNLSTM)    (None, 128)               132096    
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               16512     
__________

In [31]:
gc.collect() # clean up the memory
num_epochs = 7

train_batches, train_steps = batch_iter(train_set, batch_size)
test_batches, test_steps = batch_iter(test_set, batch_size)

# train model on data
model.fit_generator(train_batches, train_steps,
          epochs=num_epochs,
          validation_data=test_batches, validation_steps=test_steps)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x25eb9f3f710>

In [60]:
def gen_predicted_revenues(samples, visitor_grouped_df, is_train=True):
    num_visitor_ids = len(visitor_grouped_df)
    predicted_revenues = {}
    i = 0
    for visitor_id, visitor_group in visitor_grouped_df:
        X = samples[i][0]
        num_timesteps = X.shape[0]
        num_features = X.shape[1]
        predicted_revenue = np.expm1(model.predict(X.reshape(1, num_timesteps, num_features))).sum()
        i = i+1
        if is_train:
            true_revenue = visitor_group[LABEL].sum()
            predicted_revenues[visitor_id] = (predicted_revenue, true_revenue)
        else:
            predicted_revenues[visitor_id] = predicted_revenue
    return predicted_revenues

In [17]:
%%time
predicted_revenues = gen_predicted_revenues(samples, visitor_grouped_df)

Wall time: 33min 7s


In [59]:
from sklearn import metrics

pred = [rev[0] for rev in list(predicted_revenues.values())]
pred_norm = [0 if p < 9 else np.expm1(p) for p in pred]
true = [rev[1] for rev in list(predicted_revenues.values())]

print(np.sqrt(metrics.mean_squared_error(np.log1p(pred_norm), np.log1p(true))))

1.8482112532128303


In [33]:
df_test = pd.read_csv(IN_TEST, dtype={'fullVisitorId': 'str'}, low_memory=False)
samples_test, visitor_grouped_df_test = gen_samples(df_test, is_train=False)
samples_test[0][0].shape

(2, 156)

In [34]:
predicted_revenues_test = gen_predicted_revenues(samples_test, visitor_grouped_df_test, is_train=False)

ValueError: Error when checking input: expected cu_dnnlstm_10_input to have shape (None, 157) but got array with shape (2, 156)

In [None]:
pred_test_norm = {}
for visitor_id, p in predicted_revenues_test.items():
    pred_test_norm[visitor_id] = 0 if p < .4 else p
pred_test_norm

In [None]:
out_df = pd.DataFrame.from_dict(pred_test_norm, orient='index', columns=['PredictedLogRevenue'])
out_df.to_csv('trainrecurrent.csv')