In [0]:
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, LSTM, GRU, TimeDistributed, Input
from keras.optimizers import SGD
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder


import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
import datetime

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import cross_validate,GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.utils.multiclass import unique_labels

## Load training and testing datasets
train: 2016-01-01 : 2017-04-22

test: 2017-04-23 : 2017-05-31

In [0]:
# train_df = pd.read_csv("../input/finaldata/sam_train_final.csv")
# test = pd.read_csv("../input/finaldata/sam_test_final.csv")
train_df = pd.read_csv("../input/finaltable/train_final.csv")
test = pd.read_csv("../input/finaltable/test_final.csv")

In [0]:
train_df = train_df.drop(columns=[ 'population', 'reserve_visitors', 'days_diff', 'day', 'season'])
test = test.drop(columns=['population', 'reserve_visitors','days_diff', 'day', 'season'])

In [0]:
# refine column names
train_df = train_df.rename({'visitors_x': 'visitors'}, axis = 1)
train_df = train_df.rename({'day_of_week_y': 'day_of_week'}, axis = 1)
train_df = train_df.rename({'month_y': 'month'}, axis = 1)
train_df = train_df.rename({'longitude_y': 'longitude'}, axis = 1)
train_df = train_df.rename({'latitude_y': 'latitude'}, axis = 1)
test = test.rename({'latitude_y': 'latitude'}, axis = 1)
test = test.rename({'longitude_y': 'longitude'}, axis = 1)
test = test.rename({'month_y': 'month'}, axis = 1)
test = test.rename({'day_of_week_y': 'day_of_week'}, axis = 1)


In [0]:
train_df = train_df.loc[:, ~train_df.columns.str.contains('^Unnamed')]
train_df.columns

Index(['visit_date', 'air_store_id', 'Food_Type', 'is_holiday', 'next_day',
       'prev_day', 'visitors', 'week', 'daysToPrev25th', 'prev_visitors',
       'year', 'month', 'min_visitors', 'mean_visitors', 'median_visitors',
       'max_visitors', 'count_observations', 'latitude', 'longitude',
       'day_of_week', 'Consecutive_holidays'],
      dtype='object')

In [0]:
test = test.loc[:, ~test.columns.str.contains('^Unnamed')]
test.columns

Index(['visit_date', 'air_store_id', 'Food_Type', 'is_holiday', 'next_day',
       'prev_day', 'week', 'daysToPrev25th', 'prev_visitors', 'year', 'month',
       'min_visitors', 'mean_visitors', 'median_visitors', 'max_visitors',
       'count_observations', 'latitude', 'longitude', 'day_of_week',
       'Consecutive_holidays'],
      dtype='object')

## Encode categorical columns
Categorical columns: 'Food_Type','season', 'day_of_week', 'air_store_id'

One-hot encoding may provide better result, but I applied labels encoding to avoid high dimensional feature space.

In [0]:
# Weekday
le_weekday = LabelEncoder()
le_weekday.fit(train_df['day_of_week'])
train_df['day_of_week'] = le_weekday.transform(train_df['day_of_week'])
test['day_of_week'] = le_weekday.transform(test['day_of_week'])

# id
le_id = LabelEncoder()
le_id.fit(train_df['air_store_id'])
train_df['air_store_id'] = le_id.transform(train_df['air_store_id'])
test['air_store_id'] = le_id.transform(test['air_store_id'])

# food type
le_ftype = LabelEncoder()
le_ftype.fit(train_df['Food_Type'])
train_df['Food_Type'] = le_ftype.transform(train_df['Food_Type'])
test['Food_Type'] = le_ftype.transform(test['Food_Type'])

# Season
# le_season = LabelEncoder()
# le_season.fit(train_df['season'])
# train_df['season'] = le_season.transform(train_df['season'])
# test['season'] = le_season.transform(test['season'])

# lbl = LabelEncoder()
# # Adjust categorical columns in training set
# train_df['Food_Type'] = lbl.fit_transform(train_df['Food_Type'])
# train_df['season'] = lbl.fit_transform(train_df['season'])
# train_df['day_of_week'] = lbl.fit_transform(train_df['day_of_week'])
# train_df['air_store_id'] = lbl.fit_transform(train_df['air_store_id'])
# # Adjust categorical columns in test set
# test['Food_Type'] = lbl.fit_transform(test['Food_Type'])
# test['season'] = lbl.fit_transform(test['season'])
# test['day_of_week'] = lbl.fit_transform(test['day_of_week'])
# test['air_store_id'] = lbl.fit_transform(test['air_store_id'])

In [0]:
train_df['air_store_id'].nunique()

829

## Normalization 

All features (including one-hot encoded) are normalized to zero mean and unit variance. Each pageviews series normalized independently.

## Fill the cells of missing values

replace with 0 or 1 ??？

In [0]:
train_df = train_df.fillna(0)
test = test.fillna(0)

## Simutaneous transformation of Train and test sets

Time-independent features (autocorrelations, country, etc) are "stretched" to timeseries length.

In [0]:
# combine train and test sets
X_all = train_df.append(test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [0]:
X_all.shape

(284127, 21)

In [0]:
# date table (includes all dates for training and test period)
dates = np.arange(np.datetime64(X_all.visit_date.min()),
                  np.datetime64(X_all.visit_date.max()) + 1,
                  datetime.timedelta(days=1))
ids = X_all['air_store_id'].unique()
dates_all = dates.tolist()*len(ids)
ids_all = np.repeat(ids, len(dates.tolist())).tolist()
df_all = pd.DataFrame({"air_store_id": ids_all, "visit_date": dates_all})
df_all['visit_date'] = df_all['visit_date'].copy().apply(lambda x: str(x)[:10])

In [0]:
df_all

Unnamed: 0,air_store_id,visit_date
0,603,2016-01-01
1,603,2016-01-02
2,603,2016-01-03
3,603,2016-01-04
4,603,2016-01-05
...,...,...
428588,98,2017-05-27
428589,98,2017-05-28
428590,98,2017-05-29
428591,98,2017-05-30


Data relevant to 'visit_date'

In [0]:
# create copy of X_all with data relevant to 'visit_date'
X_dates = X_all[['visit_date', 'year','month','week', 'is_holiday','next_day','prev_day',\
                 'daysToPrev25th','day_of_week','Consecutive_holidays']].copy()

# remove duplicates to avoid memory issues
X_dates = X_dates.drop_duplicates('visit_date')

# merge dataframe that represents all dates per each restaurant with information about each date
df_to_reshape = df_all.merge(X_dates,
                             how = "left",
                             left_on = 'visit_date',
                             right_on = 'visit_date')
print(df_to_reshape.columns)
print(df_to_reshape.shape)

Index(['air_store_id', 'visit_date', 'year', 'month', 'week', 'is_holiday',
       'next_day', 'prev_day', 'daysToPrev25th', 'day_of_week',
       'Consecutive_holidays'],
      dtype='object')
(428593, 11)


Data relevant to 'air_store_id'

In [0]:
# create copy of X_all with data relevant to 'air_store_id'
X_stores = X_all[['air_store_id', 'Food_Type', 'latitude','longitude']].copy()       

# remove duplicates to avoid memory issues
X_stores = X_stores.drop_duplicates('air_store_id')

# merge dataframe that represents all dates per each restaurant with information about each restaurant
df_to_reshape = df_to_reshape.merge(X_stores,
                                    how = "left",
                                    left_on = 'air_store_id',
                                    right_on = 'air_store_id')
print(df_to_reshape.columns)
print(df_to_reshape.shape)

Index(['air_store_id', 'visit_date', 'year', 'month', 'week', 'is_holiday',
       'next_day', 'prev_day', 'daysToPrev25th', 'day_of_week',
       'Consecutive_holidays', 'Food_Type', 'latitude', 'longitude'],
      dtype='object')
(428593, 14)


Data relevant to 'air_store_id'

In [0]:
# merge dataframe that represents all dates per each restaurant with inf. about each restaurant per specific date
df_to_reshape = df_to_reshape.merge(X_all[['air_store_id', 'visit_date','prev_visitors', 'mean_visitors', 
                                       'median_visitors', 'max_visitors', 'min_visitors', 'count_observations','visitors']],
                                    how = "left",
                                    left_on = ['air_store_id', 'visit_date'],
                                    right_on = ['air_store_id', 'visit_date'])
print(df_to_reshape.columns)
print(df_to_reshape.shape)

Index(['air_store_id', 'visit_date', 'year', 'month', 'week', 'is_holiday',
       'next_day', 'prev_day', 'daysToPrev25th', 'day_of_week',
       'Consecutive_holidays', 'Food_Type', 'latitude', 'longitude',
       'prev_visitors', 'mean_visitors', 'median_visitors', 'max_visitors',
       'min_visitors', 'count_observations', 'visitors'],
      dtype='object')
(428593, 21)


In [0]:
# separate 'visitors' into output array
Y_lstm_df = df_to_reshape[['visit_date', 'air_store_id', 'visitors']].copy().fillna(0)

# take log(y+1)
Y_lstm_df['visitors'] = np.log1p(Y_lstm_df['visitors'].values)

# add flag for days when a restaurant was closed
df_to_reshape['closed_flag'] = np.where(df_to_reshape['visitors'].isnull() &
                                        df_to_reshape['visit_date'].isin(train_df['visit_date']).values,1,0)

# drop 'visitors' and from dataset
df_to_reshape = df_to_reshape.drop(['visitors'], axis = 1)

# fill in NaN values
df_to_reshape = df_to_reshape.fillna(-1)

# list of df_to_reshape columns without 'air_store_id' and 'visit_date'
columns_list = [x for x in list(df_to_reshape.iloc[:,2:])]

# bound all numerical values between -1 and 1
# note: to avoid data leakage 'fit' should be made on traid data and 'transform' on train and test data
# in this case all data in test set is taken from train set, thus fit/transform on all data 
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(df_to_reshape[columns_list])
df_to_reshape[columns_list] = scaler.transform(df_to_reshape[columns_list])

In [0]:
# SPECIFIC PREPARATION FOR NEURAL NETWORK AND ENCODER/DECODER ---------------
# reshape X into (samples, timesteps, features)
X_all_lstm = df_to_reshape.values[:,2:].reshape(len(ids),
                                                len(dates),
                                                df_to_reshape.shape[1]-2)

# isolate output for train set and reshape it for time series
Y_lstm_df = Y_lstm_df.loc[Y_lstm_df['visit_date'].isin(train_df['visit_date'].values) &
                          Y_lstm_df['air_store_id'].isin(train_df['air_store_id'].values),]
Y_lstm = Y_lstm_df.values[:,2].reshape(len(train_df['air_store_id'].unique()),
                                       len(train_df['visit_date'].unique()),
                                       1)


In [0]:
# test dates
n_test_dates = len(test['visit_date'].unique())

# make additional features for number of visitors in t-1, t-2, ... t-7
t_minus = np.ones([Y_lstm.shape[0],Y_lstm.shape[1],1])
for i in range(1,8):
    temp = Y_lstm.copy()
    temp[:,i:,:] = Y_lstm[:,0:-i,:].copy()
    t_minus = np.concatenate((t_minus[...], temp[...]), axis = 2)
t_minus = t_minus[:,:,1:]
print ("t_minus shape", t_minus.shape)


# split X_all into training and test data
X_lstm = X_all_lstm[:,:-n_test_dates,:]
X_lstm_test = X_all_lstm[:,-n_test_dates:,:]

# add t-1, t-2 ... t-7 visitors to feature vector
X_lstm = np.concatenate((X_lstm[...], t_minus[...]), axis = 2)

# split training set into train and validation sets
X_tr = X_lstm[:,39:-140,:]
Y_tr = Y_lstm[:,39:-140,:]

X_val = X_lstm[:,-140:,:]
Y_val = Y_lstm[:,-140:,:]

t_minus shape (829, 478, 7)


## Specific Preparation for Neural Network

This sampling works as effective data augmentation mechanism: training code randomly chooses starting point for each timeseries on each step, generating endless stream of almost non-repeating data.

X train dataset: X_lstm

Y train dataset: Y_lstm

X test dataset: X_lstm_test



#### All features for both training and test time periods
batch size: 829 unique stores

timestep: from 2016-01-01 to 2017-05-31 => 517 days

features: 20 features

In [0]:
X_all_lstm.shape

(829, 517, 19)

#### Target variable for training time periods:
batch size: 829 unique stores

timestep: from 2016-01-01 to 2017-04-22 => 478 days

features: 1 target variable

In [0]:
Y_lstm.shape

(829, 478, 1)

#### All features for training time periods:
batch size: 829 unique stores

timestep: from 2016-01-01 to 2017-04-22 => 478 days

features: 20 features

In [0]:
X_lstm.shape

(829, 478, 26)

#### All features for test time periods:
batch size: 829 unique stores

timestep: from 2016-04-23 to 2017-05-31 => 39 days

features: 20 features

In [0]:
X_lstm_test.shape

(829, 39, 19)

In [0]:
X_tr.shape

(829, 299, 26)

## LSTM

-------------------------------------------------- Plan A-----------------------------------------------------------

In [0]:
use_dropout = True

In [0]:
model = Sequential()
# First LSTM layer with Dropout regularization
model.add(LSTM(200, input_shape=(X_lstm.shape[1],X_lstm.shape[2]), return_sequences=True))
# if use_dropout:
#     model.add(Dropout(0.2))
# # Second LSTM layer
# model.add(Dense(100, activation='relu'))
# if use_dropout:
#     model.add(Dropout(0.2))
# The output layer
model.add(TimeDistributed(Dense(1))) # apply that Dense function across every output over time

model.compile(loss='mean_squared_error', optimizer='adam')


In [0]:
model.fit(X_lstm, Y_lstm, epochs=6, batch_size=100, verbose=1)

In [0]:
print(model.summary())

In [0]:
predictions = model.predict(X_lstm_test, batch_size=478)
pred_visitors = scaler.inverse_transform(predictions.reshape(-1, 1))
testY_inverse = scaler.inverse_transform(y_test.reshape(-1, 1))

def RMSE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

rmsle = RMSE(testY_inverse, pred_visitors)
print('Test RMSLE: %.3f' % rmsle)

-----------------------------------------------------------Plan B-------------------------------------------------------------------

#### Working with long timeseries
LSTM/GRU is a great solution for relatively short sequences, up to 100-300 items. On longer sequences LSTM/GRU still works, but can gradually forget information from the oldest items.  Competition timeseries is up to 478 days long, so I have to find some method to "strengthen" GRU memory. 

The encoder takes input features of 39 days (t1, t2 … t39) and encode their hidden states through LSTM neural network. Then it pass the hidden states to decoder. Decoder use them with the features of 39 days shifted 1 day forward (t2, t3 … T40) to predict number of visitors per each of 829 restaurants in t_40.



#### Training and validation
There are two ways to split timeseries into training and validation datasets:

1. Walk-forward split. This is not actually a split: we train on full dataset and validate on full dataset, using different timeframes. Timeframe for validation is shifted forward by one prediction interval relative to timeframe for training.

2. Side-by-side split. This is traditional split model for mainstream machine learning. Dataset splits into independent parts, one part used strictly for training and another part used strictly for validation.

Walk-forward is preferable, because it directly relates to the competition goal: predict future values using historical values. But this split consumes datapoints at the end of timeseries, thus making hard to train model to precisely predict the future. 

I used validation (with walk-forward split) only for model tuning. Final model to predict future values was trained in blind mode, without any validation.

#### Losses and regularization


In [0]:
# ENCODER-DECODER MODEL ===================================================
# many thanks to the following resources:
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
# https://blog.keras.io/building-autoencoders-in-keras.html
# http://cs231n.stanford.edu/slides/2017/cs231n_2017_lecture13.pdf
# https://machinelearningmastery.com/define-encoder-decoder-sequence-sequence-model-neural-machine-translation-keras/
# https://github.com/Arturus/kaggle-web-traffic

# MODEL FOR ENCODER AND DECODER -------------------------------------------
num_encoder_tokens = X_lstm.shape[2]
latent_dim = 64 # to avoid "kernel run out of time" situation. I used 256.

# encoder training
encoder_inputs = Input(shape = (None, num_encoder_tokens))
encoder = LSTM(latent_dim, 
               batch_input_shape = (1, None, num_encoder_tokens),
               stateful = False,
               return_sequences = True,
               return_state = True,
               recurrent_initializer = 'glorot_uniform')

encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c] # 'encoder_outputs' are ignored and only states are kept.

# Decoder training, using 'encoder_states' as initial state.
decoder_inputs = Input(shape=(None, num_encoder_tokens))

decoder_lstm_1 = GRU(latent_dim,
                      batch_input_shape = (1, None, num_encoder_tokens),
                      stateful = False,
                      return_sequences = True,
                      return_state = False,
                      dropout = 0.2,
                      recurrent_dropout = 0.2) # True

decoder_lstm_2 = GRU(32, # to avoid "kernel run out of time" situation. I used 128.
                     stateful = False,
                     return_sequences = True,
                     return_state = True,
                     dropout = 0.2,
                     recurrent_dropout = 0.2)

decoder_outputs, _, _ = decoder_lstm_2(decoder_lstm_1(decoder_inputs, initial_state = encoder_states))
decoder_dense = TimeDistributed(Dense(Y_lstm.shape[2], activation = 'relu'))
decoder_outputs = decoder_dense(decoder_outputs)

# training model
training_model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
training_model.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [0]:
# useful for understanding the model architecture
training_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, None, 26)     0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, None, 26)     0                                            
__________________________________________________________________________________________________
lstm_4 (LSTM)                   [(None, None, 64), ( 23296       input_5[0][0]                    
__________________________________________________________________________________________________
lstm_5 (LSTM)                   (None, None, 64)     23296       input_6[0][0]                    
                                                                 lstm_4[0][1]               

In [0]:
# GENERATOR APPLIED TO FEED ENCODER AND DECODER ---------------------------
# generator that randomly creates times series of 39 consecutive days
# theses time series has following 3d shape: 829 restaurants * 39 days * num_features 
def dec_enc_n_days_gen(X_3d, Y_3d, length):
    while 1:
        decoder_boundary = X_3d.shape[1] - length - 1
        
        encoder_start = np.random.randint(0, decoder_boundary)
        encoder_end = encoder_start + length
        
        decoder_start = encoder_start + 1
        decoder_end = encoder_end + 1
        
        X_to_conc = X_3d[:, encoder_start:encoder_end, :]
        Y_to_conc = Y_3d[:, encoder_start:encoder_end, :]
        X_to_decode = X_3d[:, decoder_start:decoder_end, :]
        Y_decoder = Y_3d[:, decoder_start:decoder_end, :]
        
        yield([X_to_conc,
               X_to_decode],
               Y_decoder)


In [0]:
# TRAINING -------------------------------------------------------------
# Training on X_tr/Y_tr and validate with X_val/Y_val
# To perform validation training on validation data should be
# made instead of training on full data set.
# Then validation check is made on period outside of training data
# (included in code below).
'''
training_model.fit_generator(dec_enc_n_days_gen(X_tr, Y_tr, 39),
                             validation_data = dec_enc_n_days_gen(X_val, Y_val, 39),
                             steps_per_epoch = X_lstm.shape[0],
                             validation_steps = X_val.shape[0],
                             verbose = 1,
                             epochs = 1)
'''

# Training on full dataset
training_model.fit_generator(dec_enc_n_days_gen(X_lstm[:,:,:], Y_lstm[:,:,:], 39),
                            steps_per_epoch = X_lstm[:,:,:].shape[0],
                            verbose = 1,
                            epochs = 1)

Epoch 1/1


<keras.callbacks.callbacks.History at 0x7f92a067cba8>

In [0]:
# PREDICTION FUNCTION --------------------------------------------------

# function takes 39 days before first prediction day (input_seq)
# then using encoder to identify hidden states for these 39 days.
# Next, decoder takes hidden states provided by encoder
# and predicts number of visitors from day 2 to day 40.
# Day 40 is the first day of target_seq.

# Predicted value for day 40 is appended to features of day 41.
# Then function takes period from day 2 to day 40 and repeat the process
# unil all days in target sequence get their predictions. 

# The output of the function is the vector with predictions that has
# following shape: 820 restaurants * 39 days * 1 predicted visitors amount

def predict_sequence(inf_enc, inf_dec, input_seq, Y_input_seq, target_seq):
    # state of input sequence produced by encoder
    state = inf_enc.predict(input_seq)
    
    # restrict target sequence to the same shape as X_lstm_test
    target_seq = target_seq[:,:, :X_lstm_test.shape[2]]
    
    
    # create vector that contains y for previous 7 days
    t_minus_seq = np.concatenate((Y_input_seq[:,-1:,:], input_seq[:,-1:, X_lstm_test.shape[2]:-1]), axis = 2)
    
    # current sequence that is going to be modified each iteration of the prediction loop
    current_seq = input_seq.copy()
    
    
    # predicting outputs
    output = np.ones([target_seq.shape[0],1,1])
    for i in range(target_seq.shape[1]):
        # add visitors for previous 7 days into features of a new day
        new_day_features = np.concatenate((target_seq[:,i:i+1,:], t_minus_seq[...]), axis = 2)
        
        # move prediction window one day forward
        current_seq = np.concatenate((current_seq[:,1:,:], new_day_features[:,]), axis = 1)
        
        
        # predict visitors amount
        pred = inf_dec.predict([current_seq] + state)
        
        # update t_minus_seq
        t_minus_seq = np.concatenate((pred[:,-1:,:], t_minus_seq[...]), axis = 2)
        t_minus_seq = t_minus_seq[:,:,:-1]        
        
        # update predicitons list
        output = np.concatenate((output[...], pred[:,-1:,:]), axis = 1)
        
        # update state
        state = inf_enc.predict(current_seq)
    
    return output[:,1:,:]

In [0]:
# INFERENCE ENCODER AND DECODER -----------------------------------------    
# inference encoder
encoder_model = Model(encoder_inputs, encoder_states)

# inference decoder
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs,_,_ = decoder_lstm_2(decoder_lstm_1(decoder_inputs,
                                                    initial_state = decoder_states_inputs))
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs])

In [0]:
# Predicting test values
enc_dec_pred = predict_sequence(encoder_model,
                                decoder_model,
                                X_lstm[:,-X_lstm_test.shape[1]:,:],
                                Y_lstm[:,-X_lstm_test.shape[1]:,:],
                                X_lstm_test[:,:,:])


In [0]:
# test data
sample_sub = pd.read_csv('../input/recruit-restaurant-visitor-forecasting/sample_submission.csv')
# transform test data
air_test = sample_sub.copy()
air_test['air_store_id'] = air_test['id'].apply(lambda x: str(x)[:-11])
air_test['visit_date'] = air_test['id'].apply(lambda x: str(x)[-10:])

# dataframe for predictions
submission_lstm = air_test.copy()
submission_lstm.head()

Unnamed: 0,id,visitors,air_store_id,visit_date
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23
1,air_00a91d42b08b08d9_2017-04-24,0,air_00a91d42b08b08d9,2017-04-24
2,air_00a91d42b08b08d9_2017-04-25,0,air_00a91d42b08b08d9,2017-04-25
3,air_00a91d42b08b08d9_2017-04-26,0,air_00a91d42b08b08d9,2017-04-26
4,air_00a91d42b08b08d9_2017-04-27,0,air_00a91d42b08b08d9,2017-04-27


In [0]:
# Add predicted test values to submission dataset ---------------------

# Note: it is important to preserve the order of time series.
# Thus, test set will contain all 829 lines in the same order as train set.
# To make this 'air_store_id' is taken as in X and not in X_test (second line of 'test' variable below).
# Only relevant results will be merged for submission dataframe
test_df = df_to_reshape.loc[df_to_reshape['visit_date'].isin(test['visit_date'].values) &
                         df_to_reshape['air_store_id'].isin(train_df['air_store_id'].values),]


# reshape predicted values to initial shape
test_pred = enc_dec_pred.reshape(test_df.shape[0], 1)
test_pred_exp = np.exp(test_pred) - 1.0
test_pred_exp[test_pred_exp<0] = 0

# add predictions to dataframe with 'air_store_id' and 'visit_date'
test_df_pred = test_df[['air_store_id', 'visit_date']].copy()
test_df_pred['predicted'] = test_pred_exp

In [0]:
# reverse transform of 'air_store_id'
test_df_pred['air_store_id'] = le_id.inverse_transform(test_df_pred['air_store_id'])

# finalizing submission csv file
submission_df = submission_lstm.merge(test_df_pred,
                                     how = 'left',
                                     left_on = ['air_store_id', 'visit_date'],
                                     right_on = ['air_store_id', 'visit_date'])

submission_df['visitors'] = submission_df['predicted']
submission_df = submission_df.drop(['air_store_id', 'visit_date', 'predicted'], axis = 1)
submission_df.to_csv('submission.csv', index = False)