In [13]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [14]:

# Prepare scalars to normalize data
input_scaler = MinMaxScaler()
output_scaler = StandardScaler()

In [17]:
calendar_df = pd.read_csv('calendar.csv')
prices_df = pd.read_csv('sell_prices.csv')
sales_df = pd.read_csv('sales_train_validation.csv')
submission_df = pd.read_csv('sample_submission.csv')

In [18]:
sales_df
#submission_df[submission_df['id'] == 'FOODS_3_823_WI_3_validation']

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [19]:
def train_test(df, train_steps, test_steps):
    num_steps = train_steps + test_steps
    cols = len(df.columns)
    iterations = cols//num_steps
    
    test_range = []
    train_range = []
    
    for i in range(iterations):
        train_start = cols - (i+1)*num_steps
        train_end = train_start + train_steps
        test_start = train_end
        test_end = test_start + test_steps
        
        train_data = df.iloc[:, train_start:train_end].to_numpy()
        test_data = df.iloc[:, test_start:test_end].to_numpy()
        train_range.extend(train_data)
        test_range.extend(test_data)
    
    return train_range, test_range

In [20]:
X_data, y_data = train_test(sales_df.iloc[:, 6:], 100, 28)

In [21]:
print('Train data shape: ', len(X_data))
print('Text data shape: ', len(y_data))

Train data shape:  426860
Text data shape:  426860


In [22]:
X_tensors = np.array(X_data)
input_scaler.fit(X_tensors)
X_tensors = input_scaler.transform(X_tensors)
X_tensors = X_tensors.reshape((X_tensors.shape[0], X_tensors.shape[1], 1))

In [23]:
Y_tensors = np.array(y_data)
output_scaler.fit(Y_tensors)
Y_tensors = output_scaler.transform(Y_tensors)

In [24]:

train_perc = 0.8
n = int(X_tensors.shape[0]*train_perc)
train_index = np.random.choice(X_tensors.shape[0], n, replace=False)
test_index = np.asarray(list(set(range(n)) - set(train_index)))

In [25]:
X_train = X_tensors[train_index]
X_test = X_tensors[test_index]
y_train = Y_tensors[train_index]
y_test = Y_tensors[test_index]
print('Train data input samples: ', len(X_train))
print('Test data input samples: ', len(X_test))
print('Train data output samples: ', len(y_train))
print('Test data output samples: ', len(y_test))

Train data input samples:  341488
Test data input samples:  68346
Train data output samples:  341488
Test data output samples:  68346


In [26]:
from keras import backend as K

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Reshape
from keras.layers import LSTM
from tensorflow.compat.v1.keras.layers import CuDNNLSTM 
from keras.layers import Conv1D
from keras.utils import to_categorical
from keras.layers import MaxPooling1D
from keras.layers import  GlobalAveragePooling1D
from keras.utils import to_categorical

import tensorflow as tf

In [27]:
def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

def rmse(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

In [28]:
steps_in = X_train.shape[1]
n_features = X_train.shape[2]
steps_out = y_train.shape[1]

In [30]:
model = Sequential()
model.add(LSTM(25, return_sequences=True, input_shape=(steps_in, n_features)))
model.add(LSTM(25))

In [31]:
model.add(Dense(steps_out))
model.compile(optimizer='adam', loss=root_mean_squared_error)

In [32]:
model.fit(X_train, y_train, epochs=1, verbose=1)

Epoch 1/1


<keras.callbacks.callbacks.History at 0x20511fead88>

In [35]:
raw_preds = model.predict(X_test)

In [36]:
all_pred = output_scaler.inverse_transform(raw_preds)
all_pred = np.round(np.abs(all_pred))

In [37]:
def rmse(predictions, targets):
    return np.sqrt(np.mean((predictions-targets)**2))

In [38]:
rmse(all_pred, y_test)

2.56796485069535

In [39]:
def submission_set(df, steps):
    
    cols = len(df.columns)
    submit_data = df.iloc[:, (cols - steps):cols].to_numpy()
    
    return submit_data

In [40]:
# Set the number of steps to be the same as the training data
submission_set = submission_set(sales_df, 100)

In [41]:
# Scale the predictions
submission_tensors = input_scaler.transform(submission_set)

In [42]:
# Reshape the input tensors into the same dimensions as the training data
submission_tensors = submission_tensors.reshape((submission_tensors.shape[0], submission_tensors.shape[1], 1))

In [43]:
# Make our predictions
submission_preds = model.predict(submission_tensors)

In [44]:
# Scale and round our predictions
submission_preds = output_scaler.inverse_transform(submission_preds)
submission_preds = np.round(np.abs(submission_preds))

In [45]:
# Create a dataframe for the predictions
submission_data = pd.DataFrame(data=submission_preds,
                              columns=[F for F in submission_df.columns[1:]])
submission_data.insert(0, 'id', submission_df['id'][:30490])
submission_data.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,HOBBIES_1_002_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,HOBBIES_1_004_CA_1_validation,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,HOBBIES_1_005_CA_1_validation,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [46]:
# Concatenate this dataframe with the evaluation observations 
validation_df = pd.concat([submission_data, submission_df[30490:]])
validation_df.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,HOBBIES_1_002_CA_1_validation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_validation,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,HOBBIES_1_004_CA_1_validation,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
4,HOBBIES_1_005_CA_1_validation,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [47]:
# Export to Excel
validation_df.to_csv('jituk.csv', index=False)