In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import datetime
import seaborn as sns
import keras

In [7]:
shops = pd.read_csv("../data/shops.csv")
items = pd.read_csv("../data/items.csv")
item_categories = pd.read_csv("../data/item_categories.csv")
test = pd.read_csv("../data/test.csv")
sales_train = pd.read_csv("../data/sales_train.csv")
sample_submission = pd.read_csv("../data/sample_submission.csv")

In [8]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


In [9]:
test.head()

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320
2,2,5,5233
3,3,5,5232
4,4,5,5268


In [10]:
sales_train.shape

(2935849, 6)

In [11]:
# removing shop id and item id which are not in test
test_shop_ids = test['shop_id'].unique()
test_item_ids = test['item_id'].unique()
# Only shops that exist in test set.
sales_train = sales_train[sales_train['shop_id'].isin(test_shop_ids)]
# Only items that exist in test set.
sales_train = sales_train[sales_train['item_id'].isin(test_item_ids)]

In [12]:
sales_train.shape

(1224439, 6)

In [13]:
sales_train['date'] = pd.to_datetime(sales_train['date'], format='%d.%m.%Y')
#now we will create a pivot tabel by going so we get our data in desired form 
#we want get total count value of an item over the whole month for a shop 
# That why we made shop_id and item_id our indices and date_block_num our column 
# the value we want is item_cnt_day and used sum as aggregating function 
dataset = sales_train.pivot_table(index = ['shop_id','item_id'],values = ['item_cnt_day'],columns = ['date_block_num'],fill_value = 0,aggfunc='sum')

In [14]:
dataset.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
Unnamed: 0_level_1,date_block_num,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
shop_id,item_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
2,30,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,31,0,4,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,32,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,33,1,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,1,0
2,53,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
# lets reset our indices, so that data should be in way we can easily manipulate
dataset.reset_index(inplace = True)

In [16]:
# lets check on our pivot table
dataset.head()

Unnamed: 0_level_0,shop_id,item_id,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day,item_cnt_day
date_block_num,Unnamed: 1_level_1,Unnamed: 2_level_1,0,1,2,3,4,5,6,7,...,24,25,26,27,28,29,30,31,32,33
0,2,30,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,31,0,4,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,32,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,2,33,1,0,0,0,0,0,0,0,...,0,1,0,1,1,0,1,0,1,0
4,2,53,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [18]:
# Now we will merge our pivot table with the test_data because we want to keep the data of items we have
# predict
dataset = pd.merge(test,dataset,on = ['item_id','shop_id'],how = 'left')

In [19]:
# lets fill all NaN values with 0
# dataset.fillna(0,inplace = True)
dataset.fillna(0, inplace=True)
# lets check our data now 
dataset.head()

Unnamed: 0,ID_x,shop_id,item_id,ID_y,"(item_cnt_day, 0)","(item_cnt_day, 1)","(item_cnt_day, 2)","(item_cnt_day, 3)","(item_cnt_day, 4)","(item_cnt_day, 5)",...,"(item_cnt_day, 24)","(item_cnt_day, 25)","(item_cnt_day, 26)","(item_cnt_day, 27)","(item_cnt_day, 28)","(item_cnt_day, 29)","(item_cnt_day, 30)","(item_cnt_day, 31)","(item_cnt_day, 32)","(item_cnt_day, 33)"
0,0,5,5037,0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,1,5,5320,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,5,5233,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,3,5,5232,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4,5,5268,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# we will drop shop_id and item_id because we do not need them
# we are teaching our model how to generate the next sequence 
dataset.drop(['shop_id','item_id','ID'],inplace = True, axis = 1)
dataset.head()

KeyError: "['ID'] not found in axis"

In [15]:
temp = dataset.clip(0, )

In [16]:
temp.max()

(item_cnt_day, 0)      169.0
(item_cnt_day, 1)      117.0
(item_cnt_day, 2)      259.0
(item_cnt_day, 3)      151.0
(item_cnt_day, 4)      504.0
(item_cnt_day, 5)      766.0
(item_cnt_day, 6)      799.0
(item_cnt_day, 7)      820.0
(item_cnt_day, 8)      950.0
(item_cnt_day, 9)      978.0
(item_cnt_day, 10)     989.0
(item_cnt_day, 11)    1305.0
(item_cnt_day, 12)     899.0
(item_cnt_day, 13)     941.0
(item_cnt_day, 14)     776.0
(item_cnt_day, 15)     597.0
(item_cnt_day, 16)     602.0
(item_cnt_day, 17)     771.0
(item_cnt_day, 18)     563.0
(item_cnt_day, 19)     591.0
(item_cnt_day, 20)     639.0
(item_cnt_day, 21)     634.0
(item_cnt_day, 22)     772.0
(item_cnt_day, 23)    1209.0
(item_cnt_day, 24)    1000.0
(item_cnt_day, 25)     257.0
(item_cnt_day, 26)     174.0
(item_cnt_day, 27)     813.0
(item_cnt_day, 28)     742.0
(item_cnt_day, 29)     444.0
(item_cnt_day, 30)     482.0
(item_cnt_day, 31)     436.0
(item_cnt_day, 32)     473.0
(item_cnt_day, 33)    2253.0
dtype: float64

In [3]:
data = pd.read_csv("data/cleaned_sales.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,city_label,item_category_id,main_category_id,sub_category_id,item_cnt_month_lag_1,...,shop_month_mean_lag_3,shop_month_mean_lag_6,shop_month_mean_lag_12,shop_category_month_mean_lag_1,shop_category_month_mean_lag_2,main_category_month_mean_lag_1,sub_category_month_mean_lag_1,month,holidays_in_month,moex_value
0,0,2,19,0,0.0,0,40,11,4,,...,,,,,,,,0,6,
1,1,2,27,0,1.0,0,19,5,10,,...,,,,,,,,0,6,
2,2,2,28,0,0.0,0,30,8,55,,...,,,,,,,,0,6,
3,3,2,29,0,0.0,0,23,5,16,,...,,,,,,,,0,6,
4,4,2,32,0,0.0,0,40,11,4,,...,,,,,,,,0,6,


In [5]:
data.fillna(0, inplace=True)
data.head()

Unnamed: 0.1,Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,city_label,item_category_id,main_category_id,sub_category_id,item_cnt_month_lag_1,...,shop_month_mean_lag_3,shop_month_mean_lag_6,shop_month_mean_lag_12,shop_category_month_mean_lag_1,shop_category_month_mean_lag_2,main_category_month_mean_lag_1,sub_category_month_mean_lag_1,month,holidays_in_month,moex_value
0,0,2,19,0,0.0,0,40,11,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6,0.0
1,1,2,27,0,1.0,0,19,5,10,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6,0.0
2,2,2,28,0,0.0,0,30,8,55,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6,0.0
3,3,2,29,0,0.0,0,23,5,16,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6,0.0
4,4,2,32,0,0.0,0,40,11,4,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,6,0.0


In [17]:
# X we will keep all columns execpt the last one 
X_train = np.expand_dims(dataset.values[:,:-1],axis = 2)
# the last column is our label
y_train = dataset.values[:,-1:]

# for test we keep all the columns execpt the first one
X_test = np.expand_dims(dataset.values[:,1:],axis = 2)

# lets have a look on the shape 
print(X_train.shape,y_train.shape,X_test.shape)

(214200, 33, 1) (214200, 1) (214200, 33, 1)


In [18]:
X_test.shape

(214200, 33, 1)

In [17]:
# importing libraries required for our model
from keras.models import Sequential
from keras.layers import LSTM,Dense,Dropout
from keras.optimizers import Adam

In [20]:
epochs = 100
# our defining our model 
my_model = Sequential()
my_model.add(LSTM(units = 64, activation='tanh', input_shape = (33,1), return_sequences=True))
my_model.add(Dropout(0.5))
my_model.add(LSTM(units= 32, activation='tanh'))
my_model.add(Dropout(0.5))
my_model.add(Dense(1))
# opt = Adam(lr=1e-3, decay=1e-3/epochs)
my_model.compile(loss = 'mse',optimizer = 'rmsprop', metrics = ['mean_squared_error'])
my_model.summary()

from keras.utils import plot_model
plot_model(my_model, to_file='lstm_model.png')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 33, 64)            16896     
_________________________________________________________________
dropout_1 (Dropout)          (None, 33, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 29,345
Trainable params: 29,345
Non-trainable params: 0
_________________________________________________________________


ImportError: Failed to import `pydot`. Please install `pydot`. For example with `pip install pydot`.

In [None]:
hist = my_model.fit(X_train,y_train,validation_split=0.2, batch_size = 4096,epochs = epochs)

In [None]:
plt.plot(hist.history['loss'])
plt.title('train loss')
plt.ylabel('mse')
plt.xlabel('epoch')
plt.savefig('train_loss.png')

In [None]:
plt.plot(ypre.history['loss'])
plt.title('train loss')
plt.ylabel('mse')
plt.xlabel('epoch')
plt.show()

In [None]:
# creating submission file 
submission_pfs = my_model.predict(X_test)
# we will keep every value between 0 and 20
submission_pfs = submission_pfs.clip(0,20)
# creating dataframe with required columns 
submission = pd.DataFrame({'ID':test['ID'],'item_cnt_month':submission_pfs.ravel()})
# creating csv file from dataframe
submission.to_csv('sub_pfs.csv',index = False)

In [2]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= 'train_loss.png')
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [11]:
encoder_decoder = Sequential()
encoder_decoder.add(LSTM(12, activation='relu', input_shape=(12, 1), return_sequences=True))
encoder_decoder.add(LSTM(6, activation='relu', return_sequences=True))
encoder_decoder.add(LSTM(1, activation='relu'))
encoder_decoder.add(RepeatVector(12))
encoder_decoder.add(LSTM(12, activation='relu', return_sequences=True))
encoder_decoder.add(LSTM(6, activation='relu', return_sequences=True))
encoder_decoder.add(TimeDistributed(Dense(1)))
encoder_decoder.summary()

plot_model(encoder_decoder, to_file='encoder_decoder_model.png', show_shapes=True, show_layer_names=True)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_17 (LSTM)               (None, 12, 12)            672       
_________________________________________________________________
lstm_18 (LSTM)               (None, 12, 6)             456       
_________________________________________________________________
lstm_19 (LSTM)               (None, 1)                 32        
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 12, 1)             0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 12, 12)            672       
_________________________________________________________________
lstm_21 (LSTM)               (None, 12, 6)             456       
_________________________________________________________________
time_distributed_3 (TimeDist (None, 12, 1)            

OSError: `pydot` failed to call GraphViz.Please install GraphViz (https://www.graphviz.org/) and ensure that its executables are in the $PATH.

In [3]:
data = pd.read_csv("data/cleaned_sales.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11128004 entries, 0 to 11128003
Data columns (total 33 columns):
Unnamed: 0                        int64
shop_id                           int64
item_id                           int64
date_block_num                    int64
item_cnt_month                    float64
city_label                        int64
item_category_id                  int64
main_category_id                  int64
sub_category_id                   int64
item_cnt_month_lag_1              float64
item_cnt_month_lag_2              float64
item_cnt_month_lag_3              float64
item_cnt_month_lag_4              float64
item_cnt_month_lag_5              float64
item_cnt_month_lag_6              float64
item_cnt_month_lag_12             float64
item_month_mean_lag_1             float64
item_month_mean_lag_2             float64
item_month_mean_lag_3             float64
item_month_mean_lag_6             float64
item_month_mean_lag_12            float64
shop_month_mean_lag

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,city_label,item_category_id,main_category_id,sub_category_id,item_cnt_month_lag_1,...,shop_month_mean_lag_3,shop_month_mean_lag_6,shop_month_mean_lag_12,shop_category_month_mean_lag_1,shop_category_month_mean_lag_2,main_category_month_mean_lag_1,sub_category_month_mean_lag_1,month,holidays_in_month,moex_value
0,0,2,19,0,0.0,0,40,11,4,,...,,,,,,,,0,6,
1,1,2,27,0,1.0,0,19,5,10,,...,,,,,,,,0,6,
2,2,2,28,0,0.0,0,30,8,55,,...,,,,,,,,0,6,
3,3,2,29,0,0.0,0,23,5,16,,...,,,,,,,,0,6,
4,4,2,32,0,0.0,0,40,11,4,,...,,,,,,,,0,6,


In [10]:
data = data[['shop_id', 'item_id', 'date_block_num', 'item_cnt_month']]

In [12]:
data.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,2,19,0,0.0
1,2,27,0,1.0
2,2,28,0,0.0
3,2,29,0,0.0
4,2,32,0,0.0


In [30]:
def generate_lag(train, months, lag_column):
    for month in months:
        # Speed up by grabbing only the useful bits
        train_shift = train[['date_block_num', 'shop_id', 'item_id', lag_column]].copy()
        train_shift.columns = ['date_block_num', 'shop_id', 'item_id', lag_column+'_lag_'+ str(month)]
        train_shift['date_block_num'] += month
        train = pd.merge(train, train_shift, on=['date_block_num', 'shop_id', 'item_id'], how='left')
    return train

In [32]:
%%time
new_train = generate_lag(data, [1,2,3,4,5,6,7,8,9,10,11,12], 'item_cnt_month')

CPU times: user 56.4 s, sys: 30.9 s, total: 1min 27s
Wall time: 1min 39s


In [35]:
new_train = new_train.dropna()
new_train.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,item_cnt_month_lag_1,item_cnt_month_lag_2,item_cnt_month_lag_3,item_cnt_month_lag_4,item_cnt_month_lag_5,item_cnt_month_lag_6,item_cnt_month_lag_7,item_cnt_month_lag_8,item_cnt_month_lag_9,item_cnt_month_lag_10,item_cnt_month_lag_11,item_cnt_month_lag_12
4488710,2,27,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4488713,2,32,12,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4488714,2,33,12,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4488715,2,34,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4488719,2,40,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
import statsmodels.formula.api as smf

model = smf.ols(formula='item_cnt_month ~ item_cnt_month_lag_1', data=new_train)

model_fit = model.fit()

regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

0.4612893416108381


In [16]:
#create dataframe for transformation from time series to supervised
df_supervised = df_diff.drop(['prev_item_cnt_month'],axis=1)#adding lags
for inc in range(1,13):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff_item_cnt_month'].shift(inc)#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)
df_supervised.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month,diff_item_cnt_month,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12
0,2,45,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,-1.0,1.0
1,2,46,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,-1.0
2,2,47,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0
3,2,48,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0
4,2,49,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0


In [17]:
#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler
df_model = df_supervised.drop(['shop_id','item_id','date_block_num','item_cnt_month'],axis=1)#split train and test set
train_set, test_set = df_model[0:-1].values, df_model[-1:].values

In [19]:
train_set

array([[ 0.,  0.,  0., ...,  0., -1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [20]:
test_set

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [21]:
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

In [23]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [25]:
X_train.shape

(11127990, 1, 12)

In [26]:
y_train.shape

(11127990, 1)

In [27]:
X_test.shape

(1, 1, 12)

In [28]:
y_test.shape

(1, 1)

In [29]:
model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, nb_epoch=100, batch_size=1, verbose=1, shuffle=False)


The `nb_epoch` argument in `fit` has been renamed `epochs`.



Epoch 1/100
   59126/11127990 [..............................] - ETA: 5:19:04 - loss: 0.0021

KeyboardInterrupt: 

In [22]:
# Adapted from https://www.kaggle.com/sebask/keras-2-0

import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, ThresholdedReLU, MaxPooling2D, Embedding, Dropout
from keras.optimizers import Adam, SGD, RMSprop
from keras import backend as K
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
import gc

# Viz
import matplotlib.pyplot as plt

# Import data
sales = pd.read_csv('data/sales_train.csv', parse_dates=['date'], infer_datetime_format=True, dayfirst=True)
shops = pd.read_csv('data/shops.csv')
items = pd.read_csv('data/items.csv')
cats = pd.read_csv('data/item_categories.csv')
val = pd.read_csv('data/test.csv')

In [32]:
# Rearrange the raw data to be monthly sales by item-shop
df = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).sum().reset_index()
df = df[['date','item_id','shop_id','item_cnt_day']]
df["item_cnt_day"].clip(0.,20.,inplace=True)
df = df.pivot_table(index=['item_id','shop_id'], columns='date',values='item_cnt_day',fill_value=0).reset_index()
df.head()

date,item_id,shop_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,55,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,54,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Merge data from monthly sales to specific item-shops in test data
test = pd.merge(val,df,on=['item_id','shop_id'], how='left').fillna(0)

# Strip categorical data so keras only sees raw timeseries
test = test.drop(labels=['ID','item_id','shop_id'],axis=1)
test.head()

Unnamed: 0,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Rearrange the raw data to be monthly average price by item-shop
# Scale Price
scaler = MinMaxScaler(feature_range=(0, 1))
sales["item_price"] = scaler.fit_transform(sales["item_price"].values.reshape(-1,1))
df2 = sales.groupby([sales.date.apply(lambda x: x.strftime('%Y-%m')),'item_id','shop_id']).mean().reset_index()
df2 = df2[['date','item_id','shop_id','item_price']].pivot_table(index=['item_id','shop_id'], columns='date',values='item_price',fill_value=0).reset_index()
df2.head()

date,item_id,shop_id,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Merge data from average prices to specific item-shops in test data
price = pd.merge(val,df2,on=['item_id','shop_id'], how='left').fillna(0)
price = price.drop(labels=['ID','item_id','shop_id'],axis=1)
price.head()

Unnamed: 0,2013-01,2013-02,2013-03,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,...,2015-01,2015-02,2015-03,2015-04,2015-05,2015-06,2015-07,2015-08,2015-09,2015-10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006494,0.0,0.0,0.0,0.004221,0.00487,0.00487,0.003247,0.002437,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002922,0.001948,0.0,0.001948,0.003247,0.003896
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001948,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Create x and y training sets from oldest data points
y_train = test['2015-10']
x_sales = test.drop(labels=['2015-10'],axis=1)
x_sales = x_sales.values.reshape((x_sales.shape[0], x_sales.shape[1], 1))
x_prices = price.drop(labels=['2015-10'],axis=1)
x_prices= x_prices.values.reshape((x_prices.shape[0], x_prices.shape[1], 1))
X = np.append(x_sales,x_prices,axis=2)

In [39]:
y = y_train.values.reshape((214200, 1))
print("Training Predictor Shape: ",X.shape)
print("Training Predictee Shape: ",y.shape)
del y_train, x_sales; gc.collect()

Training Predictor Shape:  (214200, 33, 2)
Training Predictee Shape:  (214200, 1)


54

In [40]:
# Transform test set into numpy matrix
test = test.drop(labels=['2013-01'],axis=1)
x_test_sales = test.values.reshape((test.shape[0], test.shape[1], 1))
x_test_prices = price.drop(labels=['2013-01'],axis=1)
x_test_prices = x_test_prices.values.reshape((x_test_prices.shape[0], x_test_prices.shape[1], 1))

In [41]:
# Combine Price and Sales Df
test = np.append(x_test_sales,x_test_prices,axis=2)
del x_test_sales,x_test_prices, price; gc.collect()
print("Test Predictor Shape: ",test.shape)

Test Predictor Shape:  (214200, 33, 2)
