# Importing Necessary Libraries

In [47]:
import pandas as pd
import numpy as np
import keras
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Reading the data

In [48]:
data = pd.read_csv('train.csv')
x_data = data.iloc[:,:5]
y_data = data.iloc[:,5]
data_t = pd.read_csv('test.csv')

# One Hot Encoding and Fetaure Engineering

In [49]:
def preprocessor(data):
    data.date = pd.to_datetime(data.date)

    data['month'] = data.date.dt.month
    data['day'] = data.date.dt.day
    data.drop(columns=['date'],inplace=True)

    row_id = data.row_id
    ohe = OneHotEncoder()
    features_transformed = ohe.fit_transform(data[['country','store','product']]).toarray()
    feature_labels = ohe.categories_
    feature_labels = np.hstack(feature_labels)

    data = pd.DataFrame(features_transformed,columns=feature_labels).merge(data[['month','day']],left_index=True,right_index=True,how='inner')
     
    return data, row_id

# Train Test Split

In [50]:
x_data_p, row_id = preprocessor(x_data)

In [8]:
x_train_full,x_test,y_train_full,y_test = train_test_split(x_data_p,y_data, test_size=0.2, random_state=42)
x_train,x_valid,y_train,y_valid = train_test_split(x_train_full,y_train_full,test_size=0.3,random_state=42)

In [9]:
print(x_train.shape,x_test.shape,x_valid.shape)

(39271, 14) (14026, 14) (16831, 14)


In [10]:
print(((x_valid['KaggleMart'].value_counts())/x_valid.shape[0]),((x_test['KaggleMart'].value_counts())/x_test.shape[0]),((x_train['KaggleMart'].value_counts())/x_train.shape[0]))

1.0    0.501574
0.0    0.498426
Name: KaggleMart, dtype: float64 1.0    0.501854
0.0    0.498146
Name: KaggleMart, dtype: float64 0.0    0.501337
1.0    0.498663
Name: KaggleMart, dtype: float64


# Preprocessing

In [11]:
ss = StandardScaler()
x_train= ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)
x_valid =ss.fit_transform(x_valid)

# Basic Sequential Neural Network

In [22]:
model = keras.models.Sequential([
    keras.layers.Input(shape = x_train.shape[1:]),
    keras.layers.Dense(300, activation='selu',kernel_initializer='lecun_normal'),
    keras.layers.Dense(1)
])

In [23]:
model.compile(optimizer=keras.optimizers.SGD(momentum=0.9, nesterov=True),loss='mae')

In [24]:
history = model.fit(x_train, y_train, validation_data=(x_valid,y_valid),epochs=30,callbacks=[keras.callbacks.EarlyStopping(patience=10)])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [27]:
y_pred = model.predict(x_test)



In [28]:
mean_squared_error(y_test,y_pred)

2444.4461213249774

# Testing the model on test set provided by kaggle

In [57]:
test_data, row_id_t = preprocessor(data_t)

In [60]:
final_result = pd.DataFrame(row_id_t).merge(pd.DataFrame(model.predict(test_data)), left_index=True, right_index=True, how='inner')



In [62]:
final_result.to_csv('result.csv',index=False)