In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import calendar
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
import warnings
warnings.filterwarnings('ignore')

#### Training Pipeline

In [78]:
dp = pd.read_csv('itemdatewise.csv')

In [80]:
dp = dp.rename(columns={'district':'area'})
dp.area = dp.area.str.strip()
dp.area = dp.area.apply(lambda x : '24 PARGANAS(S)' if x == '24PGS(S)' else x)
dp.area = dp.area.apply(lambda x : '24 PARGANAS(N)' if x == 'NORTH 24 PGS' else x)
dp = dp[(dp.area!='-') & (dp.area!='')]

In [82]:
le_area = LabelEncoder()
dp.area = le_area.fit_transform(dp.area)
area_mapping = dict(zip(le_area.classes_, sorted(dp.area.unique())))
le_item = LabelEncoder()
dp.itemcode = le_item.fit_transform(dp.itemcode)
item_mapping = dict(zip(le_item.classes_, sorted(dp.itemcode.unique())))

In [88]:
dp['ddate'] = pd.to_datetime(dp.ddate, format='%d-%m-%Y')
dp['day'] = dp.ddate.dt.day
dp['month'] = dp.ddate.dt.month
dp['year'] = dp.ddate.dt.year
dp['weekday'] = dp.ddate.dt.weekday
dp['weekend'] = dp.weekday.apply(lambda x: 1 if x > 4 else 0)

In [90]:
festival = pd.read_csv('FestivalDates.csv')
festival['ddate'] = pd.to_datetime(festival.ddate, format='%d-%m-%Y %H:%M')
festival['festival'] = 1
dp = pd.merge(left=dp,right=festival,left_on='ddate',right_on='ddate',how='left')
dp.festival = dp.festival.fillna(0)

In [92]:
dp['c1'] = np.sin(dp.month * (2 * np.pi / 12))
dp['c2'] = np.cos(dp.month * (2 * np.pi / 12))
#dp.loc[(dp.sales==1),'salegrp']=0
#dp.loc[(dp.sales>=2) & (dp.sales<=5),'salegrp']=1
#dp.loc[(dp.sales>5),'salegrp']=2
dp.drop('ddate', axis=1, inplace=True)

In [94]:
dp = dp[dp.sales <= 100]

In [20]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

In [96]:
features = dp.drop(['year','sales'],axis=1)
Y = dp.sales
ss = StandardScaler()
X = ss.fit_transform(features)
X = pd.DataFrame(X)
X.columns = features.columns
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size=0.20,random_state=42)

In [24]:
nn = Sequential()
nn.add(Dense(8,input_shape=(xtrain.shape[1],),kernel_initializer='normal',activation='relu'))
nn.add(Dense(16,kernel_initializer='normal',activation='relu'))
nn.add(Dropout(0.2))
nn.add(Dense(8,kernel_initializer='normal',activation='relu'))
nn.add(Dense(1,kernel_initializer='normal'))              #,activation='linear'

nn.compile(optimizer='adam',loss='mean_squared_error',metrics=['mae'])

In [26]:
nn.fit(xtrain,ytrain,epochs=10,batch_size=32)
nn_pred = nn.predict(xtest)

Epoch 1/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 1ms/step - loss: 3.7410 - mae: 0.8867
Epoch 2/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.6421 - mae: 0.8666
Epoch 3/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 3.7254 - mae: 0.8631
Epoch 4/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.2057 - mae: 0.8059
Epoch 5/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.3310 - mae: 0.8107
Epoch 6/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.3149 - mae: 0.8137
Epoch 7/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.2321 - mae: 0.8113
Epoch 8/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 1ms/step - loss: 3.4121 - mae: 0.8185
Epoch 9/10
[1m12826/12826[0m [32m━━━━

In [28]:
nn.fit(xtrain,ytrain,epochs=10,batch_size=32)
nn_pred = nn.predict(xtest)

Epoch 1/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1ms/step - loss: 3.3981 - mae: 0.8115
Epoch 2/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1ms/step - loss: 3.2450 - mae: 0.8117
Epoch 3/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1ms/step - loss: 3.3827 - mae: 0.8131
Epoch 4/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 1ms/step - loss: 3.1684 - mae: 0.7964
Epoch 5/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.2169 - mae: 0.7959
Epoch 6/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.3529 - mae: 0.8045
Epoch 7/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 2ms/step - loss: 3.2576 - mae: 0.7949
Epoch 8/10
[1m12826/12826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step - loss: 3.3617 - mae: 0.7995
Epoch 9/10
[1m12826/12826[0m [32m━━━━

In [30]:
from sklearn.metrics import mean_absolute_percentage_error
nn_pred = np.round(nn_pred)
mape = mean_absolute_percentage_error(ytest,nn_pred)*100
mape

40.05658329937904

In [32]:
pred_compare = pd.DataFrame(ytest.reset_index(drop=True))
predictions = pd.Series(nn_pred.tolist()).apply(lambda x: x[0])
pred_compare = pd.concat([pred_compare,predictions],axis=1)
pred_compare.columns = ['Sales','Predicted']
pred_compare

Unnamed: 0,Sales,Predicted
0,1,1.0
1,1,2.0
2,1,3.0
3,1,1.0
4,1,1.0
...,...,...
102602,1,2.0
102603,1,2.0
102604,1,1.0
102605,2,1.0


In [34]:
import pickle

In [98]:
pickle.dump(nn,open('./item_model.pkl','wb'))
pickle.dump(area_mapping,open('./item_encoder_area.pkl','wb'))
pickle.dump(item_mapping,open('./item_encoder_item.pkl','wb'))
pickle.dump(ss,open('./item_scaler.pkl','wb'))

In [38]:
festival.to_pickle('./festivals.pkl')

#### Prediction Pipeline

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import pickle
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('itemdatewise.csv',skiprows=300000,nrows=10000,header=None)
data = data.iloc[0:,0:-1]
data.columns = ['ddate','area','itemcode']

In [9]:
data.area = data.area.str.strip()
data.area = data.area.apply(lambda x : '24 PARGANAS(S)' if x == '24PGS(S)' else x)
data.area = data.area.apply(lambda x : '24 PARGANAS(N)' if x == 'NORTH 24 PGS' else x)
data = data[(data.area!='-') & (data.area!='')]

In [11]:
area_encoder = pickle.load(open('./item_encoder_area.pkl','rb'))
item_encoder = pickle.load(open('./item_encoder_item.pkl','rb'))

In [13]:
data.area = data.area.apply(lambda x : area_encoder[x])
data.itemcode = data.itemcode.apply(lambda x : item_encoder[x])

In [11]:
data['ddate'] = pd.to_datetime(data.ddate, format='%d-%m-%Y')
data['day'] = data.ddate.dt.day
data['month'] = data.ddate.dt.month
data['year'] = data.ddate.dt.year
data['weekday'] = data.ddate.dt.weekday
data['weekend'] = data.weekday.apply(lambda x: 1 if x > 4 else 0)

In [19]:
festival = pd.read_pickle('./festivals.pkl')  #pd.read_csv('FestivalDates.csv')
festival['ddate'] = pd.to_datetime(festival.ddate, format='%d-%m-%Y')
festival['festival'] = 1
data = pd.merge(left=data,right=festival,left_on='ddate',right_on='ddate',how='left')
data.festival = data.festival.fillna(0)

In [23]:
data['c1'] = np.sin(data.month * (2 * np.pi / 12))
data['c2'] = np.cos(data.month * (2 * np.pi / 12))
data.drop('ddate', axis=1, inplace=True)
features = data.drop(['year'],axis=1)

In [25]:
scaler = pickle.load(open('./item_scaler.pkl','rb'))
X = scaler.transform(features)
X = pd.DataFrame(X)
X.columns = features.columns

In [27]:
model = pickle.load(open('./item_model.pkl','rb'))
prediction = model.predict(X)

[1m304/304[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 929us/step


In [28]:
np.round(prediction)

array([[2.],
       [1.],
       [2.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)