# FUTURE SALES PREDICTION #    
#  Libraries Used

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
# import pmdarima as pm
# from pmdarima.arima import auto_arima

In [None]:
sales_train = pd.read_csv('../input/competitive-data-science-predict-future-sales/sales_train.csv')
shops = pd.read_csv('../input/competitive-data-science-predict-future-sales/shops.csv')
items = pd.read_csv('../input/competitive-data-science-predict-future-sales/items.csv')
items_cat = pd.read_csv('../input/competitive-data-science-predict-future-sales/item_categories.csv')


In [None]:
sales_train.head(2)

In [None]:
items.head(2)

In [None]:
items_cat.head(2)

In [None]:
shops.head(2)

In [None]:
sales_train.info()

In [None]:
sales_train.describe()

**First Merge these Tables by using Join technique.**

In [None]:
item_merged = pd.merge(items,items_cat,how='inner')
shop_merged = pd.merge(sales_train,shops,on='shop_id')
train_data = pd.merge(item_merged,shop_merged,on='item_id')

In [None]:
train_data.head()

# Remove Duplicated Rows

In [None]:
train_data.drop(train_data[train_data.duplicated()].index,axis=0,inplace=True)

**Groupby function use to group the data with the packet of date_block_num,shop_id,item_id and count the item_cnt_day to find out item count in month**

In [None]:
temp_data = train_data.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day' : 'count'})
temp_data.reset_index(inplace=True)
temp_data = temp_data.set_index('date_block_num')

In [None]:
temp_data.head()

**Checking Which item sold the most**

In [None]:
plt.figure(figsize=(16,10))
temp_data.groupby('item_id')['item_cnt_day'].sum().sort_values(ascending=False)[0:20].plot(kind='bar',legend=True,color='blue')

**Checking which shop sold the items most**

In [None]:
plt.figure(figsize=(16,10))
temp_data.groupby('shop_id')['item_cnt_day'].sum().sort_values(ascending=False)[0:20].plot(kind='bar',legend=True,color='tomato')

**Our Month Column is in form of Str. so first convert it into datetime form**

In [None]:
train_data['date'] = pd.to_datetime(train_data['date'], format = '%d.%m.%Y')
train_data['Month'] = train_data['date'].dt.to_period('M')
train_data['Month'] = train_data['Month'].astype(str)

**Now we have all the Months+year column named Month, now applying groupby to count the item_cnt_day for whole month**

In [None]:
temp_data = train_data.groupby(['shop_id','item_id','Month']).agg({'item_cnt_day' : 'sum'})
temp_data.reset_index(inplace=True)
temp_data = temp_data.set_index('Month')

In [None]:
temp_data.rename(columns = {'item_cnt_day' : 'item_cnt_month'},inplace=True)
temp_data.head()

**Checking which Month with year selling the Most**

In [None]:
plt.figure(figsize=(18,10))
temp_data.groupby('Month')['item_cnt_month'].sum().sort_values(ascending=False)[:26].plot(kind='bar',color='tab:cyan')

In [None]:
plt.figure(figsize=(18,10))
temp_data.groupby('Month')['item_cnt_month'].sum().plot(legend=True,color='forestgreen')

**So we determine that Last Months like November, December with different years have most selling**

In [None]:
temp_data.reset_index(inplace=True)
temp_data.tail(5)

In [None]:
X = temp_data.iloc[:,:-1]
y = temp_data.iloc[:,-1:]
le = LabelEncoder()
X['Month']= le.fit_transform(X['Month'])
X

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
x_train.head()

**Using XGBRegressor with different hyperparameter tuning**

In [None]:
xg = xgb.XGBRegressor(base_score=0.5,
                      n_estimators=400,
                      min_child_weight=1,
                     max_depth=7,
                     learning_rate=0.1,
                     booster='gbtree',
                     tree_method='exact',
                     reg_alpha=0,
                     subsample=0.5,
                     validate_parameters=1,
                     colsample_bylevel=1,
                     colsample_bynode=1,
                     colsample_bytree=1,
                     gamma=0)
xg.fit(x_train,y_train)
xg.score(x_test,y_test)

In [None]:
test = pd.read_csv('../input/competitive-data-science-predict-future-sales/test.csv')
test.set_index('ID',inplace=True)
test['Month'] = 34

In [None]:
pred = xg.predict(test)
test['item_cnt_month'] = (pred[0]*len(test))/len(test)

In [None]:
sub = pd.read_csv('../input/competitive-data-science-predict-future-sales/sample_submission.csv')
test.reset_index(inplace=True)
sub = test.drop(['shop_id','item_id','Month'],axis=1)

In [None]:
sub.to_csv('submission.csv',index=False)
subb = pd.read_csv('./submission.csv')
subb.head()

In [None]:
# new_train_data

In [None]:
# new_train_data = new_train_data.groupby(['Month']).agg({'item_cnt_day': 'sum'})
# new_train_data.rename(columns={'item_cnt_day' : 'item_cnt_month'},inplace=True)
# new_train_data['Month'] = new_train_data.index

In [None]:
# fig = plt.figure(figsize=(20,8))
# plt.plot('Month','item_cnt_month',data=new_train_data[0:25],color='tomato')

In [None]:
# new_train_dff = train_data.groupby('Month').agg({'item_cnt_day' : 'sum'})
# new_train_dff['Month'] = new_train_dff.index


In [None]:
# new_train_dff = new_train_data.drop('Month',axis=1)

In [None]:
# model = auto_arima(y=new_train_dff,seasonal=True,start_p = 0, max_p =5,start_q =0, max_q =5,d = 1,alpha=0.05,m=12)
# print(model.summary())

In [None]:
# prediction , confint = model.predict(n_periods=6,return_conf_int=True)
# prediction

In [None]:
# # new_train_dff.set_index('Month',inplace=True)
# period_ind = pd.period_range(start = new_train_dff.index[-1],periods = 6, freq = 'M')
# output = pd.DataFrame({'Month' : period_ind,'prediction' : prediction.round(2)})
# len(output)

In [None]:
# new_train_dff.reset_index('Month',inplace=True)
# new_train_dff

In [None]:
# output['Month'] = output['Month'].astype(str)
# output.info()

In [None]:
# plt.figure(figsize=(20,12))
# cf = pd.DataFrame(confint)
# plt.plot('Month','item_cnt_month',data=new_train_data[15:],label='Past Output',color='blue')
# plt.plot('Month','prediction',data=output,label='Future Output',color='orange')
# output.set_index('Month',inplace=True)
# plt.fill_between(output.index, cf[0], cf[1],color='grey',alpha=.2, label='Confidence Intervals Area')
# plt.legend()