In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc #garbage collect
import random as rd # generating random numbers
import datetime # manipulating date formats
from IPython.core.interactiveshell import InteractiveShell
from multiprocessing import Pool
from itertools import product
from sklearn import preprocessing
import xgboost as xgb
InteractiveShell.ast_node_interactivity = "all"
import matplotlib.pyplot as plt # basic plotting
import seaborn as sns 
import lightgbm as lgb
from sklearn import feature_extraction
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from tqdm import tqdm, tqdm_notebook

%matplotlib inline

# Function to downcast data types and reduce memory consumption
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

import os
print(os.listdir("../input"))


**EDA** -
Here is a summary of the EDA and the insights thus derived performed on the datasets. The code here summarizes some of the key data analyses I performed and how it was used to create features. The idea here was to look for trends in data and given that it was a time series problem, the intention was to check whether the problem would need to be solved by a time-series model (ARIMA etc.) or whether it can be solved using traditonal statistical methods

In [None]:
#Import datasets
train=pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
item_cats = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")


#Merge datasets, convert to datetime format, calculate total sales
train = pd.merge(train,shops,on='shop_id',how='left')
train = pd.merge(train,items,on='item_id',how='left')
train = pd.merge(train,item_cats,on='item_category_id',how='left')
train['date_format'] = pd.to_datetime(train.date,format='%d.%m.%Y')
train['total_sales']=train['item_price']*train['item_cnt_day']

# Convert datetime to date, month, year and weekday columns for EDA
def get_year(x):
    return x.year
def get_month(x):
    return x.month
def get_day(x):
    return x.day
def get_weekday(x):
    return x.dayofweek

train["year"] = train.date_format.apply(get_year)
train["month"] = train.date_format.apply(get_month)
train["day"] = train.date_format.apply(get_day)
train['dayofweek'] = train.date_format.apply(get_weekday)

train.head()

In [None]:
# First analyzing the shape of the two datasets
print("Description of training set")
print("#Rows, #columns: \t\t\t", train.shape)
print("#NaNs: \t\t\t", train.isna().sum().sum()) 
print("#Shops: \t\t", train.shop_id.nunique())
print("#Item Categories: \t", train.item_category_id.nunique())
print("#Items: \t\t", train.item_id.nunique())
print("#Months: \t\t", train.date_block_num.nunique())
print("Description of test set")
print("#Rows, #columns: \t\t\t", test.shape)
print("#Shops: \t\t", test.shop_id.nunique())
print("#Items: \t\t", test.item_id.nunique())

# Checking whether all the shops and items are present in the two datasets
train_items = train.item_id.unique()
test_items_not_in_train = test[~test.item_id.isin(train_items)].item_id.unique()
print('%d items in test data not found in train data' % len(test_items_not_in_train))

train_shops = train.shop_id.unique()
test_shops_not_in_train = test[~test.shop_id.isin(train_shops)].shop_id.unique()
print('%d shops in test data not found in train data' % len(test_shops_not_in_train))

*  Above confirms that there are no NaNs in the train dataset
* Also, while all the shops are captured in both test and train, some of the items in test aren't captured in train; therefore, we would need to create a comprehensive train dataset capturing all shop-item pairs

In [None]:
#We are supposed to compute item counts at monthly-level; therefore, we roll-up item counts and revenue at a month level
MonthlyRev = pd.DataFrame(train.groupby(["month", "year"], as_index=False)["total_sales","item_cnt_day"].sum())
MonthlyRev.head()
RevPlot = sns.FacetGrid(data = MonthlyRev.sort_values(by="month"), hue = "year", height = 4, legend_out=True)
RevPlot_1 = RevPlot.map(plt.plot, "month", "total_sales")
RevPlot_1.add_legend()
RevPlot_1;
CountPlot = sns.FacetGrid(data = MonthlyRev.sort_values(by="month"), hue = "year", height = 4, legend_out=True)
CountPlot = CountPlot.map(plt.plot, "month", "item_cnt_day")
CountPlot.add_legend()
CountPlot;

Reviewing the monthly count and sales plot we observe the following:
* The revenue and the count plot seems to suggest that 2015 sales are lower than 2013 and 2014 sales. Also, the sales seem to spike in Nov-Dec period in the previous years
* Also, considering that the avg count in 2015 seems considerably lesser than avg count in 2013, 2014; however, the differnce in revenue in the same time period isn't as pronounced. It seems to suggest that price too has an effect and varies across time

In [None]:
# Further, we drill-down to observe trends at a weekly-level
WeeklyRevenue = pd.DataFrame(train.groupby(["dayofweek"], as_index=False)["total_sales"].sum())
sns.barplot(x='dayofweek', y='total_sales', data=WeeklyRevenue)
#  Monday-Sunday=0 to 6

* Drilling down to the weekly-level we observe that the sales seem to particularly spike on the weekends (Fri-Sun) and perhaps this can be used as a feature

In [None]:
# By reviewing item_category level sales we observe that some of the item categories are clear winners with respect to sales/ revenue
MonthlyRev_Cat = pd.DataFrame(train.groupby(["month", "year","item_category_id","item_category_name"], as_index=False)["total_sales","item_cnt_day"].sum())
MonthlyRev_Cat.nlargest(10, 'total_sales')
MonthlyRev_Cat.nlargest(10, 'item_cnt_day')
# The highest revenue generating category seems to be games/ gaming consoles while the item with the most sales seems to be Cinema DVDs
# Therefore, we can say that some of the item categories seem to have more sales than others

In [None]:
# By reviewing shop item_category level sales we observe that some of the item categories are clear winners with respect to sales/ revenue
MonthlyRev_ShopCat = pd.DataFrame(train.groupby(["month", "year","shop_id","item_category_id","item_category_name"], as_index=False)["total_sales","item_cnt_day"].sum())
MonthlyRev_ShopCat.nlargest(10, 'total_sales')
MonthlyRev_ShopCat.nlargest(10, 'item_cnt_day')
# We can say that some of the shop item categories combinations seem to have more sales than others

* Per the above two item-category and shop-item category level sales, we observe that some of the categories/ shops/ items are consistently showing high sales
* Also, one more thing to note is that if one reviews the item category name it seems that the first word captures the item-master category as it is the same across categories

In [None]:
# Checking for outliers with respect to item price
train.item_price.plot()
train[train['item_price'] > 100000]
# There is a clear outlier in terms of item_price Radmin 3 which has only one record, so perhaps this can be removed

In [None]:
# Checking for outliers with respect to item count
train.item_cnt_day.plot()
train[train['item_cnt_day'] > 900]
# There are a few outliers in terms of item counts so these would need to be handled

* By visual inspection of the price and count plots, we have observed a few outliers above and we would need to handle these cases as a part of data cleaning

In [None]:
# Given that the data is a time-series, we should analyze it and check for stationarity
# Using the Augmented Dicky Fuller Test (ADF) we would test the stationarity of our data
# Confirming the stationarity of data would tell if regular statistical models can be used on the dataset
ts=train.groupby(["date_block_num"])["item_cnt_day"].sum()
ts=ts.astype('float')
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')
def test_stationarity(timeseries):
    print('ADF Test Result:')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)

test_stationarity(ts)

# The P-value here is out of the expected range; therefore, we would de-seasonalize our dataset and rerun the stationarity test

In [None]:
# Remove seasonal trend
from pandas import Series as Series
# create a differenced series
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return Series(diff)

# invert differenced forecast
def inverse_difference(last_ob, value):
    return value + last_ob

# Plot total sales as original, after de-trending, and after de-seasonalization
ts=train.groupby(["date_block_num"])["item_cnt_day"].sum()
ts=ts.astype('float')
plt.figure(figsize=(16,16))
plt.subplot(311)
plt.title('Original')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts)
plt.subplot(312)
plt.title('After De-trend')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(ts)
plt.plot(new_ts)
plt.plot()

plt.subplot(313)
plt.title('After De-seasonalization')
plt.xlabel('Time')
plt.ylabel('Sales')
new_ts=difference(ts,12)       # assuming the seasonality is 12 months long
plt.plot(new_ts)
plt.plot()

In [None]:
# Rerun stationarity test after de-seasonality 
test_stationarity(new_ts)

* Since the p-value is within 5%, we can assume our sales training data has stationarity and proceed with using standard statistical modeling methods
* We shall be attempting to transform the given time-series problem into a problem which can be solved using standard statistical modelling (regression) by the sliding window method

In [None]:
# Delete all dataframes created and release memory
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
del [MonthlyRev_Cat,MonthlyRev_ShopCat,WeeklyRevenue,item_cats,items,shops,test,train]
gc.collect()

In [None]:
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
alldfs

**Data Processing and Feature Engineering** - Here is a summary of the data processing steps and feature engineering incorporated in the data

In [None]:
#Re-import all the data files (as we cleared the memory in the above step)
train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
item_cats = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
#An Excel Input used for easy import of number of Fridays, Saturdays and Sundays in each month
calendar = pd.read_csv("../input/calendar/Calendar.csv")

calendar.head()

In [None]:
# As it was observed that not all shop-item combinations are present in train, we'll make a combined dataset with all shop-item-date combinations
index_col = ['shop_id', 'item_id', 'date_block_num']
all_combi = []
for block in train['date_block_num'].unique():
    shops_list = train.loc[train['date_block_num'] == block, 'shop_id'].unique()
    items_list = train.loc[train['date_block_num'] == block, 'item_id'].unique()
    all_combi.append(np.array(list(product(*[shops_list, items_list, [block]])),dtype='int32'))
all_combi = pd.DataFrame(np.vstack(all_combi), columns = index_col,dtype=np.int32)

#Remove outliers from the dataset
train = train[train.item_price<100000]
train = train[train.item_cnt_day<=900]

#Calculate Monthly-Sales Train data as all computation is required to be done at monthly level 
sales_data = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': 'sum','item_price': np.mean}).reset_index()

# Bring in the totals computed above against the all shop-item combination dataset
sales_data = pd.merge(all_combi,sales_data,on=['date_block_num','shop_id','item_id'],how='left').fillna(0)

In [None]:
#Group the item categories into higher-level ones as we observed that the first word in the item category column indicates a master category
l_cat = list(item_cats.item_category_name)
for ind in range(0,1):
     l_cat[ind] = 'PC Headsets / Headphones'
for ind in range(1,8):
     l_cat[ind] = 'Accessories'
l_cat[8] = 'Tickets'
l_cat[9] = 'Delivery of goods'
for ind in range(10,18):
     l_cat[ind] = 'Consoles'
for ind in range(18,25):
     l_cat[ind] = 'Console Games'
l_cat[25] = 'Accessories for Games'
for ind in range(26,28):
     l_cat[ind] = 'Phone Games'
for ind in range(28,32):
     l_cat[ind] = 'CD Games'
for ind in range(32,37):
     l_cat[ind] = 'Cards'
for ind in range(37,43):
     l_cat[ind] = 'Movies'
for ind in range(43,55):
     l_cat[ind] = 'Books'
for ind in range(55,61):
     l_cat[ind] = 'Music'
for ind in range(61,73):
     l_cat[ind] = 'Gifts'
for ind in range(73,79):
     l_cat[ind] = 'Programs'
for ind in range(79,81):
     l_cat[ind] = 'Services'
for ind in range(81,83):
     l_cat[ind] = 'Clean Media'
l_cat[83] = 'Batteries'

lb = preprocessing.LabelEncoder()

item_cats['cat_type'] = lb.fit_transform(l_cat)

In [None]:
#Bring in item related information
sales_data = pd.merge(sales_data,items,on=['item_id'],how='left')

#Bring in item category related information
sales_data = pd.merge(sales_data,item_cats, on=['item_category_id'], how='left')

sales_data.shape

In [None]:
# Create features related to item-month, shop-month,and item category-month for item price and count. These features will be later used to create lag variables
for feat_type in ['item_id','shop_id','item_category_id','cat_type']:
    for column, func_desc, funcname in [('item_price',np.mean,'avg'),('item_cnt_day',np.sum,'sum'),('item_cnt_day',np.mean,'avg')]:

        featdf = sales_data.groupby([feat_type,'date_block_num']).agg(func_desc).reset_index()[[column,feat_type,'date_block_num']]
        featdf.columns = [feat_type+'_'+funcname+'_'+column, feat_type,'date_block_num']
        
        sales_data = pd.merge(sales_data,featdf,on=['date_block_num', feat_type],how='left')

#Similar to the above add shop-item category-month related feature which would be later used as a lag variable
for column, func_desc, funcname in [('item_price',np.mean,'avg'),('item_cnt_day',np.sum,'sum'),('item_cnt_day',np.mean,'avg')]:
    
    featdf = sales_data.groupby(['shop_id','item_category_id','date_block_num']).agg(func_desc).reset_index()[[column,'shop_id','item_category_id','date_block_num']]
    featdf.columns = ['shop_itemcat'+'_'+funcname+'_'+column, 'shop_id','item_category_id','date_block_num']
    
    sales_data = pd.merge(sales_data,featdf,on=['date_block_num','shop_id','item_category_id'],how='left')

In [None]:
# Expanding Mean-encoding for target variable (Rationale for using this technique is explained after a few snippets) 
#This feature would be later used as a lag variable
Target = 'item_cnt_day'
global_mean =  sales_data[Target].mean() # global mean value used for imputation in case of NaNs
y_tr = sales_data[Target].values

mean_encoded_col = ['shop_id', 'item_id', 'item_category_id', 'cat_type']

for col in tqdm(mean_encoded_col):
    col_tr = sales_data[[col] + [Target]]
    corrcoefs = pd.DataFrame(columns = ['Cor'])

    cumsum = col_tr.groupby(col)[Target].cumsum() - col_tr[Target]

    sumcnt = col_tr.groupby(col).cumcount()
    col_tr[col + '_cnt_day_mean_Expanding'] = cumsum / sumcnt
    col_tr[col + '_cnt_day_mean_Expanding'].fillna(global_mean, inplace=True)

    corrcoefs.loc[col + '_cnt_day_mean_Expanding'] = np.corrcoef(y_tr, col_tr[col + '_cnt_day_mean_Expanding'])[0][1]

    sales_data = pd.concat([sales_data, col_tr[corrcoefs['Cor'].idxmax()]], axis = 1)   

In [None]:
#Create test data and append it with sales data (train)
temp_test = test.copy()
temp_test['date_block_num'] = 34
temp_test.drop('ID', axis=1, inplace=True)

temp_test = temp_test.merge(items, how='left', on='item_id')
temp_test = temp_test.merge(item_cats, how='left', on='item_category_id')
temp_test.drop('item_name', axis=1, inplace=True)

sales_data.drop('item_name', axis=1, inplace=True)
sales_final = pd.concat([sales_data,temp_test], axis=0, ignore_index=True)

sales_final.shape
sales_final.head()

In [None]:
#Delete dataframes and clear memory
del [featdf,all_combi,col_tr,corrcoefs,temp_test]
alldfs = [var for var in dir() if isinstance(eval(var), pd.core.frame.DataFrame)]
alldfs

In [None]:
#Had previously added simple mean-encoded features for item, shop, category and item-shop combinations
#However, it was observed that due to simple mean encoding some amount of overfitting was happening. Therefore, mean encoding with regularization (Expanding mean-encoding) was implemented instead
#for type_ids in [['item_id'], ['shop_id'], ['cat_type'], ['item_id', 'shop_id']]:
#    for column_id in ['item_price', 'item_cnt_day']:
#        mean_df = sales_data[type_ids + [column_id]].groupby(type_ids).agg(np.mean).reset_index()
#        mean_df.rename(
#            {column_id: "mean_of_"+column_id+"_groupby_"+"_".join(type_ids)},
#            axis='columns', inplace=True
#        )
#        sales_final = pd.merge(sales_final, mean_df, on=type_ids, how='left')

In [None]:
#Create lag features for selected months
lag_variables  = ['item_id_avg_item_price',
'item_id_sum_item_cnt_day',
'item_id_avg_item_cnt_day',
'cat_type_avg_item_cnt_day',
'cat_type_avg_item_price',
'cat_type_sum_item_cnt_day',
'shop_id_avg_item_price',
'shop_id_sum_item_cnt_day',
'shop_id_avg_item_cnt_day',
'shop_itemcat_avg_item_cnt_day',
'shop_itemcat_avg_item_price',
'shop_itemcat_sum_item_cnt_day',
'item_category_id_avg_item_price',
'item_category_id_sum_item_cnt_day',
'item_category_id_avg_item_cnt_day',
'shop_id_cnt_day_mean_Expanding',
'item_id_cnt_day_mean_Expanding',
'item_category_id_cnt_day_mean_Expanding',
'cat_type_cnt_day_mean_Expanding',                  
'item_cnt_day']

#Declare months for which lag features need to be created
lags = [1,2,3,4,5,6,12]

for lag in lags:
    sales_new_df = sales_final.copy()
    sales_new_df.date_block_num+=lag
    sales_new_df = sales_new_df[['date_block_num','shop_id','item_id']+lag_variables]
    sales_new_df.columns = ['date_block_num','shop_id','item_id']+ [lag_feat+'_lag_'+str(lag) for lag_feat in lag_variables]
    sales_final = pd.merge(sales_final,sales_new_df,on=['date_block_num','shop_id','item_id'] ,how='left')


# Since the latest month from which lags are available is 12 drop the data prior to it
sales_final = sales_final[sales_final['date_block_num']>12]

#fill missing values
for feat in sales_final.columns:
    if 'item_cnt' in feat:
        sales_final[feat]=sales_final[feat].fillna(0)
    elif 'item_price' in feat:
        sales_final[feat]=sales_final[feat].fillna(sales_final[feat].median())

# Bring in the days related features from the calendar file
sales_final = pd.merge(sales_final,calendar[['date_block_num','MonthNo','Fridays','Saturdays','Sundays','NormalDays','TotalDays']],on='date_block_num', how='left')

#Drop extra columns
cols_to_drop = lag_variables[:-1] + ['item_price']
sales_final = sales_final.drop(cols_to_drop, axis=1)

In [None]:
# Create delta lag features for month 1 vs. month 2 for all the item, item category and shop-item category related features created above (except the mean-encoding ones)
numeric_cols = lag_variables[:-5]
for cols in numeric_cols: 
    newName = cols + '_1m_diff'
    sales_final[newName] = sales_final[cols + '_lag_1'] - sales_final[cols + '_lag_2']

In [None]:
# The plan is to use holdout scheme for cross validation
# 12 to 32 month data would be used for training
# 33 month data would be used for CV
# 34 month would be the test month

X_train = sales_final[sales_final['date_block_num']<33]
X_cv =  sales_final[(sales_final['date_block_num']==33)]
X_test = sales_final[sales_final['date_block_num']==34]

#Values are clipped at 20 considering the distribution of the target variable
X_train['item_cnt_day'].clip_upper(20, inplace=True)
X_train['item_cnt_day'].clip_lower(0, inplace=True)

X_cv['item_cnt_day'].clip_upper(20, inplace=True)
X_cv['item_cnt_day'].clip_lower(0, inplace=True)

X_test['item_cnt_day'].clip_upper(20, inplace=True)
X_test['item_cnt_day'].clip_lower(0, inplace=True)

#Commented out:Write the dataframes into memory
#X_train.to_csv('X_train.csv', index=False)
#X_cv.to_csv('X_cv.csv', index=False)
#X_test.to_csv('X_test.csv', index=False)

**Model 1** - Simple Linear Regression (LB - 1.15) *(Commented out)*
The idea was to first try out a simple regression model and see how it was fairing on the LB and then try further complex models

In [None]:
# First trying a simple linear regression model on the dataset
# all_data = sales_final
# dates = all_data['date_block_num']
# dates_train  = dates[dates <  34]
# dates_test  = dates[dates == 34]

# target_range = [0, 20]
# cols = [c for c in X_train.columns if c not in ['date_block_num', 'item_cnt_day','item_category_name']]
# colsa = [c for c in X_train.columns if c not in ['item_category_name']]

# all_data = all_data[colsa]

# X_train_1 = all_data[all_data['date_block_num']<34]
# X_test_1 = all_data[all_data['date_block_num']==34]

# X_train = X_train_1[cols]
# X_test = X_test_1[cols]

# y_train = X_train_1['item_cnt_day']
# y_test =  X_test_1['item_cnt_day']

# lr = LinearRegression()
# lr.fit(X_train.values, y_train)

# pred_lr = lr.predict(X_test.values).clip(*target_range)

# submission = pd.DataFrame({
#    "ID": test.index, 
#    "item_cnt_month": pred_lr
# })
# submission.to_csv('lr_submission.csv', index=False)

**Model 2 -** LGBM (LB = 1.1) -* (Commented out)*
* The next attempt was to try LGBM model and some parameters were tried (using hit and trial); however, as the training/ CV was taking a lot of time and the results didn't seem exceptionally better than simple linear regression
* The plan was also to use a linear regression and LGBM ensemble; however,  the results from a quick simple convex mix solution (LB = 1.05) weren't a significant improvement
* Therefore, I decided to go for an XGBoost solution

In [None]:
# X_train_1 = X_train.drop(['item_cnt_day','item_category_name'], axis=1)
# Y_train = X_train['item_cnt_day']
# X_valid_1 = X_cv.drop(['item_cnt_day','item_category_name'], axis=1)
# Y_valid = X_cv['item_cnt_day']
# X_test = X_test.drop(['item_category_name'], axis=1)

# train_data = lgb.Dataset(data=X_train_1, label=Y_train)
# valid_data = lgb.Dataset(data=X_valid_1, label=Y_valid)

# params = {
#               'feature_fraction': 0.75,
#               'metric': 'rmse',
#               'nthread':1, 
#               'min_data_in_leaf': 2**7, 
#               'bagging_fraction': 0.75, 
#               'learning_rate': 0.03, 
#               'objective': 'mse', 
#               'bagging_seed': 2**7, 
#               'num_leaves': 2**7,
#               'bagging_freq':1,
#               'verbose':0 
#              }
#    
# lgb_model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], verbose_eval=1000) 
# Y_test = lgb_model.predict(X_test).clip(0, 20)

# submission = pd.DataFrame({
#    "ID": test.index, 
#    "item_cnt_month": Y_test
# })
# submission.to_csv('lgbm_submission.csv', index=False)

**Model 3**  - XGBoost (LB = 0.90661)
*   Per the points stated above, I decided to implement XGBoost for the problem. My decision was also motivated by the fact that XGBoost is the most widely used algorithm in Kaggle challenges
* The parameters for XGBoost were optimized using RandomizedSearchCV over an native XGBoost API/ GridSearchCV implemenation as they were too time consuming


In [None]:
# Commented out as it takes considerable time to complete all iterations
# Hyperparameter optimization using RandomizedSearchCV
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.metrics import mean_squared_error

#X_train_1 = X_train
#X_cv = X_cv
#X_test_1 =X_test

#cols = [c for c in X_train_1.columns if c not in ['date_block_num', 'item_cnt_day','item_category_name']]

#x_train = X_train_1[cols]
#y_train = X_train_1['item_cnt_day']
#x_valid = X_cv[cols]
#y_valid = X_cv['item_cnt_day']
#x_test = X_test_1[cols]

#clf = xgb.XGBRegressor()

#param_grid = {
#        'silent': [False],
#        'max_depth': [6, 10, 15, 20],
#        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
#        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
#        'gamma': [0, 0.25, 0.5, 1.0],
#        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
#        'n_estimators': [100]}
#
#fit_params = {'eval_metric': 'rmse',
#              'early_stopping_rounds': 10,
#              'eval_set': [(x_valid, y_valid)]}
#
#rs_clf = RandomizedSearchCV(clf, param_grid, n_iter=20,
#                            n_jobs=1, verbose=2, cv=2,
#                            fit_params=fit_params,
#                            scoring='neg_mean_squared_error',random_state=42)
#
#rs_clf.fit(x_train, y_train)
#
#best_score = rs_clf.best_score_
#best_params = rs_clf.best_params_
#print("Best score: {}".format(best_score))
#print("Best params: ")
#for param_name in sorted(best_params.keys()):
#    print('%s: %r' % (param_name, best_params[param_name]))   

In [None]:
# XGBoost Model Training and Fit
params = {
'colsample_bytree': 0.9, 'eta': 0.2, 'eval_metric': 'rmse', 'gamma': 1.0,'lambda': 50.0,
'max_depth': 8, 'min_child_weight': 300.0, 'n_estimators': 1000.0, 'objective': 'reg:linear', 
'seed': '50', 'silent': 1, 'subsample': 0.7, 'tree_method': 'exact', 'colsample_bylevel':0.4 
}

cols = [c for c in X_train.columns if c not in ['date_block_num', 'item_cnt_day','item_category_name']]

x1 = X_train[cols]
y1 = X_train['item_cnt_day']
x2 = X_cv[cols]
y2 = X_cv['item_cnt_day']
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 3500,  watchlist, maximize=False, verbose_eval=50, early_stopping_rounds=50)

pred = model.predict(xgb.DMatrix(X_test[cols]), ntree_limit=model.best_ntree_limit)

test['item_cnt_month'] = pred.clip(0,20)
test.drop(['shop_id', 'item_id'], axis=1, inplace=True)
test.to_csv('xgb_submission.csv', index=False)

In [None]:
# Plot the feature importance
from xgboost import plot_importance
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)
plot_features(model, (10,20))

* Lags 1-3 seem to be the most important lag features suggesting a relation between recent month sales on future sales
* Item count aggregates and lag features seem to be the most important features
* Also, item id and item category id are important features which is in-line with the observation of some categories/ items showing high sales consistently
* Item count delta price lag is also one of the important features suggesting that previous dip/ increase in sales impact future sales
* As expected month number is an important feature in-line with the seasonal nature of the data
* The feature capturing number of weekends/ weekdays isn’t an important feature; therefore, the peak in sales during weekends may not be generalizable