In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings


warnings.filterwarnings('ignore')

from scipy import stats
from xgboost.sklearn import XGBRegressor
from itertools import product
from fbprophet import Prophet
import sklearn

In [7]:
train = pd.read_csv('yds_train2018.csv')
train.drop('S_No',axis=1,inplace=True)

In [11]:
train.head()

Unnamed: 0,Year,Month,Week,Merchant_ID,Product_ID,Country,Sales
0,2013,1,1,ar00001,1,Argentina,157500.0
1,2013,1,1,ar00003,1,Argentina,39375.0
2,2013,1,1,ar00004,1,Argentina,15750.0
3,2013,1,1,ar00007,1,Argentina,47250.0
4,2013,1,1,ar00008,1,Argentina,283500.0


### Preprocessing Sales Dataframe

In [12]:
def grouper(df):
    return [df['Sales'].sum(),df['Merchant_ID'].nunique()]

In [13]:
df_1 = pd.DataFrame(train.groupby(['Country','Product_ID','Year','Month'],as_index=False)\
                    ['Sales','Merchant_ID'].apply(grouper))

In [14]:
df_1.columns = ['Sales_merchant']

In [15]:
df_1['Sales'] = df_1['Sales_merchant'].apply(lambda x:x[0])
df_1['Merchant_n'] = df_1['Sales_merchant'].apply(lambda x:x[1])

In [16]:
df_1.drop('Sales_merchant',axis=1,inplace=True)

In [17]:
df_1.reset_index(inplace=True)

In [19]:
df_1.head()

Unnamed: 0,Country,Product_ID,Year,Month,Sales,Merchant_n
0,Argentina,1,2013,1,34346025.0,63
1,Argentina,1,2013,2,32005575.0,66
2,Argentina,1,2013,3,32530050.0,52
3,Argentina,1,2013,4,35588700.0,51
4,Argentina,1,2013,5,38789100.0,51


### Preprocessing holiday dataframe to merge in sales dataframe

In [20]:
holiday = pd.read_csv('holidays.csv')

holiday.Date = pd.to_datetime(holiday.Date)

holiday['Month'] = holiday['Date'].apply(lambda x:x.month)

holiday['Year'] = holiday['Date'].apply(lambda x:x.year)

holiday_n = pd.DataFrame(holiday.groupby(['Country','Year','Month'])['Holiday'].count())

holiday_n.reset_index(inplace=True)

In [21]:
df_1['holiday']=0

for i in range(holiday_n.shape[0]):
    c = holiday_n.loc[i][0]
    y = holiday_n.loc[i][1]
    w = holiday_n.loc[i][2]
    val = holiday_n.loc[i][3]
    df_1.loc[((df_1.Country==c)&(df_1.Year==y)&(df_1.Month==w)),'holiday']=val

In [22]:
df_1['exp']=0

exp = pd.read_csv('promotional_expense.csv')

for i in range(exp.shape[0]):
    y = exp.loc[i][0]
    m = exp.loc[i][1]
    c = exp.loc[i][2]
    p = exp.iloc[i][3]
    val = exp.loc[i][4]
    df_1.loc[((df_1.Country==c)&(df_1.Year==y)&(df_1.Month==m)&(df_1.Product_ID==p)),'exp']=val

In [23]:
df_1.head()

Unnamed: 0,Country,Product_ID,Year,Month,Sales,Merchant_n,holiday,exp
0,Argentina,1,2013,1,34346025.0,63,1,14749.307
1,Argentina,1,2013,2,32005575.0,66,2,12187.566
2,Argentina,1,2013,3,32530050.0,52,4,13076.579
3,Argentina,1,2013,4,35588700.0,51,1,14377.199
4,Argentina,1,2013,5,38789100.0,51,2,15652.861


### Preprocessing test dataframe accordingly

In [24]:
test = pd.read_csv('yds_test2018.csv')

for i in range(holiday_n.shape[0]):
    c = holiday_n.loc[i][0]
    y = holiday_n.loc[i][1]
    m = holiday_n.loc[i][2]
    val = holiday_n.loc[i][3]
    test.loc[((test.Country==c)&(test.Year==y)&(test.Month==m)),'holiday']=val

for i in range(exp.shape[0]):
    y = exp.loc[i][0]
    m = exp.loc[i][1]
    c = exp.loc[i][2]
    p = exp.iloc[i][3]
    val = exp.loc[i][4]
    test.loc[((test.Country==c)&(test.Year==y)&(test.Month==m)&(test.Product_ID==p)),'exp']=val

test.fillna(0,inplace=True)

In [25]:
test['pred_xgb']=0
test['pred_fbp']=0

### Modeling

### Training a XGBRegressor tuned using SMAPE as metric

In [26]:
train = df_1.copy()

In [27]:
A = df_1.Country.unique()
B = df_1.Product_ID.unique()

In [28]:
features = ['Year','Month','exp','holiday']

In [29]:
comb = list(product(A,B))

In [30]:
def SMAPE(actual,pred):
    actual = np.array(actual).astype('float')
    pred = np.array(pred).astype('float')
    A = np.sum((np.abs(pred-actual))/((np.absolute(pred)+np.absolute(actual))/2))
    return A

In [31]:
for i in range(len(comb)):
    check = train[(train.Country==comb[i][0])&(train.Product_ID==comb[i][1])]
    if check.shape[0]!=0:
        weight = np.ones(shape=check.shape[0])
        weight[-9:]=1.25
        clf = XGBRegressor(n_estimators=40,max_depth=3,learning_rate=0.13)
        clf.fit(check[features].values,check['Sales'].values,sample_weight=weight)
        pred_df = test[(test.Country==comb[i][0])&(test.Product_ID==comb[i][1])]
        val_D = clf.predict(pred_df[features].values)
        test.loc[(test.Country==comb[i][0])&(test.Product_ID==comb[i][1]),\
               'pred_xgb'] = val_D
        val_C =  clf.predict(check[features].values)
        train.loc[check.index,'pred_xgb'] = val_C
    else:
        None

In [32]:
train['ds'] = train.apply(lambda row:(str(row['Year'])+'-'+str(row['Month'])),axis=1)
train['ds'] = pd.to_datetime(train['ds'])
train['y'] = train['Sales']

In [33]:
test['ds'] = test.apply(lambda row:(str(row['Year'])+'-'+str(row['Month'])),axis=1)
test['ds'] = pd.to_datetime(test['ds'])

### Training FBProphet model

In [34]:
for i in range(len(comb)):
    check = train[(train.Country==comb[i][0])&(train.Product_ID==comb[i][1])][['ds','y','exp','holiday']]
    
    if (check.shape[0]!=0):
        m2 = Prophet()
        m2.add_regressor(name='exp')
        m2.add_regressor(name='holiday')
        m2.fit(check)
        pred_df = test[(test.Country==comb[i][0])&(test.Product_ID==comb[i][1])][['ds','exp','holiday']]
        Z = m2.predict(pred_df)
        test.loc[(test.Country==comb[i][0])&(test.Product_ID==comb[i][1]),\
               'pred_fbp'] = Z['yhat'].values
        Y = m2.predict(check[['ds','exp','holiday']])
        train.loc[check.index,'pred_fbp'] = Y['yhat'].values

INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet.forecaster:n_changepoints greater than number of observations.Using 18.0.
INFO:fbprophet.forecaster:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override thi

In [35]:
SMAPE(train['Sales'].values,train['pred_fbp'].values)

46.660412124226426

### Combining XGBoost and fbprophet's result

In [37]:
SMAPE(train['Sales'].values,(0.8*train['pred_fbp'].values+0.2*train['pred_xgb'].values))

42.533781323406586

In [38]:
test['Sales'] = (0.8*test['pred_fbp']+0.2*test['pred_xgb'])

In [39]:
subm = test[['S_No','Year','Month','Product_ID','Country','Sales']]

In [40]:
subm.to_csv('yds_submission2018.csv',index=False)