## Demand Prediction using NestedCV for Time Series

#### Importing Necessary Libraries

In [3]:
import pandas as pd
import numpy as np
from nestedcv import NestedCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

In [4]:
# read the training data 
df = pd.read_csv(r"data/train.csv")
df.shape

(7560, 12)

In [5]:
# dropping all the null values
df = df.dropna().reset_index(drop=True)
df.shape

(6376, 12)

In [6]:
# handling date column and dropping unnecessary columns
df['date'] = pd.to_datetime(df['date'])
df.drop(['long','lat','pop'], axis=1, inplace=True)
df.shape

(6376, 9)

#### Preprocessing

In [7]:
df1 = df.copy()

Using one hot encoding for features like city, shop, brand and container. And custom mapping for capacity, where

330ml = 0
500ml = 1
1.5lt = 2

In [8]:
def get_one_hot_encoded(df: pd.DataFrame):
    
    oh_city = pd.get_dummies(df['city'], prefix='city').astype(int)
    oh_shop = pd.get_dummies(df['shop'], prefix='shop').astype(int)
    oh_brand = pd.get_dummies(df['brand'], prefix='brand').astype(int)
    oh_container = pd.get_dummies(df['container'], prefix='container').astype(int)
    
    df = pd.concat([df, oh_city, oh_shop, oh_brand, oh_container], axis=1)
    
    df.drop(['city','shop','container','brand'], axis=1, inplace=True)
    
    return df 

In [9]:
df_new = get_one_hot_encoded(df1)
df_new['capacity'] = df_new['capacity'].map({'330ml':0,
                        '500ml':1,
                        '1.5lt':2})
df_new.head()

Unnamed: 0,id,date,capacity,price,quantity,city_Athens,city_Irakleion,city_Larisa,city_Patra,city_Thessaloniki,...,shop_shop_5,shop_shop_6,brand_adult-cola,brand_gazoza,brand_kinder-cola,brand_lemon-boost,brand_orange-power,container_can,container_glass,container_plastic
0,0.0,2012-01-31,1,0.96,13280.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,1.0,2012-01-31,2,2.86,6727.0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,2.0,2012-01-31,0,0.87,9848.0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,3.0,2012-01-31,1,1.0,20050.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,4.0,2012-01-31,0,0.39,25696.0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


Defining different baseline models to run the first iteration.

In [10]:
models = {
    'Linear_Regressor' : LinearRegression(),
    'Ridge_Regressor' : Ridge(random_state=42),
    'Linear_SVR': LinearSVR(random_state=42),
    'Decision_Tree_Regressor' : DecisionTreeRegressor(random_state=42),
    'Random_Forest_Regressor' : RandomForestRegressor(random_state=42),
    'Gradient_Boosting_Regressor' : GradientBoostingRegressor(random_state=42),
    'XGBoost_Regressor' : XGBRegressor()
}

#### Model Training and Cross Validation

In [11]:
df_new.columns

Index(['id', 'date', 'capacity', 'price', 'quantity', 'city_Athens',
       'city_Irakleion', 'city_Larisa', 'city_Patra', 'city_Thessaloniki',
       'shop_shop_1', 'shop_shop_2', 'shop_shop_3', 'shop_shop_4',
       'shop_shop_5', 'shop_shop_6', 'brand_adult-cola', 'brand_gazoza',
       'brand_kinder-cola', 'brand_lemon-boost', 'brand_orange-power',
       'container_can', 'container_glass', 'container_plastic'],
      dtype='object')

In [12]:
features = ['capacity', 'price', 'city_Athens', 'city_Irakleion', 'city_Larisa', 'city_Patra', 'city_Thessaloniki', 'shop_shop_1', 'shop_shop_2', 'shop_shop_3',
            'shop_shop_4', 'shop_shop_5', 'shop_shop_6', 'brand_adult-cola', 'brand_gazoza', 'brand_kinder-cola', 'brand_lemon-boost', 'brand_orange-power', 'container_can',
            'container_glass', 'container_plastic']
target = 'quantity'

Using custom NestedCV class for 5 folds.

In [30]:
k=5
cv = NestedCV(k=k)
splits = cv.split(data=df_new, date_column='date')

Running each model and generating R2 Score for each fold.

In [31]:
check = []
result = []

print(f"Running for {k} folds...")

for i , (train, val) in enumerate(splits): 
    print("------------------")   
    print(f"Fold {i} Completed ")
    X_train = train[features].copy()
    y_train = train[target]
    
    X_val = val[features].copy()
    y_val = val[target]
    
    check.append({
        'Fold':i,
        'X_train_shape':X_train.shape,
        'X_val_shape':X_val.shape
    })
    
    check_df = pd.DataFrame(check)
    
    for name, model in models.items():
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        result.append({
            'Fold':i,
            'Model':name,
            'R2_score':round(r2_score(y_val, y_pred),2)
        })
        
        result_df = pd.DataFrame(result)

Running for 5 folds...
---------------------------------------------------------------------------------------------
Fold 0: 
---------------------------------------------------------------------------------------------
Fold 1: 
---------------------------------------------------------------------------------------------
Fold 2: 
---------------------------------------------------------------------------------------------
Fold 3: 
---------------------------------------------------------------------------------------------
Fold 4: 


In [32]:
check_df

Unnamed: 0,Fold,X_train_shape,X_val_shape
0,0,"(1064, 21)","(1063, 21)"
1,1,"(2127, 21)","(1068, 21)"
2,2,"(3195, 21)","(1061, 21)"
3,3,"(4256, 21)","(1058, 21)"
4,4,"(5314, 21)","(1062, 21)"


In [33]:
result_df

Unnamed: 0,Fold,Model,R2_score
0,0,Linear_Regressor,0.56
1,0,Ridge_Regressor,0.56
2,0,Linear_SVR,-1.99
3,0,Decision_Tree_Regressor,0.57
4,0,Random_Forest_Regressor,0.7
5,0,Gradient_Boosting_Regressor,0.71
6,0,XGBoost_Regressor,0.67
7,1,Linear_Regressor,0.55
8,1,Ridge_Regressor,0.55
9,1,Linear_SVR,-1.68


In [34]:
for name, model in models.items():
    print(name + " R^2: {:.5f}".format(model.score(X_val, y_val)))

Linear_Regressor R^2: 0.53226
Ridge_Regressor R^2: 0.53272
Linear_SVR R^2: -1.28309
Decision_Tree_Regressor R^2: 0.51500
Random_Forest_Regressor R^2: 0.62538
Gradient_Boosting_Regressor R^2: 0.69603
XGBoost_Regressor R^2: 0.66484


In [35]:
models

{'Linear_Regressor': LinearRegression(),
 'Ridge_Regressor': Ridge(random_state=42),
 'Linear_SVR': LinearSVR(random_state=42),
 'Decision_Tree_Regressor': DecisionTreeRegressor(random_state=42),
 'Random_Forest_Regressor': RandomForestRegressor(random_state=42),
 'Gradient_Boosting_Regressor': GradientBoostingRegressor(random_state=42),
 'XGBoost_Regressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
          

Using Gradient Boosting Regressor as final model to predict on test data 

In [36]:
gbr = models['Gradient_Boosting_Regressor']

In [37]:
fea_imp_gbr = pd.DataFrame({"Features":features, "Importance": gbr.feature_importances_})\
                    .sort_values(by='Importance', ascending=False)\
                        .reset_index(drop=True)
fea_imp_gbr                        

Unnamed: 0,Features,Importance
0,price,0.551478
1,city_Patra,0.066684
2,brand_gazoza,0.056437
3,shop_shop_6,0.054183
4,brand_kinder-cola,0.042386
5,brand_orange-power,0.040165
6,city_Athens,0.030842
7,shop_shop_3,0.025016
8,capacity,0.021919
9,brand_lemon-boost,0.019084


In [38]:
y_pred = gbr.predict(X_val)

r2_score(y_val, y_pred)

0.6960252470861327

#### Iteration 2 - Using only top 15 features

In [39]:
fea_imp_gbr.query("Importance>=0.012")['Features'].to_list()

['price',
 'city_Patra',
 'brand_gazoza',
 'shop_shop_6',
 'brand_kinder-cola',
 'brand_orange-power',
 'city_Athens',
 'shop_shop_3',
 'capacity',
 'brand_lemon-boost',
 'city_Larisa',
 'shop_shop_5',
 'container_can',
 'container_glass']

In [47]:
features = ['price','city_Patra','brand_gazoza','shop_shop_6', 'brand_kinder-cola', 'brand_orange-power', 'city_Athens', 'shop_shop_3', 'capacity', 'brand_lemon-boost',
            'city_Larisa', 'shop_shop_5', 'container_can', 'container_glass']

target = 'quantity'

In [48]:
k=5
cv = NestedCV(k=k)
splits = cv.split(data=df_new, date_column='date')

In [49]:
check = []
result = []

print(f"Running for {k} folds...")

for i , (train, val) in enumerate(splits): 
    print("------------------")   
    print(f"Fold {i} Completed ")
    X_train = train[features].copy()
    y_train = train[target]
    
    X_val = val[features].copy()
    y_val = val[target]
    
    check.append({
        'Fold':i,
        'X_train_shape':X_train.shape,
        'X_val_shape':X_val.shape
    })
    
    check_df = pd.DataFrame(check)
    
    
    gbr1 = GradientBoostingRegressor(random_state=42)
    gbr1.fit(X_train, y_train)
    y_pred = gbr1.predict(X_val)
    
    result.append({
        'Fold':i,
        'Model':'Gradient Boosting Regressor',
        'R2_score':round(r2_score(y_val, y_pred),2)
    })
    
    result_df = pd.DataFrame(result)

Running for 5 folds...
------------------
Fold 0 Completed 
------------------
Fold 1 Completed 
------------------
Fold 2 Completed 
------------------
Fold 3 Completed 
------------------
Fold 4 Completed 


In [50]:
check_df

Unnamed: 0,Fold,X_train_shape,X_val_shape
0,0,"(1064, 14)","(1063, 14)"
1,1,"(2127, 14)","(1068, 14)"
2,2,"(3195, 14)","(1061, 14)"
3,3,"(4256, 14)","(1058, 14)"
4,4,"(5314, 14)","(1062, 14)"


In [51]:
result_df

Unnamed: 0,Fold,Model,R2_score
0,0,Gradient Boosting Regressor,0.71
1,1,Gradient Boosting Regressor,0.71
2,2,Gradient Boosting Regressor,0.7
3,3,Gradient Boosting Regressor,0.68
4,4,Gradient Boosting Regressor,0.69


## Prediction on test set

In [52]:
test = pd.read_csv(r'data/test.csv')
test.shape

(1080, 12)

In [53]:
# dropping all the null values
test = test.dropna().reset_index(drop=True)
test.shape

(1058, 12)

In [54]:
# handling date column and dropping unnecessary columns
test['date'] = pd.to_datetime(test['date'])
test.drop(['long','lat','pop'], axis=1, inplace=True)
test.shape

(1058, 9)

In [55]:
test_new = get_one_hot_encoded(test)

test_new['capacity'] = test_new['capacity'].map({'330ml':0,
                        '500ml':1,
                        '1.5lt':2})

test_new.head()

Unnamed: 0,id,date,capacity,price,quantity,city_Athens,city_Irakleion,city_Larisa,city_Patra,city_Thessaloniki,...,shop_shop_5,shop_shop_6,brand_adult-cola,brand_gazoza,brand_kinder-cola,brand_lemon-boost,brand_orange-power,container_can,container_glass,container_plastic
0,6480,2018-01-31,2,3.1,7056,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,6481,2018-01-31,0,0.85,12490,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,6482,2018-01-31,1,0.83,26640,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,6483,2018-01-31,1,0.54,41892,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,6484,2018-01-31,2,0.83,22923,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [56]:
features = ['price','city_Patra','brand_gazoza','shop_shop_6', 'brand_kinder-cola', 'brand_orange-power', 'city_Athens', 'shop_shop_3', 'capacity', 'brand_lemon-boost',
            'city_Larisa', 'shop_shop_5', 'container_can', 'container_glass']

target = 'quantity'

In [57]:
X_test = test_new[features].copy()
y_test = test_new[target]

In [58]:
y_pred = gbr1.predict(X_test)

r2_score(y_test, y_pred)

0.6713760864421687