In [30]:
import os
import gc
import pandas as pd
import numpy as np
from datetime import date
import matplotlib.pyplot as plt
import seaborn as sns

import time
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings("ignore")

np.random.seed(0)

In [31]:
train = pd.read_csv("train_fwYjLYX.csv", parse_dates=['application_date'])
test = pd.read_csv("test_1eLl9Yf.csv", parse_dates=['application_date'])
submission = pd.read_csv("sample_submission_IIzFVsf.csv", parse_dates=['application_date'])
train.shape,test.shape,submission.shape

((80402, 6), (180, 3), (180, 4))

In [32]:
test.drop(['id'], axis=1, inplace=True)
train = train.sort_values('application_date').reset_index(drop = True)
test = test.sort_values('application_date').reset_index(drop = True)

In [33]:
train.application_date.min(), train.application_date.max()

(Timestamp('2017-04-01 00:00:00'), Timestamp('2019-07-23 00:00:00'))

In [34]:
test.application_date.min(), test.application_date.max()

(Timestamp('2019-07-06 00:00:00'), Timestamp('2019-10-24 00:00:00'))

In [35]:
agg_func = {'case_count': ['sum']}
agg_name = train.groupby(['segment','application_date']).agg(agg_func)
agg_name.columns = [ 'SA_' + ('_'.join(col).strip()) for col in agg_name.columns.values]
agg_name.reset_index(inplace=True)
train = train.merge(agg_name, on=['segment','application_date'], how='left')
del agg_name
train.drop(['branch_id','state','zone','case_count'], axis=1, inplace=True)
train = train.rename(columns={'SA_case_count_sum': 'case_count'})
train.drop_duplicates(keep='first', inplace=True)
train = train.sort_values('application_date').reset_index(drop = True)
# df = train.append(test, ignore_index=True,sort=False)

train['train_or_test'] = 'train'
test['train_or_test'] = 'test'
df = pd.concat([train,test], sort=False)
print('Combined df shape:{}'.format(df.shape))
del train, test
gc.collect()

Combined df shape:(1830, 4)


125

In [36]:
df.head()

Unnamed: 0,application_date,segment,case_count,train_or_test
0,2017-04-01,1,299.0,train
1,2017-04-01,2,897.0,train
2,2017-04-02,2,605.0,train
3,2017-04-03,1,42.0,train
4,2017-04-03,2,2016.0,train


In [37]:
# Extracting application_date features
df['dayofmonth'] = df.application_date.dt.day
df['dayofyear'] = df.application_date.dt.dayofyear
df['dayofweek'] = df.application_date.dt.dayofweek
df['month'] = df.application_date.dt.month
df['year'] = df.application_date.dt.year
df['weekofyear'] = df.application_date.dt.weekofyear
df['is_month_start'] = (df.application_date.dt.is_month_start).astype(int)
df['is_month_end'] = (df.application_date.dt.is_month_end).astype(int)
df.head()

Unnamed: 0,application_date,segment,case_count,train_or_test,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2017-04-01,1,299.0,train,1,91,5,4,2017,13,1,0
1,2017-04-01,2,897.0,train,1,91,5,4,2017,13,1,0
2,2017-04-02,2,605.0,train,2,92,6,4,2017,13,0,0
3,2017-04-03,1,42.0,train,3,93,0,4,2017,14,0,0
4,2017-04-03,2,2016.0,train,3,93,0,4,2017,14,0,0


In [38]:
df.sort_values(by=['segment','application_date'], axis=0, inplace=True)
df.head(2)

Unnamed: 0,application_date,segment,case_count,train_or_test,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2017-04-01,1,299.0,train,1,91,5,4,2017,13,1,0
3,2017-04-03,1,42.0,train,3,93,0,4,2017,14,0,0


In [39]:
# Features constructed from previous case_count values

# Creating case_count lag features
def create_case_count_lag_feats(df, gpby_cols, target_col, lags):
    gpby = df.groupby(gpby_cols)
    for i in lags:
        df['_'.join([target_col, 'lag', str(i)])] = \
                gpby[target_col].shift(i).values + np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating case_count rolling mean features
def create_case_count_rmean_feats(df, gpby_cols, target_col, windows, min_periods=2, 
                             shift=1, win_type=None):
    gpby = df.groupby(gpby_cols)
    for w in windows:
        df['_'.join([target_col, 'rmean',str(shift), str(w)])] = \
            gpby[target_col].shift(shift).rolling(window=w, 
                                                  min_periods=min_periods,
                                                  win_type=win_type).mean().values + np.random.normal(scale=1.6, size=(len(df),))
    return df

# Creating case_count exponentially weighted mean features
def create_case_count_ewm_feats(df, gpby_cols, target_col, alpha=[0.9], shift=[1]):
    gpby = df.groupby(gpby_cols)
    for a in alpha:
        for s in shift:
            df['_'.join([target_col, 'lag', str(s), 'ewm', str(a)])] = \
                gpby[target_col].shift(s).ewm(alpha=a).mean().values + np.random.normal(scale=1.6, size=(len(df),))
    return df

In [40]:
# Converting case_count to log(1+case_count)
df['case_count'] = np.log1p(df.case_count.values)
df.head(2)

Unnamed: 0,application_date,segment,case_count,train_or_test,dayofmonth,dayofyear,dayofweek,month,year,weekofyear,is_month_start,is_month_end
0,2017-04-01,1,5.703782,train,1,91,5,4,2017,13,1,0
3,2017-04-03,1,3.7612,train,3,93,0,4,2017,14,0,0


In [41]:
# Time-based Validation set

# For validation to keep months also identical to test set we can choose period (same of 2018) as the validation set.

masked_series = (df['application_date'] >= '2018-07-06') & (df['application_date'] <= '2018-10-24')
masked_series2 = (df['application_date'] < '2018-07-06') & (df['application_date'] > '2018-10-24')
df.loc[(masked_series), 'train_or_test'] = 'val'
df.loc[(masked_series2), 'train_or_test'] = 'no_train'
print('Train shape: {}'.format(df.loc[df.train_or_test=='train',:].shape))
print('Validation shape: {}'.format(df.loc[df.train_or_test=='val',:].shape))
print('No train shape: {}'.format(df.loc[df.train_or_test=='no_train',:].shape))
print('Test shape: {}'.format(df.loc[df.train_or_test=='test',:].shape))

Train shape: (1428, 12)
Validation shape: (222, 12)
No train shape: (0, 12)
Test shape: (180, 12)


In [42]:
# Model Validation

# Converting case_count of validation period to nan so as to resemble test period
train = df.loc[df.train_or_test.isin(['train','val']), :]
Y_val = train.loc[train.train_or_test=='val', 'case_count'].values.reshape((-1))
Y_train = train.loc[train.train_or_test=='train', 'case_count'].values.reshape((-1))
train.loc[train.train_or_test=='val', 'case_count'] = np.nan

# # Creating case_count lag, rolling mean, rolling median, ohe features of the above train set
train = create_case_count_lag_feats(train, gpby_cols=['segment'], target_col='case_count', 
                                    lags=[91,98,105,112,119,126,182,364,546,728])

train = create_case_count_rmean_feats(train, gpby_cols=['segment'], 
                                 target_col='case_count', windows=[364,546], 
                                 min_periods=10, win_type='triang')

train = create_case_count_ewm_feats(train, gpby_cols=['segment'], 
                               target_col='case_count', 
                               alpha=[0.95, 0.9, 0.8, 0.7, 0.6, 0.5], 
                               shift=[91,98,105,112,119,126,182,364,546,728])

# Final train and val datasets
val = train.loc[train.train_or_test=='val', :]
train = train.loc[train.train_or_test=='train', :]
print('Train shape:{}, Val shape:{}'.format(train.shape, val.shape))

Train shape:(1428, 84), Val shape:(222, 84)


In [43]:
avoid_cols = ['application_date', 'case_count', 'train_or_test', 'id', 'year','is_month_start']
cols = [col for col in train.columns if col not in avoid_cols]
print('No of training features: {} \nAnd they are:{}'.format(len(cols), cols))

No of training features: 79 
And they are:['segment', 'dayofmonth', 'dayofyear', 'dayofweek', 'month', 'weekofyear', 'is_month_end', 'case_count_lag_91', 'case_count_lag_98', 'case_count_lag_105', 'case_count_lag_112', 'case_count_lag_119', 'case_count_lag_126', 'case_count_lag_182', 'case_count_lag_364', 'case_count_lag_546', 'case_count_lag_728', 'case_count_rmean_1_364', 'case_count_rmean_1_546', 'case_count_lag_91_ewm_0.95', 'case_count_lag_98_ewm_0.95', 'case_count_lag_105_ewm_0.95', 'case_count_lag_112_ewm_0.95', 'case_count_lag_119_ewm_0.95', 'case_count_lag_126_ewm_0.95', 'case_count_lag_182_ewm_0.95', 'case_count_lag_364_ewm_0.95', 'case_count_lag_546_ewm_0.95', 'case_count_lag_728_ewm_0.95', 'case_count_lag_91_ewm_0.9', 'case_count_lag_98_ewm_0.9', 'case_count_lag_105_ewm_0.9', 'case_count_lag_112_ewm_0.9', 'case_count_lag_119_ewm_0.9', 'case_count_lag_126_ewm_0.9', 'case_count_lag_182_ewm_0.9', 'case_count_lag_364_ewm_0.9', 'case_count_lag_546_ewm_0.9', 'case_count_lag_728_e

In [44]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [45]:
train_x=train[cols]
test_x=val[cols]
train_y=Y_train
test_y=Y_val
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((1428, 79), (222, 79), (1428,), (222,))

In [46]:
train_x.fillna(0,inplace=True)
test_x.fillna(0,inplace=True)

In [47]:
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
m=RandomForestRegressor(n_estimators=100,random_state=0,max_depth= 10,criterion='mae',n_jobs=-1,oob_score= True, min_samples_leaf = 13)
m.fit(train_x,train_y)
preds=m.predict(test_x)
predrf=np.expm1(preds)
print("MAPE: ",(mean_absolute_percentage_error(np.expm1(test_y),predrf)))

MAPE:  48.93419333450568


In [48]:
# Creating case_count lag, rolling mean, rolling median, ohe features of the above train set
df_whole = create_case_count_lag_feats(df, gpby_cols=['segment'], target_col='case_count', 
                                  lags=[91,98,105,112,119,126,182,364,546,728])

df_whole = create_case_count_rmean_feats(df_whole, gpby_cols=['segment'], 
                                    target_col='case_count', windows=[364,546], 
                                    min_periods=10, win_type='triang')

df_whole = create_case_count_ewm_feats(df_whole, gpby_cols=['segment'], target_col='case_count', 
                                  alpha=[0.95, 0.9, 0.8, 0.7, 0.6, 0.5], 
                               shift=[91,98,105,112,119,126,182,364,546,728])

# Final train and test datasets
test = df_whole.loc[df_whole.train_or_test=='test', :]
train = df_whole.loc[~(df_whole.train_or_test=='test'), :]
print('Train shape:{}, Test shape:{}'.format(train.shape, test.shape))

Train shape:(1650, 84), Test shape:(180, 84)


In [50]:
train.shape, test.shape, train['case_count'].shape

((1650, 84), (180, 84), (1650,))

In [51]:
train_segment_1=train[train.segment==1]
train_segment_1 = train_segment_1.sort_values('application_date').reset_index(drop = True)
del train_segment_1['segment']

test_segment_1=test[test.segment==1]
test_segment_1 = test_segment_1.sort_values('application_date').reset_index(drop = True)
del test_segment_1['segment']

train_segment_2=train[train.segment==2]
train_segment_2 = train_segment_2.sort_values('application_date').reset_index(drop = True)
del train_segment_2['segment']

test_segment_2=test[test.segment==2]
test_segment_2 = test_segment_2.sort_values('application_date').reset_index(drop = True)
del test_segment_2['segment']

In [52]:
avoid_cols = ['application_date', 'case_count', 'train_or_test', 'id', 'year','is_month_start','segment']
cols = [col for col in train.columns if col not in avoid_cols]

In [53]:
train_x_segment_1=train_segment_1[cols]
test_x_segment_1=test_segment_1[cols]
train_y_segment_1=train_segment_1['case_count']
train_x_segment_1.shape, test_x_segment_1.shape, train_y_segment_1.shape

((806, 78), (87, 78), (806,))

In [54]:
train_x_segment_2=train_segment_2[cols]
test_x_segment_2=test_segment_2[cols]
train_y_segment_2=train_segment_2['case_count']
train_x_segment_2.shape, test_x_segment_2.shape, train_y_segment_2.shape

((844, 78), (93, 78), (844,))

In [55]:
train_x_segment_1.fillna(0,inplace=True)
test_x_segment_1.fillna(0,inplace=True)
train_x_segment_2.fillna(0,inplace=True)
test_x_segment_2.fillna(0,inplace=True)

In [261]:
%%time

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 19)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 100, num = 100)]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap
              }

rf = RandomForestRegressor(criterion='mae',oob_score= True)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 300, cv = 10, verbose=2, random_state=0, n_jobs = -1)
rf_random.fit(train_x_segment_1,train_y_segment_1)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed: 26.7min finished


Wall time: 26min 48s


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                             

In [262]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 13,
 'max_features': 'auto',
 'max_depth': 20,
 'bootstrap': True}

In [263]:
%%time

from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 19)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [int(x) for x in np.linspace(start = 1, stop = 100, num = 100)]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap
              }

rf = RandomForestRegressor(criterion='mae',oob_score= True)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 300, cv = 10, verbose=2, random_state=0, n_jobs = -1)
rf_random.fit(train_x_segment_2,train_y_segment_2)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed: 17.9min finished


Wall time: 17min 53s


RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                             

In [264]:
rf_random.best_params_

{'n_estimators': 50,
 'min_samples_split': 5,
 'min_samples_leaf': 3,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [56]:
m1=RandomForestRegressor(n_estimators=1000,min_samples_split=5,min_samples_leaf = 13,max_depth= 20,n_jobs=-1,random_state=0,criterion='mae',oob_score= True)
m2=RandomForestRegressor(n_estimators=1000,min_samples_split=5,min_samples_leaf = 3,max_depth= 100,n_jobs=-1,random_state=0,criterion='mae',oob_score= True)

m1.fit(train_x_segment_1,train_y_segment_1)
predrf_segment_1=m1.predict(test_x_segment_1)

m2.fit(train_x_segment_2,train_y_segment_2)
predrf_segment_2=m2.predict(test_x_segment_2)

predrf_segment_1.shape, predrf_segment_2.shape

((87,), (93,))

In [57]:
predrf = np.concatenate((predrf_segment_1, predrf_segment_2), axis=None)
predrf.shape

(180,)

In [59]:
submission['case_count'] = np.expm1(predrf)
submission.to_csv('rf_V32.csv', index=False)