# Random Forest improved

### To Do:
- improve performance  
- add features from feature team  
- use proper clusters  
- include (all) zero data in training data  
- normalize promotion dates

In [1]:
# change used width of browser window
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
# import packages
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.holtwinters import ExponentialSmoothing,SimpleExpSmoothing, Holt
import datetime
import re
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from scipy.interpolate import interpn
from collections import defaultdict

In [3]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [4]:
# load data
df_train = pd.read_csv('data/orders0206_train.csv', sep='|', parse_dates=['time'])
df_train['date'] = [d.date() for d in df_train['time']]
df_test = pd.read_csv('data/orders0206_test.csv', sep='|', parse_dates=['time'])
df_test['date'] = [d.date() for d in df_test['time']]
df_items = pd.read_csv('data/items.csv', sep='|')

# Helper Functions

In [5]:
def add_datepart(df, fldname, drop=True):
#     https://towardsdatascience.com/multivariate-time-series-forecasting-using-random-forest-2372f3ecbad1
    """Add additional date information to dataframe"""
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, 
                                     infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 
            'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 
            'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9 
    if drop: df.drop(fldname, axis=1, inplace=True)

In [6]:
def evaluate_result(y: dict, y_pred: dict):
    monetary_value = 0
    y_pred = defaultdict(int, y_pred)  # return prediction of 0 for items without prediction
    
    for item in set(y_pred).difference(set(y)):
        y[item] = 0  # make sure that all items for which a demand has been predicted are contained in the actual demands
    
    for item, demand in y.items():
        predicted_demand = y_pred[item]
        price = product_prices[item]
        monetary_value += price * min(demand, predicted_demand)
        if predicted_demand > demand:
            monetary_value -= .6 * price * (predicted_demand - demand)
            
    return monetary_value

In [7]:
def table2lags(table, max_lag, min_lag=0, separator='_'):
#     https://datascience.stackexchange.com/questions/24108/multiple-time-series-predictions-with-random-forests-in-python
    """ Given a dataframe, return a dataframe with different lags of all its columns """
    values=[]
    for i in range(min_lag, max_lag + 1):
        values.append(table.shift(i).copy())
        values[-1].columns = [i for c in table.columns]#[c + separator + str(i) for c in table.columns]
    return pd.concat(values, axis=1)

In [8]:
def density_scatter( x , y, ax = None, sort = True, bins = 20, **kwargs )   :
    """
    Scatter plot colored by 2d histogram
    """
    if ax is None :
        fig , ax = plt.subplots()
    data , x_e, y_e = np.histogram2d( x, y, bins = bins)
    z = interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , data , np.vstack([x,y]).T , method = "splinef2d", bounds_error = False )

    # Sort the points by density, so that the densest points are plotted last
    if sort :
        idx = z.argsort()
        x, y, z = x[idx], y[idx], z[idx]

    ax.scatter( x, y, c=z, **kwargs )
    return ax


In [9]:
from sklearn.model_selection import RandomizedSearchCV
def get_best_settings(mode, df):
    if mode == 'small':
        # parameter optimization

        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start = 5, stop = 200, num = 10)]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(3, 55, num = 11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = range(2,20)#, 7, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = range(2,20)# 6, 8]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'bootstrap': bootstrap}
    else:
        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start = 10, stop = 400, num = 20)]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(5, 55, num = 11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4, 6, 8]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'bootstrap': bootstrap}

    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    # Fit the random search model
    rf_random.fit(df.drop(['count', 'itemID',"WeekGroup"], axis=1), df["count"])

    return rf_random.best_params_

# Time Based Features
https://towardsdatascience.com/multivariate-time-series-forecasting-using-random-forest-2372f3ecbad1

## 1. Prepare train data

In [10]:
train = df_train
test = df_test

In [11]:
test.order.unique()

array([ 1,  4,  2,  3,  7, 10,  8,  6,  5, 15,  9, 60], dtype=int64)

In [12]:
# aggregate sales per day
train_aggregated = train.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})
test_aggregated = test.groupby(['itemID','date']).sum()["order"].to_frame().reset_index().rename(columns={'order':'count'})

In [13]:
# add additional date information
add_datepart(train_aggregated, 'date', drop = False)
add_datepart(test_aggregated, 'date', drop = False)

In [14]:
train_aggregated["WeekGroup"] = train_aggregated["Dayofyear"] // 14
test_aggregated["WeekGroup"] = test_aggregated["Dayofyear"] // 14

In [15]:
train_aggregated.head(5)

Unnamed: 0,itemID,date,count,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,WeekGroup
0,1,2018-01-23,1,2018,1,4,23,1,23,False,False,False,False,False,False,1516665600,1
1,1,2018-01-25,1,2018,1,4,25,3,25,False,False,False,False,False,False,1516838400,1
2,1,2018-01-29,307,2018,1,5,29,0,29,False,False,False,False,False,False,1517184000,2
3,1,2018-01-30,3,2018,1,5,30,1,30,False,False,False,False,False,False,1517270400,2
4,1,2018-01-31,1,2018,1,5,31,2,31,True,False,False,False,False,False,1517356800,2


In [16]:
# aggregate sales  for 14 days
train_aggregated_w = train_aggregated.groupby(['itemID', "WeekGroup"]).sum()["count"].to_frame().reset_index()#.rename(columns={'order':'count'})
# test_aggregated_w = test_aggregated.groupby(['itemID', "WeekGroup"]).sum()["count"].to_frame().reset_index()#.rename(columns={'order':'count'})

In [17]:
# scale data between -1 & 1
scaler = MinMaxScaler()
scaler.fit(train_aggregated_w[['count']])
train_aggregated_w["count"] = pd.DataFrame(scaler.transform(train_aggregated_w[["count"]]))[0]
# test_aggregated_w["count"] = pd.DataFrame(scaler.transform(test_aggregated_w[['count']]))[0]

In [18]:
# add Is_month_end Is_month_start Is_quarter_end Is_quarter_start back
df_weekgroup = pd.DataFrame(train_aggregated.WeekGroup.unique()).rename(columns={0:'WeekGroup'})
df_weekgroup["Is_month_end"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_month_end.unique() else 0)
df_weekgroup["Is_month_start"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_month_start.unique() else 0)
df_weekgroup["Is_quarter_end"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_quarter_end.unique() else 0)
df_weekgroup["Is_quarter_start"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_quarter_start.unique() else 0)
df_weekgroup = df_weekgroup.set_index("WeekGroup")
df_weekgroup.head(3)

Unnamed: 0_level_0,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start
WeekGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0,0,0
2,1,1,0,0
3,0,0,0,0


In [19]:
train_aggregated_w = train_aggregated_w.join(df_weekgroup, on='WeekGroup')
train_aggregated_w.head(3)

Unnamed: 0,itemID,WeekGroup,count,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start
0,1,1,0.000207,0,0,0,0
1,1,2,0.06483,1,1,0,0
2,1,3,0.007042,0,0,0,0


In [20]:
# Removing trends in input variables / turn time series into stationary one
# tbd if needed

In [21]:
# add lagged variables
for prod in train_aggregated_w.itemID.unique():
    for index, row in train_aggregated_w.loc[train_aggregated_w['itemID'] == prod].iterrows():
        for n in range(1,6):
            diff_count = train_aggregated_w[train_aggregated_w['itemID'] == prod][train_aggregated_w['WeekGroup'] == row.WeekGroup-n]["count"]
            train_aggregated_w.loc[index,'count_diff' + str(n)] = diff_count.iloc[0] if len(diff_count) == 1 else None #train_aggregated_w.loc[train_aggregated_w['itemID'] == prod]['count'] - train_aggregated_w.loc[train_aggregated_w['itemID'] == prod]['count'].shift(n)

In [22]:
train_aggregated_w.head(5)

Unnamed: 0,itemID,WeekGroup,count,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,count_diff1,count_diff2,count_diff3,count_diff4,count_diff5
0,1,1,0.000207,0,0,0,0,,,,,
1,1,2,0.06483,1,1,0,0,0.000207,,,,
2,1,3,0.007042,0,0,0,0,0.06483,0.000207,,,
3,1,4,0.000207,1,1,0,0,0.007042,0.06483,0.000207,,
4,1,5,0.0,0,0,0,0,0.000207,0.007042,0.06483,0.000207,


In [23]:
# drop entries where lagged variables are missing (beginning of time series)
train_aggregated_w = train_aggregated_w.dropna()

In [24]:
# prepare lagged variables in test data
test_aggregated_w = pd.DataFrame(columns= ["itemID", "WeekGroup", "count"])
test_aggregated_w.itemID = df_items.itemID.unique()
test_aggregated_w.WeekGroup = train_aggregated_w.WeekGroup.max() + 1
for prod in test_aggregated_w.itemID.unique():
    for n in range(1,6):
        diff_count = train_aggregated_w[train_aggregated_w['itemID'] == prod][train_aggregated_w['WeekGroup'] == test_aggregated_w.WeekGroup.max()-n]["count"]
        test_aggregated_w.loc[test_aggregated_w['itemID'] == prod,'count_diff' + str(n)] = diff_count.iloc[0] if len(diff_count) == 1 else 0

In [25]:
df_weekgroup = pd.DataFrame(test_aggregated_w.WeekGroup.unique()).rename(columns={0:'WeekGroup'})
df_weekgroup["Is_month_end"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_month_end.unique() else 0)
df_weekgroup["Is_month_start"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_month_start.unique() else 0)
df_weekgroup["Is_quarter_end"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_quarter_end.unique() else 0)
df_weekgroup["Is_quarter_start"] = df_weekgroup.WeekGroup.apply(lambda x: 1 if True in train_aggregated[train_aggregated["WeekGroup"] == x].Is_quarter_start.unique() else 0)
df_weekgroup = df_weekgroup.set_index("WeekGroup")
df_weekgroup.head(3)

Unnamed: 0_level_0,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start
WeekGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11,0,0,0,0


In [26]:
test_aggregated_w = test_aggregated_w.join(df_weekgroup, on='WeekGroup')

In [27]:
test_aggregated_w.head(5)

Unnamed: 0,itemID,WeekGroup,count,count_diff1,count_diff2,count_diff3,count_diff4,count_diff5,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start
0,1,11,,0.006007,0.000621,0.061723,0.000207,0.0,0,0,0,0
1,2,11,,0.0,0.0,0.0,0.0,0.0,0,0,0,0
2,3,11,,0.0,0.0,0.0,0.0,0.0,0,0,0,0
3,4,11,,0.0,0.0,0.0,0.0,0.0,0,0,0,0
4,5,11,,0.0,0.0,0.0,0.0,0.0,0,0,0,0


In [97]:
# actual demand
y = df_test.groupby(by='itemID')['order'].sum().to_dict()

# baseline 1 (average demand of previous 14 days)
y_baseline1 = df_train[df_train['time'] >= '2018-05-19'].groupby(by='itemID')['order'].sum().to_dict()

# baseline 2 (average demand of previous half year)
total_orders = df_train.groupby(by='itemID')['order'].sum().to_dict()
total_observed_days = (df_train['time'].dt.normalize().max() - df_train['time'].dt.normalize().min()).days
y_baseline2 = {item: orders / total_observed_days * 14 for item, orders in total_orders.items()}  # 14-day avg. demand

df_info = pd.read_csv('data/infos.csv', sep='|', index_col='itemID')
df_items = pd.read_csv('data/items.csv', sep='|', index_col='itemID')
product_prices = df_info['simulationPrice'].to_dict()

In [99]:
train_aggregated_w = train_aggregated_w.join(df_info["simulationPrice"], on="itemID")
train_aggregated_w = train_aggregated_w.join(df_items, on="itemID")

In [101]:
test_aggregated_w = test_aggregated_w.join(df_info["simulationPrice"], on="itemID")
test_aggregated_w = test_aggregated_w.join(df_items, on="itemID")

## A Time Based Features: One for All 

### 2. Parameter Tuning and Model Training
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [102]:
settings = get_best_settings('other', train_aggregated_w)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.1min finished


In [103]:
settings

{'n_estimators': 338,
 'min_samples_split': 2,
 'min_samples_leaf': 8,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': True}

In [104]:
# train model for all items
rf = RandomForestRegressor(**settings).fit(train_aggregated_w.drop(['count', 'itemID',"WeekGroup"], axis=1), train_aggregated_w["count"])

## Apply to unseen data

In [105]:
# t = pd.DataFrame(rf.predict(test_aggregated_w[test_aggregated_w.itemID == prod].drop(['count', 'itemID',"WeekGroup"], axis=1))).rename(columns={0:'count'})
# # test[test.itemID == prod]["order"].sum()
# scaler.inverse_transform(t)[:, [0]]
# t                       

In [106]:
# apply to unseen data
y_randomforest = dict()
for prod in test_aggregated_w.itemID.unique():
#     train_predict = pd.DataFrame(pd.DataFrame(rf.predict(test_aggregated_w[test_aggregated_w.itemID == prod].drop(['count', 'itemID',"WeekGroup"], axis=1)))).rename(columns={0:'predicted_count'})
    train_predict = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(rf.predict(test_aggregated_w[test_aggregated_w.itemID == prod].drop(['count', 'itemID',"WeekGroup"], axis=1))))).rename(columns={0:'predicted_count'})
    train_predict["actual_count"] = test[test.itemID == prod]["order"].sum()
    # rescale
#     scaler.inverse_transform
    if prod in train_aggregated_w.itemID.unique():
        y_randomforest[prod] = int(train_predict["predicted_count"].sum().round())
    else:
        y_randomforest[prod] = 0

  

In [107]:
# perfect result
print(f'Perfect Result: {evaluate_result(y, y):.2f}')

# baseline 1
print(f'Baseline 1: {evaluate_result(y, y_baseline1):.2f}')

# baseline 2
print(f'Baseline 2: {evaluate_result(y, y_baseline2):.2f}')

# random forest
print(f'Random Forest: {evaluate_result(y, y_randomforest):.2f}')
# mit scaler: -235623.76

Perfect Result: 7895975.87
Baseline 1: -3727365.60
Baseline 2: -1672504.21
Random Forest: -414600.38


# Time Based Features: One for Each Dummy Cluster

In [108]:
df_cluster = pd.DataFrame(train_aggregated_w.itemID.unique()).rename(columns={0:"itemID"})
for index, row in df_cluster.iterrows():
    # add number of days with sales
    saledays = train_aggregated[train_aggregated.itemID == row.itemID]
    df_cluster.loc[index,'cluster'] = 1 if len(saledays) < 5 else 2 if len(saledays) < 20  else 4 if len(saledays) < 50 else 5
df_cluster

Unnamed: 0,itemID,cluster
0,1,4.0
1,10,2.0
2,11,4.0
3,18,4.0
4,19,2.0
...,...,...
948,9422,2.0
949,9512,2.0
950,9602,2.0
951,9661,2.0


In [109]:
# train_aggregated_w = train_aggregated_w.drop(columns="cluster")
# test_aggregated_w = test_aggregated_w.drop(columns="cluster")

In [112]:
train_aggregated_w = train_aggregated_w.join(df_cluster.set_index("itemID"), on="itemID")
test_aggregated_w = test_aggregated_w.join(df_cluster.set_index("itemID"), on="itemID")

In [113]:
test_aggregated_w.cluster = test_aggregated_w.cluster.fillna(0)

In [114]:
train_aggregated_w.groupby("cluster").count()

Unnamed: 0_level_0,itemID,WeekGroup,count,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,count_diff1,count_diff2,count_diff3,count_diff4,count_diff5,simulationPrice,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2.0,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513,513
4.0,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509,1509
5.0,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480,480


In [115]:
test_aggregated_w.groupby("cluster").count()

Unnamed: 0_level_0,itemID,WeekGroup,count,count_diff1,count_diff2,count_diff3,count_diff4,count_diff5,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,simulationPrice,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0.0,9510,9510,0,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510,9510
2.0,316,316,0,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316,316
4.0,538,538,0,538,538,538,538,538,538,538,538,538,538,538,538,538,538,538,538,538
5.0,99,99,0,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99


In [116]:
len(test_aggregated_w.itemID.unique())

10463

In [117]:
y_randomforest = dict()
for cluster in train_aggregated_w.cluster.unique():
    rf = RandomForestRegressor(**get_best_settings('small', 
                                                   train_aggregated_w[train_aggregated_w.cluster == cluster])).fit(train_aggregated_w[train_aggregated_w.cluster == cluster].drop(['count', 'itemID',"WeekGroup"], axis=1), 
                                                                  train_aggregated_w[train_aggregated_w.cluster == cluster]["count"])
    
    for prod in test_aggregated_w[test_aggregated_w.cluster == cluster].itemID.unique():
#         train_predict = pd.DataFrame(pd.DataFrame(rf.predict(test_aggregated_w[test_aggregated_w.itemID == prod].drop(['count', 'itemID',"WeekGroup"], 
#                                                                                                                                            axis=1)))).rename(columns={0:'predicted_count'}) 
        if prod in test_aggregated_w["itemID"]:
            train_predict = pd.DataFrame(scaler.inverse_transform(pd.DataFrame(rf.predict(test_aggregated_w[test_aggregated_w.itemID == prod].drop(['count', 'itemID',"WeekGroup"], 
                                                                                                                                               axis=1))))).rename(columns={0:'predicted_count'}) 
        # rescale
    #     scaler.inverse_transform
            y_randomforest[prod] = int(train_predict["predicted_count"].sum().round())
        else:
            y_randomforest[prod] = 0
        train_predict["actual_count"] = test[test.itemID == prod]["order"].sum() if len(test[test.itemID == prod]) > 0 else 0



Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   31.5s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   55.3s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   24.6s finished


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   25.5s finished


In [118]:
# perfect result
print(f'Perfect Result: {evaluate_result(y, y):.2f}')

# baseline 1
print(f'Baseline 1: {evaluate_result(y, y_baseline1):.2f}')

# baseline 2
print(f'Baseline 2: {evaluate_result(y, y_baseline2):.2f}')

# random forest
print(f'Random Forest: {evaluate_result(y, y_randomforest):.2f}')
# -390900.26

Perfect Result: 7895975.87
Baseline 1: -3727365.60
Baseline 2: -1672504.21
Random Forest: -455423.44


# For which items did the RF perform well?

In [119]:
df_result = pd.DataFrame.from_dict(y_randomforest, orient='index')
len(df_result)
# df_result

953

In [120]:
for index, row in df_result.iterrows():
    df_result.loc[index, "cluster"] = test_aggregated_w[test_aggregated_w.itemID == index]["cluster"].iloc[0]
    df_result.loc[index, 'actual'] = y[index]
    df_result.loc[index, 'diff'] = abs(y[index] - row[0])
    df_result.loc[index, 'diff_perc'] = ((abs(row[0]-y[index])/((row[0]+y[index])/2))*100).round()

In [131]:
bins = [1, 5, 10, 15, 25, 50, 75, 100, 150, 200, 500, 1000, 10000]
display(df_result.groupby([ pd.cut(df_result['diff'], bins)])[0].count())
display(df_result.groupby([ pd.cut(df_result['diff_perc'], bins)])[0].count())
df_result.groupby([ pd.cut(df_result['diff'], bins),"cluster"])[0].count()
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df_result.groupby("diff_perc").count()[[0]])

diff
(1, 5]            19
(5, 10]           85
(10, 15]          62
(15, 25]         143
(25, 50]         245
(50, 75]          80
(75, 100]         80
(100, 150]        99
(150, 200]        55
(200, 500]        64
(500, 1000]       12
(1000, 10000]      5
Name: 0, dtype: int64

diff_perc
(1, 5]             3
(5, 10]            4
(10, 15]           2
(15, 25]           6
(25, 50]          38
(50, 75]          35
(75, 100]         37
(100, 150]       108
(150, 200]       719
(200, 500]         0
(500, 1000]        0
(1000, 10000]      0
Name: 0, dtype: int64

diff           cluster
(1, 5]         2.0         17
               4.0          2
               5.0          0
(5, 10]        2.0         79
               4.0          6
               5.0          0
(10, 15]       2.0         52
               4.0         10
               5.0          0
(15, 25]       2.0         75
               4.0         67
               5.0          1
(25, 50]       2.0         61
               4.0        183
               5.0          1
(50, 75]       2.0          9
               4.0         70
               5.0          1
(75, 100]      2.0         11
               4.0         66
               5.0          3
(100, 150]     2.0          1
               4.0         89
               5.0          9
(150, 200]     2.0          4
               4.0         19
               5.0         32
(200, 500]     2.0          5
               4.0         16
               5.0         43
(500, 1000]    2.0          0
               4.0          5
               5.

In [129]:
df_result.groupby([ pd.cut(df_result['diff_perc'], bins),"cluster"])[0].count()

diff_perc      cluster
(1, 5]         2.0          0
               4.0          3
               5.0          0
(5, 10]        2.0          2
               4.0          1
               5.0          1
(10, 15]       2.0          1
               4.0          1
               5.0          0
(15, 25]       2.0          1
               4.0          4
               5.0          1
(25, 50]       2.0         19
               4.0         13
               5.0          6
(50, 75]       2.0          7
               4.0         18
               5.0         10
(75, 100]      2.0         14
               4.0         20
               5.0          3
(100, 150]     2.0         38
               4.0         58
               5.0         12
(150, 200]     2.0        234
               4.0        419
               5.0         66
(200, 500]     2.0          0
               4.0          0
               5.0          0
(500, 1000]    2.0          0
               4.0          0
               5.

In [123]:
test_aggregated_w.to_csv('lag_agg_testdata.csv', index = False)
train_aggregated_w.to_csv('lag_agg_traindata.csv', index = False)

In [132]:
import os
# delete file before we can store it 
os.remove("abraca-data.csv")
# store results
pd.DataFrame(y_randomforest.items()).rename(columns={0:'itemID', 1:"demandPrediction"}).to_csv('abraca-data.csv',index=False, sep='|')