<a href="https://colab.research.google.com/github/lennart194/thesis-code/blob/main/separate_rf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install ddop

In [None]:
import pandas as pd
import numpy as np

from ddop.newsvendor import RandomForestWeightedNewsvendor
from ddop.newsvendor import SampleAverageApproximationNewsvendor

from sklearn.model_selection import RandomizedSearchCV


from ddop.metrics import make_scorer
from ddop.metrics import average_costs
from ddop.metrics import prescriptiveness_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## loading data-set


---




*   load the whole ultimative set
*   sort the rows first by item, then by date and set a multi index
**now every items time series can be separated by the index**

In [None]:
data = pd.read_csv('/content/drive/MyDrive/M5/ultimative_set.csv')
data = data.drop(columns=['Unnamed: 0'])
data = data.sort_values(by=['item_id', 'date'])
data = data.set_index(['item_id', 'date'])

## split in feature and target matrix


---



In [None]:
X = data.drop(columns=['demand'])
Y = data['demand']

## train_test_split


---



In [None]:
X_train = X.loc[(slice(None), slice(None, "2015")), :]
X_test = X.loc[(slice(None), slice("2015", '2017')), :]

Y_train = Y.loc[(slice(None), slice(None, "2015"))]
Y_test = Y.loc[(slice(None), slice("2015", "2017"))]

## defining the different cost paramerters for special service level and product range



---



*   cu is the mean of the cumulated 'sell_price' column of the training data
*   appending to the corresponding service level co is calculated with (cu - α * cu) / α
*   all the costs are rounded to 2 digits

In [None]:
prods = list(range(0,25))

cu = round(X_train['sell_price'].mean(),2)

co_90 = round(((cu-(0.90*cu))/0.90),2)

co_75 = round(((cu-(0.75*cu))/0.75),2)

co_50 = round(((cu-(0.50*cu))/0.50),2)

## determining some parameters


---


*   the potential range of the params for randomized search are defined
*   the estimator, the scorer and finally the whole param_random is defined

In [None]:
max_depth = np.arange(7, 24)
n_estimators = np.arange(65, 101)
min_samples_leaf = np.arange(2, 6)
param_random = dict(max_depth = max_depth, n_estimators = n_estimators, min_samples_leaf = min_samples_leaf)

scorer_avc = make_scorer(average_costs, greater_is_better=False)

# rf for 90% service level


---



## randomized search
*   an empty list is created for every type of hyperparameter
*   in a for-loop the randomized search is executed for every item with cu and co of alpha = 0,9
*   the best hyperparams are calculated and added to the corresponding list

In [None]:
best_param_maxd_90 = []
best_param_nest_90 = []
best_param_minsl_90 = []

for prod in prods:
  
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_90, random_state = 42, criterion = 'mse')

  random = RandomizedSearchCV(rf, param_random, cv=5, scoring = scorer_avc)
  random.fit(X_train.loc[prod], Y_train.loc[prod])
    
  maxd = random.best_params_.get('max_depth')
  nest = random.best_params_.get('n_estimators')
  minsl = random.best_params_.get('min_samples_leaf')

  best_param_maxd_90.append(maxd)
  best_param_nest_90.append(nest)
  best_param_minsl_90.append(minsl)

## newsvendor-model


---


*   the data-driven newsvendor model is executed in a loop for every single item (with optimal hyperparameters)
*   the average_costs for every item are calculated (later the prescritiveness score is calculated)
*   save the results as pandas frame



In [None]:
result_rf_90_avc = []
for prod in prods:
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_90, random_state = 42, criterion = 'mse',
                                       max_depth = best_param_maxd_90[prod], n_estimators = best_param_nest_90[prod],
                                       min_samples_leaf = best_param_minsl_90[prod])
  rf.fit(X_train.loc[prod], Y_train.loc[prod])
  preds = rf.predict(X_test.loc[prod])
  
  avc = round(average_costs(Y_test.loc[prod], preds, cu = cu, co = co_90),2)
  result_rf_90_avc.append(avc)

In [None]:
RF_separate_avc_90 = pd.DataFrame(result_rf_90_avc, columns=['RF_avc_90'])
RF_separate_avc_90.index.name = 'item_id'

In [None]:
result_rf_90_pscr = []
for prod in prods:
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_90, random_state = 42, criterion = 'mse',
                                       max_depth = best_param_maxd_90[prod], n_estimators = best_param_nest_90[prod],
                                       min_samples_leaf = best_param_minsl_90[prod])
  rf.fit(X_train.loc[prod], Y_train.loc[prod])
  preds_rf = rf.predict(X_test.loc[prod])


  saa = SampleAverageApproximationNewsvendor(cu = cu, co = co_90)
  saa.fit(Y_train.loc[prod])
  preds_saa = saa.predict(n_steps = 508)
  
  pscr = prescriptiveness_score(Y_test.loc[prod], preds_rf, preds_saa, cu = cu, co = co_90)
  result_rf_90_pscr.append(pscr)

In [None]:
RF_separate_pscr_90 = pd.DataFrame(result_rf_90_pscr, columns=['RF_pscr_90'])
RF_separate_pscr_90.index.name = 'item_id'

# rf for 75% service level


---



## randomized search
*   an empty list is created for every type of hyperparameter
*   in a for-loop the randomized search is executed for every item with cu and co of alpha = 0,75
*   the best hyperparams are calculated and added to the corresponding list

In [None]:
best_param_maxd_75 = []
best_param_nest_75 = []
best_param_minsl_75 = []

for prod in prods:
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_75, random_state = 42, criterion = 'mse')

  random = RandomizedSearchCV(rf, param_random, cv=5, scoring = scorer_avc)
  random.fit(X_train.loc[prod], Y_train.loc[prod])
    
  maxd = random.best_params_.get('max_depth')
  nest = random.best_params_.get('n_estimators')
  minsl = random.best_params_.get('min_samples_leaf')

  best_param_maxd_75.append(maxd)
  best_param_nest_75.append(nest)
  best_param_minsl_75.append(minsl)

## newsvendor-model


---


*   the data-driven newsvendor model is executed in a loop for every single item (with optimal hyperparameters)
*   the average_costs for every item are calculated (later the prescritiveness score is calculated)
*   save the results as pandas frame

In [None]:
result_rf_75_avc = []
for prod in prods:
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_75, random_state = 42, criterion = 'mse',
                                       max_depth = best_param_maxd_75[prod], n_estimators = best_param_nest_75[prod],
                                       min_samples_leaf = best_param_minsl_75[prod])
  rf.fit(X_train.loc[prod], Y_train.loc[prod])
  preds = rf.predict(X_test.loc[prod])
  
  avc = round(average_costs(Y_test.loc[prod], preds, cu = cu, co = co_75),2)
  result_rf_75_avc.append(avc)

In [None]:
RF_separate_avc_75 = pd.DataFrame(result_rf_75_avc, columns=['RF_avc_75'])
RF_separate_avc_75.index.name = 'item_id'

In [None]:
result_rf_75_pscr = []
for prod in prods:
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_75, random_state = 42, criterion = 'mse',
                                       max_depth = best_param_maxd_75[prod], n_estimators = best_param_nest_75[prod],
                                       min_samples_leaf = best_param_minsl_75[prod])
  rf.fit(X_train.loc[prod], Y_train.loc[prod])
  preds_rf = rf.predict(X_test.loc[prod])


  saa = SampleAverageApproximationNewsvendor(cu = cu, co = co_75)
  saa.fit(Y_train.loc[prod])
  preds_saa = saa.predict(n_steps = 508)
  
  pscr = prescriptiveness_score(Y_test.loc[prod], preds_rf, preds_saa, cu = cu, co = co_75)
  result_rf_75_pscr.append(pscr)

In [None]:
RF_separate_pscr_75 = pd.DataFrame(result_rf_75_pscr, columns=['RF_pscr_75'])
RF_separate_pscr_75.index.name = 'item_id'

# rf for 50% service level



---





## randomized search
*   an empty list is created for every type of hyperparameter
*   in a for-loop the randomized search is executed for every item with cu and co of alpha = 0,5
*   the best hyperparams are calculated and added to the corresponding list

In [None]:
best_param_maxd_50 = []
best_param_nest_50 = []
best_param_minsl_50 = []

for prod in prods:
  
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_50, random_state = 42, criterion = 'mse')

  random = RandomizedSearchCV(rf, param_random, cv=5, scoring = scorer_avc)
  random.fit(X_train.loc[prod], Y_train.loc[prod])
    
  maxd = random.best_params_.get('max_depth')
  nest = random.best_params_.get('n_estimators')
  minsl = random.best_params_.get('min_samples_leaf')

  best_param_maxd_50.append(maxd)
  best_param_nest_50.append(nest)
  best_param_minsl_50.append(minsl)

## newsvendor-model


---


*   the data-driven newsvendor model is executed in a loop for every single item (with optimal hyperparameters)
*   the average_costs for every item are calculated (later the prescritiveness score is calculated)
*   save the results as pandas frame

In [None]:
result_rf_50_avc = []
for prod in prods:

  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_50, random_state = 42, criterion = 'mse',
                                       max_depth = best_param_maxd_50[prod], n_estimators = best_param_nest_50[prod],
                                       min_samples_leaf = best_param_minsl_50[prod])
  rf.fit(X_train.loc[prod], Y_train.loc[prod])
  preds = rf.predict(X_test.loc[prod])
  
  avc = round(average_costs(Y_test.loc[prod], preds, cu = cu, co = co_50),2)
  result_rf_50_avc.append(avc)

In [None]:
RF_separate_avc_50 = pd.DataFrame(result_rf_50_avc, columns=['RF_avc_50'])
RF_separate_avc_50.index.name = 'item_id'

In [None]:
result_rf_50_pscr = []
for prod in prods:
  
  rf = RandomForestWeightedNewsvendor(cu = cu, co = co_50, random_state = 42, criterion = 'mse',
                                       max_depth = best_param_maxd_50[prod], n_estimators = best_param_nest_50[prod],
                                       min_samples_leaf = best_param_minsl_50[prod])
  rf.fit(X_train.loc[prod], Y_train.loc[prod])
  preds_rf = rf.predict(X_test.loc[prod])


  saa = SampleAverageApproximationNewsvendor(cu = cu, co = co_50)
  saa.fit(Y_train.loc[prod])
  preds_saa = saa.predict(n_steps = 508)
  
  pscr = prescriptiveness_score(Y_test.loc[prod], preds_rf, preds_saa, cu = cu, co = co_50)
  result_rf_50_pscr.append(pscr)

In [None]:
RF_separate_pscr_50 = pd.DataFrame(result_rf_50_pscr, columns=['RF_pscr_50'])
RF_separate_pscr_50.index.name = 'item_id'

# merge the pandas frames


---

In [None]:
RF_separated_avc = pd.concat([RF_separate_avc_90, RF_separate_avc_75, RF_separate_avc_50], axis=1)
RF_separated_avc

Unnamed: 0_level_0,RF_avc_90,RF_avc_75,RF_avc_50
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.26,5.05,8.74
1,0.96,2.05,3.54
2,2.3,4.76,8.06
3,1.74,3.71,7.05
4,2.08,4.54,8.72
5,1.23,2.64,4.82
6,1.77,3.86,6.95
7,1.59,3.51,6.53
8,1.18,2.54,4.78
9,1.22,2.61,4.77


In [None]:
RF_separated_pscr = pd.concat([RF_separate_pscr_90, RF_separate_pscr_75, RF_separate_pscr_50], axis=1)
RF_separated_pscr

Unnamed: 0_level_0,RF_pscr_90,RF_pscr_75,RF_pscr_50
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.395798,0.326306,0.306258
1,0.46636,0.48051,0.43883
2,0.211946,0.150934,0.13071
3,0.671298,0.634676,0.542336
4,0.358198,0.265477,0.217244
5,0.411035,0.375497,0.248366
6,0.469421,0.456693,0.416549
7,0.419791,0.270294,0.186095
8,0.390767,0.313984,0.255069
9,0.333419,0.342958,0.279747


# save the results


---

In [None]:
RF_separated_avc.to_csv('/content/drive/MyDrive/M5/RF_separated_avc.csv')

RF_separated_pscr.to_csv('/content/drive/MyDrive/M5/RF_separated_pscr.csv')