<a href="https://colab.research.google.com/github/lennart194/thesis-code/blob/main/pooling_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install ddop

In [2]:
import numpy as np
import pandas as pd

from ddop.newsvendor import KNeighborsWeightedNewsvendor
from ddop.newsvendor import SampleAverageApproximationNewsvendor

from sklearn.model_selection import RandomizedSearchCV

from ddop.metrics import average_costs
from ddop.metrics import prescriptiveness_score
from ddop.metrics import make_scorer

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## loading data-set


---



*   load the whole ultimative set
*   sort the rows first by item, then by date and set a multi index
**now every items time series can be separated by the index**

In [4]:
data = pd.read_csv('/content/drive/MyDrive/M5/ultimative_set.csv')
data = data.drop(columns=['Unnamed: 0'])
data = data.sort_values(by=['item_id', 'date'])
data = data.set_index(['item_id', 'date'])

## split in feature and target matrix


---



In [5]:
X = data.drop(columns=['demand'])
Y = data['demand']

## train_test_split


---

In [6]:
X_train = X.loc[(slice(None), slice(None, "2015")), :]
X_test = X.loc[(slice(None), slice("2015", '2017')), :]

Y_train = Y.loc[(slice(None), slice(None, "2015"))]
Y_test = Y.loc[(slice(None), slice("2015", "2017"))]

## new order after the split


---



*   for the product pooling the whole data must be be sorted by date independent from the 'item_id'
*   therefore sort values by 'date' in the first instance and by 'item_id' in the second 
*   these two instances create the mutli-index of the target matrix (Y)
*   as all items are in one set now the 'item_id' becomes a feature in the feature matrix (X). The only index is the date here.

In [7]:
X_train = X_train.reset_index().sort_values(by=['date', 'item_id']).set_index(['date'])
X_test = X_test.reset_index().sort_values(by=['date', 'item_id']).set_index(['date'])

Y_train = Y_train.reset_index().sort_values(by=['date', 'item_id']).set_index(['date', 'item_id'])
Y_test = Y_test.reset_index().sort_values(by=['date', 'item_id']).set_index(['date', 'item_id'])

## defining the different cost paramerters for special service level and product range



---



*   cu is the mean of the cumulated 'sell_price' column of the training data
*   appending to the corresponding service level co is calculated with (cu - α * cu) / α
*   all the costs are rounded to 2 digits

In [8]:
prods = list(range(0,25))

cu = round(X_train['sell_price'].mean(),2)

co_90 = round(((cu-(0.90*cu))/0.90),2)

co_75 = round(((cu-(0.75*cu))/0.75),2)

co_50 = round(((cu-(0.50*cu))/0.50),2)

## determining some parameters


---


*   the potential range of the params for randomized search are defined
*   the estimator, the scorer and finally the whole param_random is defined

In [None]:
k = np.arange(25,66)
param_random = dict(n_neighbors = k)

scorer_avc = make_scorer(average_costs, greater_is_better=False)

# knn for 90% service level


---






## randomized search




*   randomized serach is done once for the entire data-set with the corresponding service level costs
*   this step was done with the server of the University of Würzburg because of the high resource consumption 
*   the results for the best params are: n_neighbors = 46 




In [None]:
knn = KNeighborsWeightedNewsvendor(cu = cu, co = co_90)
random = RandomizedSearchCV(knn, param_random, cv=5, scoring = scorer_avc, n_jobs=8)
random.fit(X_train, Y_train)
print(random.best_params_)

## newsvendor-model 90%



---



*   the model is trained and tested once for entire data-set with the best hyperparameters
*   the prediction is the output



In [9]:
knn = KNeighborsWeightedNewsvendor(cu = cu, co = co_90, n_neighbors = 46)

knn.fit(X_train, Y_train)
preds_90 = knn.predict(X_test)

## reorder the data




---



*   create a pandas frame 'comp_90' where the real values, the predictions and the predictions from the saa model are in columns next to each other

In [10]:
pred_90 = pd.DataFrame(preds_90, columns=['pred_knn_90'])
comp_90 = Y.loc[(slice(None), slice("2015", "2017"))].reset_index().sort_values(by=['date', 'item_id']).set_index('date')
comp_90 = comp_90.reset_index()
comp_90['pred_knn_90'] = pred_90[['pred_knn_90']]
comp_90 = comp_90.sort_values(by=['item_id', 'date']).set_index(['item_id', 'date'])

saa_90 = pd.read_csv('/content/drive/MyDrive/M5/SAA_pred_90.csv').set_index(['item_id', 'date'])

comp_90 = pd.concat([comp_90, saa_90], axis = 1)

## average costs and prescriptivess score


---



*   create a loop where the average_costs (the prescriptivess score) of each item are calculated

*   save the results into a pandas frame


In [29]:
result_knn_90_avc = []

for prod in prods:
  avc = round(average_costs(comp_90['demand'].loc[prod], comp_90['pred_knn_90'].loc[prod], cu = cu, co = co_90),2)
  result_knn_90_avc.append(avc)

In [30]:
KNN_pooled_avc_90 = pd.DataFrame(result_knn_90_avc, columns=['KNN_avc_90'])
KNN_pooled_avc_90.index.name = 'item_id'

In [40]:
result_knn_90_pscr = []

for prod in prods:
  pscr = round(prescriptiveness_score(comp_90['demand'].loc[prod], comp_90['pred_knn_90'].loc[prod],
                                      comp_90['SAA_pred_90'].loc[prod], cu = cu, co = co_90),2)
  result_knn_90_pscr.append(pscr)

In [41]:
KNN_pooled_pscr_90 = pd.DataFrame(result_knn_90_pscr, columns=['KNN_pscr_90'])
KNN_pooled_pscr_90.index.name = 'item_id'

# knn for 75% service level


---






## randomized search




*   randomized serach is done once for the entire data-set with the corresponding service level costs
*   this step was done with the server of the University of Würzburg because of the high resource consumption 
*   the results for the best params are: n_neighbors = 65


In [None]:
knn = KNeighborsWeightedNewsvendor(cu = cu, co = co_75)
random = RandomizedSearchCV(knn, param_random, cv=5, scoring = scorer_avc, n_jobs=8)
random.fit(X_train, Y_train)
print(random.best_params_)

## newsvendor-model 75%



---



*   the model is trained and tested once for entire data-set with the best hyperparameters
*   the prediction is the output

In [44]:
knn = KNeighborsWeightedNewsvendor(cu = cu, co = co_75, n_neighbors = 65)

knn.fit(X_train, Y_train)
preds_75 = knn.predict(X_test)

## reorder the data




---



*   create a pandas frame 'comp_75' where the real values, the predictions and the predictions from the saa model are in columns next to each other

In [47]:
pred_75 = pd.DataFrame(preds_75, columns=['pred_knn_75'])
comp_75 = Y.loc[(slice(None), slice("2015", "2017"))].reset_index().sort_values(by=['date', 'item_id']).set_index('date')
comp_75 = comp_75.reset_index()
comp_75['pred_knn_75'] = pred_75[['pred_knn_75']]
comp_75 = comp_75.set_index(['item_id', 'date'])

saa_75 = pd.read_csv('/content/drive/MyDrive/M5/SAA_pred_75.csv').set_index(['item_id', 'date'])

comp_75 = pd.concat([comp_75, saa_75], axis = 1)

## average costs and prescriptivess score


---



*   create a loop where the average_costs (the prescriptivess score) of each item are calculated

*   save the results into a pandas frame


In [50]:
result_knn_75_avc = []

for prod in prods:
  avc = round(average_costs(comp_75['demand'].loc[prod], comp_75['pred_knn_75'].loc[prod], cu = cu, co = co_75),2)
  result_knn_75_avc.append(avc)

In [51]:
KNN_pooled_avc_75 = pd.DataFrame(result_knn_75_avc, columns=['KNN_avc_75'])
KNN_pooled_avc_75.index.name = 'item_id'

In [53]:
result_knn_75_pscr = []

for prod in prods:
  pscr = round(prescriptiveness_score(comp_75['demand'].loc[prod], comp_75['pred_knn_75'].loc[prod],
                                      comp_75['SAA_pred_75'].loc[prod], cu = cu, co = co_75),2)
  result_knn_75_pscr.append(pscr)

In [54]:
KNN_pooled_pscr_75 = pd.DataFrame(result_knn_75_pscr, columns=['KNN_pscr_75'])
KNN_pooled_pscr_75.index.name = 'item_id'

# knn for 50% service level


---






## randomized search




*   randomized serach is done once for the entire data-set with the corresponding service level costs
*   this step was done with the server of the University of Würzburg because of the high resource consumption 
*   the results for the best params are: n_neighbors = 61

In [None]:
knn = KNeighborsWeightedNewsvendor(cu = cu, co = co_50)
random = RandomizedSearchCV(knn, param_random, cv=5, scoring = scorer_avc, n_jobs=8)
random.fit(X_train, Y_train)
print(random.best_params_)

## newsvendor-model 50%



---



*   the model is trained and tested once for entire data-set with the best hyperparameters
*   the prediction is the output

In [56]:
knn = KNeighborsWeightedNewsvendor(cu = cu, co = co_50, n_neighbors = 61)

knn.fit(X_train, Y_train)
preds_50 = knn.predict(X_test)

## reorder the data




---



*   create a pandas frame 'comp_50' where the real values, the predictions and the predictions from the saa model are in columns next to each other

In [57]:
pred_50 = pd.DataFrame(preds_50, columns=['pred_knn_50'])
comp_50 = Y.loc[(slice(None), slice("2015", "2017"))].reset_index().sort_values(by=['date', 'item_id']).set_index('date')
comp_50 = comp_50.reset_index()
comp_50['pred_knn_50'] = pred_50[['pred_knn_50']]
comp_50 = comp_50.set_index(['item_id', 'date'])

saa_50 = pd.read_csv('/content/drive/MyDrive/M5/SAA_pred_50.csv').set_index(['item_id', 'date'])

comp_50 = pd.concat([comp_50, saa_50], axis = 1)

## average costs and prescriptivess score


---



*   create a loop where the average_costs (the prescriptivess score) of each item are calculated

*   save the results into a pandas frame


In [58]:
result_knn_50_avc = []

for prod in prods:
  avc = round(average_costs(comp_50['demand'].loc[prod], comp_50['pred_knn_50'].loc[prod], cu = cu, co = co_50),2)
  result_knn_50_avc.append(avc)

In [59]:
KNN_pooled_avc_50 = pd.DataFrame(result_knn_50_avc, columns=['KNN_avc_50'])
KNN_pooled_avc_50.index.name = 'item_id'

In [60]:
result_knn_50_pscr = []

for prod in prods:
  pscr = round(prescriptiveness_score(comp_50['demand'].loc[prod], comp_50['pred_knn_50'].loc[prod],
                                      comp_50['SAA_pred_50'].loc[prod], cu = cu, co = co_50),2)
  result_knn_50_pscr.append(pscr)

In [61]:
KNN_pooled_pscr_50 = pd.DataFrame(result_knn_50_pscr, columns=['KNN_pscr_50'])
KNN_pooled_pscr_50.index.name = 'item_id'

# merge the pandas frames


---



In [62]:
KNN_pooled_avc = pd.concat([KNN_pooled_avc_90, KNN_pooled_avc_75, KNN_pooled_avc_50], axis=1)
KNN_pooled_avc

Unnamed: 0_level_0,KNN_avc_90,KNN_avc_75,KNN_avc_50
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.39,5.26,9.02
1,0.99,2.11,3.62
2,2.18,4.69,7.76
3,1.75,4.04,7.4
4,2.2,4.87,9.29
5,1.31,2.92,4.94
6,1.65,3.78,6.86
7,1.54,3.56,6.8
8,1.23,2.7,4.98
9,1.21,2.71,4.95


In [63]:
KNN_pooled_pscr = pd.concat([KNN_pooled_pscr_90, KNN_pooled_pscr_75, KNN_pooled_pscr_50], axis=1)
KNN_pooled_pscr

Unnamed: 0_level_0,KNN_pscr_90,KNN_pscr_75,KNN_pscr_50
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.36,0.3,0.28
1,0.45,0.47,0.43
2,0.25,0.16,0.16
3,0.67,0.6,0.52
4,0.32,0.21,0.17
5,0.37,0.31,0.23
6,0.5,0.47,0.42
7,0.44,0.26,0.15
8,0.37,0.27,0.22
9,0.34,0.32,0.25


# save the results


---



In [64]:
KNN_pooled_avc.to_csv('/content/drive/MyDrive/M5/KNN_pooled_avc.csv')

KNN_pooled_pscr.to_csv('/content/drive/MyDrive/M5/KNN_pooled_pscr.csv')