In [None]:
###Product Demand Forecast
# Investigate one company's data and see if we can build a model to predict a product's demand!
# Multiple R2-squared measures resembling different criteria used for training your decision trees regression.
# The R2-squared measure resembling SVR output.
! pip install sklearn=1.*

In [6]:
## Load libraries

import pandas as pd
import numpy as np
import optuna
import time

from matplotlib import pyplot
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score

In [3]:
# Step2 Read the dataset
df = pd.read_csv('HistoricalProductDemand.csv')

## Remove Missing Value
# Check any number of data points with NaN
print("Data points NA: ", df.isnull().any(axis=1).sum(), ' / ', len(df), "\n")

print (df.isna().sum(), "\n")

df.dropna(axis=0, inplace=True) #remove all rows with na's
df.reset_index(drop=True)

print("Dimensions of df: ", df.shape, "\n")
print("DataTypes: ", df.dtypes)

df.sort_values('Order_Demand')[0:10] ## some targets (Order_Demand)  has "()" -> remove these

Data points NA:  11239  /  1048575 

Product_Code            0
Warehouse               0
Product_Category        0
Date                11239
Order_Demand            0
dtype: int64 

Dimensions of df:  (1037336, 5) 

DataTypes:  Product_Code        object
Warehouse           object
Product_Category    object
Date                object
Order_Demand        object
dtype: object


Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
981476,Product_2165,Whse_A,Category_024,2016/4/13,(1)
614427,Product_1862,Whse_J,Category_001,2014/8/19,(1)
546436,Product_0352,Whse_A,Category_021,2014/7/14,(1)
843435,Product_1863,Whse_J,Category_001,2015/1/22,(1)
384234,Product_0396,Whse_J,Category_007,2013/9/4,(1)
611858,Product_0341,Whse_J,Category_021,2014/9/4,(1)
991365,Product_2119,Whse_A,Category_009,2016/6/8,(1)
546437,Product_0352,Whse_A,Category_021,2014/7/15,(1)
623307,Product_1024,Whse_J,Category_008,2014/6/12,(1)
187255,Product_1722,Whse_J,Category_003,2012/3/15,(1)


In [4]:
df['Order_Demand'] = df['Order_Demand'].str.replace('(',"")
df['Order_Demand'] = df['Order_Demand'].str.replace(')',"")
df.sort_values('Order_Demand')[0:10]



Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
333293,Product_1541,Whse_A,Category_019,2013/5/10,0
323500,Product_2054,Whse_A,Category_021,2013/10/28,0
614229,Product_0187,Whse_J,Category_007,2014/4/2,0
323499,Product_2054,Whse_A,Category_021,2013/10/26,0
332760,Product_1007,Whse_A,Category_006,2013/1/7,0
332761,Product_1007,Whse_A,Category_006,2013/9/27,0
332762,Product_1007,Whse_A,Category_006,2013/10/25,0
559927,Product_0020,Whse_A,Category_005,2014/8/29,0
989837,Product_1416,Whse_A,Category_019,2016/10/14,0
989838,Product_1422,Whse_A,Category_019,2016/10/14,0


In [5]:
## Step3 and 4 Feature Selection and preprocessing
X=df[["Product_Code", "Warehouse", "Product_Category", "Date"]]
y=df.Order_Demand


### Encode Date : Cyclical Feature, Xsin Xcos (https://www.kaggle.com/code/avanwyk/encoding-cyclical-features-for-deep-learning)
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data


df["month"] = pd.to_datetime(df['Date']).dt.month
df["day"] = pd.to_datetime(df['Date']).dt.day
data = encode(df, 'month', 12)
data = encode(df, 'day', 31)
data


Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,month,day,month_sin,month_cos,day_sin,day_cos
0,Product_0993,Whse_J,Category_028,2012/7/27,100,7,27,-5.000000e-01,-8.660254e-01,-0.724793,0.688967
1,Product_0979,Whse_J,Category_028,2012/1/19,500,1,19,5.000000e-01,8.660254e-01,-0.651372,-0.758758
2,Product_0979,Whse_J,Category_028,2012/2/3,500,2,3,8.660254e-01,5.000000e-01,0.571268,0.820763
3,Product_0979,Whse_J,Category_028,2012/2/9,500,2,9,8.660254e-01,5.000000e-01,0.968077,-0.250653
4,Product_0979,Whse_J,Category_028,2012/3/2,500,3,2,1.000000e+00,6.123234e-17,0.394356,0.918958
...,...,...,...,...,...,...,...,...,...,...,...
1048570,Product_1791,Whse_J,Category_006,2016/4/27,1000,4,27,8.660254e-01,-5.000000e-01,-0.724793,0.688967
1048571,Product_1974,Whse_J,Category_006,2016/4/27,1,4,27,8.660254e-01,-5.000000e-01,-0.724793,0.688967
1048572,Product_1787,Whse_J,Category_006,2016/4/28,2500,4,28,8.660254e-01,-5.000000e-01,-0.571268,0.820763
1048573,Product_0901,Whse_J,Category_023,2016/10/7,50,10,7,-8.660254e-01,5.000000e-01,0.988468,0.151428


In [90]:
### Encode Product_Code, Warehouse, Product_Category
df_oh = pd.get_dummies(df[['Product_Code','Warehouse', 'Product_Category']])

## Combine date and one-hot encoded columns
X= pd.concat([df_oh, data[["month_sin", "month_cos", "day_sin", "day_cos"]]], axis=1)
print("shape :", X.shape)
X


shape : (1037336, 2201)


Unnamed: 0,Product_Code_Product_0001,Product_Code_Product_0002,Product_Code_Product_0003,Product_Code_Product_0004,Product_Code_Product_0005,Product_Code_Product_0006,Product_Code_Product_0007,Product_Code_Product_0008,Product_Code_Product_0009,Product_Code_Product_0010,...,Product_Category_Category_028,Product_Category_Category_029,Product_Category_Category_030,Product_Category_Category_031,Product_Category_Category_032,Product_Category_Category_033,month_sin,month_cos,day_sin,day_cos
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,-5.000000e-01,-8.660254e-01,-0.724793,0.688967
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,5.000000e-01,8.660254e-01,-0.651372,-0.758758
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,8.660254e-01,5.000000e-01,0.571268,0.820763
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,8.660254e-01,5.000000e-01,0.968077,-0.250653
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1.000000e+00,6.123234e-17,0.394356,0.918958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,8.660254e-01,-5.000000e-01,-0.724793,0.688967
1048571,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,8.660254e-01,-5.000000e-01,-0.724793,0.688967
1048572,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,8.660254e-01,-5.000000e-01,-0.571268,0.820763
1048573,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,-8.660254e-01,5.000000e-01,0.988468,0.151428


In [91]:
## Step 5: Split Data: Training, Test, validation

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_valid, y_train_valid, test_size=0.125, random_state=0) # 0.125 x 0.8 = 0.1

print(X_train_valid.shape)
print(X_test.shape)
print(X_train.shape)
print(X_val.shape)
print(y_test.shape)
print(y_train.shape)
print(y_val.shape)

(933602, 2201)
(103734, 2201)
(816901, 2201)
(116701, 2201)
(103734,)
(816901,)
(116701,)


In [100]:
## Step 6 Tree bsed Clustering

def objective(trial):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 10, 150, step = 10), ## 100 to 600, step 100 
    "max_depth": trial.suggest_int("max_depth", 1, 15), ## 4, 15
    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),    
    "random_state": 10,
    }
    
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, pred)
    r2 = r2_score(y_val ,pred)
    return r2 

# "criterion": trial.suggest_categorical("criterion",["squared_error", "friedman_mse", "poisson"]),

In [101]:
start_time = time.time()

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=50)

total_time = time.time()-start_time

[32m[I 2024-01-02 11:53:23,128][0m A new study created in memory with name: no-name-f386e340-fdd1-4041-a8d2-5bab3fd44151[0m
[32m[I 2024-01-02 12:34:02,005][0m Trial 0 finished with value: 0.19331081669168182 and parameters: {'n_estimators': 90, 'max_depth': 6, 'min_samples_split': 9, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.19331081669168182.[0m
[32m[I 2024-01-02 12:37:45,573][0m Trial 1 finished with value: 0.019113155182011177 and parameters: {'n_estimators': 40, 'max_depth': 1, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.19331081669168182.[0m
[32m[I 2024-01-02 12:48:45,832][0m Trial 2 finished with value: 0.020428544694928608 and parameters: {'n_estimators': 130, 'max_depth': 1, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.19331081669168182.[0m
[32m[I 2024-01-02 13:09:56,476][0m Trial 3 finished with value: 0.21463017682448027 and parameters: {'n_estimators': 30, 'max_depth': 10, 'min_samp

In [102]:
sklearn.__version__
total_time ## ~ 20 hrs

75491.72225499153

In [106]:
## Rerun using modified hyperparameters

def objective(trial):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 100, 600, step = 100), ## 100 to 600, step 100 
    "max_depth": trial.suggest_int("max_depth", 4, 15), ## 4, 15
    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),    
    "random_state": 10,
    }
    
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, pred)
    r2 = r2_score(y_val ,pred)
    return r2 

start_time = time.time()

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=20)

print("total_time =", time.time()-start_time)

[32m[I 2024-01-03 15:50:04,072][0m A new study created in memory with name: no-name-f928cae4-77b8-476f-9af2-5b3af0a36e48[0m
[32m[I 2024-01-03 17:37:53,622][0m Trial 0 finished with value: 0.21065276642927155 and parameters: {'n_estimators': 200, 'max_depth': 8, 'min_samples_split': 4, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.21065276642927155.[0m
[32m[I 2024-01-04 00:55:10,523][0m Trial 1 finished with value: 0.20418681978443787 and parameters: {'n_estimators': 600, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.21065276642927155.[0m
[32m[I 2024-01-04 04:09:05,655][0m Trial 2 finished with value: 0.1640501837368179 and parameters: {'n_estimators': 600, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.21065276642927155.[0m
[32m[I 2024-01-04 11:18:43,563][0m Trial 3 finished with value: 0.2143062433216446 and parameters: {'n_estimators': 600, 'max_depth': 13, 'min_sam

total_time = 208553.54524803162


In [None]:
print("best optuna parameters are: ", study.best_params)
print("best r2 value is: ", study.best_value)
        
best_n_estimators = study.best_params["n_estimators"]
best_max_depth = study.best_params["max_depth"]
best_min_samples_split = study.best_params["min_samples_split"]
best_min_samples_leaf = study.best_params["min_samples_leaf"]

model = RandomForestRegressor(n_estimators = best_n_estimators, max_depth = best_max_depth, min_samples_split = best_min_samples_split, min_samples_leaf = best_min_samples_leaf)
model.fit(X_train_valid, y_train_valid)
pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test ,pred)
print("\n""r2 is: ", r2, "\n mae is: ", mae)


best optuna parameters are:  {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 10}
best r2 value is:  0.21558576788984107


### Rescale; Label encode  Product_Code, Warehouse ### Label Encode Product_Code, Warehouse, Product_Category

In [7]:
## label enocde and rerun Random forest

le = LabelEncoder()
df_labelEnc = df[['Product_Code','Warehouse', 'Product_Category']].apply(le.fit_transform)

#df_oh = pd.get_dummies(df[['Product_Code','Warehouse', 'Product_Category']])

## Combine date and one-hot encoded columns
X= pd.concat([df_labelEnc, data[["month_sin", "month_cos", "day_sin", "day_cos"]]], axis=1)
print("shape :", X.shape)
X



shape : (1037336, 7)


Unnamed: 0,Product_Code,Warehouse,Product_Category,month_sin,month_cos,day_sin,day_cos
0,982,2,27,-5.000000e-01,-8.660254e-01,-0.724793,0.688967
1,968,2,27,5.000000e-01,8.660254e-01,-0.651372,-0.758758
2,968,2,27,8.660254e-01,5.000000e-01,0.571268,0.820763
3,968,2,27,8.660254e-01,5.000000e-01,0.968077,-0.250653
4,968,2,27,1.000000e+00,6.123234e-17,0.394356,0.918958
...,...,...,...,...,...,...,...
1048570,1780,2,5,8.660254e-01,-5.000000e-01,-0.724793,0.688967
1048571,1962,2,5,8.660254e-01,-5.000000e-01,-0.724793,0.688967
1048572,1776,2,5,8.660254e-01,-5.000000e-01,-0.571268,0.820763
1048573,890,2,22,-8.660254e-01,5.000000e-01,0.988468,0.151428


In [8]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_valid, y_train_valid, test_size=0.125, random_state=0) # 0.125 x 0.8 = 0.1

print(X_train_valid.shape)
print(X_test.shape)
print(X_train.shape)
print(X_val.shape)
print(y_test.shape)
print(y_train.shape)
print(y_val.shape)

(933602, 7)
(103734, 7)
(816901, 7)
(116701, 7)
(103734,)
(816901,)
(116701,)


In [9]:
## Step 6 Random Forest Regression

def objective(trial):
    params = {
    "n_estimators": trial.suggest_int("n_estimators", 10, 150, step = 10), ## 100 to 600, step 100 
    "max_depth": trial.suggest_int("max_depth", 1, 15), ## 4, 15
    "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),    
    "random_state": 10,
    }
    
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, pred)
    r2 = r2_score(y_val ,pred)
    return r2 

start_time = time.time()

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=50)

total_time = time.time()-start_time

[32m[I 2024-01-07 01:23:44,122][0m A new study created in memory with name: no-name-a07d1b54-ec1c-4bfc-893d-f9c30e9b123e[0m
[32m[I 2024-01-07 01:25:00,996][0m Trial 0 finished with value: 0.21899936191071956 and parameters: {'n_estimators': 70, 'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.21899936191071956.[0m
[32m[I 2024-01-07 01:27:47,722][0m Trial 1 finished with value: 0.226602848078157 and parameters: {'n_estimators': 150, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.226602848078157.[0m
[32m[I 2024-01-07 01:27:51,674][0m Trial 2 finished with value: 0.04379294329736194 and parameters: {'n_estimators': 10, 'max_depth': 3, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.226602848078157.[0m
[32m[I 2024-01-07 01:29:06,835][0m Trial 3 finished with value: 0.1743579700365634 and parameters: {'n_estimators': 90, 'max_depth': 7, 'min_samples_split'

In [10]:
print("best optuna parameters are: ", study.best_params)
print("best r2 value is: ", study.best_value)
        
best_n_estimators = study.best_params["n_estimators"]
best_max_depth = study.best_params["max_depth"]
best_min_samples_split = study.best_params["min_samples_split"]
best_min_samples_leaf = study.best_params["min_samples_leaf"]

model = RandomForestRegressor(n_estimators = best_n_estimators, max_depth = best_max_depth, min_samples_split = best_min_samples_split, min_samples_leaf = best_min_samples_leaf)
model.fit(X_train_valid, y_train_valid)
pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test ,pred)
print("\n""r2 is: ", r2, "\n mae is: ", mae)

best optuna parameters are:  {'n_estimators': 30, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 10}
best r2 value is:  0.2364929249527402

r2 is:  0.1846671412071853 
 mae is:  4975.04247377324


In [None]:
### Try support vector regression
### Minmax scale encodedFeatures 

scaler = MinMaxScaler(feature_range=(0,1))
X_minmax = pd.DataFrame(scaler.fit_transform(X.values), columns=X.columns, index=X.index)
X_minmax

In [None]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train_valid, y_train_valid, test_size=0.125, random_state=0) # 0.125 x 0.8 = 0.1

print(X_train_valid.shape)
print(X_test.shape)
print(X_train.shape)
print(X_val.shape)
print(y_test.shape)
print(y_train.shape)
print(y_val.shape)


## Different regression model: SVR
def objective(trial):
    params = {
    "kernel": trial.suggest_categorical("kernel", ['linear', 'poly', 'rbf', 'sigmoid']),
    "gamma": trial.suggest_categorical("gamma", ["scale", "auto"]),
    "C": trial.suggest_uniform("C", 0.01, 10),
    "degree": trial.suggest_discrete_uniform("degree", 1, 5, 1),
    }
    
    model = SVR(**params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, pred)
    r2 = r2_score(y_val ,pred)
    return r2 

start_time = time.time()

study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=10)

