In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pycaret.regression import *
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
df = pd.read_csv(r"C:\Users\fabian\Downloads\parklane_combined_edited.csv")
print('Unfiltered row of dataframe is {}'.format(len(df)))

Unfiltered row of dataframe is 727657


In [3]:
# filter where hr or hb is  not 0
df = df[(df['sys_hr'] != 0) | (df['sys_hb'] != 0)]
df = df[(df['h_chwst'] >=5) & (df['h_chwst'] <=20)]
df = df[(df['h_chwrt'] >=5) & (df['h_chwrt'] <=25)]
df = df[(df['pchwp_1_kw'] >=1.2) & (df['pchwp_1_kw'] <=5)]
# unsure of pchwp_2 kw filtering
#df = df[(df['pchwp_2_kw'] >=1.2) & (df['pchwp_2_kw'] <=5)]
df = df[(df['h_chwf'] >= 0) & (df['h_chwf'] <= 1500)]
df = df[(df['cwp_1_kw'] > 0) & (df['cwp_1_kw'] <=25)]
df = df[(df['cwp_2_kw'] > 0) & (df['cwp_2_kw'] <=25)]
df = df[(df['h_chwf'] >0) & (df['h_chwf'] <=1500)]
df = df[(df['h_cwst']>=20) & (df['h_cwst']<=32)]
df = df[(df['h_cwrt']>=20) & (df['h_cwrt']<=40)]
df = df[(df['ch_1_kwe']>=0) & (df['ch_1_kwe']<=350)]
df = df[(df['ch_2_kwe']>=0) & (df['ch_2_kwe']<=350)]
df = df[(df['ct_1_kw']>=0) & (df['ct_1_kw']<=15)]
df = df[(df['ct_2_kw']>=0) & (df['ct_2_kw']<=15)]
df = df[(df['sys_cl']>=200) & (df['sys_cl']<=500)]
df = df[(df['sys_hr']>0) & (df['sys_hr']<=1000)]
df = df[(df['sys_kw']>0) & (df['sys_kw']<=400)]
fil_df = df[(df['ch_sysef']>=0.4) & (df['ch_sysef']<=0.7) & (df['sys_eff']>=0.4) & (df['sys_eff']<=0.9)]

# efficiency column reduce by air system efficiency
fil_df['sys_eff'] = fil_df['sys_eff'] - fil_df['air_eff']
# replace NaN values with 0? median imputation
fil_df = fil_df.fillna(0)

# creating Lift Column, Lift: [h_cwrt-h_chwst]
fil_df['lift'] = fil_df['h_cwrt'] - fil_df['h_chwst']
fil_df['ct_tot_kw'] = fil_df['ct_1_kw'] + fil_df['ct_2_kw']
# both chillers running
fil_df['ch_run'] = (fil_df['ch_1_kwe']>=2) & (fil_df['ch_2_kwe']>=2)
fil_df['ch_run'] = fil_df['ch_run'].astype(int)

# filtering additional columns
fil_df = fil_df[(fil_df['ct_tot_kw']>=2) & (fil_df['ct_tot_kw']<=30)]
fil_df = fil_df[(fil_df['lift']>=18) & (fil_df['lift']<=30)]

print("total number of rows after filter is {}".format(len(fil_df)))

# manipulation of features
fil_df['ct_tot_kw'] = np.log(fil_df['ct_tot_kw']) / np.log(5)
fil_df['sys_cl'] = np.log(fil_df['sys_cl']) / np.log(50)
fil_df['lift'] = np.log(fil_df['lift']) / np.log(5)

# selecting related features only
ch_sysef_df = fil_df[['ch_sysef', 'lift', 'sys_cl', 'ct_tot_kw', 'ch_run', 'h_cwst']]
# reset
ch_sysef_df = ch_sysef_df.reset_index()

total number of rows after filter is 210699


## Pycaret

In [4]:
## lift based on ct_tot_kw and cwst
lift_df = ch_sysef_df[['lift', 'ct_tot_kw', 'h_cwst']]

reg1 = setup(lift_df, target = 'lift', session_id=1, log_experiment=True, experiment_name='lift prediction')
best_model = compare_models(fold=5)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,lift
2,Target type,Regression
3,Original data shape,"(210699, 3)"
4,Transformed data shape,"(210699, 3)"
5,Transformed train set shape,"(147489, 3)"
6,Transformed test set shape,"(63210, 3)"
7,Numeric features,2
8,Preprocess,True
9,Imputation type,simple


2024/06/25 17:56:41 INFO mlflow.tracking.fluent: Experiment with name 'lift prediction' does not exist. Creating a new experiment.


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.0125,0.0003,0.0178,0.5722,0.006,0.0064,4.62
lightgbm,Light Gradient Boosting Machine,0.0125,0.0003,0.0178,0.5714,0.006,0.0064,0.2
gbr,Gradient Boosting Regressor,0.0129,0.0003,0.0181,0.5559,0.0061,0.0066,1.32
knn,K Neighbors Regressor,0.0136,0.0004,0.0193,0.4968,0.0065,0.0069,0.078
rf,Random Forest Regressor,0.0141,0.0004,0.02,0.4578,0.0068,0.0072,2.854
et,Extra Trees Regressor,0.0149,0.0005,0.0213,0.3818,0.0072,0.0076,1.866
lar,Least Angle Regression,0.0167,0.0005,0.0215,0.3756,0.0073,0.0085,0.04
br,Bayesian Ridge,0.0167,0.0005,0.0215,0.3756,0.0073,0.0085,0.04
ridge,Ridge Regression,0.0167,0.0005,0.0215,0.3756,0.0073,0.0085,0.634
lr,Linear Regression,0.0167,0.0005,0.0215,0.3756,0.0073,0.0085,0.848


In [5]:
lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(lightgbm, n_iter=10, optimize = 'RMSE')
save_model(tuned_lightgbm, 'lift_prediction_log')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0127,0.0003,0.0181,0.5632,0.0061,0.0065
1,0.0127,0.0003,0.0183,0.5528,0.0062,0.0065
2,0.0124,0.0003,0.0176,0.5729,0.006,0.0063
3,0.0123,0.0003,0.0173,0.5853,0.0058,0.0063
4,0.0124,0.0003,0.0176,0.5745,0.006,0.0063
5,0.0126,0.0003,0.0181,0.5647,0.0061,0.0064
6,0.0123,0.0003,0.0175,0.5781,0.0059,0.0063
7,0.0126,0.0003,0.018,0.565,0.0061,0.0064
8,0.0124,0.0003,0.0174,0.5854,0.0059,0.0063
9,0.0126,0.0003,0.0178,0.5796,0.006,0.0064


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0151,0.0004,0.0205,0.4382,0.007,0.0077
1,0.0151,0.0004,0.0205,0.4378,0.0069,0.0077
2,0.0148,0.0004,0.0199,0.4523,0.0067,0.0075
3,0.0147,0.0004,0.0196,0.4651,0.0066,0.0075
4,0.0147,0.0004,0.0199,0.4573,0.0067,0.0075
5,0.0149,0.0004,0.0204,0.4469,0.0069,0.0076
6,0.0147,0.0004,0.0199,0.457,0.0067,0.0075
7,0.0149,0.0004,0.0203,0.4502,0.0069,0.0076
8,0.0149,0.0004,0.0198,0.4588,0.0067,0.0076
9,0.015,0.0004,0.0203,0.4544,0.0069,0.0077


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['ct_tot_kw', 'h_cwst'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('trained_model', LGBMRegressor(n_jobs=-1, random_state=1))]),
 'lift_prediction_log.pkl')

In [6]:
pred_holdouts = predict_model(lightgbm)
pred_holdouts.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,0.0126,0.0003,0.0178,0.5744,0.006,0.0064


Unnamed: 0,ct_tot_kw,h_cwst,lift,prediction_label
204408,1.344434,29.015131,1.979929,1.976603
18022,1.208351,27.798399,1.973514,1.970102
141,1.211189,27.143965,1.95683,1.952107
99779,1.614374,26.277998,1.916853,1.909493
57624,1.846259,28.405087,1.972643,1.981662


In [4]:
## sysef based on ct_tot_kw, cwst, lift, config

reg2 = setup(ch_sysef_df, target = 'ch_sysef', session_id=2, log_experiment=True, experiment_name='sysef prediction')
best_model = compare_models(fold=5)

Unnamed: 0,Description,Value
0,Session id,2
1,Target,ch_sysef
2,Target type,Regression
3,Original data shape,"(210699, 7)"
4,Transformed data shape,"(210699, 7)"
5,Transformed train set shape,"(147489, 7)"
6,Transformed test set shape,"(63210, 7)"
7,Numeric features,6
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,0.0101,0.0003,0.0159,0.8167,0.0102,0.0185,3.86
rf,Random Forest Regressor,0.0102,0.0003,0.016,0.813,0.0103,0.0186,9.094
catboost,CatBoost Regressor,0.0112,0.0003,0.0166,0.8,0.0107,0.0204,5.272
lightgbm,Light Gradient Boosting Machine,0.012,0.0003,0.0175,0.7768,0.0113,0.022,0.262
gbr,Gradient Boosting Regressor,0.0139,0.0004,0.0196,0.719,0.0127,0.0254,4.708
knn,K Neighbors Regressor,0.0136,0.0004,0.0211,0.6766,0.0136,0.0249,0.108
dt,Decision Tree Regressor,0.0139,0.0005,0.0222,0.6425,0.0143,0.0255,0.316
br,Bayesian Ridge,0.0196,0.0006,0.0252,0.538,0.0163,0.0359,0.062
lar,Least Angle Regression,0.0196,0.0006,0.0252,0.538,0.0163,0.0359,0.06
ridge,Ridge Regression,0.0196,0.0006,0.0252,0.538,0.0163,0.0359,0.756


In [5]:

save_model(best_model, 'sysef_prediction_log')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['index', 'lift', 'sys_cl',
                                              'ct_tot_kw', 'ch_run', 'h_cwst'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('trained_model',
                  ExtraTreesRegressor(n_jobs=-1, random_state=2))]),
 'sysef_prediction_log.pkl')

In [6]:
pred_holdouts = predict_model(best_model)
pred_holdouts.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Extra Trees Regressor,0.0099,0.0002,0.0154,0.8266,0.0099,0.0182


Unnamed: 0,index,lift,sys_cl,ct_tot_kw,ch_run,h_cwst,ch_sysef,prediction_label
124751,439078,1.947158,1.535494,1.977941,1,28.716831,0.587793,0.581855
194067,679704,1.975183,1.511488,1.611409,1,29.246607,0.561075,0.578252
128104,463344,1.984645,1.513354,1.989372,0,28.390808,0.595719,0.594175
145886,549694,1.996245,1.556494,1.966296,1,29.207752,0.633542,0.625386
202876,704317,1.937107,1.417711,0.995009,1,28.499374,0.502085,0.544002
