In [46]:
from snowflake.snowpark.session import Session
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.window import Window
from sklearn import preprocessing # https://github.com/Snowflake-Labs/snowpark-python-demos/tree/main/sp4py_utilities
from snowflake.snowpark.functions import col

import getpass
import pandas as pd
import matplotlib.pyplot as plt

In [47]:
accountname = 'nj07294.ap-southeast-1'
#accountname = getpass.getpass() # ORGNAME-ACCOUNTNAME (separated by minus sign)

In [48]:
username = getpass.getpass()    # SNOWFLAKE-USERNAME

In [49]:
password = getpass.getpass()    # SNOWFLAKE-PASSWORD

In [50]:
connection_parameters = {
    "account": accountname,
    "user": username,
    "password": password,
    "role": "ACCOUNTADMIN",
    "database": "FROSTBYTE_TASTY_BYTES",

    "warehouse": "HOL_WH"
}

session = Session.builder.configs(connection_parameters).create()

In [51]:
session.use_schema("ANALYTICS")
X_final_scaled=session.sql("Select * from Sales_Forecast_Training_Data").to_pandas()

In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

# Split the dataset into features (X) and target (y)
X = X_final_scaled.drop("PROFIT",axis=1)
y = X_final_scaled["PROFIT"]

# Split the dataset into training and testing datasets
X_train, X_holdout_test, y_train, y_holdout_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_holdout, X_test, y_holdout, y_test = train_test_split(X_holdout_test, y_holdout_test, test_size=0.2, random_state=42)

In [53]:
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)

In [54]:
print('Train MSE is: ', mean_squared_error(lgbm.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(lgbm.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(lgbm.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(lgbm.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(lgbm.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(lgbm.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(lgbm.predict(X_train), y_train))
print('Test R2 is: ', r2_score(lgbm.predict(X_test), y_test))

Train MSE is:  2014136.3855472703
Test MSE is:  2289623.987367086

Train RMSE is:  1419.2027288401296
Test RMSE is:  1513.150351870919

Train MAE is:  1063.968459036136
Test MAE is:  1105.815378228909

Train R2 is:  0.9527253423956941
Test R2 is:  0.9448532150166624


In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
import seaborn as sns
import scipy.stats as stats

lgbm = LGBMRegressor()
param_grid = {'learning_rate': [0.02,0.03,0.04,0.05],
                  'subsample'    : [0.9, 0.5, 0.2],
                  'n_estimators' : [500,1000, 1500],
                  'max_depth'    : [4,6,8,10,12]
                 }

grid_lgbm = GridSearchCV(estimator=lgbm, param_grid = param_grid, cv = 2, n_jobs=-1)
grid_lgbm.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_lgbm.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_lgbm.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_lgbm.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 LGBMRegressor(learning_rate=0.05, max_depth=10, n_estimators=1500,
              subsample=0.9)

 The best score across ALL searched params:
 0.967533176646814

 The best parameters across ALL searched params:
 {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 1500, 'subsample': 0.9}


In [66]:
lgbm = LGBMRegressor(learning_rate= 0.05, max_depth= 10, n_estimators= 1500, subsample= 0.9)
lgbm.fit(X_train, y_train)
print('Train MSE is: ', mean_squared_error(lgbm.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(lgbm.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(lgbm.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(lgbm.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(lgbm.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(lgbm.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(lgbm.predict(X_train), y_train))
print('Test R2 is: ', r2_score(lgbm.predict(X_test), y_test))

Train MSE is:  389772.8279311479
Test MSE is:  715862.4365737954

Train RMSE is:  624.317890125814
Test RMSE is:  846.0865420119832

Train MAE is:  434.1677298911734
Test MAE is:  573.5350853583703

Train R2 is:  0.9913604554995827
Test R2 is:  0.9836946244696078


In [57]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [58]:
print('Train MSE is: ', mean_squared_error(xgb.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(xgb.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(xgb.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(xgb.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(xgb.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(xgb.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(xgb.predict(X_train), y_train))
print('Test R2 is: ', r2_score(xgb.predict(X_test), y_test))

Train MSE is:  771526.7620708488
Test MSE is:  1236772.561729723

Train RMSE is:  878.3659613571377
Test RMSE is:  1112.102765813359

Train MAE is:  641.8070704684887
Test MAE is:  777.6534032164298

Train R2 is:  0.9827021720783029
Test R2 is:  0.9715130780191024


In [59]:
xgb = XGBRegressor()
param_grid = {'learning_rate': [0.01,0.05,0.1,0.15,0.2],
                  'subsample'    : [0.6, 0.8, 1.0],
                  'n_estimators' : [100,200,300,400],
                  'max_depth'    : [4,6,8,10,12,14]
                 }

grid_xgb = GridSearchCV(estimator=xgb, param_grid = param_grid, cv = 2, n_jobs=-1)
grid_xgb.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_xgb.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_xgb.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_xgb.best_params_)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


 Results from Grid Search 

 The best estimator across ALL searched params:
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=400, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

 The best score across ALL searched params:
 0.9757766644663006

 The best parameters across ALL searched params:
 {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 400, 'subsample': 0.8}


In [60]:
xgb = XGBRegressor(learning_rate= 0.1, max_depth= 10, n_estimators= 400, subsample= 0.8)
xgb.fit(X_train, y_train)
print('Train MSE is: ', mean_squared_error(xgb.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(xgb.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(xgb.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(xgb.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(xgb.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(xgb.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(xgb.predict(X_train), y_train))
print('Test R2 is: ', r2_score(xgb.predict(X_test), y_test))

Train MSE is:  1815.4656023779064
Test MSE is:  146385.32354208705

Train RMSE is:  42.608280913196985
Test RMSE is:  382.60335014488186

Train MAE is:  28.535682923807787
Test MAE is:  164.51858717007198

Train R2 is:  0.9999608631922685
Test R2 is:  0.9967669455478382


In [61]:
xgb = XGBRegressor()
param_grid = {'learning_rate': [0.02,0.03,0.04,0.05],
                  'subsample'    : [0.9, 0.5, 0.2],
                  'n_estimators' : [500,1000, 1500],
                  'max_depth'    : [4,6,8,10,12]
                 }

grid_xgb = GridSearchCV(estimator=xgb, param_grid = param_grid, cv = 2, n_jobs=-1)
grid_xgb.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_xgb.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_xgb.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_xgb.best_params_)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


 Results from Grid Search 

 The best estimator across ALL searched params:
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.04, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1500, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

 The best score across ALL searched params:
 0.9765229712336982

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 10, 'n_estimators': 1500, 'subsample': 0.9}


In [64]:
xgb = XGBRegressor(learning_rate= 0.04, max_depth= 10, n_estimators= 1500, subsample= 0.9)
xgb.fit(X_train, y_train)
print('Train MSE is: ', mean_squared_error(xgb.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(xgb.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(xgb.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(xgb.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(xgb.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(xgb.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(xgb.predict(X_train), y_train))
print('Test R2 is: ', r2_score(xgb.predict(X_test), y_test))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Train MSE is:  269.675436680646
Test MSE is:  121178.14241816645

Train RMSE is:  16.421797608077075
Test RMSE is:  348.1065101634361

Train MAE is:  10.917472028198471
Test MAE is:  139.92044363446604

Train R2 is:  0.9999941906116675
Test R2 is:  0.9973240042565259


In [65]:
xgb = XGBRegressor()
param_grid = {'learning_rate': [0.04],
                  'subsample'    : [0.9],
                  'n_estimators' : [1500,1600,1700],
                  'max_depth'    : [10]
                 }

grid_xgb = GridSearchCV(estimator=xgb, param_grid = param_grid, cv = 2, n_jobs=-1)
grid_xgb.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_xgb.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_xgb.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_xgb.best_params_)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


 Results from Grid Search 

 The best estimator across ALL searched params:
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.04, max_delta_step=0,
             max_depth=10, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1700, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='exact', validate_parameters=1, verbosity=None)

 The best score across ALL searched params:
 0.9765622664428726

 The best parameters across ALL searched params:
 {'learning_rate': 0.04, 'max_depth': 10, 'n_estimators': 1700, 'subsample': 0.9}


In [67]:
xgb = XGBRegressor(learning_rate= 0.04, max_depth= 10, n_estimators= 1700, subsample= 0.9)
xgb.fit(X_train, y_train)
print('Train MSE is: ', mean_squared_error(xgb.predict(X_train), y_train))
print('Test MSE is: ', mean_squared_error(xgb.predict(X_test), y_test))
print()
print('Train RMSE is: ',  math.sqrt(mean_squared_error(xgb.predict(X_train), y_train)))
print('Test RMSE is: ', math.sqrt(mean_squared_error(xgb.predict(X_test), y_test)))
print()
print('Train MAE is: ', mean_absolute_error(xgb.predict(X_train), y_train))
print('Test MAE is: ', mean_absolute_error(xgb.predict(X_test), y_test))
print()
print('Train R2 is: ', r2_score(xgb.predict(X_train), y_train))
print('Test R2 is: ', r2_score(xgb.predict(X_test), y_test))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Train MSE is:  160.7802588711393
Test MSE is:  120026.35909454244

Train RMSE is:  12.679915570347434
Test RMSE is:  346.4482055005372

Train MAE is:  8.476168548338265
Test MAE is:  137.24512322204365

Train R2 is:  0.9999965367888077
Test R2 is:  0.9973498725796527
