# Import Libraries

In [164]:
import numpy as np
import pandas as pd
import psycopg2
import os
import sys

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import plotly.express as px

from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.compose import make_reduction
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error, r2_score)
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.model_selection import ForecastingGridSearchCV, ForecastingRandomizedSearchCV, SlidingWindowSplitter, ExpandingWindowSplitter, SingleWindowSplitter
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor
from sktime.forecasting.fbprophet import Prophet

# Set Project Path

In [2]:
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.abspath("__file__"))

# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)

# Getting the parent directory name
gr_parent = os.path.dirname(parent)

# adding the parent directory to
# the sys.path.
sys.path.append(gr_parent)

sys.path.insert(0, "..//skk_analytics")

In [3]:
from connection import *
from utils import *

In [4]:
file_config = gr_parent + "\\database.ini"
print(file_config)

sql_file = os.path.join(parent, 'sql\\lng_prod_tangguh_data_query.sql')
print(sql_file)

d:\Users\kusumy\Documents\Code\Python\skk\skk_analytics\database.ini
d:\Users\kusumy\Documents\Code\Python\skk\skk_analytics\gas_prod\sql\lng_prod_tangguh_data_query.sql


# Get Data from Database

In [5]:
conn = create_db_connection(filename=file_config, section='postgresql_ml_lng_skk')
if conn == None:
    exit()

#Load Data from Database
query_1 = open(sql_file, mode="rt").read()
data = get_sql_data(query_1, conn)

data['date'] = pd.DatetimeIndex(data['date'], freq='D')
data = data.reset_index()

  data = pd.read_sql_query(sql, conn)


In [6]:
ds = 'date'
y = 'lng_production' 

df = data[[ds,y]]
df = df.set_index(ds)
df.index = pd.DatetimeIndex(df.index, freq='D')

# Time Series Split

In [7]:
# Test size
test_size = 0.2
# Split data (original data)
y_train, y_test = temporal_train_test_split(df, test_size=test_size)

# Horizon
fh = ForecastingHorizon(y_test.index, is_relative=False)
fh_int = np.arange(1, len(fh))

# Create Additional Regressor (Exogenous)

In [8]:
## Create Exogenous Variable
df['month'] = [i.month for i in df.index]
df['planned_shutdown'] = data['planned_shutdown'].values
df['day'] = [i.day for i in df.index]

 # Split into train and test
X_train, X_test = temporal_train_test_split(df.iloc[:,1:], test_size=test_size)
exogenous_features = ["month", "day", "planned_shutdown"]

# Build Model

In [54]:
# Model scoring for Cross Validation
mape = MeanAbsolutePercentageError(symmetric=False)

## Random Forest

### Define Model Parameters

In [32]:
rf_n_estimators = 200
rf_lags = 27 #1, 6, 27
rf_random_state = 0
rf_criterion = "squared_error"
rf_strategy = "recursive"
#n_estimators_param_grid = {"n_estimators": [100, 150, 200, 300]}
forecaster_param_grid = {"window_length": [1, 7, 14, 21, 30], 
                         "estimator__n_estimators": [200, 300]}

# Create regressor object
#rf_regressor = RandomForestRegressor(n_estimators = rf_n_estimators, random_state = rf_random_state, criterion = rf_criterion)
rf_regressor = RandomForestRegressor(random_state = rf_random_state, criterion = rf_criterion, n_jobs=-1)
#rf_forecaster = make_reduction(rf_regressor, window_length = rf_lags, strategy = rf_strategy)
rf_forecaster = make_reduction(rf_regressor, strategy = rf_strategy)

# Define Cross Validation object
#cv = ExpandingWindowSplitter(fh=int(len(fh)), initial_window=365*2, step_length=30)
#cv = SlidingWindowSplitter(window_length=365*2, step_length=7, fh=fh_int)
cv = SingleWindowSplitter(fh=fh_int)
gscv = ForecastingGridSearchCV(rf_forecaster, cv=cv, param_grid=forecaster_param_grid, n_jobs=-1, scoring=mape)


In [None]:
# Check splitter data split
list(cv.split_series(y_train))

### Perform Cross Validation

In [43]:
X_train = X_train.asfreq('D')

# Perform Cross Validation Model
print("Creating Random Forest Model ...")
#rf_forecaster.fit(y_train) #, X_train
gscv.fit(y_train, X_train) #, X_train

Creating Random Forest Model ...


In [45]:
# Show top 10 best models based on scoring function
gscv.cv_results_

Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
0,1.474057e+18,1.246639,19.883052,"{'estimator__n_estimators': 200, 'window_lengt...",6.0
1,1.461527e+18,0.584557,20.572203,"{'estimator__n_estimators': 200, 'window_lengt...",5.0
2,1.2716e+18,1.548274,19.601172,"{'estimator__n_estimators': 200, 'window_lengt...",1.0
3,1.476423e+18,4.101357,17.208071,"{'estimator__n_estimators': 200, 'window_lengt...",7.0
4,1.290622e+18,4.716171,16.438466,"{'estimator__n_estimators': 200, 'window_lengt...",3.0
5,1.795196e+18,1.692149,23.839889,"{'estimator__n_estimators': 300, 'window_lengt...",9.0
6,1.77641e+18,2.993686,22.535591,"{'estimator__n_estimators': 300, 'window_lengt...",8.0
7,1.285848e+18,3.774955,21.509632,"{'estimator__n_estimators': 300, 'window_lengt...",2.0
8,1.79596e+18,4.657068,20.973312,"{'estimator__n_estimators': 300, 'window_lengt...",10.0
9,1.295214e+18,5.081216,20.450197,"{'estimator__n_estimators': 300, 'window_lengt...",4.0


In [47]:
# Show best model parameters
gscv.best_params_

{'estimator__n_estimators': 200, 'window_length': 14}

### Perform Prediction Based on Best Model

In [49]:
print("Random Forest Model Prediction ...")
#rf_forecast = rf_forecaster.predict(fh) #, X=X_test
rf_forecast = gscv.best_forecaster_.predict(fh, X=X_test)#, X=X_test

Random Forest Model Prediction ...


### Model Performance

In [51]:
# Create MAPE
y_pred_rf = pd.DataFrame(rf_forecast).applymap('{:.2f}'.format)
rf_mape = mean_absolute_percentage_error(y_test['lng_production'], y_pred_rf)
ranfor_mape_str = str('MAPE: %.4f' % rf_mape)
print("Random Forest Model "+ranfor_mape_str)

Random Forest Model MAPE: 0.2273


In [None]:
#Get Parameters
#rf_param = str(rf_forecaster.get_params())
rf_param = str(gscv.get_params())
print("Random Forest Model Parameters "+rf_param)

## XGBoost

### Define Model Parameters

In [154]:
xgb_objective = 'reg:squarederror'
xgb_lags = 6 #1, 6, 27
xgb_strategy = "recursive"
xgb_forecaster_param_grid = {"window_length": [1, 6, 7, 14, 21, 27, 32]
                            ,"estimator__n_estimators": [100, 200, 300]
                            #,"estimator__max_depth": [3,6,10],
                            #,"estimator__learning_rate": [0.01, 0.05, 0.1, 0.3],
                            #,"estimator__colsample_bytree": [0.3, 0.5, 0.7]
                            }

xgb_regressor = XGBRegressor(objective=xgb_objective, n_jobs=-1, seed = 42)
xgb_forecaster = make_reduction(xgb_regressor, strategy=xgb_strategy)

cv_xgb = SingleWindowSplitter(fh=fh_int)
#gscv_xgb = ForecastingRandomizedSearchCV(xgb_forecaster, cv=cv_xgb, param_grid=xgb_forecaster_param_grid, n_jobs=-1, scoring=mape)
gscv_xgb = ForecastingGridSearchCV(xgb_forecaster, cv=cv_xgb, param_grid=xgb_forecaster_param_grid, n_jobs=-1, scoring=mape)


In [163]:
# Check splitter data split
#list(cv_xgb.split_series(y_train))

### Perform Cross Validation

In [156]:
X_train = X_train.asfreq('D')

# Perform Cross Validation Model
print("Creating XGBoost Model ...")
#rf_forecaster.fit(y_train) #, X_train
gscv_xgb.fit(y_train, X_train) #, X_train

Creating XGBoost Model ...


In [157]:
# Show top 10 best models based on scoring function
gscv_xgb.cv_results_.sort_values(by='rank_test_MeanAbsolutePercentageError', ascending=True)

Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
0,1.31869e+18,0.1542,0.667362,"{'estimator__n_estimators': 100, 'window_lengt...",1.0
12,1.419927e+18,3.033859,0.457291,"{'estimator__n_estimators': 200, 'window_lengt...",2.0
8,1.54994e+18,0.896029,0.653143,"{'estimator__n_estimators': 200, 'window_lengt...",3.0
9,1.552514e+18,0.915611,0.598554,"{'estimator__n_estimators': 200, 'window_lengt...",4.0
10,1.613354e+18,1.830381,0.574796,"{'estimator__n_estimators': 200, 'window_lengt...",5.0
17,1.61342e+18,2.144989,0.42777,"{'estimator__n_estimators': 300, 'window_lengt...",6.0
11,1.632736e+18,2.372885,0.419667,"{'estimator__n_estimators': 200, 'window_lengt...",7.0
5,1.650174e+18,1.512715,0.587686,"{'estimator__n_estimators': 100, 'window_lengt...",8.0
3,1.662578e+18,0.910999,0.627126,"{'estimator__n_estimators': 100, 'window_lengt...",9.0
19,1.676795e+18,3.414725,0.310947,"{'estimator__n_estimators': 300, 'window_lengt...",10.0


In [159]:
# Show best model parameters
gscv_xgb.best_params_

{'estimator__n_estimators': 100, 'window_length': 1}

### Perform Prediction Based on Best Model

In [160]:
print("XGBoost Model Prediction ...")
#rf_forecast = rf_forecaster.predict(fh) #, X=X_test
xgb_forecast = gscv_xgb.best_forecaster_.predict(fh, X=X_test)#, X=X_test

XGBoost Model Prediction ...


### Model Performance

In [161]:
# Create MAPE
y_pred_xgb = pd.DataFrame(xgb_forecast).applymap('{:.2f}'.format)
#y_pred_xgb
xgb_mape = mean_absolute_percentage_error(y_test['lng_production'], y_pred_xgb)
xgb_mape_str = str('MAPE: %.4f' % xgb_mape)
print("XGBoost Model "+ xgb_mape_str)

XGBoost Model MAPE: 0.1328


In [None]:
#Get Parameters
#rf_param = str(rf_forecaster.get_params())
xgb_param = str(gscv_xgb.get_params())
print("Random Forest Model Parameters "+xgb_param)

## Prophet

### Define Model Parameters

In [202]:
seasonality_mode = 'additive'
n_changepoints = 27 #1, 6, 27
seasonality_prior_scale = 0.05
changepoint_prior_scale = 0.1
holidays_prior_scale = 8
daily_seasonality = 8
weekly_seasonality = 1
yearly_seasonality = 10

prophet_param_grid = {'n_changepoints':[1,5]
                      ,'seasonality_prior_scale':[0.05, 0.1]
                      ,'changepoint_prior_scale':[0.1, 0.5]
                      ,'daily_seasonality':[8,10]
                      ,'weekly_seasonality':[8,10]
                      ,'yearly_seasonality':[8,10]
                     }


# create regressor object
prophet_forecaster = Prophet()

cv_prophet = SingleWindowSplitter(fh=fh_int)
#gscv_xgb = ForecastingRandomizedSearchCV(xgb_forecaster, cv=cv_xgb, param_grid=xgb_forecaster_param_grid, n_jobs=-1, scoring=mape)
gscv_prophet = ForecastingGridSearchCV(prophet_forecaster, cv=cv_prophet, param_grid=prophet_param_grid, n_jobs=-1, scoring=mape)


In [171]:
gscv_prophet.get_params().keys()

dict_keys(['backend', 'cv__fh', 'cv__window_length', 'cv', 'error_score', 'forecaster__add_country_holidays', 'forecaster__add_seasonality', 'forecaster__alpha', 'forecaster__changepoint_prior_scale', 'forecaster__changepoint_range', 'forecaster__changepoints', 'forecaster__daily_seasonality', 'forecaster__freq', 'forecaster__growth', 'forecaster__growth_cap', 'forecaster__growth_floor', 'forecaster__holidays', 'forecaster__holidays_prior_scale', 'forecaster__mcmc_samples', 'forecaster__n_changepoints', 'forecaster__seasonality_mode', 'forecaster__seasonality_prior_scale', 'forecaster__stan_backend', 'forecaster__uncertainty_samples', 'forecaster__verbose', 'forecaster__weekly_seasonality', 'forecaster__yearly_seasonality', 'forecaster', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_n_best_forecasters', 'scoring__multilevel', 'scoring__multioutput', 'scoring__symmetric', 'scoring', 'strategy', 'update_behaviour', 'verbose'])

### Perform Cross Validation

In [203]:
#X_train = X_train.asfreq('D')

# Perform Cross Validation Model
print("Creating Prophet Model ...")
gscv_prophet.fit(y_train, X_train) #, X_train

Creating Prophet Model ...


15:30:03 - cmdstanpy - INFO - Chain [1] start processing
15:30:03 - cmdstanpy - INFO - Chain [1] done processing
15:30:04 - cmdstanpy - INFO - Chain [1] start processing
15:30:04 - cmdstanpy - INFO - Chain [1] done processing


In [204]:
# Show top 10 best models based on scoring function
gscv_prophet.cv_results_.sort_values(by='rank_test_MeanAbsolutePercentageError', ascending=True)

Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
30,1.924904e+18,1.759478,1.004385,"{'changepoint_prior_scale': 0.1, 'daily_season...",1.0
11,1.927025e+18,1.254016,0.838892,"{'changepoint_prior_scale': 0.1, 'daily_season...",2.0
24,1.931790e+18,1.287318,0.864041,"{'changepoint_prior_scale': 0.1, 'daily_season...",3.0
25,1.932880e+18,1.457312,1.063224,"{'changepoint_prior_scale': 0.1, 'daily_season...",4.0
14,1.935080e+18,10.970710,0.970286,"{'changepoint_prior_scale': 0.1, 'daily_season...",5.0
...,...,...,...,...,...
63,2.105466e+18,0.931381,0.451875,"{'changepoint_prior_scale': 0.5, 'daily_season...",60.0
45,2.105820e+18,1.212615,0.680025,"{'changepoint_prior_scale': 0.5, 'daily_season...",61.0
47,2.107151e+18,1.259850,0.948070,"{'changepoint_prior_scale': 0.5, 'daily_season...",62.0
61,2.107197e+18,0.964414,0.492018,"{'changepoint_prior_scale': 0.5, 'daily_season...",63.0


In [205]:
# Show best model parameters
gscv_prophet.best_params_

{'changepoint_prior_scale': 0.1,
 'daily_seasonality': 10,
 'n_changepoints': 5,
 'seasonality_prior_scale': 0.1,
 'weekly_seasonality': 10,
 'yearly_seasonality': 8}

### Perform Prediction Based on Best Model

In [206]:
print("Prophet Model Prediction ...")
#rf_forecast = rf_forecaster.predict(fh) #, X=X_test
prophet_forecast = gscv_prophet.best_forecaster_.predict(fh, X=X_test)#, X=X_test

Prophet Model Prediction ...


### Model Performance

In [210]:
# Create MAPE
y_pred_prophet = pd.DataFrame(prophet_forecast).applymap('{:.2f}'.format)
#y_pred_xgb
prophet_mape = mean_absolute_percentage_error(y_test['lng_production'], y_pred_prophet)
prophet_mape_str = str('MAPE: %.4f' % prophet_mape)
print("Prophet Model "+ prophet_mape_str)

Prophet Model MAPE: 0.1323


In [None]:
#Get Parameters
#rf_param = str(rf_forecaster.get_params())
prophet_param = str(gscv_prophet.get_params())
print("Random Forest Model Parameters "+prophet_param)