In [40]:
import numpy as np
import pandas as pd
import psycopg2
import os
import sys

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import plotly.express as px

from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.compose import make_reduction
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error, r2_score)
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.model_selection import ForecastingGridSearchCV, SlidingWindowSplitter, ExpandingWindowSplitter, SingleWindowSplitter
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError
from sklearn.model_selection import GridSearchCV

In [2]:
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.abspath("__file__"))

# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)

# Getting the parent directory name
gr_parent = os.path.dirname(parent)

# adding the parent directory to
# the sys.path.
sys.path.append(gr_parent)

sys.path.insert(0, "..//skk_analytics")

In [3]:
from connection import *
from utils import *

In [4]:
file_config = gr_parent + "\\database.ini"
print(file_config)

sql_file = os.path.join(parent, 'sql\\lng_prod_tangguh_data_query.sql')
print(sql_file)

d:\Users\kusumy\Documents\Code\Python\skk\skk_analytics\database.ini
d:\Users\kusumy\Documents\Code\Python\skk\skk_analytics\gas_prod\sql\lng_prod_tangguh_data_query.sql


In [5]:
conn = create_db_connection(filename=file_config, section='postgresql_ml_lng_skk')
if conn == None:
    exit()

#Load Data from Database
query_1 = open(sql_file, mode="rt").read()
data = get_sql_data(query_1, conn)

data['date'] = pd.DatetimeIndex(data['date'], freq='D')
data = data.reset_index()

  data = pd.read_sql_query(sql, conn)


In [6]:
ds = 'date'
y = 'lng_production' 

df = data[[ds,y]]
df = df.set_index(ds)
df.index = pd.DatetimeIndex(df.index, freq='D')

In [7]:
# Test size
test_size = 0.2
# Split data (original data)
y_train, y_test = temporal_train_test_split(df, test_size=test_size)

# Horizon
fh = ForecastingHorizon(y_test.index, is_relative=False)
fh_int = np.arange(1, len(fh))

In [8]:
## Create Exogenous Variable
df['month'] = [i.month for i in df.index]
df['planned_shutdown'] = data['planned_shutdown'].values
df['day'] = [i.day for i in df.index]

 # Split into train and test
X_train, X_test = temporal_train_test_split(df.iloc[:,1:], test_size=test_size)
exogenous_features = ["month", "day", "planned_shutdown"]

In [32]:
rf_n_estimators = 200
rf_lags = 27 #1, 6, 27
rf_random_state = 0
rf_criterion = "squared_error"
rf_strategy = "recursive"
#n_estimators_param_grid = {"n_estimators": [100, 150, 200, 300]}
forecaster_param_grid = {"window_length": [1, 7, 14, 21, 30], 
                         "estimator__n_estimators": [200, 300]}

In [42]:

# create regressor object
#rf_regressor = RandomForestRegressor(n_estimators = rf_n_estimators, random_state = rf_random_state, criterion = rf_criterion)
rf_regressor = RandomForestRegressor(random_state = rf_random_state, criterion = rf_criterion, n_jobs=-1)
#rf_forecaster = make_reduction(rf_regressor, window_length = rf_lags, strategy = rf_strategy)
rf_forecaster = make_reduction(rf_regressor, strategy = rf_strategy)

mape = MeanAbsolutePercentageError(symmetric=False)
#cv = ExpandingWindowSplitter(fh=int(len(fh)), initial_window=365*2, step_length=30)
#cv = SlidingWindowSplitter(window_length=365*2, step_length=7, fh=fh_int)
cv = SingleWindowSplitter(fh=fh_int)
gscv = ForecastingGridSearchCV(rf_forecaster, cv=cv, param_grid=forecaster_param_grid, n_jobs=-1, scoring=mape)


In [None]:
list(cv.split_series(y_train))


In [43]:
X_train = X_train.asfreq('D')

print("Creating Random Forest Model ...")
#rf_forecaster.fit(y_train) #, X_train
gscv.fit(y_train, X_train) #, X_train


Creating Random Forest Model ...


In [45]:
gscv.cv_results_

Unnamed: 0,mean_test_MeanAbsolutePercentageError,mean_fit_time,mean_pred_time,params,rank_test_MeanAbsolutePercentageError
0,1.474057e+18,1.246639,19.883052,"{'estimator__n_estimators': 200, 'window_lengt...",6.0
1,1.461527e+18,0.584557,20.572203,"{'estimator__n_estimators': 200, 'window_lengt...",5.0
2,1.2716e+18,1.548274,19.601172,"{'estimator__n_estimators': 200, 'window_lengt...",1.0
3,1.476423e+18,4.101357,17.208071,"{'estimator__n_estimators': 200, 'window_lengt...",7.0
4,1.290622e+18,4.716171,16.438466,"{'estimator__n_estimators': 200, 'window_lengt...",3.0
5,1.795196e+18,1.692149,23.839889,"{'estimator__n_estimators': 300, 'window_lengt...",9.0
6,1.77641e+18,2.993686,22.535591,"{'estimator__n_estimators': 300, 'window_lengt...",8.0
7,1.285848e+18,3.774955,21.509632,"{'estimator__n_estimators': 300, 'window_lengt...",2.0
8,1.79596e+18,4.657068,20.973312,"{'estimator__n_estimators': 300, 'window_lengt...",10.0
9,1.295214e+18,5.081216,20.450197,"{'estimator__n_estimators': 300, 'window_lengt...",4.0


In [46]:
gscv.best_forecaster_

In [47]:
gscv.best_params_

{'estimator__n_estimators': 200, 'window_length': 14}

In [49]:
print("Random Forest Model Prediction ...")
#rf_forecast = rf_forecaster.predict(fh) #, X=X_test
rf_forecast = gscv.predict(fh, X=X_test)#, X=X_test

Random Forest Model Prediction ...


In [50]:
y_pred_rf = pd.DataFrame(rf_forecast).applymap('{:.2f}'.format)
y_pred_rf

Unnamed: 0,lng_production
2021-05-13,29452.97
2021-05-14,35374.24
2021-05-15,39095.75
2021-05-16,41188.54
2021-05-17,41608.67
...,...
2022-09-10,48666.26
2022-09-11,48700.03
2022-09-12,47911.36
2022-09-13,47694.78


In [51]:
#Create MAPE
rf_mape = mean_absolute_percentage_error(y_test['lng_production'], rf_forecast)
ranfor_mape_str = str('MAPE: %.4f' % rf_mape)
print("Random Forest Model "+ranfor_mape_str)

Random Forest Model MAPE: 0.2273


In [52]:
#Get Parameters
#rf_param = str(rf_forecaster.get_params())
rf_param = str(gscv.get_params())
print("Random Forest Model Parameters "+rf_param)

Random Forest Model Parameters {'backend': 'loky', 'cv__fh': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
   