In [21]:
import numpy as np
import pandas as pd
import psycopg2
import os
import sys

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
import plotly.express as px

from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.compose import make_reduction
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error, r2_score)
from sklearn.ensemble import RandomForestRegressor
from sktime.forecasting.model_selection import ForecastingGridSearchCV, SlidingWindowSplitter, ExpandingWindowSplitter
from sklearn.model_selection import GridSearchCV

In [2]:
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.abspath("__file__"))

# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)

# Getting the parent directory name
gr_parent = os.path.dirname(parent)

# adding the parent directory to
# the sys.path.
sys.path.append(gr_parent)

sys.path.insert(0, "..//skk_analytics")

In [3]:
from connection import *
from utils import *

In [4]:
file_config = gr_parent + "\\database.ini"
print(file_config)

sql_file = os.path.join(parent, 'sql\\lng_prod_tangguh_data_query.sql')
print(sql_file)

d:\Users\kusumy\Documents\Code\Python\skk\skk_analytics\database.ini
d:\Users\kusumy\Documents\Code\Python\skk\skk_analytics\gas_prod\sql\lng_prod_tangguh_data_query.sql


In [5]:
conn = create_db_connection(filename=file_config, section='postgresql_ml_lng_skk')
if conn == None:
    exit()

#Load Data from Database
query_1 = open(sql_file, mode="rt").read()
data = get_sql_data(query_1, conn)

data['date'] = pd.DatetimeIndex(data['date'], freq='D')
data = data.reset_index()

  data = pd.read_sql_query(sql, conn)


In [7]:
ds = 'date'
y = 'lng_production' 

df = data[[ds,y]]
df = df.set_index(ds)
df.index = pd.DatetimeIndex(df.index, freq='D')

In [10]:
# Test size
test_size = 0.2
# Split data (original data)
y_train, y_test = temporal_train_test_split(df, test_size=test_size)

# Horizon
fh = ForecastingHorizon(y_test.index, is_relative=False)

In [33]:
rf_n_estimators = 157
rf_lags = 27 #1, 6, 27
rf_random_state = 0
rf_criterion = "squared_error"
rf_strategy = "recursive"
#n_estimators_param_grid = {"n_estimators": [100, 150, 200, 300]}
forecaster_param_grid = {"window_length": [1, 7, 21], 
                         "estimator__n_estimators": [100, 200]}

In [34]:

# create regressor object
#rf_regressor = RandomForestRegressor(n_estimators = rf_n_estimators, random_state = rf_random_state, criterion = rf_criterion)
rf_regressor = RandomForestRegressor(random_state = rf_random_state, criterion = rf_criterion, n_jobs=-1)
#rf_forecaster = make_reduction(rf_regressor, window_length = rf_lags, strategy = rf_strategy)
rf_forecaster = make_reduction(rf_regressor, strategy = rf_strategy)

cv = ExpandingWindowSplitter(fh=len(fh), initial_window=365*2, step_length=14)
gscv = ForecastingGridSearchCV(rf_forecaster, cv=cv, param_grid=forecaster_param_grid, n_jobs=-1)


In [35]:

print("Creating Random Forest Model ...")
#rf_forecaster.fit(y_train) #, X_train
gscv.fit(y_train) #, X_train


Creating Random Forest Model ...


In [25]:

print("Random Forest Model Prediction ...")
#rf_forecast = rf_forecaster.predict(fh) #, X=X_test
rf_forecast = gscv.predict(fh) #, X=X_test

Random Forest Model Prediction ...


In [26]:
y_pred_rf = pd.DataFrame(rf_forecast).applymap('{:.2f}'.format)
y_pred_rf

Unnamed: 0,lng_production
2021-05-13,25761.07
2021-05-14,25934.26
2021-05-15,26009.39
2021-05-16,25931.88
2021-05-17,26009.39
...,...
2022-09-10,25931.88
2022-09-11,26009.39
2022-09-12,25931.88
2022-09-13,26009.39


In [27]:
#Create MAPE
rf_mape = mean_absolute_percentage_error(y_test['lng_production'], rf_forecast)
ranfor_mape_str = str('MAPE: %.4f' % rf_mape)
print("Random Forest Model "+ranfor_mape_str)

Random Forest Model MAPE: 0.4239


In [28]:
#Get Parameters
#rf_param = str(rf_forecaster.get_params())
rf_param = str(gscv.get_params())
print("Random Forest Model Parameters "+rf_param)

Random Forest Model Parameters {'backend': 'loky', 'cv__fh': 490, 'cv__initial_window': 730, 'cv__start_with_window': True, 'cv__step_length': 7, 'cv': ExpandingWindowSplitter(fh=490, initial_window=730, step_length=7), 'error_score': nan, 'forecaster__estimator__bootstrap': True, 'forecaster__estimator__ccp_alpha': 0.0, 'forecaster__estimator__criterion': 'squared_error', 'forecaster__estimator__max_depth': None, 'forecaster__estimator__max_features': 1.0, 'forecaster__estimator__max_leaf_nodes': None, 'forecaster__estimator__max_samples': None, 'forecaster__estimator__min_impurity_decrease': 0.0, 'forecaster__estimator__min_samples_leaf': 1, 'forecaster__estimator__min_samples_split': 2, 'forecaster__estimator__min_weight_fraction_leaf': 0.0, 'forecaster__estimator__n_estimators': 100, 'forecaster__estimator__n_jobs': None, 'forecaster__estimator__oob_score': False, 'forecaster__estimator__random_state': 0, 'forecaster__estimator__verbose': 0, 'forecaster__estimator__warm_start': Fal

In [29]:
gscv.best_params_

{'window_length': 1}