In [2]:
import pandas as pd
import numpy as np
import datetime
import requests
import json
import plotly.graph_objects as go
import plotly.express as px
import warnings
from functions import *

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',500)

In [3]:
start_date="2016-01-01"
end_date=datetime.date.today()

In [6]:
url="https://seffaflik.epias.com.tr/transparency/service/consumption/real-time-consumption?startDate="+f'{start_date}'+"&endDate="+f'{end_date}'

In [14]:
response=requests.get(url,verify=False)
json_data=json.loads(response.text.encode('utf8'))
df=pd.DataFrame(json_data['body']['hourlyConsumptions']).iloc[:-1]
df['date']=pd.to_datetime(df.date.str[:16])

In [15]:
df

Unnamed: 0,date,consumption
0,2016-01-01 00:00:00,26277.24
1,2016-01-01 01:00:00,24991.82
2,2016-01-01 02:00:00,23532.61
3,2016-01-01 03:00:00,22464.78
4,2016-01-01 04:00:00,22002.91
...,...,...
53033,2022-01-18 17:00:00,47006.64
53034,2022-01-18 18:00:00,46337.93
53035,2022-01-18 19:00:00,44961.57
53036,2022-01-18 20:00:00,42297.02


In [20]:
fh_new=24*7+1
date=pd.date_range(start=df.date.tail(1).iloc[0],periods=fh_new,freq='H',name='date')
date=pd.DataFrame(date)
df_fe=pd.merge(df,date,how='outer')

In [23]:
def rolling_features(df,fh):
    df_c=df.copy()
    rolling_windows=[fh,fh+3,fh+10,fh+15,fh+20,fh+25]
    lags=[fh,fh+5,fh+10,fh+15,fh+20,fh+30]
    for a in rolling_windows:
        df_c['rolling_mean_'+str(a)]=df_c['consumption'].rolling(a,min_periods=1).mean().shift(1)
        df_c['rolling_std_'+str(a)]=df_c['consumption'].rolling(a,min_periods=1).std().shift(1)
        df_c['rolling_min_'+str(a)]=df_c['consumption'].rolling(a,min_periods=1).min().shift(1)
        df_c['rolling_max_'+str(a)]=df_c['consumption'].rolling(a,min_periods=1).max().shift(1)
        df_c['rolling_var_'+str(a)]=df_c['consumption'].rolling(a,min_periods=1).var().shift(1)
    for l in lags:
        df_c['consumption_lag_'+str(l)]=df_c['consumption'].shift(l)
    return(df_c)

In [26]:
def date_features(df):
    df_c=df.copy()
    df_c['month']=df_c['date'].dt.month
    df_c['year']=df_c['date'].dt.year
    df_c['hour']=df_c['date'].dt.hour
    df_c['quarter']=df_c['date'].dt.quarter
    df_c['dayofweek']=df_c['date'].dt.dayofweek
    df_c['dayofyear']=df_c['date'].dt.dayofyear
    df_c['dayofmonth']=df_c['date'].dt.day
    df_c['weekofyear']=df_c['date'].dt.weekofyear
    return(df_c)

In [24]:
df_fe=rolling_features(df_fe,fh_new)
df_fe=date_features(df_fe)

In [29]:
df_fe=df_fe[fh_new+30:].reset_index(drop=True)

In [35]:
split_date = df_fe.date.tail(fh_new).iloc[0]
historical=df_fe.loc[df_fe.date <= split_date]
y=historical[['date','consumption']].set_index('date')
X=historical.drop('consumption',axis=1).set_index('date')
forecast_df=df_fe.loc[df_fe.date > split_date].set_index('date').drop('consumption',axis=1)

In [41]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from catboost import CatBoostRegressor

In [43]:
tscv = TimeSeriesSplit(n_splits=3,test_size=fh_new*20)
score_list = []
fold = 1
unseen_preds = []
importance = []

for train_index,test_index in tscv.split(X,y):
    X_train,X_val = X.iloc[train_index],X.iloc[test_index]
    y_train,y_val = y.iloc[train_index],y.iloc[test_index]
    print(X_train.shape,X_val.shape)

    cat = CatBoostRegressor(iterations = 1000, eval_metric='MAE', allow_writing_files=False)
    cat.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=150,verbose=50)

    forecast_predicted=cat.predict(forecast_df)
    unseen_preds.append(forecast_predicted)
    score = mean_absolute_error(y_val,cat.predict(X_val))
    print(f"MAE FOLD-{fold}:{score}")
    score_list.append(score)
    importance.append(cat.get_feature_importance())
    fold+=1
print("CV Mean Score:",np.mean(score_list))

(42699, 44) (3380, 44)
Learning rate set to 0.091806
0:	learn: 3861.4641196	test: 4510.6791698	best: 4510.6791698 (0)	total: 76.6ms	remaining: 1m 16s
50:	learn: 1166.6473044	test: 1340.3698808	best: 1340.3698808 (50)	total: 1.47s	remaining: 27.4s
100:	learn: 964.0838745	test: 1213.6377732	best: 1213.6377732 (100)	total: 2.91s	remaining: 25.9s
150:	learn: 845.8053342	test: 1179.4061062	best: 1178.1417695 (149)	total: 4.18s	remaining: 23.5s
200:	learn: 766.6968175	test: 1152.8799494	best: 1152.7168697 (197)	total: 5.19s	remaining: 20.6s
250:	learn: 709.7346142	test: 1139.8842991	best: 1138.0347728 (246)	total: 6.37s	remaining: 19s
300:	learn: 665.4480190	test: 1131.2693039	best: 1131.2596166 (299)	total: 7.52s	remaining: 17.5s
350:	learn: 628.6577001	test: 1144.7728306	best: 1129.2228915 (317)	total: 8.62s	remaining: 15.9s
400:	learn: 598.4157710	test: 1158.5788623	best: 1129.2228915 (317)	total: 9.67s	remaining: 14.5s
450:	learn: 572.9080047	test: 1188.2192799	best: 1129.2228915 (317)	t

In [47]:
forecasted=pd.DataFrame(unseen_preds[2],columns=["forecasting"]).set_index(forecast_df.index)

In [58]:
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=df_fe.date.iloc[-fh_new*5:],y=df_fe.consumption.iloc[-fh_new*5:],name='Tarihsel Veri',mode='lines'))
fig1.add_trace(go.Scatter(x=forecasted.index,y=forecasted['forecasting'],name='Öngörü',mode='lines'))

In [55]:
f_importance = pd.concat([pd.Series(X.columns.to_list(),name='Feature'),pd.Series(importance[2],name="Importance")],axis=1).sort_values(by='Importance',ascending=True)

In [57]:
import plotly.express as px
fig2 = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig2.show()