# Modelling process based on one region

## Data

### Select region

In [2]:
region = 'AE'

### Load

In [9]:
import pandas as pd

In [10]:
path = f'../data/regions/{region}/train.csv'
df = pd.read_csv(path, index_col=0, parse_dates=True)
df

Unnamed: 0_level_0,AE
period,Unnamed: 1_level_1
2018-07-01 05:00:00,1301
2018-07-01 06:00:00,1314
...,...
2023-01-23 12:00:00,1136
2023-01-23 13:00:00,1194


## Preprocessing

### Scale data

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
train = scaler.fit_transform(df)

In [34]:
import joblib

joblib.dump(scaler, f'../data/regions/{region}/scaler.pkl')

['../data/regions/AE/scaler.pkl']

### Create Sequences

In [12]:
from us_energy_demmand_forecast.utils import create_sequences

n_windows = 24 # Use 24 hours prior to predict the following hour
X, y = create_sequences(train, n_windows)

In [13]:
X.shape

(38144, 24, 1)

In [15]:
y.shape

(38144, 1)

## Modelling

### Cross Validation with Time Series Split

In [16]:
from sklearn.model_selection import TimeSeriesSplit

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

In [33]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(
    max_depth=5,
    learning_rate=0.01,
    num_leaves=10
)

for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index,:,0], X[test_index,:,0]
    y_train, y_test = y[train_index,0], y[test_index,0]
    
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)])
    
path_model = f'../data/regions/{region}/model.txt'
model.booster_.save_model(path_model)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6120
[LightGBM] [Info] Number of data points in the train set: 6359, number of used features: 24
[LightGBM] [Info] Start training from score 0.269243
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6120
[LightGBM] [Info] Number of data points in the train set: 12716, number of used features: 24
[LightGBM] [Info] Start training from score 0.304677
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6120
[LightGBM] [Info] Number of data points in the train set: 19073, number of used features: 24
[LightGBM] [Info] Start trai

<lightgbm.basic.Booster at 0x1aef19c7df0>