# Setup

Requires Python 3.10 and Poetry 1.6. Run setup commands to begin:
```
!poetry install
!poetry run jupyter lab
```

Load libraries and set parameters

In [1]:
import os
import copy
import matplotlib.pyplot as plt
import pandas as pd
pd.options.display.max_columns = None

KeyboardInterrupt: 

Load raw data

In [None]:
raw_data = {
    "A": {
        "test_est": pd.read_parquet(os.path.join("data", "A", "X_test_estimated.parquet")),
        "train_est": pd.read_parquet(os.path.join("data", "A", "X_train_estimated.parquet")),
        "train_obs": pd.read_parquet(os.path.join("data", "A", "X_train_observed.parquet")),
        "train_tar": pd.read_parquet(os.path.join("data", "A", "train_targets.parquet")),
    },
    "B": {
        "test_est": pd.read_parquet(os.path.join("data", "B", "X_test_estimated.parquet")),
        "train_est": pd.read_parquet(os.path.join("data", "B", "X_train_estimated.parquet")),
        "train_obs": pd.read_parquet(os.path.join("data", "B", "X_train_observed.parquet")),
        "train_tar": pd.read_parquet(os.path.join("data", "B", "train_targets.parquet")),
    },
    "C": {
        "test_est": pd.read_parquet(os.path.join("data", "C", "X_test_estimated.parquet")),
        "train_est": pd.read_parquet(os.path.join("data", "C", "X_train_estimated.parquet")),
        "train_obs": pd.read_parquet(os.path.join("data", "C", "X_train_observed.parquet")),
        "train_tar": pd.read_parquet(os.path.join("data", "C", "train_targets.parquet")),
    },
}

View data

In [None]:
raw_data["C"]["test_est"]

In [None]:
raw_data["C"]["train_est"]

In [None]:
raw_data["C"]["train_obs"]

In [None]:
raw_data["C"]["train_tar"]

Preproccess data

In [None]:
# Create empty list of location train data and location test data
location_train_data = []
location_test_data = []

# Copy raw data dictonary and loop through it
raw_data_copy = copy.deepcopy(raw_data)
for k in raw_data_copy:

    # Add a column with the difference between date_forecast and date_calc in seconds and remove date_calc
    raw_data_copy[k]["train_obs"]["forecast_seconds"] = 0
    raw_data_copy[k]["train_est"]["forecast_seconds"] = (raw_data_copy[k]["train_est"]["date_forecast"] - raw_data_copy[k]["train_est"]["date_calc"]).apply(lambda x: x.total_seconds())
    del raw_data_copy[k]["train_est"]["date_calc"]
    raw_data_copy[k]["test_est"]["forecast_seconds"] = (raw_data_copy[k]["test_est"]["date_forecast"] - raw_data_copy[k]["test_est"]["date_calc"]).apply(lambda x: x.total_seconds())
    del raw_data_copy[k]["test_est"]["date_calc"]

    # Add a column with location
    raw_data_copy[k]["train_obs"]["location"] = k
    raw_data_copy[k]["train_est"]["location"] = k
    raw_data_copy[k]["test_est"]["location"] = k

    # Concat and merge train observed, train estimated and train target to one dataframe and push to location_train_data list
    location_train_data.append(pd.merge(
        pd.concat([
            raw_data_copy[k]["train_obs"].rename(columns={"date_forecast": "time"}),
            raw_data_copy[k]["train_est"].rename(columns={"date_forecast": "time"}),
        ]),
        raw_data_copy[k]["train_tar"],
        on="time"
    ))

    # Push test estimate to location test data list
    location_test_data.append(raw_data_copy[k]["test_est"].rename(columns={"date_forecast": "time"}))

# Concat all the location data and sort by time
train_data = pd.concat(location_train_data).dropna(subset=["pv_measurement"]).reset_index()
test_data = pd.concat(location_test_data).reset_index()

# Remove columns only containing a single or non values
cols_to_remove = [col for col in train_data.columns if train_data[col].nunique() <= 1]
train_data = train_data.drop(cols_to_remove, axis=1)
test_data = test_data.drop(cols_to_remove, axis=1)

# Replace all ':' with '_'
train_data.columns = [col_name.replace(":", "_") for col_name in train_data.columns]
test_data.columns = [col_name.replace(":", "_") for col_name in test_data.columns]

In [None]:
train_data

In [None]:
test_data

Analyse train data

In [None]:
train_data.describe()

In [None]:
train_data.hist(figsize=(20,20));

In [None]:
for i, column in enumerate(train_data.columns, 1):
    print(f"{i}. {column}")

plt.matshow(train_data.corr(), cmap="PRGn", interpolation="none", vmin=-1, vmax=1)
plt.show()

# Pycaret regression

Import pycaret

In [None]:
from pycaret.regression import RegressionExperiment
exp = RegressionExperiment()
n_select = 5 # Select the top n models
turbo = False # Filter away slow models

Setup basic model

In [None]:
exp.setup(
    train_data, 
    target = 'pv_measurement', 
    create_date_columns = ['hour', 'day', 'month', 'year'],
    #imputation_type = 'iterative',
    #numeric_iterative_imputer = 'et',
    #normalize = True,
    #transformation = True,
    #pca = True,
    #pca_components = 'mle',
    #polynomial_features = True,
    #remove_multicollinearity = True,
    feature_selection = True, 
    #n_features_to_select = 0.5,
)

List all models

In [None]:
exp.models()

Compare models

In [None]:
basic_top_models = exp.compare_models(sort='MAE', turbo=turbo, n_select=n_select)

Choose best model

In [None]:
basic_model_best = basic_top_models[0]

Visualize best model

In [None]:
exp.plot_model(basic_model_best, plot = 'residuals')

In [None]:
exp.plot_model(basic_model_best, plot = 'error')

In [None]:
exp.plot_model(basic_model_best, plot = 'feature')

Combine top models with stacked and blended

In [None]:
# basic_model_blended = exp.blend_models(basic_top_models)

In [None]:
# basic_model_stacked = exp.stack_models(basic_top_models)

Finalize model

In [None]:
basic_model_best = exp.finalize_model(basic_top_models[0])

In [None]:
#basic_model_blended = exp.finalize_model(basic_model_blended)

In [None]:
#basic_model_stacked = exp.finalize_model(basic_model_stacked)

Save model

In [None]:
exp.save_model(basic_model_best, 'basic_model_best')

In [None]:
#exp.save_model(basic_model_blended, 'basic_model_blended')

In [None]:
#exp.save_model(basic_model_stacked, 'basic_model_stacked')

Load model

In [None]:
basic_model_best = exp.load_model('basic_model_best')

In [None]:
#basic_model_blended = exp.load_model('basic_model_blended')

In [None]:
#basic_model_stacked = exp.load_model('basic_model_stacked')

Model predictions

In [None]:
best_predicted_data = predict_model(basic_model_best, data = test_data)

In [None]:
#stacked_predicted_data = predict_model(basic_model_stacked, data = test_data)

In [None]:
#blended_predicted_data = predict_model(basic_model_blended, data = test_data)