In [84]:
#%pip install pycaret
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection._split import _BaseKFold
import pycaret
from pycaret.regression import * 
import pycaret.regression as pycr 
from sklearn.linear_model import LinearRegression
from pycaret.time_series import TSForecastingExperiment
from statsmodels.tsa.seasonal import seasonal_decompose
import pycaret.utils as pycu

In [85]:
x_train_a = pd.read_csv('cleaned_data/A/x_train_a.csv')
y_train_a = pd.read_csv('cleaned_data/A/train_a.csv')
x_test_a = pd.read_csv('cleaned_data/A/x_test_a.csv')

x_train_b = pd.read_csv('cleaned_data/B/x_train_b.csv')
y_train_b = pd.read_csv('cleaned_data/B/train_b.csv')
x_test_b = pd.read_csv('cleaned_data/B/x_test_b.csv')

x_train_c = pd.read_csv('cleaned_data/C/x_train_c.csv')
y_train_c = pd.read_csv('cleaned_data/C/train_c.csv')
x_test_c = pd.read_csv('cleaned_data/C/x_test_c.csv')

In [None]:
class CustomTimeSeriesSplit(_BaseKFold):
    def __init__(self, n_splits, train_size=None, test_size=None):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.train_size = train_size
        self.test_size = test_size

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        indices = np.arange(n_samples)

        # Define initial sizes if not provided
        train_size = self.train_size or n_samples // (self.n_splits + 1)
        test_size = self.test_size or n_samples // self.n_splits

        for test_start in range(train_size + test_size, n_samples, test_size):
            train_end = test_start - test_size
            train_start = max(train_end - train_size, 0)
            yield indices[train_start:train_end], indices[train_end:test_start]

# Example usage
tscv = CustomTimeSeriesSplit(n_splits=5, train_size=720, test_size=120)

for train_idx, test_idx in tscv.split(x_train_a):
    print("Train indices:", train_idx)
    print("Test indices:", test_idx)

In [86]:
x_train_a_combined = x_train_a.merge(y_train_a, left_on='date_forecast', right_on='time', how='left')
x_train_a_combined['observed'] = x_train_a_combined['calc_year'].isna().astype(int)
train_data_a = x_train_a_combined.drop(['time', 'calc_year', 'calc_month', 'calc_day', 'calc_hour'], inplace=True,axis = 1)
x_test_a['observed'] = x_test_a['calc_year'].isna().astype(int)
test_data_a = x_test_a.drop([ 'calc_year', 'calc_month', 'calc_day', 'calc_hour'], axis = 1)

x_train_b_combined = x_train_b.merge(y_train_b, left_on='date_forecast', right_on='time', how='left')
x_train_b_combined['observed'] = x_train_b_combined['calc_year'].isna().astype(int)
train_data_b = x_train_b_combined.drop(['time', 'calc_year', 'calc_month', 'calc_day', 'calc_hour'], inplace=True,axis = 1)
x_test_b['observed'] = x_test_b['calc_year'].isna().astype(int)
test_data_b = x_test_b.drop([ 'calc_year', 'calc_month', 'calc_day', 'calc_hour'], axis = 1)

x_train_c_combined = x_train_c.merge(y_train_c, left_on='date_forecast', right_on='time', how='left')
x_train_c_combined['observed'] = x_train_c_combined['calc_year'].isna().astype(int)
train_data_c = x_train_c_combined.drop(['time', 'calc_year', 'calc_month', 'calc_day', 'calc_hour'],inplace=True, axis = 1)
x_test_c['observed'] = x_test_c['calc_year'].isna().astype(int)
test_data_c = x_test_c.drop([ 'calc_year', 'calc_month', 'calc_day', 'calc_hour'], axis = 1)

In [87]:
#################### A ####################


In [115]:
def add_lag_feature_and_dropna(dataframe, column_name, lag_hours):
    '''
    Add a lag feature for a given column in a DataFrame with a DateTime index and drop rows with NaN in the lagged column.
    
    Parameters:
    - dataframe: The input DataFrame with a DateTime index.
    - column_name: The name of the column to create a lag feature for.
    - lag_hours: The number of hours to lag.
    
    Returns:
    - A DataFrame with the new lagged feature column added and NaN rows dropped.
    '''
    lag_column_name = f"{column_name}_lag_{lag_hours}"
    dataframe[lag_column_name] = dataframe[column_name].shift(lag_hours)
    dataframe.dropna(subset=[lag_column_name], inplace=True)
    return dataframe
def add_multiple_lag_features_and_dropna_indexproof(dataframe, column_name, lag_list):
    """
    Add multiple lag features for a given column based on datetime values.
    This version ensures that the initial dates where we can't fetch the lagged data will have NaN values.
    
    Parameters:
    - dataframe: The input DataFrame with a DateTime column.
    - column_name: The name of the column to create lag features for.
    - lag_list: A list of tuples, where the first value indicates the unit (e.g., "hours", "days", "months", "years", "monthshours", "yearshours", "yearmonthshours")
                and the second value indicates the number. For "monthshours", the number should be a tuple itself (e.g., (1, 1) for "1 month + 1 hour").
    
    Returns:
    - A DataFrame with the new lagged feature columns added, NaN rows dropped, and an average of the lag values for each row.
    """
    df_result = dataframe.copy()
    lag_columns = []
    
    for lag_unit, lag_value in lag_list:
        if lag_unit == "hours":
            target_dates = dataframe.index - pd.DateOffset(hours=lag_value)
            lag_column_name = f"{column_name}_lag_{lag_value}h"
            
        elif lag_unit == "days":
            target_dates = dataframe.index - pd.DateOffset(days=lag_value)
            lag_column_name = f"{column_name}_lag_{lag_value}d"
            
        elif lag_unit == "months":
            target_dates = dataframe.index - pd.DateOffset(months=lag_value)
            lag_column_name = f"{column_name}_lag_{lag_value}m"
            
        elif lag_unit == "years":
            target_dates = dataframe.index - pd.DateOffset(years=lag_value)
            lag_column_name = f"{column_name}_lag_{lag_value}y"
            
        elif lag_unit == "monthshours":
            months, hours = lag_value
            target_dates = dataframe.index - pd.DateOffset(months=months, hours=hours)
            lag_column_name = f"{column_name}_lag_{months}m{hours}h"
            
        elif lag_unit == "yearshours":
            years, hours = lag_value
            target_dates = dataframe.index - pd.DateOffset(years=years, hours=hours)
            lag_column_name = f"{column_name}_lag_{years}y{hours}h"
            
        elif lag_unit == "yearmonthshours":
            years, months, hours = lag_value
            target_dates = dataframe.index - pd.DateOffset(years=years, months=months, hours=hours)
            lag_column_name = f"{column_name}_lag_{years}y{months}m{hours}h"

        lagged_values = []
        for date in target_dates:
            lagged_values.append(dataframe.loc[date, column_name] if date in dataframe.index else np.nan)

        df_result[lag_column_name] = lagged_values
        lag_columns.append(lag_column_name)
    
    # Compute the average of the lag columns
    df_result['avg_lag'] = df_result[lag_columns].mean(axis=1)
    
    # Drop rows with NaN in any of the new lagged columns
    df_result.dropna(subset=lag_columns, inplace=True)
    
    return df_result

    

In [116]:
X_train_a = x_train_a_combined.copy()
X_train_a['date_forecast'] = pd.to_datetime(X_train_a['date_forecast'])
X_train_a.set_index('date_forecast', inplace=True)
X_train_a


Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,pv_measurement,observed
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-02 22:00:00,7.700,1.22825,1728.950,0.00,0.000,1728.950,0.0,280.300,0.000,0.0,...,3.600,-3.575,-0.500,0.0,2019,6,2,22,0.00,1
2019-06-02 23:00:00,7.700,1.22350,1689.825,0.00,0.000,1689.825,0.0,280.300,0.000,0.0,...,3.350,-3.350,0.275,0.0,2019,6,2,23,0.00,1
2019-06-03 00:00:00,7.875,1.21975,1563.225,0.00,0.000,1563.225,0.0,280.650,0.000,0.0,...,3.050,-2.950,0.750,0.0,2019,6,3,0,0.00,1
2019-06-03 01:00:00,8.425,1.21800,1283.425,834.60,0.750,1283.425,0.0,281.675,0.300,2107.1,...,2.725,-2.600,0.875,0.0,2019,6,3,1,0.00,1
2019-06-03 02:00:00,8.950,1.21800,1003.500,129872.60,23.100,1003.500,0.0,282.500,11.975,88275.8,...,2.550,-2.350,0.925,0.0,2019,6,3,2,19.36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,4.550,1.27650,1674.200,337859.78,4.225,542.700,0.0,272.425,2.825,225724.2,...,5.175,4.800,1.925,0.0,2023,4,30,19,9.02,0
2023-04-30 20:00:00,4.500,1.27975,1762.400,9083.50,0.000,546.400,0.0,272.300,0.000,20271.5,...,4.650,4.025,2.300,0.0,2023,4,30,20,0.00,0
2023-04-30 21:00:00,4.500,1.28100,1696.650,0.00,0.000,548.350,0.0,272.300,0.000,0.0,...,4.450,3.575,2.600,0.0,2023,4,30,21,0.00,0
2023-04-30 22:00:00,4.500,1.28100,1353.400,0.00,0.000,527.775,0.0,272.300,0.000,0.0,...,4.100,3.175,2.550,0.0,2023,4,30,22,0.00,0


In [117]:
lags_all = [("years", 1), ("yearshours", (1,2)),("yearshours", (1,-1)),
            
          ("yearmonthshours", (1,1,0)), ("yearmonthshours", (1,1,1)),("yearmonthshours", (1,1,-1)),
        ("yearmonthshours", (1,-1,0)), ("yearmonthshours", (1,-1,1)),("yearmonthshours", (1,-1,-1))]
            
    
         
         
         
df_pvm_lag_A = add_multiple_lag_features_and_dropna_indexproof(X_train_a, "pv_measurement",lags_all)
df_pvm_lag_A

Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,pv_measurement_lag_1y,pv_measurement_lag_1y2h,pv_measurement_lag_1y-1h,pv_measurement_lag_1y1m0h,pv_measurement_lag_1y1m1h,pv_measurement_lag_1y1m-1h,pv_measurement_lag_1y-1m0h,pv_measurement_lag_1y-1m1h,pv_measurement_lag_1y-1m-1h,avg_lag
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-02 23:00:00,7.70,1.23225,4465.100,0.00,0.000,669.400,0.0,280.15002,0.000,0.0,...,0.00,1.10,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.122222
2020-07-03 00:00:00,7.60,1.23350,4445.875,0.00,0.000,493.350,0.0,280.00000,0.000,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.000000
2020-07-03 01:00:00,7.60,1.23400,4715.300,1505.30,1.350,417.325,0.0,279.90000,0.875,6268.5,...,0.00,0.00,36.08,0.00,0.00,19.36,0.00,0.00,0.00,6.160000
2020-07-03 02:00:00,7.60,1.23325,4801.750,153324.69,25.275,378.450,0.0,279.95000,15.725,119383.5,...,36.08,0.00,326.70,19.36,0.00,251.02,0.00,0.00,20.24,72.600000
2020-07-03 03:00:00,7.60,1.23175,5191.750,749727.50,86.350,784.200,0.0,280.10000,47.200,453031.5,...,326.70,0.00,456.06,251.02,19.36,263.78,20.24,0.00,158.84,166.222222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,4.55,1.27650,1674.200,337859.78,4.225,542.700,0.0,272.42500,2.825,225724.2,...,1.10,223.74,0.00,0.00,0.00,0.00,300.74,587.84,69.30,131.413333
2023-04-30 20:00:00,4.50,1.27975,1762.400,9083.50,0.000,546.400,0.0,272.30000,0.000,20271.5,...,0.00,84.70,0.00,0.00,0.00,0.00,69.30,300.74,0.00,50.526667
2023-04-30 21:00:00,4.50,1.28100,1696.650,0.00,0.000,548.350,0.0,272.30000,0.000,0.0,...,0.00,1.10,0.00,0.00,0.00,0.00,0.00,69.30,0.00,7.822222
2023-04-30 22:00:00,4.50,1.28100,1353.400,0.00,0.000,527.775,0.0,272.30000,0.000,0.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.000000


In [131]:
X_train_a

Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,pv_measurement,observed
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-02 22:00:00,7.700,1.22825,1728.950,0.00,0.000,1728.950,0.0,280.300,0.000,0.0,...,3.600,-3.575,-0.500,0.0,2019,6,2,22,0.00,1
2019-06-02 23:00:00,7.700,1.22350,1689.825,0.00,0.000,1689.825,0.0,280.300,0.000,0.0,...,3.350,-3.350,0.275,0.0,2019,6,2,23,0.00,1
2019-06-03 00:00:00,7.875,1.21975,1563.225,0.00,0.000,1563.225,0.0,280.650,0.000,0.0,...,3.050,-2.950,0.750,0.0,2019,6,3,0,0.00,1
2019-06-03 01:00:00,8.425,1.21800,1283.425,834.60,0.750,1283.425,0.0,281.675,0.300,2107.1,...,2.725,-2.600,0.875,0.0,2019,6,3,1,0.00,1
2019-06-03 02:00:00,8.950,1.21800,1003.500,129872.60,23.100,1003.500,0.0,282.500,11.975,88275.8,...,2.550,-2.350,0.925,0.0,2019,6,3,2,19.36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,4.550,1.27650,1674.200,337859.78,4.225,542.700,0.0,272.425,2.825,225724.2,...,5.175,4.800,1.925,0.0,2023,4,30,19,9.02,0
2023-04-30 20:00:00,4.500,1.27975,1762.400,9083.50,0.000,546.400,0.0,272.300,0.000,20271.5,...,4.650,4.025,2.300,0.0,2023,4,30,20,0.00,0
2023-04-30 21:00:00,4.500,1.28100,1696.650,0.00,0.000,548.350,0.0,272.300,0.000,0.0,...,4.450,3.575,2.600,0.0,2023,4,30,21,0.00,0
2023-04-30 22:00:00,4.500,1.28100,1353.400,0.00,0.000,527.775,0.0,272.300,0.000,0.0,...,4.100,3.175,2.550,0.0,2023,4,30,22,0.00,0


In [132]:
#pycaret_ml_A = setup(data = dataframe_seasonal_lag_A, target = "pv_measurement")
pycaret_ml_A = setup(data = X_train_a, target = "pv_measurement")


Unnamed: 0,Description,Value
0,Session id,3415
1,Target,pv_measurement
2,Target type,Regression
3,Original data shape,"(34022, 51)"
4,Transformed data shape,"(34022, 51)"
5,Transformed train set shape,"(23815, 51)"
6,Transformed test set shape,"(10207, 51)"
7,Numeric features,50
8,Rows with missing values,97.4%
9,Preprocess,True


In [133]:
best_model_A = compare_models(sort='MAE')
#183 raw
#180 with lags [("monthshours", (1,1)),
          #("monthshours", (1,24)),("monthshours", (1,24-1)),("monthshours", (1,24+1)),
          #("monthshours", (1,24*2)),("monthshours", (1,24*2-1)),("monthshours", (1,24*2+1))]

#179 with lags [("monthshours", (1,1)),
          #("monthshours", (1,24)),("monthshours", (1,24-1)),("monthshours", (1,24+1)),
          #("monthshours", (1,24*2)),("monthshours", (1,24*2-1)),("monthshours", (1,24*2+1)),
          #("monthshours", (1,24*3)),("monthshours", (1,24*3-1)),("monthshours", (1,24*3+1)),
          #("monthshours", (1,24*4)),("monthshours", (1,24*4-1)),("monthshours", (1,24*4+1)),
          #("monthshours", (1,24*5)),("monthshours", (1,24*5-1)),("monthshours", (1,24*5+1)),
          #("monthshours", (1,24*6)),("monthshours", (1,24*6-1)),("monthshours", (1,24*6+1))
         


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,180.4462,163145.1097,403.6287,0.8805,0.5918,1.3443,5.402
lightgbm,Light Gradient Boosting Machine,180.5997,159951.6919,399.7166,0.8828,1.2016,1.448,0.351
catboost,CatBoost Regressor,182.7307,153109.8769,391.0114,0.8879,1.9269,2.3936,10.446
rf,Random Forest Regressor,185.5166,172109.958,414.6771,0.8739,0.6005,1.3731,18.312
xgboost,Extreme Gradient Boosting,188.7385,170018.5219,411.9999,0.8756,1.6167,1.941,4.264
gbr,Gradient Boosting Regressor,205.8934,189315.9366,434.8991,0.8613,1.8817,2.8711,6.526
huber,Huber Regressor,249.6172,289520.9494,537.9672,0.7879,0.9711,2.7302,0.387
en,Elastic Net,250.2326,228444.6987,477.7992,0.8326,2.773,5.3669,0.642
llar,Lasso Least Angle Regression,254.3526,226448.292,475.699,0.8341,2.8494,6.1512,0.056
lasso,Lasso Regression,254.4964,226611.3795,475.871,0.8339,2.8518,6.118,0.629


In [115]:
#################### B ####################


In [134]:
X_train_b = x_train_b_combined.copy()
X_train_b['date_forecast'] = pd.to_datetime(X_train_b['date_forecast'])
X_train_b.set_index('date_forecast', inplace=True)
X_train_b

Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,pv_measurement,observed
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:00,5.525,1.23975,1200.6750,0.0,0.000,1200.6750,0.0,275.150,0.000,0.00,...,2.800,1.175,2.550,0.0,2019,1,1,0,0.000000,1
2019-01-01 01:00:00,5.425,1.23975,1131.4249,0.0,0.000,1131.4249,0.0,274.825,0.000,0.00,...,3.550,1.525,3.200,0.0,2019,1,1,1,0.000000,1
2019-01-01 02:00:00,5.400,1.23850,1061.0000,0.0,0.000,1061.0000,0.0,274.800,0.000,0.00,...,4.025,2.150,3.425,0.0,2019,1,1,2,0.000000,1
2019-01-01 03:00:00,5.350,1.23975,1021.1500,0.0,0.000,1021.1500,0.0,274.675,0.000,0.00,...,4.900,3.425,3.500,0.0,2019,1,1,3,0.000000,1
2019-01-01 04:00:00,5.675,1.23750,1033.7000,0.0,0.000,1033.7000,0.0,275.500,0.000,0.00,...,6.050,5.575,2.200,0.0,2019,1,1,4,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,4.550,1.27650,1677.9500,337850.1,4.225,542.8500,0.0,272.425,2.825,225735.89,...,5.175,4.800,1.925,0.0,2023,4,30,19,0.828587,0
2023-04-30 20:00:00,4.500,1.27875,1766.5000,9083.1,0.000,546.3500,0.0,272.300,0.000,20268.10,...,4.650,4.025,2.300,0.0,2023,4,30,20,-0.000000,0
2023-04-30 21:00:00,4.500,1.27900,1698.9250,0.0,0.000,548.0500,0.0,272.300,0.000,0.00,...,4.450,3.600,2.600,0.0,2023,4,30,21,-0.000000,0
2023-04-30 22:00:00,4.500,1.27975,1354.8250,0.0,0.000,527.6000,0.0,272.275,0.000,0.00,...,4.100,3.200,2.550,0.0,2023,4,30,22,-0.000000,0


In [121]:

df_pvm_lag_B = add_multiple_lag_features_and_dropna_indexproof(X_train_b, "pv_measurement",lags_all)
df_pvm_lag_B

Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,pv_measurement_lag_1y,pv_measurement_lag_1y2h,pv_measurement_lag_1y-1h,pv_measurement_lag_1y1m0h,pv_measurement_lag_1y1m1h,pv_measurement_lag_1y1m-1h,pv_measurement_lag_1y-1m0h,pv_measurement_lag_1y-1m1h,pv_measurement_lag_1y-1m-1h,avg_lag
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-01 01:00:00,4.250,1.22975,1578.575,0.0,0.000,1578.575,0.0,271.525,0.000,0.00,...,0.0000,0.0,0.0000,0.0,0.0,0.000,0.0000,0.0000,0.0000,0.000000
2020-02-01 02:00:00,4.100,1.22750,1496.975,0.0,0.000,1496.975,0.0,271.100,0.000,0.00,...,0.0000,0.0,0.0000,0.0,0.0,0.000,0.0000,0.0000,0.0000,0.000000
2020-02-01 03:00:00,4.175,1.22550,1375.675,0.0,0.000,1375.675,0.0,271.300,0.000,0.00,...,0.0000,0.0,0.0000,0.0,0.0,0.000,0.0000,0.0000,0.0000,0.000000
2020-02-01 04:00:00,4.450,1.22575,1188.700,0.0,0.000,1188.700,0.0,272.025,0.000,0.00,...,0.0000,0.0,0.0000,0.0,0.0,0.000,0.0000,0.0000,0.0000,0.000000
2020-02-01 05:00:00,4.725,1.22675,996.125,0.0,0.000,996.125,0.0,272.875,0.000,0.00,...,0.0000,0.0,0.0000,0.0,0.0,0.000,0.0000,0.0000,0.0000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-19 03:00:00,3.550,1.29600,7052.100,0.0,0.000,6678.800,0.0,269.100,0.000,0.00,...,0.0000,0.0,0.0000,0.0,0.0,0.000,0.0000,0.0000,26.7375,2.970833
2023-03-19 04:00:00,3.375,1.29625,6946.750,0.0,0.000,6441.075,0.0,268.450,0.000,0.00,...,0.0000,0.0,6.9000,0.0,0.0,0.000,26.7375,0.0000,126.7875,17.825000
2023-03-19 05:00:00,3.250,1.29750,6842.025,2706.4,2.425,6303.800,0.0,267.925,2.425,17505.50,...,6.9000,0.0,75.0375,0.0,0.0,0.000,126.7875,26.7375,331.2000,62.962500
2023-03-19 06:00:00,3.200,1.29675,6579.900,303097.3,51.325,6346.250,0.0,267.600,42.550,323779.30,...,75.0375,0.0,122.4750,0.0,0.0,0.000,331.2000,126.7875,526.9875,131.387500


In [135]:
X_train_b

Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,pv_measurement,observed
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:00,5.525,1.23975,1200.6750,0.0,0.000,1200.6750,0.0,275.150,0.000,0.00,...,2.800,1.175,2.550,0.0,2019,1,1,0,0.000000,1
2019-01-01 01:00:00,5.425,1.23975,1131.4249,0.0,0.000,1131.4249,0.0,274.825,0.000,0.00,...,3.550,1.525,3.200,0.0,2019,1,1,1,0.000000,1
2019-01-01 02:00:00,5.400,1.23850,1061.0000,0.0,0.000,1061.0000,0.0,274.800,0.000,0.00,...,4.025,2.150,3.425,0.0,2019,1,1,2,0.000000,1
2019-01-01 03:00:00,5.350,1.23975,1021.1500,0.0,0.000,1021.1500,0.0,274.675,0.000,0.00,...,4.900,3.425,3.500,0.0,2019,1,1,3,0.000000,1
2019-01-01 04:00:00,5.675,1.23750,1033.7000,0.0,0.000,1033.7000,0.0,275.500,0.000,0.00,...,6.050,5.575,2.200,0.0,2019,1,1,4,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,4.550,1.27650,1677.9500,337850.1,4.225,542.8500,0.0,272.425,2.825,225735.89,...,5.175,4.800,1.925,0.0,2023,4,30,19,0.828587,0
2023-04-30 20:00:00,4.500,1.27875,1766.5000,9083.1,0.000,546.3500,0.0,272.300,0.000,20268.10,...,4.650,4.025,2.300,0.0,2023,4,30,20,-0.000000,0
2023-04-30 21:00:00,4.500,1.27900,1698.9250,0.0,0.000,548.0500,0.0,272.300,0.000,0.00,...,4.450,3.600,2.600,0.0,2023,4,30,21,-0.000000,0
2023-04-30 22:00:00,4.500,1.27975,1354.8250,0.0,0.000,527.6000,0.0,272.275,0.000,0.00,...,4.100,3.200,2.550,0.0,2023,4,30,22,-0.000000,0


In [136]:
pycaret_ml_B = setup(data = X_train_b, target = "pv_measurement")

Unnamed: 0,Description,Value
0,Session id,6997
1,Target,pv_measurement
2,Target type,Regression
3,Original data shape,"(27053, 51)"
4,Transformed data shape,"(27053, 51)"
5,Transformed train set shape,"(18937, 51)"
6,Transformed test set shape,"(8116, 51)"
7,Numeric features,50
8,Rows with missing values,96.5%
9,Preprocess,True


In [137]:
best_model_B = compare_models(sort='MAE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,26.7937,4106.1906,64.0491,0.904,0.7155,1.876,4.219
catboost,CatBoost Regressor,26.8114,3755.4339,61.2465,0.9121,1.0786,5.6719,8.918
lightgbm,Light Gradient Boosting Machine,26.8702,3950.9797,62.8306,0.9075,0.8333,2.3892,0.297
rf,Random Forest Regressor,27.6909,4300.794,65.5613,0.8995,0.7303,1.6838,16.191
xgboost,Extreme Gradient Boosting,28.0631,4303.3974,65.5829,0.8993,0.9547,3.0181,3.523
gbr,Gradient Boosting Regressor,30.7131,4736.6804,68.8082,0.8893,1.1013,3.1823,5.26
dt,Decision Tree Regressor,36.4553,8196.6354,90.4409,0.8078,0.7919,1.6261,0.283
llar,Lasso Least Angle Regression,38.5912,5862.5023,76.5273,0.8631,1.7502,6.0204,0.047
lasso,Lasso Regression,38.5929,5862.1456,76.5251,0.8631,1.7505,6.0811,0.436
br,Bayesian Ridge,39.292,5808.0593,76.1745,0.8644,1.8338,10.4535,0.065


In [None]:
#################### C ####################


In [124]:
X_train_c = x_train_c_combined.copy()
X_train_c['date_forecast'] = pd.to_datetime(X_train_c['date_forecast'])
X_train_c.set_index('date_forecast', inplace=True)
X_train_c

Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,pv_measurement,observed
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-09-04 08:00:00,6.625,1.22075,2287.250,5283377.5,421.225,2287.250,0.0,278.20000,67.975,932039.7,...,1.250,-0.825,0.825,0.0,2019,9,4,8,137.20,1
2019-09-04 09:00:00,6.275,1.21425,2679.075,6726922.0,508.125,2679.075,0.0,277.42500,76.625,1041404.3,...,1.500,-1.500,-0.200,0.0,2019,9,4,9,0.00,1
2019-09-04 10:00:00,5.900,1.20825,2983.750,7747199.5,561.875,2983.750,0.0,276.65002,112.575,1362371.1,...,2.000,-2.000,-0.225,0.0,2019,9,4,10,0.00,1
2019-09-04 11:00:00,5.875,1.20350,3286.550,8254105.0,578.025,3286.550,0.0,276.65000,195.150,2215590.5,...,1.950,-1.950,-0.250,0.0,2019,9,4,11,0.00,1
2019-09-04 12:00:00,6.150,1.20175,3453.425,8205280.0,555.175,3453.425,0.0,277.25000,243.375,3157120.2,...,1.525,-1.500,-0.300,0.0,2019,9,4,12,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,4.400,1.27550,1456.575,336040.6,4.175,551.225,0.0,272.02500,2.775,219096.0,...,4.075,3.600,1.875,0.0,2023,4,30,19,50.96,0
2023-04-30 20:00:00,4.400,1.27850,1476.350,8827.2,0.000,564.100,0.0,271.95000,0.000,19936.2,...,3.600,2.950,2.125,0.0,2023,4,30,20,2.94,0
2023-04-30 21:00:00,4.400,1.27900,1516.300,0.0,0.000,578.700,0.0,271.90000,0.000,0.0,...,3.600,2.625,2.400,0.0,2023,4,30,21,0.00,0
2023-04-30 22:00:00,4.400,1.27975,1240.600,0.0,0.000,551.500,0.0,271.95000,0.000,0.0,...,3.275,2.325,2.325,0.0,2023,4,30,22,-0.00,0


In [125]:

df_pvm_lag_C = add_multiple_lag_features_and_dropna_indexproof(X_train_c, "pv_measurement",lags_all)
df_pvm_lag_C

Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,pv_measurement_lag_1y,pv_measurement_lag_1y2h,pv_measurement_lag_1y-1h,pv_measurement_lag_1y1m0h,pv_measurement_lag_1y1m1h,pv_measurement_lag_1y1m-1h,pv_measurement_lag_1y-1m0h,pv_measurement_lag_1y-1m1h,pv_measurement_lag_1y-1m-1h,avg_lag
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-04 09:00:00,8.40,1.20575,,3777675.8,298.650,3287.975,0.0,281.80000,58.625000,804660.5,...,245.0,68.6,333.2,0.0,137.2,0.0,58.8,29.4,88.2,106.711111
2020-10-04 10:00:00,8.35,1.20150,,4670687.0,344.200,1167.750,0.0,281.72500,62.050000,869085.0,...,333.2,137.2,441.0,0.0,0.0,0.0,88.2,58.8,98.0,128.488889
2020-10-04 11:00:00,8.50,1.19900,,5063742.0,352.725,930.950,0.0,282.02500,65.524994,918814.2,...,441.0,245.0,382.2,0.0,0.0,0.0,98.0,88.2,78.4,148.088889
2020-10-04 12:00:00,8.65,1.19825,8149.1,4912074.5,323.300,991.025,0.0,282.27500,74.575000,1008793.7,...,382.2,333.2,274.4,0.0,0.0,0.0,78.4,98.0,39.2,133.933333
2020-10-04 13:00:00,8.65,1.19900,8149.1,4232460.0,259.250,3108.700,0.0,282.25000,73.800000,1068125.1,...,274.4,441.0,156.8,0.0,0.0,0.0,39.2,78.4,9.8,111.066667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-21 07:00:00,2.80,1.30250,,0.0,0.000,802.200,0.0,265.95000,0.000000,0.0,...,0.0,0.0,9.8,49.0,9.8,78.4,0.0,0.0,0.0,16.333333
2022-11-21 08:00:00,2.80,1.29950,,25546.1,7.450,802.200,0.0,265.90000,5.125000,36890.2,...,9.8,0.0,19.6,78.4,49.0,88.2,0.0,0.0,0.0,27.222222
2022-11-21 09:00:00,2.80,1.29475,,302288.3,35.575,,0.0,265.80000,21.500000,191699.2,...,19.6,0.0,29.4,88.2,78.4,78.4,0.0,0.0,9.8,33.755556
2022-11-21 10:00:00,2.75,1.28975,,702814.0,59.850,,0.0,265.65002,29.825000,369614.9,...,29.4,9.8,29.4,78.4,88.2,58.8,9.8,0.0,19.6,35.933333


In [None]:
X_train_c

In [126]:
pycaret_ml_C = setup(data = X_train_c, target = "pv_measurement")

Unnamed: 0,Description,Value
0,Session id,4062
1,Target,pv_measurement
2,Target type,Regression
3,Original data shape,"(7585, 61)"
4,Transformed data shape,"(7585, 61)"
5,Transformed train set shape,"(5309, 61)"
6,Transformed test set shape,"(2276, 61)"
7,Numeric features,60
8,Rows with missing values,99.7%
9,Preprocess,True


In [127]:
best_model_C = compare_models(sort='MAE')
#32.1126 raw
#26.48  with lags [("monthshours", (1,1)),
          #("monthshours", (1,24)),("monthshours", (1,24-1)),("monthshours", (1,24+1)),
          #("monthshours", (1,24*2)),("monthshours", (1,24*2-1)),("monthshours", (1,24*2+1))]

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,26.0773,2985.7418,54.4779,0.9195,0.8734,0.4127,6.979
et,Extra Trees Regressor,26.2844,3262.9475,56.9789,0.9119,0.6248,0.4722,1.106
lightgbm,Light Gradient Boosting Machine,26.2941,3136.3986,55.8842,0.9153,0.7133,0.4415,0.218
rf,Random Forest Regressor,27.2053,3440.9578,58.5344,0.9069,0.639,0.4952,3.577
xgboost,Extreme Gradient Boosting,28.4409,3740.2264,60.971,0.8988,0.7656,0.4649,1.248
gbr,Gradient Boosting Regressor,29.5661,3722.4626,60.8447,0.8996,0.9647,0.5033,1.651
lasso,Lasso Regression,35.7817,4397.2819,66.1463,0.8809,1.6105,0.6539,0.148
llar,Lasso Least Angle Regression,35.7847,4398.2691,66.1536,0.8809,1.6106,0.6539,0.037
huber,Huber Regressor,35.9047,5270.9637,72.5062,0.8577,1.1365,0.6864,0.108
en,Elastic Net,36.235,4486.8088,66.8348,0.8785,1.6316,0.6716,0.188


In [None]:
############ Add seasonal lag to test sets ##################

In [128]:
def set_datetime_index_corrected(dataframe, year_col='forecast_year', month_col='forecast_month', day_col='forecast_day', hour_col='forecast_hour'):

    # Combine the year, month, day, and hour columns to create a datetime series
    datetime_series = pd.to_datetime(dataframe[year_col].astype(str) + '-' + 
                                     dataframe[month_col].astype(str) + '-' + 
                                     dataframe[day_col].astype(str) + ' ' + 
                                     dataframe[hour_col].astype(str) + ':00:00')
    
    # Set this datetime series as the dataframe's index
    dataframe.set_index(datetime_series, inplace=True)
    
    # Return the modified dataframe
    return dataframe

In [130]:
##### A ####
#Check shape of both of train with seasonality and test data, find out when test data starts
#date time index
#x_train_combined_a_seasonal_feat

test_data_a_copy = test_data_a.copy().drop(columns=["location"])
X_train_a_copy = X_train_a.copy()
print("test_data_a_copy")
print(test_data_a_copy.shape)
print("X_train_a_copy")
print(X_train_a_copy.shape)

test_data_a_dtidx = set_datetime_index_corrected(test_data_a_copy,year_col='forecast_year', month_col='forecast_month', day_col='forecast_day', hour_col='forecast_hour')
print("test_data_a_dtidx")
print(test_data_a_dtidx.shape)


combined_test_train_a = pd.concat([X_train_a_copy,test_data_a_dtidx], axis =0)
print("combined_test_train_a")
print(combined_test_train_a.shape)

pvm_lag_test_train_A = add_multiple_lag_features_and_dropna_indexproof(combined_test_train_a, "pv_measurement",lags_all)
print("pvm_lag_test_train_A")

print(pvm_lag_test_train_A.shape)


test_lag_A = pvm_lag_test_train_A.loc["2023-05-01 00:00:00":].drop(columns=["pv_measurement"])
print("test_lag_A")
print(test_lag_A.shape)

#make seasonal lag feature

#cut where test started (check that it has the same shape as before)

test_data_a_copy
(720, 50)
X_train_a_copy
(34022, 51)
test_data_a_dtidx
(720, 50)
combined_test_train_a
(34742, 51)
pvm_lag_test_train_A
(24859, 61)
test_lag_A
(684, 60)


In [297]:
##### B ####
#Check shape of both of train with seasonality and test data, find out when test data starts
#date time index
#x_train_combined_a_seasonal_feat

test_data_b_copy = test_data_b.copy().drop(columns=["location"])
x_train_combined_b_seasonal_feat_copy = x_train_combined_b_seasonal_feat.copy()


test_data_b_dtidx = set_datetime_index_corrected(test_data_b_copy,year_col='forecast_year', month_col='forecast_month', day_col='forecast_day', hour_col='forecast_hour')
print(test_data_b_dtidx.shape)

display(x_train_combined_b_seasonal_feat)

combined_test_train_b = pd.concat([x_train_combined_b_seasonal_feat_copy,test_data_b_dtidx], axis =0)



combined_test_train_seas_lag_b = add_lag_feature_and_dropna(combined_test_train_b, "seasonal_yr", 24*365)


test_data_b_seasonal_lag = combined_test_train_seas_lag_b.loc["2023-05-01 00:00:00":]

test_data_b_seasonal_lag.drop(columns=["seasonal_yr","pv_measurement"], inplace= True)
display(test_data_b_seasonal_lag)
#train data with seasonality concat with test data --> nan values for seasonality


#make seasonal lag feature

#cut where test started (check that it has the same shape as before)


(720, 50)


Unnamed: 0_level_0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,pv_measurement,observed,seasonal_yr
date_forecast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 00:00:00,5.525,1.23975,1200.6750,0.0,0.000,1200.6750,0.0,275.150,0.000,0.00,...,1.175,2.550,0.0,2019,1,1,0,0.000000,1,-0.124526
2019-01-01 01:00:00,5.425,1.23975,1131.4249,0.0,0.000,1131.4249,0.0,274.825,0.000,0.00,...,1.525,3.200,0.0,2019,1,1,1,0.000000,1,-0.106586
2019-01-01 02:00:00,5.400,1.23850,1061.0000,0.0,0.000,1061.0000,0.0,274.800,0.000,0.00,...,2.150,3.425,0.0,2019,1,1,2,0.000000,1,-0.103324
2019-01-01 03:00:00,5.350,1.23975,1021.1500,0.0,0.000,1021.1500,0.0,274.675,0.000,0.00,...,3.425,3.500,0.0,2019,1,1,3,0.000000,1,-0.161220
2019-01-01 04:00:00,5.675,1.23750,1033.7000,0.0,0.000,1033.7000,0.0,275.500,0.000,0.00,...,5.575,2.200,0.0,2019,1,1,4,0.000000,1,-0.181605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-30 19:00:00,4.550,1.27650,1677.9500,337850.1,4.225,542.8500,0.0,272.425,2.825,225735.89,...,4.800,1.925,0.0,2023,4,30,19,0.828587,0,0.065367
2023-04-30 20:00:00,4.500,1.27875,1766.5000,9083.1,0.000,546.3500,0.0,272.300,0.000,20268.10,...,4.025,2.300,0.0,2023,4,30,20,-0.000000,0,0.058840
2023-04-30 21:00:00,4.500,1.27900,1698.9250,0.0,0.000,548.0500,0.0,272.300,0.000,0.00,...,3.600,2.600,0.0,2023,4,30,21,-0.000000,0,0.098795
2023-04-30 22:00:00,4.500,1.27975,1354.8250,0.0,0.000,527.6000,0.0,272.275,0.000,0.00,...,3.200,2.550,0.0,2023,4,30,22,-0.000000,0,0.022145


Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,observed,seasonal_yr_lag_8760
2023-05-01 00:00:00,4.300,1.28300,912.3000,0.0,0.000,1059.750,0.0,271.65002,0.000,0.000,...,3.950,2.100,3.375,0.0,2023,5,1,0,0,0.273297
2023-05-01 01:00:00,4.250,1.28300,1482.8002,0.0,0.000,1073.700,0.0,271.45000,0.000,0.000,...,3.825,1.925,3.300,0.0,2023,5,1,1,0,0.340977
2023-05-01 02:00:00,4.150,1.28275,1765.9000,0.0,0.000,1200.100,0.0,271.05000,0.000,0.000,...,3.650,1.750,3.225,0.0,2023,5,1,2,0,0.764185
2023-05-01 03:00:00,4.025,1.28225,2269.7500,40510.2,11.675,1179.000,0.0,270.65000,9.375,67382.305,...,3.500,1.475,3.150,0.0,2023,5,1,3,0,0.650029
2023-05-01 04:00:00,3.900,1.28200,2198.2250,567057.1,76.900,919.150,0.0,270.37500,47.400,408812.200,...,3.325,1.300,3.075,0.0,2023,5,1,4,0,0.305933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-03 19:00:00,8.350,1.19800,3640.1250,1908360.9,85.100,2015.750,0.0,281.57500,33.625,675011.000,...,2.475,2.075,-1.350,0.0,2023,7,3,19,0,-0.155935
2023-07-03 20:00:00,8.525,1.20075,3351.1000,737351.8,24.800,1613.375,0.0,281.85000,14.350,345239.800,...,2.450,2.100,-1.275,0.0,2023,7,3,20,0,-0.138019
2023-07-03 21:00:00,8.800,1.20375,2753.0250,149728.8,1.275,1624.450,0.0,282.30000,1.300,112669.700,...,2.575,2.150,-1.400,0.0,2023,7,3,21,0,-0.096462
2023-07-03 22:00:00,9.000,1.20600,2204.5000,1440.5,0.000,1768.325,0.0,282.67502,0.000,9413.900,...,2.250,1.800,-1.350,0.0,2023,7,3,22,0,-0.025552


In [300]:
##### C ####
#Check shape of both of train with seasonality and test data, find out when test data starts
#date time index
#x_train_combined_a_seasonal_feat
display(test_data_c)

test_data_c_copy = test_data_c.copy().drop(columns=["location"])
x_train_combined_c_seasonal_feat_copy = x_train_combined_c_seasonal_feat.copy()




test_data_c_dtidx = set_datetime_index_corrected(test_data_c_copy,year_col='forecast_year', month_col='forecast_month', day_col='forecast_day', hour_col='forecast_hour')
print(test_data_c_dtidx.shape)


combined_test_train_c = pd.concat([x_train_combined_c_seasonal_feat_copy,test_data_c_dtidx], axis =0)

combined_test_train_seas_lag_c = add_lag_feature_and_dropna(combined_test_train_c, "seasonal_yr", 24*365)


test_data_c_seasonal_lag = combined_test_train_seas_lag_c.loc["2023-05-01 00:00:00":]

test_data_c_seasonal_lag.drop(columns=["seasonal_yr","pv_measurement"], inplace= True)
display(test_data_c_seasonal_lag)
#train data with seasonality concat with test data --> nan values for seasonality


Unnamed: 0,location,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,...,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,observed
0,C,4.150,1.28600,,0.0,0.000000,1236.500,0.0,271.025,0.000,...,31241.975,3.425,1.450,3.100,0.0,2023,5,1,0,0
1,C,4.050,1.28550,1477.100,0.0,0.000000,1220.425,0.0,270.700,0.000,...,31019.550,3.325,1.400,3.025,0.0,2023,5,1,1,0
2,C,3.900,1.28375,1477.100,0.0,0.000000,1258.650,0.0,270.200,0.000,...,32372.800,3.225,1.225,3.000,0.0,2023,5,1,2,0
3,C,3.750,1.28275,,39890.0,11.549999,1655.175,0.0,269.700,9.300,...,35433.625,3.125,0.950,3.000,0.0,2023,5,1,3,0
4,C,3.700,1.28150,1990.000,564426.7,76.675000,1639.100,0.0,269.450,47.925,...,35455.574,3.025,0.700,2.950,0.0,2023,5,1,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,C,8.400,1.19675,3493.650,1903925.5,84.775000,1967.375,0.0,281.700,32.200,...,41007.900,2.175,1.900,-1.075,0.0,2023,7,3,19,0
716,C,8.600,1.20000,3078.750,733045.4,24.550000,1449.500,0.0,282.025,13.875,...,41315.950,2.200,2.000,-0.925,0.0,2023,7,3,20,0
717,C,8.875,1.20350,2308.400,147324.3,1.225000,1543.650,0.0,282.350,1.250,...,41665.900,2.250,2.050,-0.950,0.0,2023,7,3,21,0
718,C,9.000,1.20650,2000.450,1378.3,0.000000,1725.950,0.0,282.600,0.000,...,39007.600,1.875,1.700,-0.775,0.0,2023,7,3,22,0


(720, 50)


Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,...,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,forecast_year,forecast_month,forecast_day,forecast_hour,observed,seasonal_yr_lag_8760
2023-05-01 00:00:00,4.150,1.28600,,0.0,0.000000,1236.500,0.0,271.025,0.000,0.0,...,3.425,1.450,3.100,0.0,2023,5,1,0,0,0.035054
2023-05-01 01:00:00,4.050,1.28550,1477.100,0.0,0.000000,1220.425,0.0,270.700,0.000,0.0,...,3.325,1.400,3.025,0.0,2023,5,1,1,0,-0.080017
2023-05-01 02:00:00,3.900,1.28375,1477.100,0.0,0.000000,1258.650,0.0,270.200,0.000,0.0,...,3.225,1.225,3.000,0.0,2023,5,1,2,0,-0.103031
2023-05-01 03:00:00,3.750,1.28275,,39890.0,11.549999,1655.175,0.0,269.700,9.300,66888.9,...,3.125,0.950,3.000,0.0,2023,5,1,3,0,-0.114538
2023-05-01 04:00:00,3.700,1.28150,1990.000,564426.7,76.675000,1639.100,0.0,269.450,47.925,412029.7,...,3.025,0.700,2.950,0.0,2023,5,1,4,0,-0.114538
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-03 19:00:00,8.400,1.19675,3493.650,1903925.5,84.775000,1967.375,0.0,281.700,32.200,650270.1,...,2.175,1.900,-1.075,0.0,2023,7,3,19,0,0.332963
2023-07-03 20:00:00,8.600,1.20000,3078.750,733045.4,24.550000,1449.500,0.0,282.025,13.875,331501.4,...,2.200,2.000,-0.925,0.0,2023,7,3,20,0,0.252407
2023-07-03 21:00:00,8.875,1.20350,2308.400,147324.3,1.225000,1543.650,0.0,282.350,1.250,108841.0,...,2.250,2.050,-0.950,0.0,2023,7,3,21,0,0.206373
2023-07-03 22:00:00,9.000,1.20650,2000.450,1378.3,0.000000,1725.950,0.0,282.600,0.000,8968.6,...,1.875,1.700,-0.775,0.0,2023,7,3,22,0,0.235140


In [230]:
############ Predictions ################
test_data_a_seasonal_lag.columns

Index(['absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W',
       'cloud_base_agl:m', 'dew_or_rime:idx', 'dew_point_2m:K',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'elevation:m', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
       'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx',
       'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_azimuth:d',
       'sun_elevation:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_speed_v_10

In [305]:
y_pred_a = predict_model(best_model_A, data=test_data_a_seasonal_lag)
df_y_pred_a = y_pred_a[['prediction_label']].copy()
df_y_pred_a.columns = ['pv_measurement']
print(df_y_pred_a) #print the DataFrame

                     pv_measurement
2023-05-01 00:00:00       -1.305224
2023-05-01 01:00:00       -1.305224
2023-05-01 02:00:00       -1.305224
2023-05-01 03:00:00       50.723522
2023-05-01 04:00:00      400.496231
...                             ...
2023-07-03 19:00:00      144.999344
2023-07-03 20:00:00      -37.893918
2023-07-03 21:00:00      -87.719269
2023-07-03 22:00:00      -87.661979
2023-07-03 23:00:00     -123.177398

[720 rows x 1 columns]


In [302]:
y_pred_b = predict_model(best_model_B, data=test_data_b_seasonal_lag)
df_y_pred_b = y_pred_b[['prediction_label']].copy()
df_y_pred_b.columns = ['pv_measurement']
df_y_pred_b #print the DataFrame

Unnamed: 0,pv_measurement
2023-05-01 00:00:00,0.000000
2023-05-01 01:00:00,0.000000
2023-05-01 02:00:00,0.000000
2023-05-01 03:00:00,4.027863
2023-05-01 04:00:00,50.811223
...,...
2023-07-03 19:00:00,38.730593
2023-07-03 20:00:00,14.568682
2023-07-03 21:00:00,16.768368
2023-07-03 22:00:00,4.355625


In [303]:
y_pred_c = predict_model(best_model_C, data=test_data_c_seasonal_lag)
df_y_pred_c = y_pred_c[['prediction_label']].copy()
df_y_pred_c.columns = ['pv_measurement']
df_y_pred_c #print the DataFrame


Unnamed: 0,pv_measurement
2023-05-01 00:00:00,0.438456
2023-05-01 01:00:00,0.438456
2023-05-01 02:00:00,0.438456
2023-05-01 03:00:00,2.517790
2023-05-01 04:00:00,13.706025
...,...
2023-07-03 19:00:00,45.463205
2023-07-03 20:00:00,10.738984
2023-07-03 21:00:00,2.196515
2023-07-03 22:00:00,-0.062311


In [306]:
predictions = pd.concat([df_y_pred_a,df_y_pred_b, df_y_pred_c], ignore_index= True)
predictions.tail()

Unnamed: 0,pv_measurement
2155,45.463205
2156,10.738984
2157,2.196515
2158,-0.062311
2159,-0.453361


In [307]:
predictions_df = predictions.rename(columns={'pv_measurement': 'prediction'})

sample_submission = pd.read_csv('sample_submission.csv')

# Convert all negative predictions to 0
predictions_df.loc[predictions_df['prediction'] < 0, 'prediction'] = 0

# Join the 'id' column from sample_submission with the predictions
sample_submission['prediction'] = predictions_df['prediction']

# Save to CSV
sample_submission.to_csv('pycaret_prediction.csv', index=False)
predictions_df

Unnamed: 0,prediction
0,0.000000
1,0.000000
2,0.000000
3,50.723522
4,400.496231
...,...
2155,45.463205
2156,10.738984
2157,2.196515
2158,0.000000
