🙏 This notebook builds off of the work of @greysky's [Enefit Generic Notebook](https://www.kaggle.com/code/greysky/enefit-generic-notebook) and @patrick0302's [Do you desire sun power](https://www.kaggle.com/code/patrick0302/do-you-desire-sun-power)

We will build two models, one for energy production and one for energy consumption, and build our prediction from the output of these two.

In [None]:
import holidays
import os
import gc
import pickle

import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_absolute_error
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import VotingRegressor

import lightgbm as lgb

import optuna

In [None]:
class MonthlyKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits
        
    def split(self, X, y, groups=None):
        dates = 12 * X["year"] + X["month"]
        timesteps = sorted(dates.unique().tolist())
        X = X.reset_index()
        
        for t in timesteps[-self.n_splits:]:
            idx_train = X[dates.values < t].index
            idx_test = X[dates.values == t].index
            
            yield idx_train, idx_test
            
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [None]:
def feature_eng(df_data, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target):
    df_data = (
        df_data
        .with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )
    )
    
    df_client = (
        df_client
        .with_columns(
            (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
        )
    )
    
    df_gas = (
        df_gas
        .rename({"forecast_date": "date"})
        .with_columns(
            (pl.col("date") + pl.duration(days=1)).cast(pl.Date)
        )
    )
    
    df_electricity = (
        df_electricity
        .rename({"forecast_date": "datetime"})
        .with_columns(
            pl.col("datetime") + pl.duration(days=1)
        )
    )
    
    df_location = (
        df_location
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32)
        )
    )
    
    df_forecast = (
        df_forecast
        .rename({"forecast_datetime": "datetime"})
        .filter((pl.col("hours_ahead") >= 24) & pl.col("hours_ahead") <= 48)
        .drop("hours_ahead")
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
#             pl.col('datetime').dt.convert_time_zone("Europe/Bucharest").dt.replace_time_zone(None).cast(pl.Datetime("us")),
#             pl.col('datetime').dt.replace_time_zone(None).cast(pl.Datetime("us"))
        )
        .join(df_location, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
    )
    
    df_historical = (
        df_historical
        .with_columns(
            pl.col("latitude").cast(pl.datatypes.Float32),
            pl.col("longitude").cast(pl.datatypes.Float32),
#             pl.col("datetime") + pl.duration(hours=37)
        )
        .join(df_location, how="left", on=["longitude", "latitude"])
        .drop("longitude", "latitude")
    )
    
    df_forecast_date = (
        df_forecast
        .group_by("datetime").mean()
        .drop("county")
    )
    
    df_forecast_local = (
        df_forecast
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )
    
    df_historical_date = (
        df_historical
        .group_by("datetime").mean()
        .drop("county")
    )
    
    df_historical_local = (
        df_historical
        .filter(pl.col("county").is_not_null())
        .group_by("county", "datetime").mean()
    )
    
    df_data = (
        df_data
        .join(df_gas, on="date", how="left")
        .join(df_client, on=["county", "is_business", "product_type", "date"], how="left")
        .join(df_electricity, on="datetime", how="left")
        
        .join(df_forecast_date, on="datetime", how="left", suffix="_fd")
        .join(df_forecast_local, on=["county", "datetime"], how="left", suffix="_fl")
        .join(df_historical_date, on="datetime", how="left", suffix="_hd")
        .join(df_historical_local, on=["county", "datetime"], how="left", suffix="_hl")
        
        .join(df_forecast_date.with_columns(pl.col("datetime") + pl.duration(days=3)), on="datetime", how="left", suffix="_fd_3")
        .join(df_forecast_local.with_columns(pl.col("datetime") + pl.duration(days=3)), on=["county", "datetime"], how="left", suffix="_fl_3")
        .join(df_historical_date.with_columns(pl.col("datetime") + pl.duration(days=3)), on="datetime", how="left", suffix="_hd_3")
        .join(df_historical_local.with_columns(pl.col("datetime") + pl.duration(days=3)), on=["county", "datetime"], how="left", suffix="_hl_3")

        .join(df_forecast_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_fd_7")
        .join(df_forecast_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_fl_7")
        .join(df_historical_date.with_columns(pl.col("datetime") + pl.duration(days=7)), on="datetime", how="left", suffix="_hd_7")
        .join(df_historical_local.with_columns(pl.col("datetime") + pl.duration(days=7)), on=["county", "datetime"], how="left", suffix="_hl_7")
        
        .join(df_forecast_date.with_columns(pl.col("datetime") + pl.duration(days=14)), on="datetime", how="left", suffix="_fd_14")
        .join(df_forecast_local.with_columns(pl.col("datetime") + pl.duration(days=14)), on=["county", "datetime"], how="left", suffix="_fl_14")
        .join(df_historical_date.with_columns(pl.col("datetime") + pl.duration(days=14)), on="datetime", how="left", suffix="_hd_14")
        .join(df_historical_local.with_columns(pl.col("datetime") + pl.duration(days=14)), on=["county", "datetime"], how="left", suffix="_hl_14")
        
#         .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=1)).rename({"target": "target_1"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=2)).rename({"target": "target_2"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=3)).rename({"target": "target_3"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=4)).rename({"target": "target_4"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=5)).rename({"target": "target_5"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=6)).rename({"target": "target_6"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=7)).rename({"target": "target_7"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=14)).rename({"target": "target_14"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
#         .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=21)).rename({"target": "target_21"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
#         .join(df_target.with_columns(pl.col("datetime") + pl.duration(days=28)).rename({"target": "target_28"}), on=["county", "is_business", "product_type", "is_consumption", "datetime"], how="left")
        
        .with_columns(
            pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
            pl.col("datetime").dt.hour().alias("hour"),
            pl.col("datetime").dt.day().alias("day"),
            pl.col("datetime").dt.weekday().alias("weekday"),
            pl.col("datetime").dt.month().alias("month"),
            pl.col("datetime").dt.year().alias("year"),
        )
        
        .with_columns(
            pl.concat_str("county", "is_business", "product_type", "is_consumption", separator="_").alias("category_1"),
        )
        
        .with_columns(
            (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
            (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
            (np.pi * pl.col("day") / 15).sin().alias("sin(dayofmonth)"),
            (np.pi * pl.col("day") / 15).cos().alias("cos(dayofmonth)"),
            (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
            (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
        )
        
        .with_columns(
            pl.col(pl.Float64).cast(pl.Float32),
        )
        
        .drop("date", "datetime", "hour", "dayofyear")
    )
    
    return df_data

In [None]:
def to_pandas(X, y=None):
    cat_cols = ["county", "is_business", "product_type", "is_consumption", "category_1", "is_holiday"]
    
    if y is not None:
        df = pd.concat([X.to_pandas(), y.to_pandas()], axis=1)
    else:
        df = X.to_pandas()    

    # Identify holidays as a binary feature
    estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
    estonian_holidays_keys = list(estonian_holidays.keys())
    df["temp_date"] = pd.to_datetime(df[['year', 'month', 'day']])
    df['is_holiday'] = df["temp_date"].isin(estonian_holidays_keys).astype(int)
    df.drop("temp_date", axis=1, inplace=True)
        
    df = df.set_index("row_id")
    df[cat_cols] = df[cat_cols].astype("category")
    
    df["target_mean"] = df[[f"target_{i}" for i in range(2, 7)]].mean(1)
    df["target_std"] = df[[f"target_{i}" for i in range(2, 7)]].std(1)
    df["target_ratio_7"] = df["target_7"] / (df["target_14"] + 1e-3)
#     df["target_ratio_14"] = df["target_14"] / (df["target_28"] + 1e-3)
    
#     df.drop([f"target_{i}" for i in range(2, 7)], axis=1, inplace=True)

#     # Add the log of some important features to account for outlier data
#     log_features = ["target_2", "target_3", "target_4", "target_5", "target_6", "target_7", "target_14", "target_mean",
#                     "eic_count", "installed_capacity", "direct_solar_radiation_fl", "direct_solar_radiation", 
#                     "surface_solar_radiation_downwards_fl","surface_solar_radiation_downwards"]
#     for col in log_features:
#         df["log_" + col] = np.log(df[col] + 1e-3)

    return df

In [None]:
def lgb_objective(trial):
    params = {
        'n_iter'           : 1000,
        'verbose'          : -1,
        'random_state'     : 42,
        'objective'        : 'l1',
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 4, 512),
        'max_depth'        : trial.suggest_int('max_depth', 5, 12),
        'max_bin'          : trial.suggest_int('max_bin', 32, 1024),
        'num_leaves'       : trial.suggest_int('num_leaves', 16, 512),
    }
    
    model  = lgb.LGBMRegressor(**params)
    X, y   = df_train[df_train['is_consumption']==0].drop(columns=["target"]), df_train[df_train['is_consumption']==0]["target"]
    cv     = MonthlyKFold(1)
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    
    return -1 * np.mean(scores)

To kick things off, we begin by performing a pivoting operation on the training data to obtain distinct time series.

In [None]:
train = pd.read_csv("/kaggle/input/predict-energy-behavior-of-prosumers/train.csv")

# Pivot the training data to have a cleaner DataFrame where we can analyze the mean target values
# organized by datetime and various categorical variables.
pivot_train = train.pivot_table(index='datetime',columns=['county','product_type','is_business','is_consumption'], values='target', aggfunc='mean')

# Renaming columns for easier access and interpretation
pivot_train.columns = ['county{}_productType{}_isBusiness{}_isConsumption{}'.format(*col) for col in pivot_train.columns.values]
pivot_train.index = pd.to_datetime(pivot_train.index)

pivot_train

Upon visualizing the data for the past year, with daily average values plotted, an intriguing pattern emerges among the 138 time series. They can be classified into two distinct groups based on their trends — either an upward trajectory or a downward one.

As you may have already knew, the upward-trending series predominantly represent energy consumption, while the downward-trending ones are mostly energy production, specifically solar power generation.

In [None]:
df_plot = pivot_train.copy()
df_plot = (df_plot - df_plot.min())/(df_plot.max() - df_plot.min())
df_plot_resampled_D = df_plot.resample('D').mean()

# Plot the consumption data with alpha=0.1 
df_plot_resampled_D.loc['2022-7':].plot(alpha=0.1, color='gray', figsize=(15, 6), legend=False)

If we color these time series using the values from the `is_consumption` variable, with 0 denoting green and 1 representing blue, it becomes evident that the **green lines consistently align with solar radiation**, as illustrated in the subsequent plot.

In [None]:
# Select the relevant columns and time range
columns_consumption_0 = df_plot_resampled_D.columns[df_plot_resampled_D.columns.str.contains('isConsumption0')]
columns_consumption_1 = df_plot_resampled_D.columns[df_plot_resampled_D.columns.str.contains('isConsumption1')]

# Create a single legend for each category
plt.figure(figsize=(15, 6))
plt.plot([], color='blue', label='is_Consumption = 1')
plt.plot([], color='green', label='is_Consumption = 0')
plt.legend()

# Plot the data for is_Consumption = 0 in green
for column in columns_consumption_0:
    df_plot_resampled_D.loc['2022-7':, column].plot(alpha=0.1, color='green', legend=False)

# Plot the data for is_Consumption = 1 in blue
for column in columns_consumption_1:
    df_plot_resampled_D.loc['2022-7':, column].plot(alpha=0.1, color='blue', legend=False)

# Add a single legend to the plot
#plt.legend()

# Show the plot
plt.show()

This forms the foundation of our hypothesis that developing a separate model for energy production data could be beneficial, given the distinct characteristics of energy production when compared to consumption.

The subsequent code modifications involve the introduction of a new lightGBM model for production data (where df_train["is_consumption"] == 0), and the replacement of the production predictions with the new solar model's output in the final submission, while all other prediction values remain the same with the original model.

# Do you feel the sun power? 😎

In [None]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"

data_cols        = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'row_id']
client_cols      = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business', 'date']
gas_cols         = ['forecast_date', 'lowest_price_per_mwh', 'highest_price_per_mwh']
electricity_cols = ['forecast_date', 'euros_per_mwh']
forecast_cols    = ['latitude', 'longitude', 'hours_ahead', 'temperature', 'dewpoint', 'cloudcover_high', 'cloudcover_low', 'cloudcover_mid', 'cloudcover_total', '10_metre_u_wind_component', '10_metre_v_wind_component', 'forecast_datetime', 'direct_solar_radiation', 'surface_solar_radiation_downwards', 'snowfall', 'total_precipitation']
historical_cols  = ['datetime', 'temperature', 'dewpoint', 'rain', 'snowfall', 'surface_pressure','cloudcover_total','cloudcover_low','cloudcover_mid','cloudcover_high','windspeed_10m','winddirection_10m','shortwave_radiation','direct_solar_radiation','diffuse_radiation','latitude','longitude']
location_cols    = ['longitude', 'latitude', 'county']
target_cols      = ['target', 'county', 'is_business', 'product_type', 'is_consumption', 'datetime']

save_path = None
load_path = None

In [None]:
df_data        = pl.read_csv(os.path.join(root, "train.csv"), columns=data_cols, try_parse_dates=True)
df_client      = pl.read_csv(os.path.join(root, "client.csv"), columns=client_cols, try_parse_dates=True)
df_gas         = pl.read_csv(os.path.join(root, "gas_prices.csv"), columns=gas_cols, try_parse_dates=True)
df_electricity = pl.read_csv(os.path.join(root, "electricity_prices.csv"), columns=electricity_cols, try_parse_dates=True)
df_forecast    = pl.read_csv(os.path.join(root, "forecast_weather.csv"), columns=forecast_cols, try_parse_dates=True)
df_historical  = pl.read_csv(os.path.join(root, "historical_weather.csv"), columns=historical_cols, try_parse_dates=True)
df_location    = pl.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"), columns=location_cols, try_parse_dates=True)
df_target      = df_data.select(target_cols)

schema_data        = df_data.schema
schema_client      = df_client.schema
schema_gas         = df_gas.schema
schema_electricity = df_electricity.schema
schema_forecast    = df_forecast.schema
schema_historical  = df_historical.schema
schema_target      = df_target.schema

### Feature Engineering

In [None]:
X, y = df_data.drop("target"), df_data.select("target")

X = feature_eng(X, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target)

df_train = to_pandas(X, y)

In [None]:
# Train on only the last 12 months of data + first half of this year
# df_train = df_train[df_train["target"].notnull() & (df_train["year"].ge(2022))]
df_train = df_train[df_train["target"].notnull()]

### HyperParam Optimization

Note, each iteration of hyperparameter optimization takes about 3min 30secs to run

In [None]:
# study = optuna.create_study(direction='minimize', study_name='Regressor')
# study.optimize(lgb_objective, n_trials=10, show_progress_bar=True)

In [None]:
# New strategy - pick 5 good sets of params based on Optuna and train 5 different GBM regressors with them
c_params_1 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.04821377423576645, 'colsample_bytree': 0.5497196655514527, 'colsample_bynode': 0.5274840071632579, 'lambda_l1': 9.77879680143732, 'lambda_l2': 5.791892001041021, 'min_data_in_leaf': 481, 'max_depth': 11, 'max_bin': 511, 'num_leaves': 129
}

c_params_2 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.0648413470041428, 'colsample_bytree': 0.9238860133873772, 'colsample_bynode': 0.9327729120937044, 'lambda_l1': 1.0514727461524573, 'lambda_l2': 5.8473607663399685, 'min_data_in_leaf': 397, 'max_depth': 9, 'max_bin': 246, 'num_leaves': 381
} 

c_params_3 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.05315768053078176, 'colsample_bytree': 0.6787574884682299, 'colsample_bynode': 0.5943348870186838, 'lambda_l1': 3.1690894965046614, 'lambda_l2': 7.5744315883391735, 'min_data_in_leaf': 143, 'max_depth': 12, 'max_bin': 670, 'num_leaves': 489
}

c_params_4 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.023198448443733553, 'colsample_bytree': 0.9963976012610497, 'colsample_bynode': 0.632271439449325, 'lambda_l1': 5.262597596637711, 'lambda_l2': 1.990267982972063, 'min_data_in_leaf': 431, 'max_depth': 10, 'max_bin': 612, 'num_leaves': 367
}

c_params_5 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.0921786254132783, 'colsample_bytree': 0.707024903411793, 'colsample_bynode': 0.6296431177800739, 'lambda_l1': 0.19057707138540486, 'lambda_l2': 0.3693298584783809, 'min_data_in_leaf': 469, 'max_depth': 8, 'max_bin': 318, 'num_leaves': 290
}

# Repeat this process for the models used for energy production
p_params_1 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.0694601099159599, 'colsample_bytree': 0.9402567874359262, 'colsample_bynode': 0.5997215103835294, 'lambda_l1': 8.324749667075178, 'lambda_l2': 3.8152229017561003, 'min_data_in_leaf': 45, 'max_depth': 12, 'max_bin': 885, 'num_leaves': 315
}

p_params_2 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.0833938661632418, 'colsample_bytree': 0.6143745897951222, 'colsample_bynode': 0.6322055379594584, 'lambda_l1': 8.419801316576436, 'lambda_l2': 1.07301441651146, 'min_data_in_leaf': 454, 'max_depth': 12, 'max_bin': 970, 'num_leaves': 127
}

p_params_3 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.09398750601092679, 'colsample_bytree': 0.6280596991836258, 'colsample_bynode': 0.6379779219482682, 'lambda_l1': 4.221770392358979, 'lambda_l2': 5.713993750225508, 'min_data_in_leaf': 180, 'max_depth': 11, 'max_bin': 544, 'num_leaves': 266
}

p_params_4 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.05433580227049732, 'colsample_bytree': 0.8386685084562422, 'colsample_bynode': 0.9310187766170934, 'lambda_l1': 7.3906618978965195, 'lambda_l2': 6.073387950064185, 'min_data_in_leaf': 177, 'max_depth': 12, 'max_bin': 577, 'num_leaves': 487
}

p_params_5 = {
    'n_iter': 1000, 'verbose': -1, 'objective': 'l1', 'learning_rate': 0.09088669147433569, 'colsample_bytree': 0.7020454250886452, 'colsample_bynode': 0.9844810334453777, 'lambda_l1': 6.553300533986207, 'lambda_l2': 0.3035575347785749, 'min_data_in_leaf': 151, 'max_depth': 9, 'max_bin': 980, 'num_leaves': 429
}

### Validation

In [None]:
'''result = cross_validate(
    estimator=lgb.LGBMRegressor(**c_params_1, random_state=42),
    X=df_train[df_train['is_consumption']==1].drop(columns=["target"]), 
    y=df_train[df_train['is_consumption']==1]["target"],
    scoring="neg_mean_absolute_error",
    cv=MonthlyKFold(1),
)

result_solar = cross_validate(
    estimator=lgb.LGBMRegressor(**p_params_1, random_state=42),
    X=df_train[df_train['is_consumption']==0].drop(columns=["target"]), 
    y=df_train[df_train['is_consumption']==0]["target"],
    scoring="neg_mean_absolute_error",
    cv=MonthlyKFold(1),
)

print(f"Fit Time(s): {result['fit_time'].mean():.3f}")
print(f"Score Time(s): {result['score_time'].mean():.3f}")
print(f"Error(MAE): {-result['test_score'].mean():.3f}")

print(f"Fit Time(s): {result_solar['fit_time'].mean():.3f}")
print(f"Score Time(s): {result_solar['score_time'].mean():.3f}")
print(f"Error(MAE): {-result_solar['test_score'].mean():.3f}")'''

### Training

In [None]:
if load_path is not None:
    load_solar_path = "_solar.".join(load_path.split("."))
    model = pickle.load(open(load_path, "rb"))
    model_solar = pickle.load(open(load_solar_path, "rb"))
else:
    model = VotingRegressor([
        ('lgb_1', lgb.LGBMRegressor(**c_params_1, random_state=100)), 
        ('lgb_2', lgb.LGBMRegressor(**c_params_2, random_state=101)), 
        ('lgb_3', lgb.LGBMRegressor(**c_params_3, random_state=102)), 
        ('lgb_4', lgb.LGBMRegressor(**c_params_4, random_state=103)), 
        ('lgb_5', lgb.LGBMRegressor(**c_params_5, random_state=104)), 
    ])
    
    model_solar = VotingRegressor([
        ('lgb_6', lgb.LGBMRegressor(**p_params_1, random_state=105)), 
        ('lgb_7', lgb.LGBMRegressor(**p_params_2, random_state=106)), 
        ('lgb_8', lgb.LGBMRegressor(**p_params_3, random_state=107)), 
        ('lgb_9', lgb.LGBMRegressor(**p_params_4, random_state=108)), 
        ('lgb_10', lgb.LGBMRegressor(**p_params_5, random_state=109)), 
    ])
    
    model.fit(
        X=df_train[df_train['is_consumption']==1].drop(columns=["target"]),
        y=df_train[df_train['is_consumption']==1]["target"]
    )
    
    model_solar.fit(
        X=df_train[df_train['is_consumption']==0].drop(columns=["target"]),
        y=df_train[df_train['is_consumption']==0]["target"]
    )

if save_path is not None:
    save_solar_path = "_solar.".join(save_path.split("."))
    with open(save_path, "wb") as f:
        pickle.dump(model, f)
    with open(save_solar_path, "wb") as f:
        pickle.dump(model_solar, f)

### Model Analysis

In [None]:
lgb.plot_importance(model.estimators_[0], max_num_features=20, figsize=(8, 8))

In [None]:
lgb.plot_importance(model_solar.estimators_[0], max_num_features=20, figsize=(8, 8))

### Prediction

In [None]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
for (test, revealed_targets, client, historical_weather,
        forecast_weather, electricity_prices, gas_prices, sample_prediction) in iter_test:
    
    test = test.rename(columns={"prediction_datetime": "datetime"})
    
    df_test           = pl.from_pandas(test[data_cols[1:]], schema_overrides=schema_data)
    df_client         = pl.from_pandas(client[client_cols], schema_overrides=schema_client)
    df_gas            = pl.from_pandas(gas_prices[gas_cols], schema_overrides=schema_gas)
    df_electricity    = pl.from_pandas(electricity_prices[electricity_cols], schema_overrides=schema_electricity)
    df_new_forecast   = pl.from_pandas(forecast_weather[forecast_cols], schema_overrides=schema_forecast)
    df_new_historical = pl.from_pandas(historical_weather[historical_cols], schema_overrides=schema_historical)
    df_new_target     = pl.from_pandas(revealed_targets[target_cols], schema_overrides=schema_target)
    
    df_forecast       = pl.concat([df_forecast, df_new_forecast]).unique()
    df_historical     = pl.concat([df_historical, df_new_historical]).unique()
    df_target         = pl.concat([df_target, df_new_target]).unique()
    
    X_test = feature_eng(df_test, df_client, df_gas, df_electricity, df_forecast, df_historical, df_location, df_target)
    X_test = to_pandas(X_test)
    
    test['target'] = model.predict(X_test).clip(0)
    test['target_solar'] = model_solar.predict(X_test).clip(0)
    test.loc[test['is_consumption']==0, "target"] = test.loc[test['is_consumption']==0, "target_solar"]    
    
    sample_prediction["target"] = test['target']
    
    env.predict(sample_prediction)