**Baseline was taken from the beautiful** [Enefit Generic Notebook](https://www.kaggle.com/code/greysky/enefit-generic-notebook)  **and** [Enefit - lgb with regression_l1 objective](https://www.kaggle.com/code/davero/enefit-lgb-with-regression-l1-objective) 🙏

**Some baseline techniques and competition mechanics are demonstrated in the** [Explain Dataset and Baseline](https://www.kaggle.com/code/vitalykudelya/explain-dataset-and-baseline)

In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
import gc
import holidays
import pickle

import numpy as np
import pandas as pd
import polars as pl
# import plotly.express as px

import optuna
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import cross_val_score, cross_validate


# Classes

### DataStorage

In [2]:
class DataStorage:
    # root = "/kaggle/input/predict-energy-behavior-of-prosumers"
    root = "competition_data"

    data_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
        "row_id",
    ]
    client_cols = [
        "product_type",
        "county",
        "eic_count",
        "installed_capacity",
        "is_business",
        "date",
    ]
    gas_prices_cols = ["forecast_date", "lowest_price_per_mwh", "highest_price_per_mwh"]
    electricity_prices_cols = ["forecast_date", "euros_per_mwh"]
    forecast_weather_cols = [
        "latitude",
        "longitude",
        "hours_ahead",
        "temperature",
        "dewpoint",
        "cloudcover_high",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_total",
        "10_metre_u_wind_component",
        "10_metre_v_wind_component",
        "forecast_datetime",
        "direct_solar_radiation",
        "surface_solar_radiation_downwards",
        "snowfall",
        "total_precipitation",
    ]
    historical_weather_cols = [
        "datetime",
        "temperature",
        "dewpoint",
        "rain",
        "snowfall",
        "surface_pressure",
        "cloudcover_total",
        "cloudcover_low",
        "cloudcover_mid",
        "cloudcover_high",
        "windspeed_10m",
        "winddirection_10m",
        "shortwave_radiation",
        "direct_solar_radiation",
        "diffuse_radiation",
        "latitude",
        "longitude",
    ]
    location_cols = ["longitude", "latitude", "county"]
    target_cols = [
        "target",
        "county",
        "is_business",
        "product_type",
        "is_consumption",
        "datetime",
    ]

    def __init__(self):
        self.df_data = pl.read_csv(
            os.path.join(self.root, "train.csv"),
            columns=self.data_cols,
            try_parse_dates=True,
        )
        self.df_client = pl.read_csv(
            os.path.join(self.root, "client.csv"),
            columns=self.client_cols,
            try_parse_dates=True,
        )
        self.df_gas_prices = pl.read_csv(
            os.path.join(self.root, "gas_prices.csv"),
            columns=self.gas_prices_cols,
            try_parse_dates=True,
        )
        self.df_electricity_prices = pl.read_csv(
            os.path.join(self.root, "electricity_prices.csv"),
            columns=self.electricity_prices_cols,
            try_parse_dates=True,
        )
        self.df_forecast_weather = pl.read_csv(
            os.path.join(self.root, "forecast_weather.csv"),
            columns=self.forecast_weather_cols,
            try_parse_dates=True,
        )
        self.df_historical_weather = pl.read_csv(
            os.path.join(self.root, "historical_weather.csv"),
            columns=self.historical_weather_cols,
            try_parse_dates=True,
        )
        self.df_weather_station_to_county_mapping = pl.read_csv(
            os.path.join(self.root, "weather_station_to_county_mapping.csv"),
            columns=self.location_cols,
            try_parse_dates=True,
        )
        self.df_data = self.df_data.filter(
            pl.col("datetime") >= pd.to_datetime("2022-01-01")
        )
        self.df_target = self.df_data.select(self.target_cols)

        self.schema_data = self.df_data.schema
        self.schema_client = self.df_client.schema
        self.schema_gas_prices = self.df_gas_prices.schema
        self.schema_electricity_prices = self.df_electricity_prices.schema
        self.schema_forecast_weather = self.df_forecast_weather.schema
        self.schema_historical_weather = self.df_historical_weather.schema
        self.schema_target = self.df_target.schema

        self.df_weather_station_to_county_mapping = (
            self.df_weather_station_to_county_mapping.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
        )

    def update_with_new_data(
        self,
        df_new_client,
        df_new_gas_prices,
        df_new_electricity_prices,
        df_new_forecast_weather,
        df_new_historical_weather,
        df_new_target,
    ):
        df_new_client = pl.from_pandas(
            df_new_client[self.client_cols], schema_overrides=self.schema_client
        )
        df_new_gas_prices = pl.from_pandas(
            df_new_gas_prices[self.gas_prices_cols],
            schema_overrides=self.schema_gas_prices,
        )
        df_new_electricity_prices = pl.from_pandas(
            df_new_electricity_prices[self.electricity_prices_cols],
            schema_overrides=self.schema_electricity_prices,
        )
        df_new_forecast_weather = pl.from_pandas(
            df_new_forecast_weather[self.forecast_weather_cols],
            schema_overrides=self.schema_forecast_weather,
        )
        df_new_historical_weather = pl.from_pandas(
            df_new_historical_weather[self.historical_weather_cols],
            schema_overrides=self.schema_historical_weather,
        )
        df_new_target = pl.from_pandas(
            df_new_target[self.target_cols], schema_overrides=self.schema_target
        )

        self.df_client = pl.concat([self.df_client, df_new_client]).unique(
            ["date", "county", "is_business", "product_type"]
        )
        self.df_gas_prices = pl.concat([self.df_gas_prices, df_new_gas_prices]).unique(
            ["forecast_date"]
        )
        self.df_electricity_prices = pl.concat(
            [self.df_electricity_prices, df_new_electricity_prices]
        ).unique(["forecast_date"])
        self.df_forecast_weather = pl.concat(
            [self.df_forecast_weather, df_new_forecast_weather]
        ).unique(["forecast_datetime", "latitude", "longitude", "hours_ahead"])
        self.df_historical_weather = pl.concat(
            [self.df_historical_weather, df_new_historical_weather]
        ).unique(["datetime", "latitude", "longitude"])
        self.df_target = pl.concat([self.df_target, df_new_target]).unique(
            ["datetime", "county", "is_business", "product_type", "is_consumption"]
        )

    def preprocess_test(self, df_test):
        df_test = df_test.rename(columns={"prediction_datetime": "datetime"})
        df_test = pl.from_pandas(
            df_test[self.data_cols[1:]], schema_overrides=self.schema_data
        )
        return df_test


### FeaturesGenerator

In [3]:
class FeaturesGenerator:
    def __init__(self, data_storage):
        self.data_storage = data_storage

    def _add_general_features(self, df_features):
        df_features = (
            df_features.with_columns(
                pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
                pl.col("datetime").dt.hour().alias("hour"),
                pl.col("datetime").dt.day().alias("day"),
                pl.col("datetime").dt.weekday().alias("weekday"),
                pl.col("datetime").dt.month().alias("month"),
                pl.col("datetime").dt.year().alias("year"),
            )
            .with_columns(
                pl.concat_str(
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    separator="_",
                ).alias("segment"),
            )
            .with_columns(
                (np.pi * pl.col("dayofyear") / 183).sin().alias("sin(dayofyear)"),
                (np.pi * pl.col("dayofyear") / 183).cos().alias("cos(dayofyear)"),
                (np.pi * pl.col("hour") / 12).sin().alias("sin(hour)"),
                (np.pi * pl.col("hour") / 12).cos().alias("cos(hour)"),
            )
        )
        return df_features

    def _add_client_features(self, df_features):
        df_client = self.data_storage.df_client

        df_features = df_features.join(
            df_client.with_columns(
                (pl.col("date") + pl.duration(days=2)).cast(pl.Date)
            ),
            on=["county", "is_business", "product_type", "date"],
            how="left",
        )
        return df_features

    def _add_forecast_weather_features(self, df_features):
        df_forecast_weather = self.data_storage.df_forecast_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_forecast_weather = (
            df_forecast_weather.rename({"forecast_datetime": "datetime"})
            .filter((pl.col("hours_ahead") >= 22) & pl.col("hours_ahead") <= 45)
            .drop("hours_ahead")
            .with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_forecast_weather_date = (
            df_forecast_weather.group_by("datetime").mean().drop("county")
        )

        df_forecast_weather_local = (
            df_forecast_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [0, 7 * 24]:
            df_features = df_features.join(
                df_forecast_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_forecast_{hours_lag}h",
            )
            df_features = df_features.join(
                df_forecast_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_forecast_local_{hours_lag}h",
            )

        return df_features

    def _add_historical_weather_features(self, df_features):
        df_historical_weather = self.data_storage.df_historical_weather
        df_weather_station_to_county_mapping = (
            self.data_storage.df_weather_station_to_county_mapping
        )

        df_historical_weather = (
            df_historical_weather.with_columns(
                pl.col("latitude").cast(pl.datatypes.Float32),
                pl.col("longitude").cast(pl.datatypes.Float32),
            )
            .join(
                df_weather_station_to_county_mapping,
                how="left",
                on=["longitude", "latitude"],
            )
            .drop("longitude", "latitude")
        )

        df_historical_weather_date = (
            df_historical_weather.group_by("datetime").mean().drop("county")
        )

        df_historical_weather_local = (
            df_historical_weather.filter(pl.col("county").is_not_null())
            .group_by("county", "datetime")
            .mean()
        )

        for hours_lag in [2 * 24, 7 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )
            df_features = df_features.join(
                df_historical_weather_local.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ),
                on=["county", "datetime"],
                how="left",
                suffix=f"_historical_local_{hours_lag}h",
            )

        for hours_lag in [1 * 24]:
            df_features = df_features.join(
                df_historical_weather_date.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag),
                    pl.col("datetime").dt.hour().alias("hour"),
                )
                .filter(pl.col("hour") <= 10)
                .drop("hour"),
                on="datetime",
                how="left",
                suffix=f"_historical_{hours_lag}h",
            )

        return df_features

    def _add_target_features(self, df_features):
        df_target = self.data_storage.df_target

        df_target_all_type_sum = (
            df_target.group_by(["datetime", "county", "is_business", "is_consumption"])
            .sum()
            .drop("product_type")
        )

        df_target_all_county_type_sum = (
            df_target.group_by(["datetime", "is_business", "is_consumption"])
            .sum()
            .drop("product_type", "county")
        )

        # Add log of target value to df_target
        df_target = df_target.with_columns(
            pl.col("target").log().alias("target_log")
        )

        for hours_lag in [
            2 * 24,
            3 * 24,
            4 * 24,
            5 * 24,
            6 * 24,
            7 * 24,
            8 * 24,
            9 * 24,
            10 * 24,
            11 * 24,
            12 * 24,
            13 * 24,
            14 * 24,
        ]:
            df_features = df_features.join(
                df_target.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_{hours_lag}h", "target_log": f"target_log_{hours_lag}h"}),
                on=[
                    "county",
                    "is_business",
                    "product_type",
                    "is_consumption",
                    "datetime",
                ],
                how="left",
            )

        for hours_lag in [2 * 24, 3 * 24, 7 * 24, 14 * 24]:
            df_features = df_features.join(
                df_target_all_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_type_sum_{hours_lag}h"}),
                on=["county", "is_business", "is_consumption", "datetime"],
                how="left",
            )

            df_features = df_features.join(
                df_target_all_county_type_sum.with_columns(
                    pl.col("datetime") + pl.duration(hours=hours_lag)
                ).rename({"target": f"target_all_county_type_sum_{hours_lag}h"}),
                on=["is_business", "is_consumption", "datetime"],
                how="left",
                suffix=f"_all_county_type_sum_{hours_lag}h",
            )

        cols_for_stats = [
            f"target_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ] + [
            f"target_log_{hours_lag}h" for hours_lag in [2 * 24, 3 * 24, 4 * 24, 5 * 24]
        ]
        df_features = df_features.with_columns(
            df_features.select(cols_for_stats).mean(axis=1).alias(f"target_mean"),
            df_features.select(cols_for_stats)
            .transpose()
            .std()
            .transpose()
            .to_series()
            .alias(f"target_std"),
        )

        for target_prefix, lag_nominator, lag_denomonator in [
            ("target", 24 * 7, 24 * 14),
            ("target", 24 * 2, 24 * 9),
            ("target", 24 * 3, 24 * 10),
            ("target", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 2, 24 * 3),
            ("target_all_type_sum", 24 * 7, 24 * 14),
            ("target_all_county_type_sum", 24 * 2, 24 * 3),
            ("target_all_county_type_sum", 24 * 7, 24 * 14),
        ]:
            df_features = df_features.with_columns(
                (
                    pl.col(f"{target_prefix}_{lag_nominator}h")
                    / (pl.col(f"{target_prefix}_{lag_denomonator}h") + 1e-3)
                ).alias(f"{target_prefix}_ratio_{lag_nominator}_{lag_denomonator}")
            )

        return df_features

    def _reduce_memory_usage(self, df_features):
        df_features = df_features.with_columns(pl.col(pl.Float64).cast(pl.Float32))
        return df_features

    def _drop_columns(self, df_features):
        df_features = df_features.drop(
            "date", "datetime", "hour", "dayofyear"
        )
        return df_features

    def _to_pandas(self, df_features, y):
        cat_cols = [
            "county",
            "is_business",
            "product_type",
            "is_consumption",
            "segment",
            "weekday"
        ]

        if y is not None:
            df_features = pd.concat([df_features.to_pandas(), y.to_pandas()], axis=1)
        else:
            df_features = df_features.to_pandas()

        # Add holidays as a categorical feature
        estonian_holidays = holidays.country_holidays('EE', years=range(2021, 2026))
        estonian_holidays_keys = list(estonian_holidays.keys())
        df_features["temp_date"] = pd.to_datetime(df_features[['year', 'month', 'day']])
        df_features['is_holiday'] = df_features["temp_date"].isin(estonian_holidays_keys).astype(int)
        df_features.drop(columns=["temp_date"], inplace=True)

        df_features = df_features.set_index("row_id")
        df_features[cat_cols] = df_features[cat_cols].astype("category")
#         df_features["month_cat"] = df_features["month"].astype("category")

        return df_features

    def generate_features(self, df_prediction_items):
        if "target" in df_prediction_items.columns:
            df_prediction_items, y = (
                df_prediction_items.drop("target"),
                df_prediction_items.select("target"),
            )
        else:
            y = None

        df_features = df_prediction_items.with_columns(
            pl.col("datetime").cast(pl.Date).alias("date"),
        )

        for add_features in [
            self._add_general_features,
            self._add_client_features,
            self._add_forecast_weather_features,
            self._add_historical_weather_features,
            self._add_target_features,
            self._reduce_memory_usage,
            self._drop_columns,
        ]:
            df_features = add_features(df_features)

        df_features = self._to_pandas(df_features, y)

        return df_features


### Model

In [4]:
class MonthlyKFold:
    def __init__(self, n_splits=3):
        self.n_splits = n_splits
        
    def split(self, X, y, groups=None):
        dates = 12 * X["year"] + X["month"]
        timesteps = sorted(dates.unique().tolist())
        X = X.reset_index()
        
        for t in timesteps[-self.n_splits:]:
            idx_train = X[dates.values < t].index
            idx_test = X[dates.values == t].index
            
            yield idx_train, idx_test
            
    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

In [5]:
class Model:
    def __init__(self, n_models=5):
        self.model_consumption_parameters = {
            "n_estimators": 300,
            "learning_rate": 0.03,
            "colsample_bytree": 0.85,
            "colsample_bynode": 0.65,
            "lambda_l1": 1.6,
            "lambda_l2": 10.0,
            "max_depth": 12,
            "num_leaves": 750,
            "min_data_in_leaf": 25,
            "objective": "regression_l1",
#             "device": "gpu"
        }

        self.model_production_parameters = {
            "n_estimators": 300,
            "learning_rate": 0.04,
            "colsample_bytree": 0.8,
            "colsample_bynode": 0.55,
            "lambda_l1": 1.3,
            "lambda_l2": 9.5,
            "max_depth": 12,
            "num_leaves": 1000,
            "min_data_in_leaf": 20,
            "objective": "regression_l1",
#             "device": "gpu"
        }

        self.model_consumption = VotingRegressor(
            [
                (
                    f"consumption_lgb_{i}",
                    lgb.LGBMRegressor(**self.model_consumption_parameters, random_state=i),
                )
                for i in range(n_models)
            ]
        )
        self.model_production = VotingRegressor(
            [
                (
                    f"production_lgb_{i}",
                    lgb.LGBMRegressor(**self.model_production_parameters, random_state=i),
                )
                for i in range(n_models)
            ]
        )

    def fit(self, df_train_features):
        mask = df_train_features["is_consumption"] == 1
        self.model_consumption.fit(
            X=df_train_features[mask].drop(columns=["target"]),
            y=df_train_features[mask]["target"]
        )

        mask = df_train_features["is_consumption"] == 0
        self.model_production.fit(
            X=df_train_features[mask].drop(columns=["target"]),
            y=df_train_features[mask]["target"]
        )

    def predict(self, df_features):
        predictions = np.zeros(len(df_features))

        mask = df_features["is_consumption"] == 1
        predictions[mask.values] = self.model_consumption.predict(
            df_features[mask]
        ).clip(0)

        mask = df_features["is_consumption"] == 0
        predictions[mask.values] = self.model_production.predict(
            df_features[mask]
        ).clip(0)

        return predictions


In [6]:
class ProductModel(Model):
    def __init__(self, n_models=5):
        super().__init__(n_models=n_models)

        self.model_consumption_product_parameters = {
            0: {
                "n_estimators": 300,
                'learning_rate': 0.06, 
                'colsample_bytree': 0.70, 
                'colsample_bynode': 0.60, 
                'lambda_l1': 0.5, 
                'lambda_l2': 10.0, 
                'min_data_in_leaf': 125, 
                'max_depth': 15, 
                'num_leaves': 900,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
            1: {
                "n_estimators": 300,
                'learning_rate': 0.084, 
                'colsample_bytree': 0.97, 
                'colsample_bynode': 0.65, 
                'lambda_l1': 0.2, 
                'lambda_l2': 8.0, 
                'min_data_in_leaf': 50, 
                'max_depth': 13, 
                'num_leaves': 400,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
            2: {
                "n_estimators": 300,
                'learning_rate': 0.03, 
                'colsample_bytree': 0.99, 
                'colsample_bynode': 0.85, 
                'lambda_l1': 4.3, 
                'lambda_l2': 2.1, 
                'min_data_in_leaf': 45, 
                'max_depth': 10, 
                'num_leaves': 680,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
            3: {
                "n_estimators": 300,
                'learning_rate': 0.05, 
                'colsample_bytree': 0.62, 
                'colsample_bynode': 0.85, 
                'lambda_l1': 5.8, 
                'lambda_l2': 0.9, 
                'min_data_in_leaf': 40, 
                'max_depth': 11, 
                'num_leaves': 300,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
        }

        self.model_production_product_parameters = {
            0: {
                "n_estimators": 300,
                'learning_rate': 0.08, 
                'colsample_bytree': 0.65, 
                'colsample_bynode': 0.975, 
                'lambda_l1': 1.5, 
                'lambda_l2': 10.0, 
                'min_data_in_leaf': 100, 
                'max_depth': 12, 
                'num_leaves': 700,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
            1: {
                "n_estimators": 300,
                'learning_rate': 0.08, 
                'colsample_bytree': 0.99, 
                'colsample_bynode': 0.50, 
                'lambda_l1': 1.8, 
                'lambda_l2': 4.9, 
                'min_data_in_leaf': 40, 
                'max_depth': 11, 
                'num_leaves': 400,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
            2: {
                "n_estimators": 300,
                'learning_rate': 0.05, 
                'colsample_bytree': 0.70, 
                'colsample_bynode': 0.99, 
                'lambda_l1': 3.5, 
                'lambda_l2': 4.6, 
                'min_data_in_leaf': 100, 
                'max_depth': 9, 
                'num_leaves': 30,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
            3: {
                "n_estimators": 300,
                'learning_rate': 0.07, 
                'colsample_bytree': 0.74, 
                'colsample_bynode': 0.51, 
                'lambda_l1': 7.3, 
                'lambda_l2': 9.0, 
                'min_data_in_leaf': 50, 
                'max_depth': 14, 
                'num_leaves': 600,
                "objective": "regression_l1",
    #             "device": "gpu"
            },
        }

        # Set up one model per product type found in the training data
        # Number of product types is 4, indexed from 0 to 3
        self.all_prod_types = list(range(4))
        self.model_consumption_products = {
            c: VotingRegressor(
                [
                (
                    f"consumption_lgb_{i}",
                    lgb.LGBMRegressor(**self.model_consumption_product_parameters[c], 
                                      random_state=i),
                )
                for i in range(n_models)
                ]
            )
            for c in self.all_prod_types
        }
        self.model_production_products = {
            c: VotingRegressor(
                [
                (
                    f"production_lgb_{i}",
                    lgb.LGBMRegressor(**self.model_production_product_parameters[c], 
                                      random_state=i),
                )
                for i in range(n_models)
                ]
            )
            for c in self.all_prod_types
        }

    def fit(self, df_train_features):
        # Fit global models
        # super().fit(df_train_features)

        # Fit segment models
        for product in self.all_prod_types:
            product_df = df_train_features[df_train_features["product_type"] == product]

            mask = product_df["is_consumption"] == 1
            self.model_consumption_products[product].fit(
                X=product_df[mask].drop(columns=["target"]),
                y=product_df[mask]["target"]
            )

            mask = product_df["is_consumption"] == 0
            self.model_production_products[product].fit(
                X=product_df[mask].drop(columns=["target"]),
                y=product_df[mask]["target"]
            )

    def cross_validate(self, df_features):
        # Cross validate each county model
        prod_type_found = df_features["product_type"].unique().tolist()

        for product in self.all_prod_types:
            if product not in prod_type_found:
                continue

            product_df = df_features[df_features["product_type"] == product]
            print(f"Product Type: {product}")

            result = cross_validate(
                estimator=self.model_consumption_products[product],
                X=product_df[product_df['is_consumption']==1].drop(columns=["target"]), 
                y=product_df[product_df['is_consumption']==1]["target"],
                scoring="neg_mean_absolute_error",
                cv=MonthlyKFold(1),
            )

            print(f"Fit Time(s): {result['fit_time'].mean():.3f}")
            print(f"Score Time(s): {result['score_time'].mean():.3f}")
            print(f"Consumption Error (MAE): {-result['test_score'].mean():.3f}")

            result_solar = cross_validate(
                estimator=self.model_production_products[product],
                X=product_df[product_df['is_consumption']==0].drop(columns=["target"]), 
                y=product_df[product_df['is_consumption']==0]["target"],
                scoring="neg_mean_absolute_error",
                cv=MonthlyKFold(1),
            )

            print(f"Fit Time(s): {result_solar['fit_time'].mean():.3f}")
            print(f"Score Time(s): {result_solar['score_time'].mean():.3f}")
            print(f"Production Error (MAE): {-result_solar['test_score'].mean():.3f}")
            print("")

    def predict(self, df_features):
        predictions = np.zeros(len(df_features))
        prod_types_found = df_features["product_type"].unique().tolist()

        for product in self.all_prod_types:
            # Skip county if not found in this prediction dataset
            if product not in prod_types_found:
                continue
            
            mask = df_features["is_consumption"] == 1
            predictions[mask.values] = self.model_consumption_products[product].predict(
                df_features[mask]
            ).clip(0)

            mask = df_features["is_consumption"] == 0
            predictions[mask.values] = self.model_production_products[product].predict(
                df_features[mask]
            ).clip(0)

        return predictions


# Initialisation

In [7]:
data_storage = DataStorage()
features_generator = FeaturesGenerator(data_storage=data_storage)

# Feature Generation

In [8]:
df_train_features = features_generator.generate_features(data_storage.df_data)
df_train_features = df_train_features[df_train_features['target'].notnull()]

product_df = {}
for product in range(4):
    product_df[product] = df_train_features[df_train_features["product_type"] == product]

## Hyperparameter tuning

In [9]:
def lgb_p_objective(trial):
    params = {
        'n_iter'           : 300,
        'verbose'          : -1,
        'random_state'     : 42,
        'objective'        : 'l1',
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 4, 128),
        'max_depth'        : trial.suggest_int('max_depth', 5, 15),
        # 'max_bin'          : trial.suggest_int('max_bin', 32, 1024),
        'num_leaves'       : trial.suggest_int('num_leaves', 16, 1024),
    }
    
    model  = lgb.LGBMRegressor(**params)
    X, y   = product_df[3][product_df[3]['is_consumption']==0].drop(columns=["target"]), product_df[3][product_df[3]['is_consumption']==0]["target"]
    cv     = MonthlyKFold(1)
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    
    return -1 * np.mean(scores)

In [10]:
def lgb_c_objective(trial):
    params = {
        'n_iter'           : 300,
        'verbose'          : -1,
        'random_state'     : 42,
        'objective'        : 'l1',
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 4, 128),
        'max_depth'        : trial.suggest_int('max_depth', 5, 15),
        # 'max_bin'          : trial.suggest_int('max_bin', 32, 1024),
        'num_leaves'       : trial.suggest_int('num_leaves', 16, 1024),
    }
    
    model  = lgb.LGBMRegressor(**params)
    X, y   = product_df[3][product_df[3]['is_consumption']==1].drop(columns=["target"]), product_df[3][product_df[3]['is_consumption']==1]["target"]
    cv     = MonthlyKFold(1)
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_mean_absolute_error')
    
    return -1 * np.mean(scores)

In [11]:
# study_c = optuna.create_study(direction='minimize', study_name='Regressor_consumption')
# study_c.optimize(lgb_c_objective, n_trials=20, show_progress_bar=True)

In [12]:
# study_p = optuna.create_study(direction='minimize', study_name='Regressor_production')
# study_p.optimize(lgb_p_objective, n_trials=20, show_progress_bar=True)

# Train Model

In [13]:
# model = Model()
# model.fit(df_train_features)

# all_segments = df_train_features["segment"].unique().tolist()
# segment_model = SegmentModel(all_segments)
# segment_model.fit(df_train_features)

product_model = ProductModel()
product_model.fit(df_train_features)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38240
[LightGBM] [Info] Number of data points in the train set: 72798, number of used features: 162
[LightGBM] [Info] Start training from score 328.540009
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38240
[LightGBM] [Info] Number of data points in the train set: 72798, number of used features: 162
[LightGBM] [Info] Start training from score 328.540009
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098580 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38240
[LightGBM] [Info] Number of data points in the train set: 72798, number of used features: 162
[LightGBM] [Info]

In [14]:
product_model.cross_validate(df_train_features)

Product Type: 0
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38232
[LightGBM] [Info] Number of data points in the train set: 68334, number of used features: 162
[LightGBM] [Info] Start training from score 328.898987
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38232
[LightGBM] [Info] Number of data points in the train set: 68334, number of used features: 162
[LightGBM] [Info] Start training from score 328.898987
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38232
[LightGBM] [Info] Number of data points in the train set: 68334, number of used features: 162
[

# Submit API

In [None]:
import enefit

env = enefit.make_env()
iter_test = env.iter_test()

In [None]:
for (
    df_test, 
    df_new_target, 
    df_new_client, 
    df_new_historical_weather,
    df_new_forecast_weather, 
    df_new_electricity_prices, 
    df_new_gas_prices, 
    df_sample_prediction
) in iter_test:

    data_storage.update_with_new_data(
        df_new_client=df_new_client,
        df_new_gas_prices=df_new_gas_prices,
        df_new_electricity_prices=df_new_electricity_prices,
        df_new_forecast_weather=df_new_forecast_weather,
        df_new_historical_weather=df_new_historical_weather,
        df_new_target=df_new_target
    )
    df_test = data_storage.preprocess_test(df_test)
    
    df_test_features = features_generator.generate_features(df_test)
    df_sample_prediction["target"] = product_model.predict(df_test_features)
    
    env.predict(df_sample_prediction)