In [1]:
# import libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import lightgbm as lbg
from tqdm import tqdm
import polars as pl
import json
from abc import ABC, abstractmethod

from sklearn.ensemble import VotingRegressor
import lightgbm as lgb

In [2]:
class Config():

    ROOT = os.path.join("predict-energy-behavior-of-prosumers")
    

    # country id mapping
    COUNTRY_ID_TO_NAME_MAP_DIR = os.path.join(ROOT,"county_id_to_name_map.json")
    with open(COUNTRY_ID_TO_NAME_MAP_DIR,'r') as country_id_to_name:
        COUNTRY_ID_TO_NAME = json.load(country_id_to_name)

    # loading the weather station to country mapping 
    WEATHER_STATION_TO_COUNTRY_MAPPING_DIR = os.path.join(ROOT,"weather_station_to_county_mapping.csv")
    WEATHER_STATION_TO_COUNTRY_MAPPING = pd.read_csv(WEATHER_STATION_TO_COUNTRY_MAPPING_DIR)
    # Preprocess the Weather station mapping as the unknown mapping is 12 and give the name unknown
    WEATHER_STATION_TO_COUNTRY_MAPPING.fillna({"county_name":"UNKNOWN","county":12},inplace = True)
    WEATHER_STATION_TO_COUNTRY_MAPPING = pl.from_pandas(WEATHER_STATION_TO_COUNTRY_MAPPING)
    

    
    # paths of all the files for the training 
    TRAIN_CLIENT = os.path.join(ROOT,"client.csv")
    TRAIN_ELECTRICITY_PRICES = os.path.join(ROOT,"electricity_prices.csv")
    TRAIN_FORCAST_WEATHER = os.path.join(ROOT,"forecast_weather.csv")
    TRAIN_GAS_PRICES = os.path.join(ROOT,"gas_prices.csv")
    TRAIN_HISTORICAL_WEATHER = os.path.join(ROOT,"historical_weather.csv")
    TRAIN_TRAIN = os.path.join(ROOT, "train.csv")
    TRAIN_WEATHER_STATION_TO_COUNTRY_MAPPING = os.path.join(ROOT,"weather_station_to_country_mapping.csv")

    # paths of all the files for the testing
    TEST_CLIENT = os.path.join(ROOT,"example_test_files","client.csv")
    TEST_ELECTRICITY_PRICES = os.path.join(ROOT,"example_test_files","electricity_prices.csv")
    TEST_FORCAST_WEATHER = os.path.join(ROOT,"example_test_files","forecast_weather.csv")
    TEST_GAS_PRICES = os.path.join(ROOT,"example_test_files","gas_prices.csv")
    TEST_HISTORICAL_WEATHER = os.path.join(ROOT,"example_test_files","historical_weather.csv")
    TEST_TEST = os.path.join(ROOT,"example_test_files", "test.csv")
    TEST_WEATHER_STATION_TO_COUNTRY_MAPPING = os.path.join(ROOT,"example_test_files","weather_station_to_country_mapping.csv")



In [3]:
# DataStorage class will be injected into the feature engineering class 
# This will create a lot of memory instead create the data storage class and add wrapers arround it 
# just like adding decorators to the class so as to     

In [4]:
class DataStorage:

    def __init__(self):
        # loading all the training data 
        self.train_df_data = pl.read_csv(Config.TRAIN_TRAIN,try_parse_dates = True)
        self.train_df_client = pl.read_csv(Config.TRAIN_CLIENT,try_parse_dates = True)
        self.train_df_electricity_prices = pl.read_csv(Config.TRAIN_ELECTRICITY_PRICES,try_parse_dates = True)
        self.train_df_forcast_weather = pl.read_csv(Config.TRAIN_FORCAST_WEATHER,try_parse_dates = True)
        self.train_df_gas_prices = pl.read_csv(Config.TRAIN_GAS_PRICES,try_parse_dates = True)
        self.train_df_historical_weather = pl.read_csv(Config.TRAIN_HISTORICAL_WEATHER,try_parse_dates = True)

        # loading all the testing data 
        self.test_df_data = pl.read_csv(Config.TEST_TEST,try_parse_dates = True)
        self.test_df_client = pl.read_csv(Config.TEST_CLIENT,try_parse_dates = True)
        self.test_df_electricity_prices = pl.read_csv(Config.TEST_ELECTRICITY_PRICES,try_parse_dates = True)
        self.test_df_forcast_weather = pl.read_csv(Config.TEST_FORCAST_WEATHER,try_parse_dates = True)
        self.test_df_gas_prices = pl.read_csv(Config.TEST_GAS_PRICES,try_parse_dates = True)
        self.test_df_historical_weather = pl.read_csv(Config.TEST_HISTORICAL_WEATHER,try_parse_dates = True)
        

In [5]:
# the base class which is used in the decorator design pattern
class AddFeatures(ABC):
    
    @abstractmethod
    def addFeature(self,df):
        # Here add some of the feature to the df and pass the df along 
        if df is None:
            raise ValueError("This features should not be first to be added")
        pass

    def cleanUp(self):
        pass

In [6]:
# add general features 
class AddGeneralFeatures(AddFeatures):

    def __init__(self,df):
        self.df = df
        

    def addFeature(self,df = None):
        # Adding basic features such as hours,day, weekday etc
        self.df = self.df.with_columns(
            pl.col("datetime").dt.hour().alias("hour"),
            pl.col("datetime").dt.day().alias("day"),
            pl.col("datetime").dt.weekday().alias("weekday"),
            pl.col("datetime").dt.month().alias("month"),
            pl.col("datetime").dt.year().alias("year"),
            pl.col("datetime").dt.ordinal_day().alias("dayofyear"),
            pl.col("datetime").dt.date().alias("date")
        )

        # drop data_block_id 
        self.df = self.df.drop(["data_block_id"])

        # print(f"After the {self.__class__.__name__} {self.df.shape}")
        
        
        if df is None: 
            return self.df
        # perform the join operation
        
        raise ValueError("First features should be General Features")
          

In [7]:
class AddClientFeatures(AddFeatures):

    def __init__(self,df):
        self.df = df

    def addFeature(self,df = None):
        # just combine the dataframe based on the country , is_business, date and product type 
        # perform left join 
        super().addFeature(df)

        #drop data_block_id
        self.df = self.df.drop(["data_block_id"])
        
        df = df.join(
            self.df,
            on = ["county","is_business","date","product_type"],
            how = "left",
        )

        # print(f"After the {self.__class__.__name__} {df.shape}")        
        return df

In [8]:
class AddElectricityPricesFeatures(AddFeatures):

    def __init__(self,df):
        self.df = df

    def addFeature(self,df = None):
        # just combine the dataframe based on the 
        super().addFeature(df)

        # TODO: Right now not adding this features 
        # print(f"After the {self.__class__.__name__} {df.shape}")

        return df

In [9]:
class AddForecastWeatherFeatures(AddFeatures):

    def __init__(self,df):
        self.df = df

    def addFeature(self,df = None):
        # just combine the dataframe based on the 
        super().addFeature(df)
        # adding the country column
        self.df = self.df.join(
            Config.WEATHER_STATION_TO_COUNTRY_MAPPING,
            on = ["latitude","longitude"],
            how = "left",
        )

        # Droping the latitude and longitude columns 
        self.df = self.df.drop(["latitude","longitude","data_block_id","county_name","origin_datetime"])

        # Change the name of the forecast_datetime to datetime
        self.df = self.df.rename({"forecast_datetime":"datetime"})

        # Perform aggreagation 
        # TODO: right now taking the mean of according to the datetime and the country 
        # TODO: try different different techniques such as expontential smotthing and the houter winters 

        self.df = self.df.group_by(["county","datetime"]).mean()

        # convert the cast of country to int64
        self.df = self.df.with_columns(
            pl.col("county").cast(pl.Int64).alias("county"),
        )
    
        # Perform join based on the datetime,country and data_block_id with the original dataset    
        df = df.join(
            self.df,
            on = ["county","datetime"],
            how = "left",
        )
        
        # print(f"After the {self.__class__.__name__} {df.shape}")
    
        return df

In [10]:
class AddHistoricalWeatherFeatures(AddFeatures):

    def __init__(self,df):
        self.df = df

    def addFeature(self,df = None):
        # just combine the dataframe based on the 
        super().addFeature(df)
        # adding the country column
        self.df = self.df.join(
            Config.WEATHER_STATION_TO_COUNTRY_MAPPING,
            on = ["latitude","longitude"],
            how = "left",
        )

        # Droping the latitude,longitude and data_block_id columns 
        self.df = self.df.drop(["latitude","longitude","data_block_id","county_name"])

        self.df = self.df.with_columns(
            pl.col("county").cast(pl.Int64).alias("county"),
        )

        self.df = self.df.group_by(["county","datetime"]).mean()
    
        # Perform join based on the datetime,country and data_block_id with the original dataset    
        df = df.join(
            self.df,
            on = ["county","datetime"],
            how = "left",
            suffix = f"_historical"
        )

        # print(f"After the {self.__class__.__name__} {df.shape}")
        return df

In [11]:
class AddGasPricesFeatures(AddFeatures):

    def __init__(self,df):
        self.df = df

    def addFeature(self,df = None):
        # just combine the dataframe based on the 
        super().addFeature(df)

        # TODO: Right now not adding this features 
        # print(f"After the {self.__class__.__name__} {df.shape}")

        return df

In [12]:
class AddFeatureManager():

    def __init__(self,dataStorage : DataStorage):
        self.addGeneralFeatures = AddGeneralFeatures(dataStorage.train_df_data)
        self.addClientFeatures = AddClientFeatures(dataStorage.train_df_client)
        self.addElectricityPricesFeatures = AddElectricityPricesFeatures(dataStorage.train_df_electricity_prices)
        self.addForcastWeatherFeatures = AddForecastWeatherFeatures(dataStorage.train_df_forcast_weather)
        self.addHistoricalWeatherFeatures = AddHistoricalWeatherFeatures(dataStorage.train_df_historical_weather)
        self.addGasPricesFeatures = AddGasPricesFeatures(dataStorage.train_df_gas_prices)
        
        self.df = None

    def safeDrop(self,column,df):
        if column in df.columns:
            df = df.drop(columns = [column])
        return df
            
    def to_pandas(self):
        # Convert to pandas
        
        categorical_cols = ["county","is_business","product_type","is_consumption"]
        df = self.df.to_pandas()
        df = df.set_index("row_id")
        df[categorical_cols] = df[categorical_cols].astype("category")

        # drop date and datetime column 
        df = self.safeDrop("datetime",df)
        df = self.safeDrop("date",df)
        
        return df

    def addFeatures(self):
        self.allOperations = [self.addGeneralFeatures,self.addClientFeatures,
                             self.addElectricityPricesFeatures,self.addForcastWeatherFeatures,
                             self.addHistoricalWeatherFeatures,self.addGasPricesFeatures]
        for operation in self.allOperations:
            self.df = operation.addFeature(self.df)
        return self.to_pandas()

In [13]:
data_storage = DataStorage()

In [14]:
add_feature_manager = AddFeatureManager(data_storage)

In [15]:
train_df = add_feature_manager.addFeatures()

In [16]:
train_df.dtypes

county                               category
is_business                          category
product_type                         category
target                                float64
is_consumption                       category
prediction_unit_id                      int64
hour                                     int8
day                                      int8
weekday                                  int8
month                                    int8
year                                    int32
dayofyear                               int16
eic_count                             float64
installed_capacity                    float64
hours_ahead                           float64
temperature                           float64
dewpoint                              float64
cloudcover_high                       float64
cloudcover_low                        float64
cloudcover_mid                        float64
cloudcover_total                      float64
10_metre_u_wind_component         

## Model

In [19]:
class Model:

    def __init__(self,optuna = False,):
        self.model_parameters = {
            "n_estimators": 2500,
            "learning_rate": 0.06,
            "colsample_bytree": 0.9,
            "colsample_bynode": 0.6,
            "lambda_l1": 3.5,
            "lambda_l2": 1.5,
            "max_depth": 16,
            "num_leaves": 100,
            "min_data_in_leaf": 50,
            "objective": "regression_l1",
            "device": "cpu",
            "verbosity": 1
        }
        self.model = VotingRegressor(
            [
                (
                    f"consumption_lgb_{i}",
                    lgb.LGBMRegressor(**self.model_parameters, random_state=i,),
                )
                for i in range(10)
            ]
        )

    def fit(self,df):
        self.model.fit(
            X = df.drop(columns = ["target"]),
            y = df["target"]
        )

    def predict(self,df):
        predictions = np.zeros(len(df))
        predict = self.model.predict(df)
        return predict
        


In [None]:
model = Model()
model.fit(train_df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042090 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7370
[LightGBM] [Info] Number of data points in the train set: 2018352, number of used features: 40
[LightGBM] [Info] Start training from score 31.101000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7374
[LightGBM] [Info] Number of data points in the train set: 2018352, number of used features: 40
[LightGBM] [Info] Start training from score 31.101000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.046387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not

## Submissions

## Exploratory Data Analysis

In [24]:
temp = data_storage.train_df_data

In [25]:
temp.head()

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id
i64,i64,i64,f64,i64,datetime[μs],i64,i64,i64
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0
0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0
0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1
0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1
0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2


In [26]:
temp.with_columns(
    pl.col("datetime").dt.hour().alias("hour"),
    pl.col("datetime").dt.day().alias("day"),
    pl.col("datetime").dt.weekday().alias("weekday"),
    pl.col("datetime").dt.month().alias("month"),
    pl.col("datetime").dt.year().alias("year"),
    pl.col("datetime").dt.ordinal_day().alias("dayofyear")
)

county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,hour,day,weekday,month,year,dayofyear
i64,i64,i64,f64,i64,datetime[μs],i64,i64,i64,i8,i8,i8,i8,i32,i16
0,0,1,0.713,0,2021-09-01 00:00:00,0,0,0,0,1,3,9,2021,244
0,0,1,96.59,1,2021-09-01 00:00:00,0,1,0,0,1,3,9,2021,244
0,0,2,0.0,0,2021-09-01 00:00:00,0,2,1,0,1,3,9,2021,244
0,0,2,17.314,1,2021-09-01 00:00:00,0,3,1,0,1,3,9,2021,244
0,0,3,2.904,0,2021-09-01 00:00:00,0,4,2,0,1,3,9,2021,244
0,0,3,656.859,1,2021-09-01 00:00:00,0,5,2,0,1,3,9,2021,244
0,1,0,0.0,0,2021-09-01 00:00:00,0,6,3,0,1,3,9,2021,244
0,1,0,59.0,1,2021-09-01 00:00:00,0,7,3,0,1,3,9,2021,244
0,1,1,0.0,0,2021-09-01 00:00:00,0,8,4,0,1,3,9,2021,244
0,1,1,501.76,1,2021-09-01 00:00:00,0,9,4,0,1,3,9,2021,244
