In [3]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from typing import Literal, List
from sklearn.metrics import mean_absolute_error

In [1]:
cleaned_dataset_address = "../dataset/interim/past_dataset.csv"

In [4]:
past_knowledge = pd.read_csv(cleaned_dataset_address, parse_dates=["datetime"])

In [5]:
parameters = {
    "past_knowledge": past_knowledge,
    "cyclical_feature_names": {
        "month": 12,
        "day": 31,
        "day_of_year": 365,
        "week_of_year": 52,
        "quarter": 4,
        # "season": 4,
        "is_weekend": 2,
    },
    "lag_size": 30,
    "window_size": 30,
}

In [6]:
known_dates = DataFrame(past_knowledge["datetime"])

In [7]:
TARGET = "general_dam_occupancy_rate"

In [8]:
class FeatureExtractor:
    def __init__(
        self,
        past_knowledge: DataFrame,
        cyclical_feature_names: List[str],
        lag_size: int = 30,
        window_size: int = 30,
    ):
        self.PAST_KNOWLEDGE = past_knowledge.sort_values(by="datetime")
        self.cyclical_feature_names = cyclical_feature_names
        self.lag_size = lag_size
        self.window_size = window_size

    def transform(self, df: DataFrame) -> DataFrame:
        return (
            df.sort_values("datetime")
            .pipe(self._add_lag_features)
            .pipe(self._add_rolling_window_features)
            .pipe(self._add_exponential_moving_features)
            .pipe(self._drop_columns_with_same_values)
            .pipe(self._expand_datetime)
            .pipe(self._add_fourier_features)
            .pipe(
                lambda df: df.astype(
                    {col: "float32" for col in df.select_dtypes("number").columns}
                )
            )
            .bfill()
        )

    def _add_lag_features(
        self,
        df: DataFrame,
        fillna_with: Literal["ffill", "bfill"] | None = "bfill",
    ) -> DataFrame:
        df["datetime"] = pd.to_datetime(df["datetime"])

        df = df.sort_values("datetime")

        full_date_range = pd.date_range(
            start=self.PAST_KNOWLEDGE["datetime"].min(),
            end=df["datetime"].max(),
            freq="D",
        )
        full_df = pd.DataFrame({"datetime": full_date_range})
        full_df = full_df.merge(self.PAST_KNOWLEDGE, on="datetime", how="left")

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(
            include="number"
        ).columns.tolist()

        created_features = []
        for col in columns_to_use:
            for i in range(1, self.lag_size + 1):
                created_col_name = f"{col}_lag_{i}"
                created_features.append(full_df[col].shift(i).rename(created_col_name))

        lags_df = pd.concat([full_df["datetime"], *created_features], axis=1)

        df = df.merge(
            lags_df,
            on="datetime",
            how="left",
        )

        if fillna_with == "ffill":
            df = df.ffill()
        elif fillna_with == "bfill":
            df = df.bfill()

        return df

    def _add_rolling_window_features(
        self,
        df: DataFrame,
        fillna_with: Literal["ffill", "bfill"] | None = "ffill",
    ) -> DataFrame:
        df["datetime"] = pd.to_datetime(df["datetime"])

        df = df.sort_values("datetime")

        full_date_range = pd.date_range(
            start=self.PAST_KNOWLEDGE["datetime"].min(),
            end=df["datetime"].max(),
            freq="D",
        )
        full_df = pd.DataFrame({"datetime": full_date_range}).merge(
            self.PAST_KNOWLEDGE, on="datetime", how="left"
        )

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(
            include=["number"]
        ).columns.tolist()

        metrics = ["mean", "std", "min", "max", "median", "var"]

        created_features = []
        for col in columns_to_use:
            for size in range(2, self.window_size + 1):
                rolling_window_feature = (
                    full_df[col]
                    .rolling(window=size, min_periods=1)
                    .agg(metrics)
                    .rename(columns=lambda metric: f"{col}_rw{size}_{metric}")
                )
                created_features.append(rolling_window_feature)

        window_df = pd.concat([full_df["datetime"], *created_features], axis=1)

        df = df.merge(
            window_df,
            on="datetime",
            how="left",
        )

        if fillna_with == "ffill":
            df = df.ffill()
        elif fillna_with == "bfill":
            df = df.bfill()

        return df

    def _drop_columns_with_same_values(self, df: DataFrame, threshold=0.9) -> DataFrame:
        to_drop = [
            col
            for col in df.columns
            if df[col].value_counts(normalize=True, dropna=False).values[0] >= threshold
        ]
        return df.drop(columns=to_drop)

    def _add_exponential_moving_features(
        self, df: pd.DataFrame, up_to: int = 30
    ) -> pd.DataFrame:
        df["datetime"] = pd.to_datetime(df["datetime"])

        df = df.sort_values("datetime")

        full_date_range = pd.date_range(
            start=self.PAST_KNOWLEDGE["datetime"].min(),
            end=df["datetime"].max(),
            freq="D",
        )
        full_df = pd.DataFrame({"datetime": full_date_range}).merge(
            self.PAST_KNOWLEDGE, on="datetime", how="left"
        )

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(
            include=["number"]
        ).columns.tolist()

        columns_to_use = self.PAST_KNOWLEDGE.select_dtypes(include="number").columns
        metrics = ["mean"]
        created_features = []

        for col in columns_to_use:
            for span in range(2, up_to + 1):
                feature = (
                    full_df[col]
                    .ewm(span=span, adjust=False)
                    .agg(metrics)
                    .rename(columns=lambda metric: f"{col}_em_{span}_{metric}")
                )
                created_features.append(feature)

        pd.concat([df, *created_features], axis=1)

        exponential_moving_df = pd.concat(
            [full_df["datetime"], *created_features], axis=1
        )

        df = df.merge(
            exponential_moving_df,
            on="datetime",
            how="left",
        )
        return df

    def _expand_datetime(self, df: DataFrame, column: str = "datetime") -> DataFrame:
        return df.assign(
            **{
                "year": lambda a_df: a_df[column].dt.year,
                "month": lambda a_df: a_df[column].dt.month,
                "day": lambda a_df: a_df[column].dt.day,
                "hour": lambda a_df: a_df[column].dt.hour,
                "day_of_year": lambda a_df: a_df[column].dt.dayofyear,
                "week_of_year": lambda a_df: a_df[column].dt.isocalendar().week,
                "quarter": lambda a_df: a_df[column].dt.quarter,
                # "season": lambda a_df: a_df[column].dt.month % 12 // 3 + 1,
                "is_weekend": lambda a_df: (a_df[column].dt.weekday >= 5).map(
                    {True: 1, False: 0}
                ),
            }
        )

    def _add_fourier_features(self, df: pd.DataFrame, num_terms: int = 7) -> DataFrame:
        for col, max_val in self.cyclical_feature_names.items():
            source = self._get_column_source(df, col)

            for i in range(1, num_terms + 1):
                operation = 2 * np.pi * i * source[col] / max_val

                df[f"fourier_sin_{col}_{i}"] = np.sin(operation)
                df[f"fourier_cos_{col}_{i}"] = np.cos(operation)

        return df

    def _get_column_source(self, df: DataFrame, col: str) -> List[str]:
        if col in df.columns:
            source = df
        elif col in self.PAST_KNOWLEDGE.columns:
            source = self.PAST_KNOWLEDGE
        else:
            raise KeyError(f"{col} not found both in df and past knowledge.")
        return source


In [9]:
feature_extractor = FeatureExtractor(**parameters)

In [10]:
y_values = known_dates.merge(past_knowledge, on="datetime", how="left").loc[:, ["datetime", TARGET]]

In [11]:
X_values = feature_extractor.transform(known_dates).loc[:, ["datetime"]]

In [12]:
train_size = int(len(X_values) * 0.7)
val_size = int(len(X_values) * 0.15)

In [13]:
train_df = X_values.iloc[:train_size].merge(y_values, on="datetime", how="inner")

val_df = X_values.iloc[train_size : train_size + val_size].merge(
    y_values, on="datetime", how="inner"
)

test_df = X_values.iloc[train_size + val_size :].merge(
    y_values, on="datetime", how="inner"
)


In [14]:
X_train, y_train = (
    train_df.drop(columns=["datetime", "general_dam_occupancy_rate"]),
    train_df["general_dam_occupancy_rate"],
)

X_val, y_val = (
    val_df.drop(columns=["datetime", "general_dam_occupancy_rate"]),
    val_df["general_dam_occupancy_rate"],
)

X_test, y_test = (
    test_df.drop(columns=["datetime", "general_dam_occupancy_rate"]),
    test_df["general_dam_occupancy_rate"],
)


In [15]:
def dummy_forecaster(y):
    return np.full(len(y), y.mean())

In [16]:
mean_absolute_error(y_train, dummy_forecaster(y_train))

21.46453160118681

In [17]:
mean_absolute_error(y_val, dummy_forecaster(y_val))

14.062895669528846

In [18]:
mean_absolute_error(y_test, dummy_forecaster(y_test))

16.998338269300987