In [1]:
import pandas as pd
from pandas import DataFrame
from typing import Literal, List
import numpy as np

In [2]:
cleaned_dataset_address = "../dataset/interim/past_dataset.csv"

In [3]:
past_knowledge = pd.read_csv(cleaned_dataset_address, parse_dates=["datetime"], converters = {'weather_code': str})

In [4]:
past_knowledge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5939 entries, 0 to 5938
Data columns (total 34 columns):
 #   Column                                         Non-Null Count  Dtype         
---  ------                                         --------------  -----         
 0   general_dam_occupancy_rate                     5939 non-null   float64       
 1   datetime                                       5939 non-null   datetime64[ns]
 2   weather_code                                   5939 non-null   object        
 3   temperature_2m_max                             5939 non-null   float64       
 4   temperature_2m_min                             5939 non-null   float64       
 5   temperature_2m_mean                            5939 non-null   float64       
 6   apparent_temperature_max                       5939 non-null   float64       
 7   apparent_temperature_min                       5939 non-null   float64       
 8   apparent_temperature_mean                      5939 non-nu

In [5]:
def expand_datetime(df: DataFrame, column: str = "datetime") -> DataFrame:
    return df.assign(
        **{
            "year": lambda a_df: a_df[column].dt.year,
            "month": lambda a_df: a_df[column].dt.month,
            "day": lambda a_df: a_df[column].dt.day,
            "hour": lambda a_df: a_df[column].dt.hour,
            "day_of_year": lambda a_df: a_df[column].dt.dayofyear,
            "week_of_year": lambda a_df: a_df[column].dt.isocalendar().week,
            "quarter": lambda a_df: a_df[column].dt.quarter,
            # "season": lambda a_df: a_df[column].dt.month % 12 // 3 + 1,
            "is_weekend": lambda a_df: (a_df[column].dt.weekday >= 5).map(
                {True: 1, False: 0}
            ),
        }
    )

In [6]:
def add_rolling_window_features(
    prediction_df: DataFrame,
    past_knowledge: DataFrame,
    window_size: int = 7,
    fillna_with: Literal["ffill", "bfill"] | None = "ffill",
) -> DataFrame:
    past_knowledge["datetime"] = pd.to_datetime(past_knowledge["datetime"])
    prediction_df["datetime"] = pd.to_datetime(prediction_df["datetime"])

    past_knowledge = past_knowledge.sort_values("datetime")
    prediction_df = prediction_df.sort_values("datetime")

    full_date_range = pd.date_range(
        start=past_knowledge["datetime"].min(),
        end=prediction_df["datetime"].max(),
        freq="D",
    )
    full_df = pd.DataFrame({"datetime": full_date_range}).merge(
        past_knowledge, on="datetime", how="left"
    )

    columns_to_use = past_knowledge.select_dtypes(include=["number"]).columns.tolist()

    metrics = ["mean", "std", "min", "max", "median", "var"]

    created_features = []
    for col in columns_to_use:
        for size in range(2, window_size + 1):
            rolling_window_feature = (
                full_df[col]
                .rolling(window=size, min_periods=1)
                .agg(metrics)
                .rename(columns=lambda metric: f"{col}_rw{size}_{metric}")
            )
            created_features.append(rolling_window_feature)

    window_df = pd.concat([full_df["datetime"], *created_features], axis=1)

    prediction_df = prediction_df.merge(
        window_df,
        on="datetime",
        how="left",
    )

    if fillna_with == "ffill":
        prediction_df = prediction_df.ffill()
    elif fillna_with == "bfill":
        prediction_df = prediction_df.bfill()

    return prediction_df


In [7]:
def add_lag_features(
    prediction_df: DataFrame,
    past_knowledge: DataFrame,
    lag_size: int = 30,
    fillna_with: Literal["ffill", "bfill"] | None = "ffill",
):
    past_knowledge["datetime"] = pd.to_datetime(past_knowledge["datetime"])
    prediction_df["datetime"] = pd.to_datetime(prediction_df["datetime"])

    past_knowledge = past_knowledge.sort_values("datetime")
    prediction_df = prediction_df.sort_values("datetime")

    full_date_range = pd.date_range(
        start=past_knowledge["datetime"].min(),
        end=prediction_df["datetime"].max(),
        freq="D",
    )
    full_df = pd.DataFrame({"datetime": full_date_range})
    full_df = full_df.merge(past_knowledge, on="datetime", how="left")

    past_knowledge_columns_to_use = past_knowledge.select_dtypes(
        include="number"
    ).columns.tolist()

    created_features = []
    for col in past_knowledge_columns_to_use:
        for i in range(1, lag_size + 1):
            created_col_name = f"{col}_lag_{i}"
            created_features.append(full_df[col].shift(i).rename(created_col_name))

    lags_df = pd.concat([full_df["datetime"], *created_features], axis=1)

    prediction_df = prediction_df.merge(
        lags_df,
        on="datetime",
        how="left",
    )

    if fillna_with == "ffill":
        prediction_df = prediction_df.ffill()
    elif fillna_with == "bfill":
        prediction_df = prediction_df.bfill()

    return prediction_df


In [8]:
def encode_cyclical(df: pd.DataFrame, cyclical_feature_names: dict[str, int]) -> DataFrame:
    for col, max_val in cyclical_feature_names.items():
        df[col + "_sin"] = np.sin(2 * np.pi * df[col] / max_val)
        df[col + "_cos"] = np.cos(2 * np.pi * df[col] / max_val)
    return df

In [9]:
def drop_columns_with_same_values(df: DataFrame, threshold=0.9) -> DataFrame:
    to_drop = [
        col
        for col in df.columns
        if df[col].value_counts(normalize=True, dropna=False).values[0] >= threshold
    ]
    return df.drop(columns=to_drop)

In [10]:
cyclical_features = {
    "month": 12,
    "day": 31,
    "day_of_year": 365,
    "week_of_year": 52,
    "quarter": 4,
    # "season": 4,
    "is_weekend": 2,
}

In [11]:
prediction_df = (
    DataFrame(index=pd.date_range("2021-04-07", "2021-05-07", freq="D"))
    .reset_index()
    .rename(columns={"index": "datetime"})
)


In [12]:
prediction_df = (
    prediction_df
    .sort_values("datetime")
    .pipe(add_lag_features, past_knowledge=past_knowledge, lag_size=30)
    .pipe(add_rolling_window_features, past_knowledge=past_knowledge, window_size=30)
    .pipe(drop_columns_with_same_values)
    .pipe(expand_datetime)
    .pipe(encode_cyclical, cyclical_feature_names=cyclical_features)
)

In [13]:
prediction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Columns: 5307 entries, datetime to is_weekend_cos
dtypes: Float64(2), UInt32(1), datetime64[ns](1), float64(5296), int32(6), int64(1)
memory usage: 1.3 MB


In [14]:
prediction_df_null_counts = prediction_df.isna().sum()
prediction_df_null_counts[prediction_df_null_counts > 0]

Series([], dtype: int64)

In [15]:
prediction_df.iloc[:, : 40].head(5)

Unnamed: 0,datetime,general_dam_occupancy_rate_lag_5,general_dam_occupancy_rate_lag_6,general_dam_occupancy_rate_lag_7,general_dam_occupancy_rate_lag_8,general_dam_occupancy_rate_lag_9,general_dam_occupancy_rate_lag_10,general_dam_occupancy_rate_lag_11,general_dam_occupancy_rate_lag_12,general_dam_occupancy_rate_lag_13,...,temperature_2m_max_lag_8,temperature_2m_max_lag_9,temperature_2m_max_lag_10,temperature_2m_max_lag_11,temperature_2m_max_lag_12,temperature_2m_max_lag_13,temperature_2m_max_lag_14,temperature_2m_max_lag_15,temperature_2m_max_lag_16,temperature_2m_max_lag_17
0,2021-04-07,73.46,73.18,72.83,72.24,71.96,71.56,71.14,70.63,69.83,...,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455,5.3455,7.3955,12.7455
1,2021-04-08,73.66,73.46,73.18,72.83,72.24,71.96,71.56,71.14,70.63,...,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455,5.3455,7.3955
2,2021-04-09,73.76,73.66,73.46,73.18,72.83,72.24,71.96,71.56,71.14,...,11.445499,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455,5.3455
3,2021-04-10,74.7,73.76,73.66,73.46,73.18,72.83,72.24,71.96,71.56,...,17.4955,11.445499,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455,6.7455
4,2021-04-11,75.88,74.7,73.76,73.66,73.46,73.18,72.83,72.24,71.96,...,14.8955,17.4955,11.445499,9.945499,10.195499,9.3455,10.7955,13.1455,10.9955,9.0455


In [16]:
series = pd.Series()

In [21]:
pd.date_range("2021-04-07", "2021-05-07", freq="D")

DatetimeIndex(['2021-04-07', '2021-04-08', '2021-04-09', '2021-04-10',
               '2021-04-11', '2021-04-12', '2021-04-13', '2021-04-14',
               '2021-04-15', '2021-04-16', '2021-04-17', '2021-04-18',
               '2021-04-19', '2021-04-20', '2021-04-21', '2021-04-22',
               '2021-04-23', '2021-04-24', '2021-04-25', '2021-04-26',
               '2021-04-27', '2021-04-28', '2021-04-29', '2021-04-30',
               '2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04',
               '2021-05-05', '2021-05-06', '2021-05-07'],
              dtype='datetime64[ns]', freq='D')

In [39]:
a = past_knowledge.copy()

In [40]:
a = a.set_index("datetime")

In [None]:
for date in a.index:
    delta = date - pd.offsets.MonthBegin(1)
    train = a.loc[delta:date-pd.offsets.Day(1)]
    valid = a.loc[date:date+pd.offsets.MonthEnd(1)]

In [43]:
valid

Unnamed: 0_level_0,general_dam_occupancy_rate,weather_code,temperature_2m_max,temperature_2m_min,temperature_2m_mean,apparent_temperature_max,apparent_temperature_min,apparent_temperature_mean,daylight_duration,sunshine_duration,...,climate_change_wind_speed_10m_max,climate_change_relative_humidity_2m_mean,climate_change_relative_humidity_2m_max,climate_change_relative_humidity_2m_min,climate_change_dew_point_2m_mean,climate_change_dew_point_2m_min,climate_change_dew_point_2m_max,climate_change_precipitation_sum,climate_change_pressure_msl_mean,climate_change_et0_fao_evapotranspiration_sum
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-04-06,75.88,53.0,19.0455,8.1455,14.087169,17.12777,3.700723,11.177331,46667.355,42191.03,...,13.493123,74.78501,87.00573,62.43915,4.169358,1.621818,6.34774,0.220384,1029.3667,2.714185
