In [None]:
#| default_exp features

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.basics import patch
from pathlib import Path
from math import factorial

In [None]:
#| export
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures

We will first open the data

In [None]:
DATA_PATH = Path("../testing_data")

data = pd.read_csv(
    DATA_PATH/'hydro_example.csv', 
    usecols=['time', 'smoothed_rain', 'Q_mgb', 'Q_obs'], 
    index_col='time',
    converters={"time": pd.to_datetime}
    )

## Feature generator
> Feature generator that uses a sliding window of n previous timesteps and polynomial features to predict the next value, enabling learning of temporal patterns and non-linear relationships

In [None]:
#| export
class FeatureAndTargetGenerator:
    """
    Transforms time series data into feature matrices suitable for machine learning models.
    Creates lagged features using a sliding window and optionally generates polynomial features
    to capture non-linear relationships between variables. It also creates a target vector for
    the number of timesteps to predict.
    """
    def __init__(self, 
                 context_len: int = 10, # number of previous timesteps to use as features
                 target_len: int = 10, # number of timesteps to predict
                 poly_degree: int = 1 # degree of polynomial features
                 ):
        self.context_len = context_len
        self.target_len = target_len
        self.poly_features = PolynomialFeatures(degree=poly_degree)
        
    def generate(self, df: pd.DataFrame, x_col: list[str], y_col: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
        """ Generates a feature matrix and target vector from the input data. """
        X, y = df[x_col], df[y_col]
        if 1 < self.poly_features.degree:
            X = pd.DataFrame(self.poly_features.fit_transform(X), index=X.index)
        X, y = self._generate_sliding_window_data(X, y)
        return X, y


    def _generate_sliding_window_data(self, X: pd.DataFrame, y: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Creates a feature matrix by combining multiple input variables and their lagged values.
        For each time step t, takes values from t-context_len to t for each input variable
        and combines them into a single feature vector. The target values are taken from t+1 to t+target_len.
        This allows the model to learn patterns across multiple timesteps and predict multiple steps ahead.
        """
        features = []
        targets = []
        
        for i in range(len(X) - self.context_len - self.target_len):
            row_features = X.iloc[i:i + self.context_len]
            features.append(row_features.values.reshape(-1))
            row_targets = y.iloc[i + self.context_len: i + self.context_len + self.target_len]
            targets.append(row_targets.values.reshape(-1))

        features = pd.DataFrame(
            index=X.index[self.context_len - 1:len(X) - self.target_len - 1],
              data=features)
        targets = pd.DataFrame(
            index=y.index[self.context_len - 1:len(X) - self.target_len - 1],
            data=targets,
            columns=[f"t+{i+1}" for i in range(0, self.target_len)])
        return features, targets
    


First we need to create an instance of the generator setting the context and the target length and the polynomial degree.

In [None]:
generator = FeatureAndTargetGenerator(context_len=1, target_len=2, poly_degree=2)

Then we can generate the feature and target matrixes as follows

In [None]:
show_doc(FeatureAndTargetGenerator.generate)

In [None]:
x_col, y_col = ['smoothed_rain','Q_mgb'], ['Q_obs']
data_x, data_y = generator.generate(data, x_col=x_col, y_col=y_col)

The generated data will look as follows

In [None]:
data_x.head(3)

In [None]:
data_y.head(3)

In [None]:
#| test
gen_test = FeatureAndTargetGenerator(context_len=3, target_len=2, poly_degree=1)
test_x, test_y = gen_test.generate(data, x_col=x_col, y_col=y_col)

# Test the shape and the index matching for multistep features and prediction
def test_target_shape(gen_test, test_y):
    assert test_y.shape[1] == gen_test.target_len, f"Expected {gen_test.target_len} target columns but got {test_y.shape[1]}"
    assert all(f't+{i+1}' in test_y.columns for i in range(gen_test.target_len)), "Target columns not properly named"

def test_index_matching(test_x, test_y):
    assert test_x.index.equals(test_y.index), "Feature and target indices do not match"
    assert test_x.shape[0] == test_y.shape[0], f"Feature and target row counts don't match: {test_x.shape[0]} vs {test_y.shape[0]}"

def test_sample_count(gen_test, test_x, data):
    if not test_x.shape[0] == data.shape[0] - gen_test.context_len - gen_test.target_len:
        raise ValueError(f"Expected {data.shape[0] - gen_test.context_len - gen_test.target_len} samples but got {test_x.shape[0]}")

def test_values(test_x, test_y, data, gen_test, x_col, y_col):
    if not all(test_x.iloc[5].values == data.iloc[5:5+gen_test.context_len][x_col].values.reshape(-1)):
        raise ValueError("Feature values do not match expected values from source data")
    if not all(test_y.iloc[2].values == data.iloc[2+gen_test.context_len:2+gen_test.context_len+gen_test.target_len][y_col].values.reshape(-1)):
        raise ValueError("Target values do not match expected values from source data")

test_target_shape(gen_test, test_y)
test_index_matching(test_x, test_y)
test_sample_count(gen_test, test_x, data)
test_values(test_x, test_y, data, gen_test, x_col, y_col)

In [None]:
#| test
def test_polynomial_features(data, x_col, y_col):
    def poly_terms(vars, deg): return (factorial(len(vars) +deg))/(factorial(len(vars))*factorial(deg))
    
    gen_test = FeatureAndTargetGenerator(context_len=2, target_len=2, poly_degree=3)
    test_x, _ = gen_test.generate(data, x_col=x_col, y_col=y_col)
    expected_cols = gen_test.context_len * poly_terms(x_col, gen_test.poly_features.degree)
    if not test_x.shape[1] == expected_cols:
        raise ValueError(f"Expected {expected_cols} polynomial feature columns but got {test_x.shape[1]}")
    
test_polynomial_features(data, x_col, y_col)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()