# Imports

In [1]:
import numpy  as np
import pandas as pd

import matplotlib.pyplot as plt

from dstoolkit.feature_engine import GroupedLagTimeFeatureCreator

## Loading Dataset

In [2]:
# the data is from https://datahub.io/core/global-temp
df = pd.read_csv("../data/global_temperature_time_series.csv")

In [3]:
df.head()

Unnamed: 0,Source,Year,Mean
0,gcag,1850-01,-0.6746
1,gcag,1850-02,-0.3334
2,gcag,1850-03,-0.5913
3,gcag,1850-04,-0.5887
4,gcag,1850-05,-0.5088


In [4]:
df = df.rename({'Source': 'source', 'Year': 'time', 'Mean': 'temperature'}, axis=1)
df['time'] = pd.to_datetime(df['time'])

## Class GroupedLagTimeFeatureCreator

In [5]:
from typing import List, Union
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew
import warnings
from itertools import combinations


class GroupedLagTimeFeatureCreator:

    def __init__(
            self,
            windows: List[int] = [2, 3, 4],
            functions: List[str] = ["mean", "median", "max", "min"],
            add_div: bool = True,
            add_diff: bool = True
        ):
        self.windows = windows
        self.functions = functions
        self.add_div = add_div
        self.add_diff = add_diff

        self._function_map = {
            'mean': np.mean,
            'median': np.median,
            'max': np.max,
            'min': np.min,
            'sum': np.sum,
            'std': lambda x: np.std(x, ddof=1),
            'kurt': lambda x: kurtosis(x, bias=False),
            'skew': lambda x: skew(x, bias=False),
            'slope': self._calc_slope
        }

        if 1 in self.windows:
            warnings.warn(
                "Window size 1 was found. Windows with size equal to 1 are not recommended for lag/rolling calculations.",
                UserWarning
            )
            self.windows.remove(1)

    def _calc_slope(self, x: np.ndarray) -> float:
        return np.polyfit(np.arange(len(x)), x, 1)[0]

    def _create_group_features(self, group_df: pd.DataFrame, target: str) -> pd.DataFrame:
        series = group_df[target]

        max_lag = max(self.windows)
        lagged_features = {f'{target}_lag_{i}': series.shift(i) for i in range(1, max_lag + 1)}
        
        # rolling statistics
        for func in self.functions:
            func_operation = self._function_map[func]
            for win in self.windows:
                result = lagged_features[f'{target}_lag_1'].rolling(window=win, min_periods=2).apply(func_operation, raw=True)
                lagged_features[f'{target}_{func}_{win}_lags'] = result

        # div and diff between lag pairs
        if self.add_div or self.add_diff:
            for i, j in combinations(range(1, max_lag + 1), 2):
                lag_i = lagged_features[f'{target}_lag_{i}']
                lag_j = lagged_features[f'{target}_lag_{j}']

                if self.add_div:
                    lagged_features[f'{target}_div_lag_{i}_vs_{j}'] = lag_i / lag_j.replace({0: np.nan})
                if self.add_diff:
                    lagged_features[f'{target}_diff_lag_{i}_vs_{j}'] = lag_i - lag_j

        return group_df.assign(**lagged_features)

    def create(self, df: pd.DataFrame, group_cols: Union[str, List[str]], target: str, time: str) -> pd.DataFrame:
        """
        Cria features de lag agrupadas temporalmente.

        Args:
            df: DataFrame de entrada
            group_cols: Coluna(s) de agrupamento (ex: cliente_id, produto_id)
            target: Coluna a ser utilizada como base para os lags
            time: Coluna temporal (deve estar ordenada por grupo)

        Returns:
            DataFrame com features de lag adicionadas
        """
        if isinstance(group_cols, str):
            group_cols = [group_cols]

        if target not in df.columns:
            raise ValueError(f"Coluna '{target}' não encontrada no DataFrame")

        df = df.sort_values(by=group_cols + [time])
        result = (
            df
            .groupby(group_cols, group_keys=False)
            .apply(lambda group: self._create_group_features(group, target))
        )

        return result.reset_index(drop=True)

In [6]:
['ui'] + ['time']

['ui', 'time']

In [7]:
df = df.sample(frac=1)
df.head()

Unnamed: 0,source,time,temperature
180,gcag,1865-01-01,-0.1761
2692,GISTEMP,1977-03-01,0.24
1688,GISTEMP,1935-05-01,-0.28
1522,GISTEMP,1928-06-01,-0.39
2660,GISTEMP,1975-11-01,-0.17


In [8]:
s = GroupedLagTimeFeatureCreator(windows=[2, 3, 4, 5, 6], functions=['mean', 'median', 'max', 'min', 'sum', 'std', 'slope'])
df = s.create(df, time='time', group_cols=['source'], target='temperature')

  .apply(lambda group: self._create_group_features(group, target))


In [9]:
df['source'].unique()

array(['GISTEMP', 'gcag'], dtype=object)

In [10]:
df.loc[df['source'] == 'gcag', :].tail(10)

Unnamed: 0,source,time,temperature,temperature_lag_1,temperature_lag_2,temperature_lag_3,temperature_lag_4,temperature_lag_5,temperature_lag_6,temperature_mean_2_lags,...,temperature_div_lag_3_vs_5,temperature_diff_lag_3_vs_5,temperature_div_lag_3_vs_6,temperature_diff_lag_3_vs_6,temperature_div_lag_4_vs_5,temperature_diff_lag_4_vs_5,temperature_div_lag_4_vs_6,temperature_diff_lag_4_vs_6,temperature_div_lag_5_vs_6,temperature_diff_lag_5_vs_6
3813,gcag,2023-10-01,1.2866,1.3522,1.1993,1.15,1.0518,0.8714,0.9275,1.27575,...,1.319715,0.2786,1.239892,0.2225,1.207023,0.1804,1.134016,0.1243,0.939515,-0.0561
3814,gcag,2023-11-01,1.3338,1.2866,1.3522,1.1993,1.15,1.0518,0.8714,1.3194,...,1.140236,0.1475,1.376291,0.3279,1.093364,0.0982,1.319715,0.2786,1.207023,0.1804
3815,gcag,2023-12-01,1.2586,1.3338,1.2866,1.3522,1.1993,1.15,1.0518,1.3102,...,1.175826,0.2022,1.285606,0.3004,1.04287,0.0493,1.140236,0.1475,1.093364,0.0982
3816,gcag,2024-01-01,1.1516,1.2586,1.3338,1.2866,1.3522,1.1993,1.15,1.2962,...,1.072792,0.0873,1.118783,0.1366,1.127491,0.1529,1.175826,0.2022,1.04287,0.0493
3817,gcag,2024-02-01,1.2902,1.1516,1.2586,1.3338,1.2866,1.3522,1.1993,1.2051,...,0.986393,-0.0184,1.112149,0.1345,0.951486,-0.0656,1.072792,0.0873,1.127491,0.1529
3818,gcag,2024-03-01,1.2515,1.2902,1.1516,1.2586,1.3338,1.2866,1.3522,1.2209,...,0.978237,-0.028,0.930779,-0.0936,1.036686,0.0472,0.986393,-0.0184,0.951486,-0.0656
3819,gcag,2024-04-01,1.2053,1.2515,1.2902,1.1516,1.2586,1.3338,1.2866,1.27085,...,0.863398,-0.1822,0.895072,-0.135,0.94362,-0.0752,0.978237,-0.028,1.036686,0.0472
3820,gcag,2024-05-01,1.0745,1.2053,1.2515,1.2902,1.1516,1.2586,1.3338,1.2284,...,1.025107,0.0316,0.967311,-0.0436,0.914985,-0.107,0.863398,-0.1822,0.94362,-0.0752
3821,gcag,2024-06-01,1.1154,1.0745,1.2053,1.2515,1.2902,1.1516,1.2586,1.1399,...,1.086749,0.0999,0.994359,-0.0071,1.120354,0.1386,1.025107,0.0316,0.914985,-0.107
3822,gcag,2024-07-01,1.1398,1.1154,1.0745,1.2053,1.2515,1.2902,1.1516,1.09495,...,0.934196,-0.0849,1.046631,0.0537,0.970005,-0.0387,1.086749,0.0999,1.120354,0.1386


In [12]:
1.2053 / 1.2902

0.9341962486436212

In [13]:
df.isna().mean()

source                         0.000000
time                           0.000000
temperature                    0.000000
temperature_lag_1              0.000523
temperature_lag_2              0.001046
                                 ...   
temperature_diff_lag_4_vs_5    0.002616
temperature_div_lag_4_vs_6     0.005755
temperature_diff_lag_4_vs_6    0.003139
temperature_div_lag_5_vs_6     0.005755
temperature_diff_lag_5_vs_6    0.003139
Length: 74, dtype: float64