# Imports

In [1]:
import numpy  as np
import pandas as pd

import matplotlib.pyplot as plt

from dstoolkit.feature_engine import SimpleLagTimeFeatureCreator

## Loading Dataset

In [2]:
# the data is from https://datahub.io/core/global-temp
df = pd.read_csv("../data/global_temperature_time_series.csv")

In [3]:
df.head()

Unnamed: 0,Source,Year,Mean
0,gcag,1850-01,-0.6746
1,gcag,1850-02,-0.3334
2,gcag,1850-03,-0.5913
3,gcag,1850-04,-0.5887
4,gcag,1850-05,-0.5088


In [4]:
df = df.drop(['Source'], axis=1).rename({'Year': 'time', 'Mean': 'temperature'}, axis=1)
df['time'] = pd.to_datetime(df['time'])

## Class SimpleLagTimeFeatureCreator

In [5]:
import warnings

import numpy  as np
import pandas as pd

from typing import List, Union
from itertools import combinations

from scipy.stats import skew, kurtosis


class SimpleLagTimeFeatureCreator:

    def __init__(
            self, 
            windows: List[int] = [2, 3, 4], 
            functions: List[str] = ["mean", "median", "max", "min"], 
            add_div: bool = True, 
            add_diff: bool = True
        ):
        """
        Initiate the lag feature creator.

        Args:
            windows: List of window sizes for calculating statistics
            functions: List of functions to be applied (mean, median, max, min, etc.)
        """
        self.windows = windows
        self.functions = functions
        self.add_div = add_div
        self.add_diff = add_diff

        self._function_map = {
            'mean': np.mean,
            'median': np.median,
            'max': np.max,
            'min': np.min,
            'sum': np.sum,
            'std': lambda x: np.std(x, ddof=1),
            'kurt': lambda x: kurtosis(x, bias=False),
            'skew': lambda x: skew(x, bias=False),
            'slope': self._calc_slope
        }

        if 1 in self.windows:
            warnings.warn(
                "Window size 1 was found. Windows with size equal to 1 are not recommended for lag/rolling calculations, because they do not aggregate temporal information.",
                UserWarning
            )
            self.windows.remove(1)

    def _calc_slope(self, x: np.ndarray) -> float:
        """Calculates the slope using least squares."""
        return np.polyfit(np.arange(len(x)), x, 1)[0]

    def _create_lag_features(self, series: pd.Series) -> None:
        """Creates all lagged features for a time series."""
        self.features = {f'{series.name}_lag_{i}': series.shift(i) for i in range(1, max(self.windows) + 1)}

        for func in self.functions:
            func_operation = self._function_map[func]
            for win in self.windows:
                feature_values = self.features[f'{series.name}_lag_1'].rolling(window=win, min_periods=2).apply(func_operation, raw=True)
                self.features[f'{series.name}_{func}_last_{win}_lags'] = feature_values

    def _create_lag_div_features(self, series: pd.Series, max_lag: int = 4) -> None:
        """
        Creates div between all lag combinations: lag_i/lag_j for i < j
        """
        for i, j in combinations(range(1, max_lag + 1), 2):
            pct_diff = self.features[f'{series.name}_lag_{i}'] / self.features[f'{series.name}_lag_{j}'].replace({0: np.nan})  # avoid division by zero
            self.features[f'{series.name}_div_lag_{i}_vs_{j}'] = pct_diff

    def _create_lag_diff_features(self, series: pd.Series, max_lag: int = 4) -> None:
        """
        Creates differences between all lag combinations: lag_i - lag_j for i < j
        """
        for i, j in combinations(range(1, max_lag + 1), 2):
            diff = self.features[f'{series.name}_lag_{i}'] - self.features[f'{series.name}_lag_{j}']
            self.features[f'{series.name}_diff_lag_{i}_vs_{j}'] = diff

    def create(self, df: pd.DataFrame, target: str, time: str) -> pd.DataFrame:
        """
        Calculates all features with temporal lag for the target column.

        Args:
            df: Input DataFrame
            target: Name of the target column for feature calculation
            time: Name of the column with temporal data

        Returns:
            DataFrame with the new features added
        """
        if target not in df.columns:
            raise ValueError(f"Coluna '{target}' não encontrada no DataFrame")

        df = df.sort_values(by=time)

        self._create_lag_features(df[target])

        if self.add_div:
            self._create_lag_div_features(df[target], max_lag=max(self.windows))

        if self.add_diff:
            self._create_lag_diff_features(df[target], max_lag=max(self.windows))
    
        return df.assign(**self.features)

In [6]:
df = df.loc[:, ['time', 'temperature']].sample(frac=1)
df.head()

Unnamed: 0,time,temperature
269,1872-06-01,-0.2794
1802,1940-02-01,0.08
406,1881-12-01,-0.07
2663,1975-12-01,-0.2468
96,1858-01-01,-0.2955


In [7]:
s = SimpleLagTimeFeatureCreator(windows=[2, 3, 4, 5, 6], functions=['mean', 'median', 'max', 'min', 'sum', 'std', 'slope'])
df = s.create(df, 'temperature', time='time')

In [8]:
df.head(10)

Unnamed: 0,time,temperature,temperature_lag_1,temperature_lag_2,temperature_lag_3,temperature_lag_4,temperature_lag_5,temperature_lag_6,temperature_mean_last_2_lags,temperature_mean_last_3_lags,...,temperature_diff_lag_2_vs_3,temperature_diff_lag_2_vs_4,temperature_diff_lag_2_vs_5,temperature_diff_lag_2_vs_6,temperature_diff_lag_3_vs_4,temperature_diff_lag_3_vs_5,temperature_diff_lag_3_vs_6,temperature_diff_lag_4_vs_5,temperature_diff_lag_4_vs_6,temperature_diff_lag_5_vs_6
0,1850-01-01,-0.6746,,,,,,,,,...,,,,,,,,,,
1,1850-02-01,-0.3334,-0.6746,,,,,,,,...,,,,,,,,,,
2,1850-03-01,-0.5913,-0.3334,-0.6746,,,,,-0.504,,...,,,,,,,,,,
3,1850-04-01,-0.5887,-0.5913,-0.3334,-0.6746,,,,-0.46235,-0.5331,...,0.3412,,,,,,,,,
4,1850-05-01,-0.5088,-0.5887,-0.5913,-0.3334,-0.6746,,,-0.59,-0.504467,...,-0.2579,0.0833,,,0.3412,,,,,
5,1850-06-01,-0.3442,-0.5088,-0.5887,-0.5913,-0.3334,-0.6746,,-0.54875,-0.562933,...,0.0026,-0.2553,0.0859,,-0.2579,0.0833,,0.3412,,
6,1850-07-01,-0.1598,-0.3442,-0.5088,-0.5887,-0.5913,-0.3334,-0.6746,-0.4265,-0.480567,...,0.0799,0.0825,-0.1754,0.1658,0.0026,-0.2553,0.0859,-0.2579,0.0833,0.3412
7,1850-08-01,-0.2077,-0.1598,-0.3442,-0.5088,-0.5887,-0.5913,-0.3334,-0.252,-0.3376,...,0.1646,0.2445,0.2471,-0.0108,0.0799,0.0825,-0.1754,0.0026,-0.2553,-0.2579
8,1850-09-01,-0.3847,-0.2077,-0.1598,-0.3442,-0.5088,-0.5887,-0.5913,-0.18375,-0.237233,...,0.1844,0.349,0.4289,0.4315,0.1646,0.2445,0.2471,0.0799,0.0825,0.0026
9,1850-10-01,-0.5331,-0.3847,-0.2077,-0.1598,-0.3442,-0.5088,-0.5887,-0.2962,-0.250733,...,-0.0479,0.1365,0.3011,0.381,0.1844,0.349,0.4289,0.1646,0.2445,0.0799


In [9]:
df.tail(10)

Unnamed: 0,time,temperature,temperature_lag_1,temperature_lag_2,temperature_lag_3,temperature_lag_4,temperature_lag_5,temperature_lag_6,temperature_mean_last_2_lags,temperature_mean_last_3_lags,...,temperature_diff_lag_2_vs_3,temperature_diff_lag_2_vs_4,temperature_diff_lag_2_vs_5,temperature_diff_lag_2_vs_6,temperature_diff_lag_3_vs_4,temperature_diff_lag_3_vs_5,temperature_diff_lag_3_vs_6,temperature_diff_lag_4_vs_5,temperature_diff_lag_4_vs_6,temperature_diff_lag_5_vs_6
3813,2023-11-01,1.3338,1.42,1.2866,1.34,1.48,1.3522,1.1993,1.3533,1.348867,...,-0.0534,-0.1934,-0.0656,0.0873,-0.14,-0.0122,0.1407,0.1278,0.2807,0.1529
3814,2023-12-01,1.35,1.3338,1.42,1.2866,1.34,1.48,1.3522,1.3769,1.3468,...,0.1334,0.08,-0.06,0.0678,-0.0534,-0.1934,-0.0656,-0.14,-0.0122,0.1278
3815,2023-12-01,1.2586,1.35,1.3338,1.42,1.2866,1.34,1.48,1.3419,1.367933,...,-0.0862,0.0472,-0.0062,-0.1462,0.1334,0.08,-0.06,-0.0534,-0.1934,-0.14
3816,2024-01-01,1.1516,1.2586,1.35,1.3338,1.42,1.2866,1.34,1.3043,1.314133,...,0.0162,-0.07,0.0634,0.01,-0.0862,0.0472,-0.0062,0.1334,0.08,-0.0534
3817,2024-02-01,1.2902,1.1516,1.2586,1.35,1.3338,1.42,1.2866,1.2051,1.2534,...,-0.0914,-0.0752,-0.1614,-0.028,0.0162,-0.07,0.0634,-0.0862,0.0472,0.1334
3818,2024-03-01,1.2515,1.2902,1.1516,1.2586,1.35,1.3338,1.42,1.2209,1.233467,...,-0.107,-0.1984,-0.1822,-0.2684,-0.0914,-0.0752,-0.1614,0.0162,-0.07,-0.0862
3819,2024-04-01,1.2053,1.2515,1.2902,1.1516,1.2586,1.35,1.3338,1.27085,1.2311,...,0.1386,0.0316,-0.0598,-0.0436,-0.107,-0.1984,-0.1822,-0.0914,-0.0752,0.0162
3820,2024-05-01,1.0745,1.2053,1.2515,1.2902,1.1516,1.2586,1.35,1.2284,1.249,...,-0.0387,0.0999,-0.0071,-0.0985,0.1386,0.0316,-0.0598,-0.107,-0.1984,-0.0914
3821,2024-06-01,1.1154,1.0745,1.2053,1.2515,1.2902,1.1516,1.2586,1.1399,1.1771,...,-0.0462,-0.0849,0.0537,-0.0533,-0.0387,0.0999,-0.0071,0.1386,0.0316,-0.107
3822,2024-07-01,1.1398,1.1154,1.0745,1.2053,1.2515,1.2902,1.1516,1.09495,1.131733,...,-0.1308,-0.177,-0.2157,-0.0771,-0.0462,-0.0849,0.0537,-0.0387,0.0999,0.1386


In [11]:
df.columns.tolist()

['time',
 'temperature',
 'temperature_lag_1',
 'temperature_lag_2',
 'temperature_lag_3',
 'temperature_lag_4',
 'temperature_lag_5',
 'temperature_lag_6',
 'temperature_mean_last_2_lags',
 'temperature_mean_last_3_lags',
 'temperature_mean_last_4_lags',
 'temperature_mean_last_5_lags',
 'temperature_mean_last_6_lags',
 'temperature_median_last_2_lags',
 'temperature_median_last_3_lags',
 'temperature_median_last_4_lags',
 'temperature_median_last_5_lags',
 'temperature_median_last_6_lags',
 'temperature_max_last_2_lags',
 'temperature_max_last_3_lags',
 'temperature_max_last_4_lags',
 'temperature_max_last_5_lags',
 'temperature_max_last_6_lags',
 'temperature_min_last_2_lags',
 'temperature_min_last_3_lags',
 'temperature_min_last_4_lags',
 'temperature_min_last_5_lags',
 'temperature_min_last_6_lags',
 'temperature_sum_last_2_lags',
 'temperature_sum_last_3_lags',
 'temperature_sum_last_4_lags',
 'temperature_sum_last_5_lags',
 'temperature_sum_last_6_lags',
 'temperature_std_last_