In [6]:
import numpy as np
import pandas as pd
import uuid


def generate_lag_data(
    sample_size=10,
    feature_size=3,
    lag_size=3,
    low=0,
    high=100,
    pk: str = "id",
):
    """
    Generate a dataframe with UUID index and lagged features.

    Parameters:
    - sample_size (int): Number of samples (rows).
    - feature_size (int): Number of features.
    - lag_size (int): Number of lagged versions per feature.
    - low (int): Lower bound for random values (for 'uniform' distribution).
    - high (int): Upper bound for random values (for 'uniform' distribution).
    - pk (str): Name of the primary key column.

    Example:
    >>> df = generate_lag_data(sample_size=3, feature_size=2, lag_size=2)
    >>> df
                                        feature1_l1m  feature1_l2m  feature2_l1m  feature2_l2m
    id
    f47a0d65-42d5-4c93-bb6b-3d38d8e24c6f      19          29          54          61
    62e76469-0e53-4347-91a4-5c39d1d1f089      72          15          33          48
    f418d4d7-7cb5-4e1f-9b6d-dcfb7d3281ef      38          48          13          88
    """
    uuids = [str(uuid.uuid4()) for _ in range(sample_size)]
    columns = {
        f"feature{feature}_l{lag}m": np.random.randint(
            low, high, size=sample_size
        )
        for feature in range(1, feature_size + 1)
        for lag in range(1, lag_size + 1)
    }

    df = pd.DataFrame(columns)
    df.index = uuids
    df.index.name = pk

    return df


data = generate_lag_data(sample_size=100, lag_size=12, feature_size=1)
data.head()

Unnamed: 0_level_0,feature1_l1m,feature1_l2m,feature1_l3m,feature1_l4m,feature1_l5m,feature1_l6m,feature1_l7m,feature1_l8m,feature1_l9m,feature1_l10m,feature1_l11m,feature1_l12m
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
a8fcbf86-bad9-43df-a506-5fe54e7caee9,61,6,78,84,55,4,74,68,84,47,55,56
bcbb85ec-a8b7-42b4-8c70-bd2c4fbd69f4,57,14,90,40,12,53,42,41,78,29,27,39
9db78cec-4ad9-4b2b-9c66-a23737041657,66,27,4,87,90,28,47,96,86,12,93,12
a8e7200d-1a14-43ff-889b-6efe05a5d1a8,96,52,43,40,7,95,8,29,4,42,82,26
4c41f361-d0a9-4c4b-8be4-2af28659ba36,14,63,1,37,75,64,13,3,68,75,67,13


In [65]:
import re
import pandas as pd


def rolling_avg_lag_features(
    data: pd.DataFrame, *, cols: list[str], group_size: int = 4, weight: int = 2
) -> pd.DataFrame:
    """
    Compute moving averages for lagged features.

    Args:
        data (pd.DataFrame): Input dataframe with lagged features.
        cols (list[str]): List of base feature names.
        group_size (int): Number of lagged features to average over. Must divide the total number of lagged features.

    Returns:
        pd.DataFrame: DataFrame with new columns for moving averages.

    Example:
        >>> df = pd.DataFrame({
        ...     'feature1_l1m': [1, 2, 3],
        ...     'feature1_l2m': [4, 5, 6],
        ...     'feature1_l3m': [7, 8, 9],
        ...     'feature1_l4m': [10, 11, 12]
        ... })
        >>> rolling_avg_lag_features(df, ['feature1'], group_size=4)
           feature1_l1m  feature1_l2m  feature1_l3m  feature1_l4m  avg_feature1_l1-4m
        0            1            4            7           10                   5.5
        1            2            5            8           11                   6.5
        2            3            6            9           12                   7.5
    """
    data = data.copy()

    for col in cols:
        pattern = re.compile(rf"{col}_l\d+m")
        select_cols = sorted(
            (c for c in data.columns if pattern.match(c)),
            key=lambda c: int(re.search(r"\d+", c).group()),
        )

        if len(select_cols) % group_size != 0:
            raise ValueError(
                f"Number of lagged features must be divisible by group_size `{group_size}`."
            )

        df = data[select_cols]
        n_sample, n_lags = data.shape

        # grouping average lagged features
        for i in range(0, len(select_cols), group_size):
            group = df.iloc[:, i : i + group_size]
            data[f"avg_{col}_l{i+1}-{i+group_size}m"] = group.mean(axis=1)

        # narrowing average lagged features
        avg_lagged_cols = [
            c for c in data.columns if c.startswith(f"avg_{col}_l")
        ]
        

        for size in range(group_size, n_lags + 1, group_size):
            narrow_avg_cols = [
                c
                for c in avg_lagged_cols
                if int(re.search(r"\d+", c.split("_l")[1])[0]) <= size
            ]
            if narrow_avg_cols:
                data[f"avg_{col}_l1-{size}m"] = data[narrow_avg_cols].mean(
                    axis=1
                )
                
        # compute prc change
        for i in range(1, n_lags):
            ratio_col = f"{col}_l{i}-{i+1}m_diff_prc"
            data[ratio_col] = data[f"{col}_l{i+1}m"] / data[f"{col}_l{i}m"]
            data[ratio_col] = data[ratio_col].clip(upper=10)

        # average/std prc change
        prc_cols = [c for c in data.columns if c.endswith("diff_prc")]
        data[f"avg_{col}_l1-{n_lags}m_diff_prc"] = data[prc_cols].mean(axis=1)
        data[f"std_{col}_l1-{n_lags}m_diff_prc"] = data[prc_cols].std(axis=1)
    
    return data


rolling_avg_lag_features(data, cols=["feature1"], group_size=3, weight=2)

Unnamed: 0_level_0,feature1_l1m,feature1_l2m,feature1_l3m,feature1_l4m,feature1_l5m,feature1_l6m,feature1_l7m,feature1_l8m,feature1_l9m,feature1_l10m,...,feature1_l4-5m_diff_prc,feature1_l5-6m_diff_prc,feature1_l6-7m_diff_prc,feature1_l7-8m_diff_prc,feature1_l8-9m_diff_prc,feature1_l9-10m_diff_prc,feature1_l10-11m_diff_prc,feature1_l11-12m_diff_prc,avg_feature1_l1-12m_diff_prc,std_feature1_l1-12m_diff_prc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3abb5c81-6f60-4e2f-aa92-f49bfcf32b9d,3,41,32,78,46,52,91,76,14,31,...,0.589744,1.130435,1.750000,0.835165,0.184211,2.214286,1.290323,2.325000,2.139741,2.712571
ffb23698-b762-45c4-a0e3-ecff65827856,24,9,1,81,89,89,66,14,55,78,...,1.098765,1.000000,0.741573,0.212121,3.928571,1.418182,0.833333,0.846154,1.869528,2.887283
939e2265-d0bc-4f26-9bc5-4e97e0c1d7ad,95,92,48,58,33,90,19,39,18,0,...,0.568966,2.727273,0.211111,2.052632,0.461538,0.000000,10.000000,10.000000,2.610910,3.740553
dad5b361-0f47-4116-b964-7dd9d87a7c60,17,72,30,78,10,65,53,9,73,22,...,0.128205,6.500000,0.815385,0.169811,8.111111,0.301370,2.727273,0.516667,2.411071,2.783369
86800a19-8adf-43f1-8a63-e5fdef7764db,99,62,68,98,70,91,46,7,78,52,...,0.714286,1.300000,0.505495,0.152174,10.000000,0.666667,1.788462,0.204301,1.681418,2.806010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b499830e-31f8-41fa-8e18-c8279e1122f0,67,67,26,84,83,54,49,74,20,5,...,0.988095,0.650602,0.907407,1.510204,0.270270,0.250000,10.000000,0.475000,1.788219,2.850936
463d527f-2f33-454e-a30d-8595adc5a449,40,80,22,26,32,8,81,66,5,94,...,1.230769,0.250000,10.000000,0.814815,0.075758,10.000000,0.170213,1.000000,2.454398,3.774699
bb6b57d6-2a55-4f37-940b-052485802b48,83,51,91,22,17,44,96,74,8,0,...,0.772727,2.588235,2.181818,0.770833,0.108108,0.000000,10.000000,1.951220,1.910316,2.827489
f147690c-08f7-4ded-9c3f-a4ad3c0c3411,85,64,64,93,24,1,44,64,77,8,...,0.258065,0.041667,10.000000,1.454545,1.203125,0.103896,7.500000,0.533333,2.209154,3.319206


In [55]:
import re
import pandas as pd


def compute_grouped_avg(
    data: pd.DataFrame, cols: list[str], group_size: int
) -> pd.DataFrame:
    """
    Compute moving averages for grouped lagged features.

    Args:
        data (pd.DataFrame): Input dataframe with lagged features.
        cols (list[str]): List of base feature names.
        group_size (int): Number of lagged features to average over. Must divide the total number of lagged features.

    Returns:
        pd.DataFrame: DataFrame with new columns for grouped moving averages.

    Example:
        >>> df = pd.DataFrame({
        ...     'feature1_l1m': [1, 2, 3],
        ...     'feature1_l2m': [4, 5, 6],
        ...     'feature1_l3m': [7, 8, 9],
        ...     'feature1_l4m': [10, 11, 12]
        ... })
        >>> compute_grouped_avg(df, ['feature1'], group_size=4)
           feature1_l1m  feature1_l2m  feature1_l3m  feature1_l4m  avg_feature1_l1-4m
        0            1            4            7           10                   5.5
        1            2            5            8           11                   6.5
        2            3            6            9           12                   7.5
    """
    data = data.copy()

    for col in cols:
        pattern = re.compile(rf"{col}_l\d+m")
        select_cols = sorted(
            (c for c in data.columns if pattern.match(c)),
            key=lambda c: int(re.search(r"\d+", c).group()),
        )

        if len(select_cols) % group_size != 0:
            raise ValueError(
                f"Number of lagged features must be divisible by group_size `{group_size}`."
            )

        df = data[select_cols]

        for i in range(0, len(select_cols), group_size):
            group = df.iloc[:, i: i + group_size]
            data[f"avg_{col}_l{i+1}-{i+group_size}m"] = group.mean(axis=1)

    return data


def compute_narrowed_avg(
    data: pd.DataFrame, cols: list[str], group_size: int
) -> pd.DataFrame:
    """
    Compute moving averages for narrowed lagged features.

    Args:
        data (pd.DataFrame): Input dataframe with lagged features.
        cols (list[str]): List of base feature names.
        group_size (int): Number of lagged features to average over. Must divide the total number of lagged features.

    Returns:
        pd.DataFrame: DataFrame with new columns for narrowed moving averages.

    Example:
        >>> df = pd.DataFrame({
        ...     'feature1_l1m': [1, 2, 3],
        ...     'feature1_l2m': [4, 5, 6],
        ...     'feature1_l3m': [7, 8, 9],
        ...     'feature1_l4m': [10, 11, 12],
        ...     'avg_feature1_l1-4m': [5.5, 6.5, 7.5]
        ... })
        >>> compute_narrowed_avg(df, ['feature1'], group_size=2)
           feature1_l1m  feature1_l2m  feature1_l3m  feature1_l4m  avg_feature1_l1-4m  avg_feature1_l1-2m
        0            1            4            7           10                   5.5                  2.5
        1            2            5            8           11                   6.5                  3.5
        2            3            6            9           12                   7.5                  4.5
    """
    data = data.copy()

    for col in cols:
        avg_lagged_cols = [
            c for c in data.columns if c.startswith(f"avg_{col}_l")
        ]
        num_lags = len([c for c in data.columns if re.search(rf"{col}_l\d+m", c)])

        for size in range(group_size, num_lags + 1, group_size):
            narrow_avg_cols = [
                c for c in avg_lagged_cols
                if int(re.search(r"\d+", c.split("_l")[1])[0]) <= size
            ]
            if narrow_avg_cols:
                data[f"avg_{col}_l1-{size}m"] = data[narrow_avg_cols].mean(axis=1)

    return data

def compute_prc_change():...
def compute_total_prc_change():...

def rolling_avg_lag_features(
    data: pd.DataFrame, *, cols: list[str], group_size: int = 4,
) -> pd.DataFrame:
    """
    Compute moving averages for lagged features.

    Args:
        data (pd.DataFrame): Input dataframe with lagged features.
        cols (list[str]): List of base feature names.
        group_size (int): Number of lagged features to average over. Must divide the total number of lagged features.

    Returns:
        pd.DataFrame: DataFrame with new columns for moving averages.

    Example:
        >>> df = pd.DataFrame({
        ...     'feature1_l1m': [1, 2, 3],
        ...     'feature1_l2m': [4, 5, 6],
        ...     'feature1_l3m': [7, 8, 9],
        ...     'feature1_l4m': [10, 11, 12]
        ... })
        >>> rolling_avg_lag_features(df, ['feature1'], group_size=4)
           feature1_l1m  feature1_l2m  feature1_l3m  feature1_l4m  avg_feature1_l1-4m
        0            1            4            7           10                   5.5
        1            2            5            8           11                   6.5
        2            3            6            9           12                   7.5
    """
    data = compute_grouped_avg(data, cols=cols, group_size=group_size)
    data = compute_narrowed_avg(data, cols=cols, group_size=group_size)
    return data.round(3)

rolling_avg_lag_features(data, cols=["feature1"], group_size=3)

Unnamed: 0_level_0,feature1_l1m,feature1_l2m,feature1_l3m,feature1_l4m,feature1_l5m,feature1_l6m,feature1_l7m,feature1_l8m,feature1_l9m,feature1_l10m,feature1_l11m,feature1_l12m,avg_feature1_l1-3m,avg_feature1_l4-6m,avg_feature1_l7-9m,avg_feature1_l10-12m,avg_feature1_l1-6m,avg_feature1_l1-9m,avg_feature1_l1-12m
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3abb5c81-6f60-4e2f-aa92-f49bfcf32b9d,3,41,32,78,46,52,91,76,14,31,40,93,25.333,58.667,60.333,54.667,42.000,48.111,49.750
ffb23698-b762-45c4-a0e3-ecff65827856,24,9,1,81,89,89,66,14,55,78,65,55,11.333,86.333,45.000,66.000,48.833,47.556,52.167
939e2265-d0bc-4f26-9bc5-4e97e0c1d7ad,95,92,48,58,33,90,19,39,18,0,3,44,78.333,60.333,25.333,15.667,69.333,54.667,44.917
dad5b361-0f47-4116-b964-7dd9d87a7c60,17,72,30,78,10,65,53,9,73,22,60,31,39.667,51.000,45.000,37.667,45.333,45.222,43.333
86800a19-8adf-43f1-8a63-e5fdef7764db,99,62,68,98,70,91,46,7,78,52,93,19,76.333,86.333,43.667,54.667,81.333,68.778,65.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b499830e-31f8-41fa-8e18-c8279e1122f0,67,67,26,84,83,54,49,74,20,5,80,38,53.333,73.667,47.667,41.000,63.500,58.222,53.917
463d527f-2f33-454e-a30d-8595adc5a449,40,80,22,26,32,8,81,66,5,94,16,16,47.333,22.000,50.667,42.000,34.667,40.000,40.500
bb6b57d6-2a55-4f37-940b-052485802b48,83,51,91,22,17,44,96,74,8,0,41,80,75.000,27.667,59.333,40.333,51.333,54.000,50.583
f147690c-08f7-4ded-9c3f-a4ad3c0c3411,85,64,64,93,24,1,44,64,77,8,60,32,71.000,39.333,61.667,33.333,55.167,57.333,51.333


In [7]:
import re
import pandas as pd
import itertools
import functools
import more_itertools

def rolling_avg_lag_features(
    data: pd.DataFrame, *, cols: list[str], group_size: int = 4, weight: int = 2
) -> pd.DataFrame:
    """
    Compute moving averages for lagged features.

    Args:
        data (pd.DataFrame): Input dataframe with lagged features.
        cols (list[str]): List of base feature names.
        group_size (int): Number of lagged features to average over. Must divide the total number of lagged features.

    Returns:
        pd.DataFrame: DataFrame with new columns for moving averages.
    """
    data = data.copy()

    def get_lagged_columns(col: str) -> list[str]:
        """Get and sort columns related to a specific feature."""
        pattern = re.compile(rf"{col}_l\d+m")
        return sorted(
            (c for c in data.columns if pattern.match(c)),
            key=lambda x: int(re.search(r"\d+", x).group())
        )

    def compute_group_means(df: pd.DataFrame, group_size: int) -> pd.DataFrame:
        """Compute means of columns in specified group sizes."""
        chunks = list(more_itertools.chunked(df.columns, group_size))
        means = [df[chunk].mean(axis=1).rename(f"avg_{col}_l{start+1}-{start+group_size}m")
                 for start, chunk in enumerate(chunks)]
        return pd.concat(means, axis=1)

    def compute_narrowed_means(df: pd.DataFrame, avg_cols: list[str], group_size: int) -> pd.DataFrame:
        """Compute means of averaged columns for increasing sizes."""
        narrow_means = [
            df[[col for col in avg_cols if int(re.search(r"\d+", col.split("_l")[1])[0]) <= size]].mean(axis=1).rename(f"avg_{col}_l1-{size}m")
            for size in range(group_size, len(avg_cols) + 1, group_size)
        ]
        return pd.concat(narrow_means, axis=1) if narrow_means else pd.DataFrame()

    def compute_percentage_changes(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
        """Calculate percentage changes between consecutive lagged features."""
        changes = {
            f"{col}_l{i}-{i+1}m_diff_prc": df[f"{col}_l{i+1}m"] / df[f"{col}_l{i}m"]
            for i in range(1, len(cols))
        }
        return pd.DataFrame(changes).clip(upper=10)

    for col in cols:
        lagged_cols = get_lagged_columns(col)
        if len(lagged_cols) % group_size != 0:
            raise ValueError(f"Number of lagged features must be divisible by group_size `{group_size}`.")
        
        df_lags = data[lagged_cols]
        means_df = compute_group_means(df_lags, group_size)
        data = pd.concat([data, means_df], axis=1)
        
        avg_lagged_cols = [c for c in data.columns if c.startswith(f"avg_{col}_l")]
        narrowed_means_df = compute_narrowed_means(data, avg_lagged_cols, group_size)
        data = pd.concat([data, narrowed_means_df], axis=1)
        
        prc_changes_df = compute_percentage_changes(data, lagged_cols)
        data = pd.concat([data, prc_changes_df], axis=1)
        if not prc_changes_df.empty:
            data[f"avg_{col}_l1-{len(lagged_cols)}m_diff_prc"] = prc_changes_df.mean(axis=1)
            data[f"std_{col}_l1-{len(lagged_cols)}m_diff_prc"] = prc_changes_df.std(axis=1)
    
    return data

# Example usage
df = generate_lag_data(sample_size=100, lag_size=12, feature_size=1)
result = rolling_avg_lag_features(df, cols=['feature1'], group_size=4)
result.head()


Unnamed: 0_level_0,feature1_l1m,feature1_l2m,feature1_l3m,feature1_l4m,feature1_l5m,feature1_l6m,feature1_l7m,feature1_l8m,feature1_l9m,feature1_l10m,...,feature1_l4-5m_diff_prc,feature1_l5-6m_diff_prc,feature1_l6-7m_diff_prc,feature1_l7-8m_diff_prc,feature1_l8-9m_diff_prc,feature1_l9-10m_diff_prc,feature1_l10-11m_diff_prc,feature1_l11-12m_diff_prc,avg_feature1_l1-12m_diff_prc,std_feature1_l1-12m_diff_prc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d245a134-a02a-4a2f-b2ce-cefca3ef107d,14,26,76,16,96,68,29,26,79,81,...,6.0,0.708333,0.426471,0.896552,3.038462,1.025316,0.148148,6.75,2.180366,2.304324
fcba13a3-a9ee-4fb2-9f6f-548994e6e8c0,55,67,84,14,10,44,99,11,94,5,...,0.714286,4.4,2.25,0.111111,8.545455,0.053191,1.6,10.0,2.755693,3.465314
b428ffcc-f0f4-4f9d-bc7a-e6299ffb6847,10,40,86,86,87,4,14,28,17,14,...,1.011628,0.045977,3.5,2.0,0.607143,0.823529,0.0,10.0,2.285298,2.872218
c0cd377e-7695-43f8-aa8c-338c6abdadcf,57,79,51,86,7,20,7,59,35,60,...,0.081395,2.857143,0.35,8.428571,0.59322,1.714286,1.183333,1.28169,1.837041,2.317891
bb174f27-8d29-4881-aaef-fbeecbf8e8bb,70,71,70,99,88,81,0,78,2,36,...,0.888889,0.920455,0.0,10.0,0.025641,10.0,1.861111,1.223881,2.57586,3.70965
