In [2]:
import polars as pl
import pickle 
import typing as t
import numpy as np
import random

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

In [None]:
def create_calendric_features(df: pl.DataFrame, date_column: str) -> pl.DataFrame:
    """
    Create calendric features for a given Polars DataFrame with a date column.

    Parameters:
        df (pl.DataFrame): Input Polars DataFrame containing a date column.
        date_column (str): Name of the column containing dates.

    Returns:
        pl.DataFrame: Polars DataFrame with added calendric features.
    """
    # Ensure the date column is in datetime format
    df = df.with_columns(pl.col(date_column).cast(pl.Date).alias(date_column))

    # Create basic calendric features
    df = df.with_columns([
        pl.col(date_column).dt.month().alias("month"),
        (pl.col(date_column).dt.weekday()).alias("day_of_week"),  # Monday=1, ..., Sunday=7
        (pl.col(date_column).dt.strftime("%V").cast(pl.Int32)).alias("week_of_year"),  # ISO week number
        (pl.col(date_column).dt.year()).alias("year"),
        (pl.col(date_column).dt.weekday() >= 5).alias("is_weekend")  # True for Saturday/Sunday
    ])

    # Create the "quarter" column based on the "month" column
    df = df.with_columns(((pl.col("month") - 1) // 3 + 1).alias("quarter"))

    return df

def add_lag_features(
    df: pl.DataFrame,
    lags: t.Union[int, t.List[int], range] = 1,
    group_by_cols: t.List[str] = None,
    value_col: str = "value",
    date_col: str = "date",
    
) -> pl.DataFrame:
    """
    Add lag features to a DataFrame.

    Parameters
    ----------
    df : pl.DataFrame
        Input DataFrame containing time series data.
    value_col : str, default "value"
        Name of the column to create lag features for.
    group_by_cols : List[str], optional
        List of columns to group by when creating lags. If None, defaults to 
        ["skuID", "frequency"] if both exist, otherwise ["skuID"].
    date_col : str, default "date"
        Name of the date column used for sorting.
    lags : Union[int, List[int], range], optional
        Lag periods to create. If int, creates lags from 1 to that number.
        If List[int] or range, creates lags for those specific values.
        If None, defaults to range(1, 8).

    Returns
    -------
    pl.DataFrame
        DataFrame with added lag columns named "{value_col}_lag_{lag}".
    """
    
    # Handle default parameters
    if group_by_cols is None:
        group_by_cols = ["skuID", "frequency"]
    
    sort_cols = group_by_cols + [date_col]
    
    # Validate that required columns exist
    missing_cols = [col for col in sort_cols + [value_col] if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")
    
    # Sort the DataFrame
    df_sorted = df.sort(sort_cols)
    
    # Create lag features
    lag_features = [
        pl.col(value_col).shift(lag).over(group_by_cols).alias(f"{value_col}_lag_{lag}")
        for lag in lags
    ]
    
    # Add lag columns to the DataFrame
    result = df_sorted.with_columns(lag_features)
    
    return result

def add_trend_feature(df: pl.DataFrame, date_col: str = "date") -> pl.DataFrame:
    """
    Adds a 'trend' feature to the DataFrame, counting days from the earliest to the latest date.

    Parameters
    ----------
    df : pl.DataFrame
        Input DataFrame containing a date column.
    date_col : str, default "date"
        Name of the date column.

    Returns
    -------
    pl.DataFrame
        DataFrame with a new 'trend' column joined on the date.
    """
    earliest = df[date_col].min()
    latest = df[date_col].max()

    # Create a date range from earliest to latest (inclusive)
    date_range = pl.date_range(earliest, latest, "1d", eager=True)

    # Create the trend column: count from 1 to N
    trend = pl.int_range(1, len(date_range) + 1, eager=True)

    # Create the new DataFrame
    result = pl.DataFrame({
        date_col: date_range,
        "trend": trend
    })

    # Join the trend column to the original DataFrame
    df = df.join(result, on=date_col, how="left")
    return df


: 

In [None]:
#load csv with skus that are not part of transformers training data
unseed_skus_csv =  '../../data/db_snapshot_offsite/unseen_sku/unseen.csv'
unseen_skus = pl.read_csv(unseed_skus_csv)

: 

In [None]:
#features_path = '../../data/db_snapshot_offsite/processed/train_data_features.feather'
features_path = '../../data/db_snapshot_offsite/train_data/train_data_features.feather'
target_path = '../../data/db_snapshot_offsite/train_data/train_data_target.feather'

df = pl.read_ipc(features_path)  #Polars uses `read_ipc` for Feather files
train_target = pl.read_ipc(target_path)



: 

In [None]:
untrained_names = (unseen_skus
                   .select(pl.col("name"), pl.col("name-2"), pl.col("productID"), pl.col("storeID"))
                   )

: 

In [None]:
df_unprocessed = df.join(untrained_names, on=["storeID","productID"], how="inner")

: 

In [None]:
df_joined = df_unprocessed.join(train_target.select("bdID","target"), on="bdID", how="left")  # left join to keep only bdIDs that are in features

: 

In [None]:
df_calendric = create_calendric_features(df_joined, 'date')

: 

In [None]:
df_dummies = df_calendric.to_dummies(
    columns=["day_of_week", "month", "quarter", "week_of_year", "year", "is_weekend"])

: 

In [None]:
df_lagged = add_lag_features(
    df_dummies,
    lags=range(1, 8),
    group_by_cols=["skuID", "frequency"],
    value_col="target", # actually target variable
    date_col="date"
)

: 

In [None]:
df_trend = add_trend_feature(df_lagged, date_col="date")

: 

In [None]:
df_filtered_1 = df_trend.drop_nulls()


: 

In [None]:
df_filterd_2 = df_trend.filter(pl.col("not_for_sale") != 1)

: 

In [None]:
df_filterd_3 = df_filterd_2.drop("lag_target_1","target")

: 

In [None]:
df_clean = df_filterd_3.sort(["skuID","date"])

: 

In [None]:
#df_clean.write_ipc("../../data/db_snapshot_offsite/train_data/processed/train_data_features.feather") 

: 

In [28]:
#sample 3 random productID and storeID combinations simultaniously  and directly pout of the dataframe and give me a list of tuples 
df_clean.select(pl.col("productID"), pl.col("storeID")).unique().sample(3, seed=RANDOM_STATE).to_dicts()

#turn the statement above into a list of tuples
sku_tuples = [(d['productID'], d['storeID']) for d in df_clean.select(pl.col("productID"), pl.col("storeID")).unique().sample(3, seed=RANDOM_STATE).to_dicts()]

sku_tuples


[(1912, 7), (377, 1), (715, 7)]

In [1]:
df_clean.select(pl.col("productID"), pl.col("storeID")).unique().shape

NameError: name 'df_clean' is not defined

In [19]:
sku_tuples

[(2419, 2), (1729, 9), (131, 10)]

In [32]:
# fitler dataframe for this [(1912, 7)]
df_clean.filter((pl.col("productID") == 1912) & (pl.col("storeID") == 7)).select(pl.col("target_lag_1")).sum()


target_lag_1
f64
81.0
