### Generate user advance features
* Reference - https://www.kaggle.com/code/alexvishnevskiy/ranking-user-features/notebook

In [None]:
import sys
sys.path.append('/Users/jasonlin/Desktop/hm-competition')

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
from src.advance_user_features import (
    UserFeaturesCollector,
    AggrFeatures, 
    CountFeatures, 
    CustomerFeatures,
    ArticlesFeatures,
)

In [None]:
base_pth = Path('/Users/jasonlin/Desktop/hm-competition/hm_datasets')
transactions_df = pd.read_csv(base_pth/'transactions_train.csv')
transactions_df['t_dat'] = pd.to_datetime(transactions_df['t_dat'])
customers_df = pd.read_csv(base_pth/'customers.csv')
articles_df = pd.read_csv(base_pth/'articles.csv')

In [None]:
%%time
user_features = UserFeaturesCollector.collect([
    AggrFeatures(transactions_df),
    CountFeatures(transactions_df, 3),
    CustomerFeatures(customers_df),
    ArticlesFeatures(transactions_df, articles_df, 3),
])

In [None]:
user_features.to_parquet(data_path/'user_features.parquet')

In [None]:
### given age bins & season
def _add_season_column(transaction_df):
    conditions = [
        (
            transaction_df["t_dat"].between(
                datetime(2019, 3, 1), datetime(2019, 5, 31)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2020, 3, 1), datetime(2020, 5, 31)
            )
        ),
        (
            transaction_df["t_dat"].between(
                datetime(2019, 6, 1), datetime(2019, 8, 31)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2020, 6, 1), datetime(2020, 8, 31)
            )
        ),
        (
            transaction_df["t_dat"].between(
                datetime(2018, 9, 1), datetime(2018, 11, 30)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2019, 9, 1), datetime(2019, 11, 30)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2020, 9, 1), datetime(2020, 11, 30)
            )
        ),
        (
            transaction_df["t_dat"].between(
                datetime(2018, 12, 1), datetime(2019, 2, 28)
            )
        )
        | (
            transaction_df["t_dat"].between(
                datetime(2019, 12, 1), datetime(2020, 2, 29)
            )
        ),
    ]
    choices = ["spring", "summer", "fall", "winter"]
    transaction_df["season"] = np.select(conditions, choices)
    return transaction_df

In [None]:
customers_df_with_age_bins = customers_df.copy()
customers_df_with_age_bins['age_bins'] = pd.cut(customers_df_with_age_bins['age'], [-1, 19, 29, 39, 49, 69, 119])
customers_df_with_age_bins['age_bins'] = customers_df_with_age_bins['age_bins'].astype(str)

transactions_df_with_season = _add_season_column(transactions_df)

In [None]:
seasons = list(transactions_df_with_season['season'].unique())
age_bins = list(customers_df_with_age_bins['age_bins'].unique())
for season in tqdm(seasons):
    for age_bin in tqdm(age_bins):
        tmp_trans = transactions_df_with_season[transactions_df_with_season['season'] == season].copy()
        tmp_trans = tmp_trans.drop(columns=['season'])

        tmp_cus = customers_df_with_age_bins[customers_df_with_age_bins['age_bins'] == age_bin].copy()
        tmp_cus = tmp_cus.drop(columns=['age_bins'])

        user_features = UserFeaturesCollector.collect([
            AggrFeatures(tmp_trans),
            CountFeatures(tmp_trans, 3),
            CustomerFeatures(tmp_cus, use_age=False),
            ArticlesFeatures(tmp_trans, articles_df, 3),
        ])
        user_features.to_parquet(base_pth/f'cust_features/user_{season}_{age_bin}_features.parquet')
    