In [1]:
import os
import pandas as pd
from tqdm import tqdm

In [2]:
INTERIM_PATH = "../.data/interim"
PROCESSED_PATH = "../.data/processed"

In [3]:
import logging

logger = logging.getLogger(__name__)

In [42]:
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    handlers=[logging.StreamHandler()]
)

In [47]:
def build_user_features_from_chunk_main(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("Converting event_time to datetime and extracting date")
    df['event_time'] = pd.to_datetime(df['event_time'])
    df['date'] = df['event_time'].dt.date

    logger.info("Filtering purchase events")
    purchase_df = df[df['event_type'] == 'purchase']

    logger.info("Grouping by user_id")
    grouped = df.groupby('user_id')

    logger.info("Aggregating user features")
    features = grouped.agg(
        count_view=('event_type', lambda x: (x == 'view').sum()),
        count_cart=('event_type', lambda x: (x == 'cart').sum()),
        count_purchase=('event_type', lambda x: (x == 'purchase').sum()),
        unique_sessions=('user_session', pd.Series.nunique),
        fav_main_category=('main_category', lambda x: x.mode(
        ).iloc[0] if not x.mode().empty else 'unknown'),
        fav_sub_category=('sub_category', lambda x: x.mode(
        ).iloc[0] if not x.mode().empty else 'unknown'),
        first_event=('event_time', 'min'),
        last_event=('event_time', 'max'),
        active_days=('date', 'nunique')
    )

    logger.info("Computing total and average purchase amounts")
    purchase_stats = purchase_df.groupby('user_id')['price'].agg(['sum', 'mean']).rename(
        columns={'sum': 'total_spent', 'mean': 'avg_purchase_price'}
    )

    logger.info("Merging features with purchase statistics")
    features = features.join(purchase_stats, how='left')

    logger.info("Calculating recency_days")
    latest_date = features['last_event'].max()
    features['recency_days'] = (latest_date - features['last_event']).dt.days

    features[["total_spent", "avg_purchase_price"]] = features[[
        "total_spent", "avg_purchase_price"]].fillna(0)

    logger.info("Reordering final feature columns")
    features = features[
        [
            "count_view", "count_cart", "count_purchase",
            "unique_sessions", "fav_main_category", "fav_sub_category",
            "active_days", "recency_days",
            "total_spent", "avg_purchase_price",
            "first_event", "last_event"
        ]
    ]

    logger.info("Feature extraction complete")
    return features.reset_index()

In [None]:
def build_all_features_main():
    logger.info("Starting user feature extraction for all chunks")
    os.makedirs(PROCESSED_PATH, exist_ok=True)

    files = sorted([f for f in os.listdir(
        INTERIM_PATH) if f.endswith(".parquet")])
    logger.info("Found %d chunks to process", len(files))

    for i, file in enumerate(tqdm(files, desc="Building user features from chunks")):
        chunk_path = os.path.join(INTERIM_PATH, file)
        logger.info("Reading chunk %s", file)
        df = pd.read_parquet(chunk_path)

        logger.info("Building features from chunk %s", i)
        user_features = build_user_features_from_chunk_main(df)

        output_path = os.path.join(
            PROCESSED_PATH, f"user_features_chunk_{i}.parquet")
        user_features.to_parquet(output_path, index=False)
        logger.info("Saved user features for chunk %s to %s", i, output_path)

    logger.info("All chunks processed and saved successfully.")

In [49]:
build_all_features_main()

2025-05-31 16:07:13,659 | INFO | __main__ | Starting user feature extraction for all chunks
2025-05-31 16:07:13,661 | INFO | __main__ | Found 17 chunks to process
Building user features from chunks:   0%|          | 0/17 [00:00<?, ?it/s]2025-05-31 16:07:13,733 | INFO | __main__ | Reading chunk cleaned_chunk_0.parquet
2025-05-31 16:07:24,840 | INFO | __main__ | Building features from chunk 0
2025-05-31 16:07:24,840 | INFO | __main__ | Converting event_time to datetime and extracting date
2025-05-31 16:07:31,780 | INFO | __main__ | Filtering purchase events
2025-05-31 16:07:33,280 | INFO | __main__ | Grouping by user_id
2025-05-31 16:07:33,281 | INFO | __main__ | Aggregating user features
2025-05-31 16:35:15,336 | INFO | __main__ | Computing total and average purchase amounts
2025-05-31 16:35:15,410 | INFO | __main__ | Merging features with purchase statistics
2025-05-31 16:35:15,594 | INFO | __main__ | Calculating recency_days
2025-05-31 16:35:15,715 | INFO | __main__ | Reordering final

In [50]:
def merge_all_user_feature_chunks_aggregated():
    logger.info("Merging all user feature chunks with user-level aggregation")

    all_chunks = []
    files = sorted([
        f for f in os.listdir(PROCESSED_PATH)
        if f.startswith("user_features_chunk_") and f.endswith(".parquet")
    ])

    for file in tqdm(files, desc="Loading feature chunks"):
        path = os.path.join(PROCESSED_PATH, file)
        df = pd.read_parquet(path)
        all_chunks.append(df)

    full_df = pd.concat(all_chunks, ignore_index=True)
    logger.info("Concatenated shape: %s", full_df.shape)

    # Group by user_id and aggregate numeric features
    logger.info("Aggregating user features for duplicate user_ids")
    aggregated = full_df.groupby("user_id").agg({
        "count_view": "sum",
        "count_cart": "sum",
        "count_purchase": "sum",
        "unique_sessions": "sum",
        "active_days": "sum",
        "recency_days": "min",  # most recent chunk is better
        "total_spent": "sum",
        "avg_purchase_price": "mean",  # simple avg across chunks
        "first_event": "min",
        "last_event": "max",
        "fav_main_category": lambda x: x.mode().iloc[0] if not x.mode().empty else "unknown",
        "fav_sub_category": lambda x: x.mode().iloc[0] if not x.mode().empty else "unknown"
    }).reset_index()

    output_path = os.path.join(PROCESSED_PATH, "user_features.parquet")
    aggregated.to_parquet(output_path, index=False)
    logger.info("Saved final aggregated user features to %s", output_path)

    return aggregated

In [51]:
merge_all_user_feature_chunks_aggregated()

2025-05-31 23:31:20,252 | INFO | __main__ | Merging all user feature chunks with user-level aggregation
Loading feature chunks: 100%|██████████| 17/17 [00:04<00:00,  3.54it/s]
2025-05-31 23:31:26,067 | INFO | __main__ | Concatenated shape: (36312031, 13)
2025-05-31 23:31:26,068 | INFO | __main__ | Aggregating user features for duplicate user_ids
2025-06-01 00:53:04,921 | INFO | __main__ | Saved final aggregated user features to ../.data/processed\user_features.parquet


Unnamed: 0,user_id,count_view,count_cart,count_purchase,unique_sessions,active_days,recency_days,total_spent,avg_purchase_price,first_event,last_event,fav_main_category,fav_sub_category
0,10300217,1,0,0,1,1,9,0.0,0.0,2019-11-06 06:51:52+00:00,2019-11-06 06:51:52+00:00,unknown,unknown
1,12511517,2,0,0,2,2,5,0.0,0.0,2020-02-24 05:52:05+00:00,2020-03-08 17:23:57+00:00,apparel,shoes.moccasins
2,22165363,14,0,0,13,9,0,0.0,0.0,2020-01-30 08:17:02+00:00,2020-03-13 05:04:50+00:00,computers,bedroom.bed
3,27396220,1,0,0,1,1,6,0.0,0.0,2020-04-10 04:21:00+00:00,2020-04-10 04:21:00+00:00,construction,components.faucet
4,29515875,23,0,0,14,9,2,0.0,0.0,2019-11-10 02:08:39+00:00,2020-04-23 05:03:44+00:00,furniture,bedroom.bed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15639798,649775813,1,0,0,1,1,0,0.0,0.0,2020-04-30 23:58:54+00:00,2020-04-30 23:58:54+00:00,apparel,shoes
15639799,649775850,1,0,0,1,1,0,0.0,0.0,2020-04-30 23:59:04+00:00,2020-04-30 23:59:04+00:00,furniture,universal.light
15639800,649775918,1,0,0,1,1,0,0.0,0.0,2020-04-30 23:59:26+00:00,2020-04-30 23:59:26+00:00,apparel,shoes
15639801,649775938,2,0,0,1,1,0,0.0,0.0,2020-04-30 23:59:33+00:00,2020-04-30 23:59:37+00:00,construction,tools.light


In [45]:
def build_user_features_from_chunk_main2(df: pd.DataFrame) -> pd.DataFrame:
    df['event_time'] = pd.to_datetime(df['event_time'])
    df['date'] = df['event_time'].dt.date
    print("from event_type to purchase")
    purchase_df = df[df['event_type'] == 'purchase']
    print("grouping by user_id")
    grouped = df.groupby('user_id')
    print("starting the aggregation")
    features = grouped.agg(
        count_view=('event_type', lambda x: (x == 'view').sum()),
        count_cart=('event_type', lambda x: (x == 'cart').sum()),
        count_purchase=('event_type', lambda x: (x == 'purchase').sum()),
        unique_sessions=('user_session', pd.Series.nunique),
        fav_main_category=('main_category', lambda x: x.mode(
        ).iloc[0] if not x.mode().empty else 'unknown'),
        fav_sub_category=('sub_category', lambda x: x.mode(
        ).iloc[0] if not x.mode().empty else 'unknown'),
        first_event=('event_time', 'min'),
        last_event=('event_time', 'max'),
        active_days=('date', 'nunique')
    )
    print("aggregation finished, starting total price and average")
    # Total & average purchase price
    purchase_stats = purchase_df.groupby('user_id')['price'].agg(['sum', 'mean']).rename(
        columns={'sum': 'total_spent', 'mean': 'avg_purchase_price'})

    # Merge
    print("merging")
    features = features.join(purchase_stats, how='left')
    latest_date = features['last_event'].max()
    features['recency_days'] = (latest_date - features['last_event']).dt.days
    
    features[["total_spent", "avg_purchase_price"]] = features[[
        "total_spent", "avg_purchase_price"]].fillna(0)
    
    features = features[
        [
            "count_view", "count_cart", "count_purchase",
            "unique_sessions", "fav_main_category", "fav_sub_category",
            "active_days", "recency_days",
            "total_spent", "avg_purchase_price",
            "first_event", "last_event"
        ]
    ]

    return features.reset_index()

In [46]:
def build_all_features_main2():
    """
    Process ONLY the first .parquet file in data/interim to test feature extraction.
    """
    os.makedirs(PROCESSED_PATH, exist_ok=True)
    files = sorted([f for f in os.listdir(
        INTERIM_PATH) if f.endswith(".parquet")])
    print("Building user features from chunks")
    for i, file in enumerate(tqdm(files, desc="Building user features from chunks")):
        chunk_path = os.path.join(INTERIM_PATH, file)
        df = pd.read_parquet(chunk_path)

        user_features = build_user_features_from_chunk_main2(df)
        user_features.to_parquet(os.path.join(PROCESSED_PATH, f"user_features_chunk_{i}.parquet"), index=False)


In [None]:
def merge_all_user_features_main():
    """
    Merge all processed per-user parquet chunks into a single user_features.parquet file.
    """
    files = sorted([f for f in os.listdir(PROCESSED_PATH)
                   if f.startswith("user_features_chunk")])
    all_chunks = []

    for file in tqdm(files, desc="Merging all user features"):
        chunk = pd.read_parquet(os.path.join(PROCESSED_PATH, file))
        all_chunks.append(chunk)

    df_all = pd.concat(all_chunks).groupby("user_id").agg({
        **{col: 'sum' for col in ["count_view", "count_cart", "count_remove_from_cart", "count_purchase", "total_spent", "unique_sessions", "active_days"]},
        "fav_sub_category": lambda x: x.mode().iloc[0] if not x.mode().empty else "unknown"
    }).reset_index()

    df_all.to_parquet(os.path.join(
        PROCESSED_PATH, "user_features.parquet"), index=False)
    print("Final user_features.parquet saved.")

In [None]:
def build_user_features_from_chunk(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregate user-level features from a chunk of cleaned data.
    """
    df['event_time'] = pd.to_datetime(df['event_time'])

    # Count events per type
    event_counts = df.pivot_table(index='user_id',
                                  columns='event_type',
                                  aggfunc='size',
                                  fill_value=0)

    event_counts.columns = [f"count_{col}" for col in event_counts.columns]

    # Total amount spent per user
    df_purchase = df[df['event_type'] == 'purchase']
    total_spent = df_purchase.groupby(
        'user_id')['price'].sum().rename("total_spent")

    # Number of sessions per user
    session_count = df.groupby(
        'user_id')['user_session'].nunique().rename("unique_sessions")

    # Most frequent sub-category
    top_sub_category = df.groupby('user_id')['sub_category'].agg(lambda x: x.mode(
    ).iloc[0] if not x.mode().empty else "unknown").rename("fav_sub_category")

    # First and last event time per user
    first_event = df.groupby('user_id')[
        'event_time'].min().rename("first_event")
    last_event = df.groupby('user_id')['event_time'].max().rename("last_event")

    # Combine all features
    features = pd.concat([event_counts, total_spent, session_count,
                         top_sub_category, first_event, last_event], axis=1).reset_index()

    # Add recency and activity length
    features["active_days"] = (
        features["last_event"] - features["first_event"]).dt.days + 1
    
    features[["total_spent", "avg_purchase_price"]] = features[[
        "total_spent", "avg_purchase_price"]].fillna(0)

    return features.drop(columns=["first_event", "last_event"])

In [None]:
def build_all_features_main():
    """
    Process ONLY the first .parquet file in data/interim to test feature extraction.
    """
    os.makedirs(PROCESSED_PATH, exist_ok=True)
    files = sorted([f for f in os.listdir(
        INTERIM_PATH) if f.endswith(".parquet")])

    # Test only on the first file
    test_file = files[2]
    print(f"🧪 Testing with file: {test_file}")
    chunk_path = os.path.join(INTERIM_PATH, test_file)
    df = pd.read_parquet(chunk_path)
    print("Start building fatures")
    user_features = build_user_features_from_chunk_main(df)
    print("Features builded and saving new file")
    user_features.to_parquet(os.path.join(
        PROCESSED_PATH, "user_features_chunk_test.parquet"), index=False)
    print("Test features saved as user_features_chunk_test2.parquet")

In [24]:
build_all_features_test()

🧪 Testing with file: cleaned_chunk_1.parquet
Start building fatures
Features builded and saving new file
Test features saved as user_features_chunk_test1.parquet


In [27]:
build_all_features_test()

🧪 Testing with file: cleaned_chunk_10.parquet
Start building fatures
from event_type to purchase
grouping by user_id
starting the aggregation


KeyboardInterrupt: 

In [None]:
def build_all_features():
    """
    Process ONLY the first .parquet file in data/interim to test feature extraction.
    """
    os.makedirs(PROCESSED_PATH, exist_ok=True)
    files = sorted([f for f in os.listdir(
        INTERIM_PATH) if f.endswith(".parquet")])

    # Test only on the first file
    test_file = files[1]
    print(f"🧪 Testing with file: {test_file}")
    chunk_path = os.path.join(INTERIM_PATH, test_file)
    df = pd.read_parquet(chunk_path)
    print("Start building fatures")
    user_features = build_user_features_from_chunk(df)
    print("Features builded and saving new file")
    user_features.to_parquet(os.path.join(
        PROCESSED_PATH, "user_features_chunk_test.parquet"), index=False)
    print("Test features saved as user_features_chunk_test.parquet")

In [16]:
chuncked = pd.read_parquet(os.path.join(INTERIM_PATH, "cleaned_chunk_0.parquet"))

In [17]:
chuncked.head(10)

Unnamed: 0,event_time,event_type,brand,price,user_id,user_session,main_category,sub_category
0,2019-12-01 00:00:00+00:00,view,apple,1302.48,556695836,ca5eefc5-11f9-450c-91ed-380285a0bc80,construction,tools.light
1,2019-12-01 00:00:00+00:00,view,force,102.96,577702456,de33debe-c7bf-44e8-8a12-3bf8421f842a,unknown,unknown
2,2019-12-01 00:00:01+00:00,view,bosch,313.52,539453785,5ee185a7-0689-4a33-923d-ba0130929a76,appliances,personal.massager
3,2019-12-01 00:00:02+00:00,purchase,unknown,132.31,535135317,61792a26-672f-4e61-9832-7b63bb1714db,computers,peripherals.printer
4,2019-12-01 00:00:02+00:00,view,nika,101.68,517987650,906c6ca8-ff5c-419a-bde9-967ba8e2233e,apparel,trousers
5,2019-12-01 00:00:02+00:00,view,ikea,163.56,542860793,a1bcb550-1065-4769-a80a-0ccb4bcee78d,accessories,umbrella
6,2019-12-01 00:00:02+00:00,view,unknown,88.81,538021416,e88f77cc-e75e-4e9f-9ef6-ef1a302ed50a,electronics,clocks
7,2019-12-01 00:00:03+00:00,view,xiaomi,256.38,525740700,370e8c88-3d07-41df-9aaa-2adf5a0bf312,construction,tools.light
8,2019-12-01 00:00:04+00:00,view,jet,20.57,512509221,4227259f-1c4c-41dc-84b5-9354d864eefa,computers,notebook
9,2019-12-01 00:00:04+00:00,view,unknown,179.16,553345124,58c692ff-c7a9-4e35-9ec4-58598f1940e0,construction,components.faucet


In [18]:
chuncked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000000 entries, 0 to 29999999
Data columns (total 8 columns):
 #   Column         Dtype              
---  ------         -----              
 0   event_time     datetime64[ns, UTC]
 1   event_type     object             
 2   brand          object             
 3   price          float64            
 4   user_id        int64              
 5   user_session   object             
 6   main_category  object             
 7   sub_category   object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(5)
memory usage: 1.8+ GB


In [8]:
build_all_features()

🧪 Testing with file: cleaned_chunk_0.parquet
Start building fatures
Features builded and saving new file
Test features saved as user_features_chunk_test.parquet


In [None]:
test_features_1 = pd.read_parquet(os.path.join(
    PROCESSED_PATH, "user_features_chunk_test.parquet"))

In [32]:
test_features_2 = pd.read_parquet(os.path.join(
    PROCESSED_PATH, "user_features_chunk_test.parquet"))

In [33]:
test_features_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2624800 entries, 0 to 2624799
Data columns (total 13 columns):
 #   Column              Dtype              
---  ------              -----              
 0   user_id             int64              
 1   count_view          int64              
 2   count_cart          int64              
 3   count_purchase      int64              
 4   unique_sessions     int64              
 5   fav_main_category   object             
 6   fav_sub_category    object             
 7   first_event         datetime64[ns, UTC]
 8   last_event          datetime64[ns, UTC]
 9   active_days         int64              
 10  total_spent         float64            
 11  avg_purchase_price  float64            
 12  recency_days        int64              
dtypes: datetime64[ns, UTC](2), float64(2), int64(7), object(2)
memory usage: 260.3+ MB


In [34]:
test_features_2.head(10)

Unnamed: 0,user_id,count_view,count_cart,count_purchase,unique_sessions,fav_main_category,fav_sub_category,first_event,last_event,active_days,total_spent,avg_purchase_price,recency_days
0,30493659,2,0,0,2,construction,tools.light,2019-12-22 18:49:44+00:00,2019-12-23 04:50:04+00:00,2,,,5
1,32836036,1,0,0,1,unknown,unknown,2019-12-26 02:49:31+00:00,2019-12-26 02:49:31+00:00,1,,,2
2,39480587,12,1,0,3,appliances,kitchen.hob,2019-12-22 16:56:26+00:00,2019-12-24 19:39:22+00:00,3,,,3
3,40484041,3,0,0,1,appliances,kitchen.dishwasher,2019-12-18 09:21:46+00:00,2019-12-18 09:22:26+00:00,1,,,10
4,49484535,3,0,0,3,accessories,wallet,2019-12-20 18:30:18+00:00,2019-12-23 05:49:37+00:00,3,,,5
5,58438489,1,0,0,1,apparel,shorts,2019-12-28 05:42:00+00:00,2019-12-28 05:42:00+00:00,1,,,0
6,62336140,1,0,0,1,unknown,unknown,2019-12-22 16:19:22+00:00,2019-12-22 16:19:22+00:00,1,,,5
7,63518127,17,0,0,2,apparel,bicycle,2019-12-24 05:32:38+00:00,2019-12-24 07:38:27+00:00,1,,,4
8,68576588,1,0,0,1,electronics,clocks,2019-12-27 14:37:59+00:00,2019-12-27 14:37:59+00:00,1,,,1
9,70829073,5,0,0,3,unknown,unknown,2019-12-20 16:45:02+00:00,2019-12-22 16:35:09+00:00,2,,,5


In [31]:
test_features_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2513903 entries, 0 to 2513902
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           int64  
 1   count_cart        int64  
 2   count_purchase    int64  
 3   count_view        int64  
 4   total_spent       float64
 5   unique_sessions   int64  
 6   fav_sub_category  object 
 7   active_days       int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 153.4+ MB


In [10]:
test_features_1.describe()

Unnamed: 0,user_id,count_cart,count_purchase,count_view,total_spent,unique_sessions,active_days
count,2513903.0,2513903.0,2513903.0,2513903.0,233501.0,2513903.0,2513903.0
mean,553341200.0,0.5614318,0.1873227,11.18488,604.777046,2.741512,3.272886
std,28346130.0,2.381645,1.200075,32.11536,1768.718047,19.51251,4.041068
min,29515880.0,0.0,0.0,0.0,0.85,1.0,1.0
25%,525465200.0,0.0,0.0,1.0,102.94,1.0,1.0
50%,557382900.0,0.0,0.0,4.0,230.64,1.0,1.0
75%,581345500.0,0.0,0.0,11.0,591.63,3.0,4.0
max,588084800.0,391.0,315.0,23388.0,236749.03,22938.0,17.0


In [14]:
test_features_1.head(10)

Unnamed: 0,user_id,count_cart,count_purchase,count_view,total_spent,unique_sessions,fav_sub_category,active_days
0,29515875,0,0,1,,1,bedroom.bed,1
1,31198833,0,0,3,,1,tools.light,1
2,34916060,0,0,1,,1,video.projector,1
3,38661019,0,0,2,,1,shoes.sandals,1
4,42896738,0,0,1,,1,tools.light,1
5,49484535,0,0,2,,2,unknown,8
6,56931866,0,0,5,,2,kitchen.refrigerators,8
7,62336140,0,0,1,,1,shoes,1
8,63518127,0,0,1,,1,kitchen.chair,1
9,65746813,0,0,10,,9,audio.headphone,1


In [35]:
print(test_features_2.isnull().sum())

user_id                     0
count_view                  0
count_cart                  0
count_purchase              0
unique_sessions             0
fav_main_category           0
fav_sub_category            0
first_event                 0
last_event                  0
active_days                 0
total_spent           2354202
avg_purchase_price    2354202
recency_days                0
dtype: int64


In [None]:
def build_all_features():
    """
    Process all .parquet files in data/interim and save user features in data/processed.
    """
    os.makedirs(PROCESSED_PATH, exist_ok=True)
    files = sorted([f for f in os.listdir(
        INTERIM_PATH) if f.endswith(".parquet")])

    for i, file in enumerate(tqdm(files, desc="Building user features from chunks")):
        chunk_path = os.path.join(INTERIM_PATH, file)
        df = pd.read_parquet(chunk_path)

        user_features = build_user_features_from_chunk(df)
        user_features.to_parquet(os.path.join(
            PROCESSED_PATH, f"user_features_chunk_{i}.parquet"), index=False)


In [None]:
def merge_all_user_features():
    """
    Merge all processed per-user parquet chunks into a single user_features.parquet file.
    """
    files = sorted([f for f in os.listdir(PROCESSED_PATH)
                   if f.startswith("user_features_chunk")])
    all_chunks = []

    for file in tqdm(files, desc="Merging all user features"):
        chunk = pd.read_parquet(os.path.join(PROCESSED_PATH, file))
        all_chunks.append(chunk)

    df_all = pd.concat(all_chunks).groupby("user_id").agg({
        **{col: 'sum' for col in ["count_view", "count_cart", "count_remove_from_cart", "count_purchase", "total_spent", "unique_sessions", "active_days"]},
        "fav_sub_category": lambda x: x.mode().iloc[0] if not x.mode().empty else "unknown"
    }).reset_index()

    df_all.to_parquet(os.path.join(
        PROCESSED_PATH, "user_features.parquet"), index=False)
    print("Final user_features.parquet saved.")

In [None]:
if __name__ == "__main__":
    build_all_features()
    merge_all_user_features()