In [1]:
# notebooks/04_sessionize_and_prefix_target.ipynb
# --- Sessionization + Prefix->Target generation (1 hour gap) ---

import pandas as pd
from pathlib import Path
import numpy as np
import tqdm

# Parameters
TIME_GAP_SECONDS = 60 * 60   # 1 hour
MIN_SESSION_LEN = 2
MAX_SESSION_LEN = 200        # truncate or drop very long sessions
MAX_PREFIX_LEN = 50          # optional cap for prefix length when saving
DATA_DIR = Path("../data/processed")

def sessionize_df(df, user_col="user_id", time_col="timestamp", gap_seconds=TIME_GAP_SECONDS):
    """
    Input: df with columns [dataset, user_id, session_id (placeholder), item_id, timestamp, interaction_type]
    Returns: df with new column 'session_id_real' and session-level summary DF
    """
    # Ensure timestamp is datetime
    df = df.copy()
    df[time_col] = pd.to_datetime(df[time_col])
    df = df.sort_values([user_col, time_col])

    # compute time diff per user
    df['time_diff'] = df.groupby(user_col)[time_col].diff().dt.total_seconds()
    # start new session when time_diff is NaN (first event) or > gap
    df['new_session'] = (df['time_diff'].isna()) | (df['time_diff'] > gap_seconds)
    # cumulative sum per user to get session index
    df['session_idx'] = df.groupby(user_col)['new_session'].cumsum().astype(int)

    # build a real session id: f"{user_id}__s{session_idx}"
    df['session_id_real'] = df[user_col].astype(str) + "__s" + df['session_idx'].astype(str)

    # session summary
    session_summary = (
        df.groupby('session_id_real').agg(
            dataset=('dataset', 'first'),
            user_id=(user_col, 'first'),
            session_idx=('session_idx', 'first'),
            start_time=(time_col, 'min'),
            end_time=(time_col, 'max'),
            session_length=('item_id', 'size')
        ).reset_index()
    )

    # filter by length
    valid_sessions = session_summary[
        (session_summary['session_length'] >= MIN_SESSION_LEN) &
        (session_summary['session_length'] <= MAX_SESSION_LEN)
    ]['session_id_real']

    df = df[df['session_id_real'].isin(valid_sessions)].copy()
    df = df.drop(columns=['time_diff', 'new_session', 'session_idx'])

    return df, session_summary

def generate_prefix_target(df, item_col='item_id', session_col='session_id_real', max_prefix_len=MAX_PREFIX_LEN):
    """
    For each session (ordered by timestamp), generate prefix->target pairs:
      for seq [i1, i2, i3], generate:
        ([i1], i2), ([i1,i2], i3)
    Return dataframe with columns:
      dataset, user_id, session_id_real, prefix (list or str), prefix_len, target
    """
    rows = []
    # ensure ordering
    df = df.sort_values([session_col, 'timestamp'])
    grouped = df.groupby(session_col)
    for session_id, g in tqdm.tqdm(grouped, desc="Generating prefix-target"):
        items = g[item_col].astype(str).tolist()
        if len(items) < 2:
            continue
        # optional: cap session items to last N
        if len(items) > max_prefix_len + 1:
            items = items[-(max_prefix_len+1):]  # keep last (max_prefix_len + 1) items
        for t in range(1, len(items)):
            prefix = items[:t]
            target = items[t]
            rows.append({
                'dataset': g['dataset'].iloc[0],
                'user_id': g['user_id'].iloc[0],
                'session_id_real': session_id,
                'prefix_len': len(prefix),
                # store prefix as space-separated string to save space; you can parse back with .split()
                'prefix': " ".join(prefix),
                'target': target
            })
    out = pd.DataFrame(rows)
    return out

# ------- Run for a dataset function ----------
def process_dataset(dataset_name):
    print(f"\n--- Processing {dataset_name} ---")
    in_path = DATA_DIR / f"{dataset_name}_interactions.parquet"
    assert in_path.exists(), f"{in_path} not found"
    df = pd.read_parquet(in_path)

    # sessionize only if session_id currently equals user_id placeholder
    # (For yoochoose we already have real session ids; but it's safe to run — we'll detect that)
    # If there's already many distinct session ids that are not just equal to user_id, you can skip.
    df_sessionized, session_summary = sessionize_df(df, user_col='user_id', time_col='timestamp')

    # Save sessionized interactions
    out_sessions_path = DATA_DIR / f"{dataset_name}_sessions.parquet"
    df_sessionized.to_parquet(out_sessions_path, index=False)
    print(f"Saved sessionized interactions to: {out_sessions_path}")

    # Save session summary
    out_summary = DATA_DIR / f"{dataset_name}_session_summary.csv"
    session_summary.to_csv(out_summary, index=False)
    print(f"Saved session summary to: {out_summary}")

    # Generate prefix-target pairs
    pairs = generate_prefix_target(df_sessionized, item_col='item_id', session_col='session_id_real')
    out_pairs = DATA_DIR / f"{dataset_name}_prefix_target.parquet"
    pairs.to_parquet(out_pairs, index=False)
    print(f"Saved prefix->target pairs to: {out_pairs}")

    # Print quick stats
    print("Session summary stats (before filtering):")
    print(session_summary['session_length'].describe())
    print("After filtering sessions:", session_summary[session_summary['session_length'] >= MIN_SESSION_LEN].shape[0])
    print("Prefix-target pairs generated:", len(pairs))
    return df_sessionized, session_summary, pairs

# Datasets to process:
datasets = ["amazon_books_2023", "mars"]  # yoochoose already ok
results = {}
for ds_name in datasets:
    df_sess, sess_summary, pairs = process_dataset(ds_name)
    results[ds_name] = (df_sess, sess_summary, pairs)

# Quick sanity check prints
for ds_name, (df_sess, summary, pairs) in results.items():
    print(f"\n=== {ds_name} ===")
    print("Interactions after sessionization:", len(df_sess))
    print("Unique sessions:", summary.shape[0])
    print("Prefix-target pairs:", len(pairs))
    print("Sample session summary head:")
    print(summary.head())
    print("Sample prefix-target head:")
    print(pairs.head())



--- Processing amazon_books_2023 ---
Saved sessionized interactions to: ..\data\processed\amazon_books_2023_sessions.parquet
Saved session summary to: ..\data\processed\amazon_books_2023_session_summary.csv


Generating prefix-target: 100%|██████████| 2788253/2788253 [07:49<00:00, 5934.23it/s]


Saved prefix->target pairs to: ..\data\processed\amazon_books_2023_prefix_target.parquet
Session summary stats (before filtering):
count    2.162191e+07
mean     1.252362e+00
std      1.170748e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.150000e+02
Name: session_length, dtype: float64
After filtering sessions: 2788255
Prefix-target pairs generated: 5441026

--- Processing mars ---
Saved sessionized interactions to: ..\data\processed\mars_sessions.parquet
Saved session summary to: ..\data\processed\mars_session_summary.csv


Generating prefix-target: 100%|██████████| 549/549 [00:00<00:00, 5083.02it/s]

Saved prefix->target pairs to: ..\data\processed\mars_prefix_target.parquet
Session summary stats (before filtering):
count    1275.000000
mean        2.869804
std         4.694463
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max        50.000000
Name: session_length, dtype: float64
After filtering sessions: 549
Prefix-target pairs generated: 2384

=== amazon_books_2023 ===
Interactions after sessionization: 8244397
Unique sessions: 21621909
Prefix-target pairs: 5441026
Sample session summary head:





                     session_id_real            dataset  \
0   AE22236AFRRSMQIKGG7TPTB75QEA__s1  amazon_books_2023   
1  AE22236AFRRSMQIKGG7TPTB75QEA__s10  amazon_books_2023   
2  AE22236AFRRSMQIKGG7TPTB75QEA__s11  amazon_books_2023   
3  AE22236AFRRSMQIKGG7TPTB75QEA__s12  amazon_books_2023   
4  AE22236AFRRSMQIKGG7TPTB75QEA__s13  amazon_books_2023   

                        user_id  session_idx          start_time  \
0  AE22236AFRRSMQIKGG7TPTB75QEA            1 1999-07-30 03:53:39   
1  AE22236AFRRSMQIKGG7TPTB75QEA           10 2012-01-16 02:07:51   
2  AE22236AFRRSMQIKGG7TPTB75QEA           11 2012-01-21 16:09:22   
3  AE22236AFRRSMQIKGG7TPTB75QEA           12 2012-08-04 01:19:59   
4  AE22236AFRRSMQIKGG7TPTB75QEA           13 2012-12-09 02:35:09   

             end_time  session_length  
0 1999-07-30 03:53:39               1  
1 2012-01-16 02:07:51               1  
2 2012-01-21 16:09:22               1  
3 2012-08-04 01:19:59               1  
4 2012-12-09 02:35:09               