# EDA
Load datasets from `../data` and preview.


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd

data_dir = Path('..') / 'data'
dataframes = {}

for path in sorted(data_dir.iterdir()):
    if path.suffix.lower() == '.csv':
        df = pd.read_csv(path)
        dataframes[path.stem] = df
    elif path.suffix.lower() == '.npy':
        arr = np.load(path, allow_pickle=True)
        if arr.ndim == 1:
            df = pd.DataFrame(arr, columns=[path.stem])
        else:
            df = pd.DataFrame(arr)
        dataframes[path.stem] = df

In [None]:
from IPython.display import display

for name, df in dataframes.items():
    print(f'=== {name} | shape: {df.shape} ===')
    display(df.head())

In [None]:
### 1. Interaction Features: Negative + Click + Like + Comment + Gift
#### 1.1 Click
##### 1.1.1 Rename timestamp to imp_timestamp, Convert it to datetime feature 
##### 1.1.2  Derive is_click --> df_click

#### 1.2 like
##### 1.2.1 Rename timestamp to like_timestamp, Convert it to datetime feature
##### 1.2.2 Derive is_like
##### 1.2.3 Merge like -> latest prior click per user_id/live_id/streamer_id --> df_click_with_like


#### 1.3 comment
##### 1.3.1 Rename timestamp to like_timestamp, Convert it to datetime feature 
##### 1.3.2 Derive is_comment
##### 1.3.3 Merge comment data -> latest prior click per user_id/live_id/streamer_id --> df_click_with_like_comment

#### 1.4 gift
##### 1.4.1 Rename timestamp to gift_timestamp, Convert it to datetime feature 
##### 1.4.2 Derive is_gift
##### 1.4.3 Merge gift data -> latest prior click per user_id/live_id/streamer_id --> df_click_with_like_comment_gift

#### 1.5 Negative
##### 1.5.1 Rename timestamp to imp_timestamp, Convert it to datetime feature 
##### 1.5.2 Derive is_click, watch_live_time, is_like, is_comment, is_gift, gift_price and set their values all as 0 
##### 1.5.3 Derive latest_like_ts, latest_comment_ts, latest_gift_ts and set as 0
##### 1.5.4 Vertically append with df_click_with_like_comment_gift to generate the final dataset "df_interactions"
##### 1.5.5 Sort df_interactions by imp_timestamp, user_id, live_id, streamer_id ---> df_interactions
##### 1.5.6 Derive imp_year/imp_month/imp_day/imp_hour/imp_is_weekend and reorder columns

### 2. User Features
#### 2.1 Convert reg_timestamp and first_watch_live_timestamp to datetime features
#### 2.2 rename features 
		# 1) age -->  user_age
		# 2) gender --> user_gender
		# 3) country --> user_country
		# 4) device_brand --> user_device_brand
		# 5) device_price --> user_device_price
		# 6) reg_timestamp --> user_reg_timestamp
        # 7) onehot_feat0 --- onehot_feat6 --> user_onehot_feat0 --- user_onehot_feat6
#### 2.3 Label encoding
		# 1) user_age --> user_age_le
		# 2) user_gener --> user_gender_le
		# 3) country --> user_country_le
		# 4) device_brand --> user_device_brand_le
		# 5) device_price --> user_device_price_le
		# 6) fans_num --> fans_num_le
        # 7) follow_num --> follow_num_le
		# 8) accu_watch_live_cnt --> accu_watch_live_cnt_le
		# 9) accu_watch_live_duration --> accu_watch_live_duration_le

### 3. Room features
#### 3.1 Convert p_date, start_timestamp and end_timestamp to datetime features
#### 3.2 Label encoding 
		# live_content_category --> see unique values and missingness ---> label encoding --> live_content_category_le
#### 3.3 New features 
		#xxx 1) time_since_live_sart (ms) = imp_timestamp - start_timestamp (Pending after merge with interaction)
		# 2) live_start_year
		# 3) live_start_month
		# 4) live_start_day
		# 5) live_start_hour
		# 6) live_is_weekend
#### 3.4 merge title_embedding dataset by live_name_id
#### 3.5 Fill missing embedding with 0 and derive a flag variable title_emb_missing
#### 3.6 check missing of all variables




### 4. streamer features
#### 4.1 Convert reg_timestamp and first_live_timestamp to datetime features
#### 4.2 Rename
		# 1) age -->  streamer_age
		# 2) gender --> streamer_gender
		# 3) country --> streamer_country
		# 4) device_brand --> streamer_divice_brand
		# 5) device_price --> streamer_device_price
		# 6) reg_timestamp --> streamer_reg_timestamp
		# 7) onehot_feat0 --- onehot_feat6 --> streamer_onehot_feat0 --- streamer_onehot_feat6
#### 4.3 Label Encoding
		# 1) streamer_age --> see unique values and missingness ---> label encoding --> streamer_age_le
		# 2) streamer_gender --> see unique values and missingness ---> label encoding --> streamer_gender_le (pending)
		# 3) streamer_country --> see unique values and missingness ---> label encoding --> streamer_country_le
		# 4) streamer_device_brand --> see unique values and missingness ---> streamer_device_brand_le
		# 5) streamer_device_price --> see unique values and missingness ---> streamer_device_price_le
		# 6) live_operation_tag --> see unique values and missingness ---> live_operation_tag_le
		# 7) fans_user_num --> see unique values and missingness ---> fans_user_num_le
		# 8) fans_group_fans_num --> see unique values and missingness ---> fans_group_fans_num_le
		# 9) follow_user_num --> see unique values and missingness ---> follow_user_num_le
		# 10) accu_live_cnt --> see unique values and missingness --->  accu_live_cnt_le
		# 11) accu_live_duration --> see unique values and missingness --->  accu_live_duration_le
		# 12) accu_play_cnt --> see unique values and missingness --->  accu_play_cnt_le
		# 13) accu_play_duration --> see unique values and missingness --->  accu_play_duration_le
#### 4.4 check missing of all variables



### 5. concatenation
#### 5.1 merge df_room + df_streamer by streamer_id --> df_room_streamer 
#### 5.2 merge df_interactions + df_user by user_id--> df_interaction_user (47?)
#### 5.3 merge df_interaction_user + df_room_streamer by streamer_id and live_id (223) 




### 6. Other derivation (Contextual / Temporal / Cross Features)
#### 6.1 User Features
##### 6.1.1 Basic
			# 1) user_account_age = imp_timestamp - user_reg_timestamp
			# 2) user_watch_live_age = imp_timestamp - first_watch_live_timestamp

##### 6.1.2 User CTR (pre-impression, denominator: impressions)
			# ctr_user_15min
			# ctr_user_3hr
			# ctr_user_1d
			# ctr_user_7d
##### 6.1.3 User exposure fatigue - Imp
			# num_imp_user_10min
			# num_imp_user_30min
			# num_imp_user_2hr
			# num_imp_user_12hr
			# num_imp_user_1d
			# num_imp_user_7d
##### 6.1.4 User click fatigue - click
			# num_click_user_15min
			# num_click_user_3hr
			# num_click_user_1d
			# num_click_user_7d
			# click_trend_user = log(num_click_user_15min + 1) - log(num_click_user_3hr + 1)
##### 6.1.5 User recency
			# time_since_last_impression_user
			# tsli_missing - 1 if the user has no prior impression (first impression); 0 - otherwise
			# time_since_last_click_user
			# tslc_missing - - 1 if the user has no reliable prior click (never clicked or click masked by causality guard); 0 - otherwise
			# consecutive_skips_user: number of impressions since last click
##### 6.1.6 User dwell / engagement quality (from past clicks only)
			# avg_watch_time_user
			# avg_watch_time_user
			# median_watch_time_user
			# median_watch_time_user
			# pct_long_watch_user_30s
##### 6.1.7 User comment behavior (Denominator: clicks)
			# comment_rate_user = (num_comment_user + 1) / (num_click_user + 1)
			# has_comment_user_24h
			# num_comment_user_24h
##### 6.1.8 User like behavior (Denominator: clicks)
			# like_rate_user = (num_like_user + 1) / (num_click_user + 1)
			# has_like_user_24h
			# num_like_user_24h
##### 6.1.9 User gift behavior
			# has_gift_user_7d
			# num_gift_user_7d
			# amount_gift_user_7d


#### 6.2 Room (Live) Features
##### 6.2.1 Basic
			# time_since_live_start (ms) = imp_timestamp - start_timestamp
##### 6.2.2 Room CTR (pre-impression, denominator: impressions)
			# ctr_room_10min
			# ctr_room_30min
			# ctr_room_2hr
			# ctr_room_12hr
##### 6.2.3 Room exposure volume - imp
			# num_imp_room_10min
			# num_imp_room_30min
			# num_imp_room_2hr
			# num_imp_room_12hr
			# num_imp_room_1d
##### 6.2.4 Room click volume - click
			# num_click_room_10min
			# num_click_room_30min
			# num_click_room_2hr
			# num_click_room_12hr
			# num_click_room_1d
			# ctr_trend_room = log(ctr_room_10min + 1e-6) - log(ctr_room_2hr + 1e-6)
##### 6.2.5 Room freshness (leakage-safe)
			# time_since_start_live
			# time_since_start_live_bucket: (<5min, 5-20min, >20min)
##### 6.2.6 Room dwell / engagement quality (from past clicks only)
			# avg_watch_time_live
			# median_watch_time_live
			# watch_time_live_missing
			# avg_watch_time_live_30min
			# median_watch_time_live_30min
			# watch_time_live_30min_missing
			# pct_long_watch_live_60s_30min
##### 6.2.7 Room comment behavior (Denominator: impressions)
			# comment_rate_live
			# comment_rate_live_15min
			# comment_rate_live_1hr
			# comment_rate_live_3hr
			# num_comment_live
			# num_comment_live_15min
			# num_comment_live_1hr
			# num_comment_live_3hr
			# comment_trend_room = log(comment_rate_live_15min + 1e-6) - log(comment_rate_live_1hr + 1e-6)
##### 6.2.8 Room like behavior (Denominator: impressions)
			# like_rate_live
			# like_rate_live_15min
			# like_rate_live_1hr
			# like_rate_live_3hr
			# num_like_live
			# num_like_live_15min
			# num_like_live_1hr
			# num_like_live_3hr
			# like_trend_room = log(like_rate_live_15min + 1e-6) - log(like_rate_live_1hr + 1e-6)
##### 6.2.9 Room gift behavior (Denominator: impressions)
			# gift_rate_live
			# gift_rate_live_15min
			# gift_rate_live_1hr
			# gift_rate_live_3hr
			# num_gift_live
			# num_gift_live_15min
			# num_gift_live_1hr
			# num_gift_live_3hr
			# amount_gift_live
			# amount_gift_live_15min
			# amount_gift_live_1hr
			# amount_gift_live_3hr
			# gift_trend_room = log(log_amount_gift_room_15min + 1) - log(log_amount_gift_room_1hr + 1)


#### 6.3 Streamer Features
##### 6.3.1 Basic
			# streamer_account_age = imp_timestamp - streamer_reg_timestamp
			# streamer_live_age = imp_timestamp - first_live_timestamp
##### 6.3.2 Streamer CTR / volume
			# ctr_streamer_1d
			# ctr_streamer_7d
			# num_imp_streamer_7d
			# num_click_streamer_7d
			# num_lives_streamer_7d
##### 6.3.3 Streamer engagement quality - dwell time (from past clicks only)
			# avg_watch_time_streamer
			# median_watch_time_streamer
			# pct_long_watch_streamer_30s
			# watch_time_streamer_missing

##### 6.3.4 Streamer interaction volume
			# num_comment_streamer_7d
			# num_like_streamer_7d
			# amount_gift_streamer_7d


#### 6.4 Cross Features (High ROI)
##### 6.4.1 User x streamer
			# ctr_user_streamer_7d
			# num_click_user_streamer_7d
			# num_imp_user_streamer_7d
			# time_since_last_impression_user_streamer
			# time_since_last_click_user_streamer
##### 6.4.2 User x category
			# ctr_user_category_7d
			# num_click_user_category_7d
			# num_imp_user_category_7d

In [None]:
#### 1.1 Click
##### 1.1.1 Rename timestamp to imp_timestamp, Convert it to datetime feature 
##### 1.1.2  Derive is_click
df_click = dataframes.get("click")

if df_click is not None:
    df_click = df_click.copy()

    if "timestamp" in df_click.columns:
        df_click = df_click.rename(columns={"timestamp": "imp_timestamp"})
        df_click["imp_timestamp"] = pd.to_datetime(df_click["imp_timestamp"], unit="ms", errors="coerce")

    df_click["is_click"] = 1

In [None]:
display(df_click.head())
print(df_click.shape)

In [None]:
#### 1.2 like
##### 1.2.1 Rename timestamp to imp_timestamp, Convert it to datetime feature Derive is_click
##### 1.2.2  Derive is_click
df_like = dataframes.get("like")

if df_like is not None:
    df_like = df_like.copy()

    if "timestamp" in df_like.columns:
        df_like = df_like.rename(columns={"timestamp": "like_timestamp"})
        df_like["like_timestamp"] = pd.to_datetime(df_like["like_timestamp"], unit="ms", errors="coerce")

    df_like["is_like"] = 1

display(df_like.head())
print(df_like.shape)

##### 1.2.3 Merge like -> latest prior click per user_id/live_id/streamer_id
key_cols = ["user_id", "live_id", "streamer_id"]

# normalize
for c in key_cols:
    df_click[c] = pd.to_numeric(df_click[c], errors="coerce").astype("int64")
    df_like[c]  = pd.to_numeric(df_like[c],  errors="coerce").astype("int64")

df_click["imp_timestamp"] = pd.to_datetime(df_click["imp_timestamp"], errors="coerce")
df_like["like_timestamp"] = pd.to_datetime(df_like["like_timestamp"], errors="coerce")

df_click_m = df_click.dropna(subset=key_cols + ["imp_timestamp"]).copy()
df_like_m  = df_like.dropna(subset=key_cols + ["like_timestamp"]).copy()

# give each click a unique id (internal only)
df_click_m = df_click_m.sort_values(key_cols + ["imp_timestamp"], kind="mergesort").reset_index(drop=True)
df_click_m["click_id"] = df_click_m.index

# build combined timeline
click_t = df_click_m.rename(columns={"imp_timestamp": "event_time"})
like_t  = df_like_m.rename(columns={"like_timestamp": "event_time"})

click_t["_is_click"] = 1
like_t["_is_like"] = 1

combined = pd.concat([click_t, like_t], ignore_index=True, sort=False)
combined = combined.sort_values(key_cols + ["event_time"], kind="mergesort").reset_index(drop=True)

# forward-fill latest click_id within each key (so each like gets matched to latest prior click)
combined["click_id"] = combined.groupby(key_cols, sort=False)["click_id"].ffill()

# extract like->click mapping
like_map = combined[combined["_is_like"].eq(1)].dropna(subset=["click_id"])[["click_id", "event_time"]]

# aggregate likes per click (only need latest_like_ts + is_like)
like_agg = like_map.groupby("click_id").agg(
    latest_like_ts=("event_time", "max")
).reset_index()

# join back to clicks (keeps ALL clicks)
df_click_with_like = df_click_m.merge(like_agg, on="click_id", how="left")
df_click_with_like["is_like"] = df_click_with_like["latest_like_ts"].notna().astype("int64")

# drop internal id
df_click_with_like = df_click_with_like.drop(columns=["click_id"])

display(df_click_with_like.head())
print(df_click_with_like.shape)


In [None]:
#### 1.3 comment
##### 1.3.1 Rename timestamp to comment_timestamp, Convert it to datetime feature 
##### 1.3.2 Derive is_comment
df_comment = dataframes.get("comment")

if df_comment is not None:
    df_comment = df_comment.copy()

    if "timestamp" in df_comment.columns:
        df_comment = df_comment.rename(columns={"timestamp": "comment_timestamp"})
        df_comment["comment_timestamp"] = pd.to_datetime(df_comment["comment_timestamp"], unit="ms", errors="coerce")

    df_comment["is_comment"] = 1

display(df_comment.head())
print(df_comment.shape)


##### 1.3.3 Merge comment data -> latest prior click per user_id/live_id/streamer_id
key_cols = ["user_id", "live_id", "streamer_id"]

# normalize
for c in key_cols:
    df_click_with_like[c] = pd.to_numeric(df_click_with_like[c], errors="coerce").astype("int64")
    df_comment[c]         = pd.to_numeric(df_comment[c], errors="coerce").astype("int64")

df_click_with_like["imp_timestamp"] = pd.to_datetime(df_click_with_like["imp_timestamp"], errors="coerce")
df_comment["comment_timestamp"]     = pd.to_datetime(df_comment["comment_timestamp"], errors="coerce")

df_click_m   = df_click_with_like.dropna(subset=key_cols + ["imp_timestamp"]).copy()
df_comment_m = df_comment.dropna(subset=key_cols + ["comment_timestamp"]).copy()

# give each click a unique id (internal only)
df_click_m = df_click_m.sort_values(key_cols + ["imp_timestamp"], kind="mergesort").reset_index(drop=True)
df_click_m["click_id"] = df_click_m.index

# build combined timeline
click_t   = df_click_m.rename(columns={"imp_timestamp": "event_time"})
comment_t = df_comment_m.rename(columns={"comment_timestamp": "event_time"})

click_t["_is_click"] = 1
comment_t["_is_comment"] = 1

combined = pd.concat([click_t, comment_t], ignore_index=True, sort=False)
combined = combined.sort_values(key_cols + ["event_time"], kind="mergesort").reset_index(drop=True)

# forward-fill latest click_id within each key (so each comment gets matched to latest prior click)
combined["click_id"] = combined.groupby(key_cols, sort=False)["click_id"].ffill()

# extract comment->click mapping
comment_map = combined[combined["_is_comment"].eq(1)].dropna(subset=["click_id"])[["click_id", "event_time"]]

# aggregate comments per click (only need latest_comment_ts + is_comment)
comment_agg = comment_map.groupby("click_id").agg(
    latest_comment_ts=("event_time", "max")
).reset_index()

# join back to clicks (keeps ALL clicks)
df_click_with_like_comment = df_click_m.merge(comment_agg, on="click_id", how="left")
df_click_with_like_comment["is_comment"] = df_click_with_like_comment["latest_comment_ts"].notna().astype("int64")

# drop internal id
df_click_with_like_comment = df_click_with_like_comment.drop(columns=["click_id"])

display(df_click_with_like_comment.head())
print(df_click_with_like_comment.shape)

In [None]:
print("non-missing latest_comment_ts:", df_click_with_like_comment["latest_comment_ts"].notna().sum())
print("sum(is_comment):", df_click_with_like_comment["is_comment"].sum())

In [None]:
#### 1.4 gift
##### 1.4.1 Rename timestamp to gift_timestamp, Convert it to datetime feature 
##### 1.4.2 Derive is_gift
df_gift = dataframes.get("gift")

if df_gift is not None:
    df_gift = df_gift.copy()

    if "timestamp" in df_gift.columns:
        df_gift = df_gift.rename(columns={"timestamp": "gift_timestamp"})
        df_gift["gift_timestamp"] = pd.to_datetime(df_gift["gift_timestamp"], unit="ms", errors="coerce")

    df_gift["is_gift"] = 1

display(df_gift.head())
print(df_gift.shape)


##### 1.4.3 Merge gift data -> latest prior click per user_id/live_id/streamer_id
key_cols = ["user_id", "live_id", "streamer_id"]

# normalize
for c in key_cols:
    df_click_with_like_comment[c] = pd.to_numeric(df_click_with_like_comment[c], errors="coerce").astype("int64")
    df_gift[c]                    = pd.to_numeric(df_gift[c], errors="coerce").astype("int64")

df_click_with_like_comment["imp_timestamp"] = pd.to_datetime(df_click_with_like_comment["imp_timestamp"], errors="coerce")
df_gift["gift_timestamp"]                   = pd.to_datetime(df_gift["gift_timestamp"], errors="coerce")

df_click_m = df_click_with_like_comment.dropna(subset=key_cols + ["imp_timestamp"]).copy()
df_gift_m  = df_gift.dropna(subset=key_cols + ["gift_timestamp"]).copy()

# give each click a unique id (internal only)
df_click_m = df_click_m.sort_values(key_cols + ["imp_timestamp"], kind="mergesort").reset_index(drop=True)
df_click_m["click_id"] = df_click_m.index

# build combined timeline
click_t = df_click_m.rename(columns={"imp_timestamp": "event_time"})
gift_t  = df_gift_m.rename(columns={"gift_timestamp": "event_time"})

click_t["_is_click"] = 1
gift_t["_is_gift"] = 1

combined = pd.concat([click_t, gift_t], ignore_index=True, sort=False)
combined = combined.sort_values(key_cols + ["event_time"], kind="mergesort").reset_index(drop=True)

# forward-fill latest click_id within each key (so each gift gets matched to latest prior click)
combined["click_id"] = combined.groupby(key_cols, sort=False)["click_id"].ffill()

# extract gift->click mapping (keep gift_price)
gift_map = combined[combined["_is_gift"].eq(1)].dropna(subset=["click_id"])[
    ["click_id", "event_time", "gift_price"]
]

# take the latest gift per click (by event_time), keep its price
gift_map = gift_map.sort_values(["click_id", "event_time"], kind="mergesort")
gift_agg = gift_map.groupby("click_id", as_index=False).last()

gift_agg = gift_agg.rename(columns={"event_time": "latest_gift_ts"})


# join back to clicks (keeps ALL clicks)
df_click_with_like_comment_gift = df_click_m.merge(gift_agg, on="click_id", how="left")
df_click_with_like_comment_gift["is_gift"] = df_click_with_like_comment_gift["latest_gift_ts"].notna().astype("int64")

# drop internal id
df_click_with_like_comment_gift = df_click_with_like_comment_gift.drop(columns=["click_id"])

display(df_click_with_like_comment_gift.head())
print(df_click_with_like_comment_gift.shape)


In [None]:
print("non-missing latest_gift_ts:", df_click_with_like_comment_gift["latest_gift_ts"].notna().sum())
print("non-missing gift_price:", df_click_with_like_comment_gift["gift_price"].notna().sum())
print("sum(is_gift):", df_click_with_like_comment_gift["is_gift"].sum())

In [None]:
#### 1.5 Negative
##### 1.5.1 Rename timestamp to imp_timestamp, Convert it to datetime feature 
##### 1.5.2 Derive is_click, watch_live_time, is_like, is_comment, is_gift, gift_price and set their values all as 0 
##### 1.5.3 Derive latest_like_ts, latest_comment_ts, latest_gift_ts and set as 0
df_negative = dataframes.get("negative")

if df_negative is not None:
    df_negative = df_negative.copy()

    if "timestamp" in df_negative.columns:
        df_negative = df_negative.rename(columns={"timestamp": "imp_timestamp"})
        df_negative["imp_timestamp"] = pd.to_datetime(df_negative["imp_timestamp"], unit="ms", errors="coerce")

    # fill required fields with 0
    df_negative["is_click"] = 0
    df_negative["watch_live_time"] = 0
    df_negative["is_like"] = 0
    df_negative["is_comment"] = 0
    df_negative["is_gift"] = 0
    df_negative["gift_price"] = 0

    df_negative["latest_like_ts"] = 0
    df_negative["latest_comment_ts"] = 0
    df_negative["latest_gift_ts"] = 0

    display(df_negative.head())
    print(df_negative.shape)


##### 1.5.4 Vertically append with df_click_with_like_comment_gift to generate the final dataset "df_interactions"
df_interactions = pd.concat(
    [df_click_with_like_comment_gift, df_negative],
    ignore_index=True,
    sort=False
)

##### 1.5.5 Sort df_interactions by imp_timestamp, user_id, live_id, streamer_id
df_interactions = df_interactions.sort_values(
    ["imp_timestamp", "user_id", "live_id", "streamer_id"],
    kind="mergesort"
).reset_index(drop=True)


##### 1.5.6 Derive imp_year/imp_month/imp_day/imp_hour/imp_is_weekend and reorder columns
# derive time features
df_interactions["imp_year"] = df_interactions["imp_timestamp"].dt.year
df_interactions["imp_month"] = df_interactions["imp_timestamp"].dt.month
df_interactions["imp_day"] = df_interactions["imp_timestamp"].dt.day
df_interactions["imp_hour"] = df_interactions["imp_timestamp"].dt.hour
df_interactions["imp_is_weekend"] = df_interactions["imp_timestamp"].dt.weekday.ge(5).astype("int64")

# reorder columns: place new cols between imp_timestamp and watch_live_time
new_cols = ["imp_year", "imp_month", "imp_day", "imp_hour", "imp_is_weekend"]
cols = df_interactions.columns.tolist()

imp_idx = cols.index("imp_timestamp")
watch_idx = cols.index("watch_live_time")

# remove new cols if already exist, then insert after imp_timestamp
for c in new_cols:
    cols.remove(c)

cols = cols[:imp_idx + 1] + new_cols + cols[imp_idx + 1:]
df_interactions = df_interactions[cols]

# for missing values in gift_price in df_interactions, replace the missing value with zero
df_interactions["gift_price"] = df_interactions["gift_price"].fillna(0)


display(df_interactions.head())
print(df_interactions.shape)

In [None]:
print("count is_click == 0:", (df_interactions["is_click"] == 0).sum())

In [None]:
# check missingness of each column in df_interactions
(df_interactions.isna().sum().sort_values(ascending=False))

In [None]:
### 2. User Features
#### 2.1 Convert reg_timestamp and first_watch_live_timestamp to datetime features
#### 2.2 rename features 
    # 1) age -->  user_age
    # 2) gender --> user_gender
    # 3) country --> user_country
    # 4) device_brand --> user_device_brand
    # 5) device_price --> user_device_price
    # 6) reg_timestamp --> user_reg_timestamp
    # 7) onehot_feat0 --- onehot_feat6 --> user_onehot_feat0 --- user_onehot_feat6

df_user = dataframes.get("user")

if df_user is not None:

    df_user = df_user.copy()
    
    # check duplicates by user_id
    if "user_id" in df_user.columns:
        dup_mask = df_user.duplicated(subset=["user_id"], keep=False)
        dup_count = int(dup_mask.sum())
        print(f"df_user duplicate rows by user_id: {dup_count}")
        if dup_count:
            display(df_user.loc[dup_mask].sort_values("user_id").head())
    else:
        print("df_user duplicate rows by user_id: user_id column not found")


    # convert date-like strings to datetime
    df_user["reg_timestamp"] = pd.to_datetime(df_user["reg_timestamp"], errors="coerce")
    df_user["first_watch_live_timestamp"] = pd.to_datetime(df_user["first_watch_live_timestamp"], errors="coerce")

    rename_map = {
        "age": "user_age",
        "gender": "user_gender",
        "country": "user_country",
        "device_brand": "user_device_brand",
        "device_price": "user_device_price",
        "reg_timestamp": "user_reg_timestamp",
    }
    rename_map.update({f"onehot_feat{i}": f"user_onehot_feat{i}" for i in range(7)})

    df_user = df_user.rename(columns=rename_map)

display(df_user.head())
print(df_user.shape)

In [None]:
# check missingness of each column in df_user
(df_user.isna().sum().sort_values(ascending=False))

In [None]:
# inspect unique counts and values for selected user features
cols = [
    "user_age",
    "user_gender",
    "user_country",
    "user_device_brand",
    "user_device_price",
    "fans_num",
    "follow_num",
    "accu_watch_live_cnt",
    "accu_watch_live_duration",
]

for c in cols:
    uniq = df_user[c].dropna().unique()
    print(f"{c}: n_unique={len(uniq)}")
    print(uniq)
    print("-" * 60)

In [None]:
#### 2.3 Label encoding
		# 1) user_age --> user_age_le
        # 2) user_gender --> user_gender_le
		# 3) country --> user_country_le
		# 4) device_brand --> user_device_brand_le
		# 5) device_price --> user_device_price_le
		# 6) fans_num --> fans_num_le
        # 7) follow_num --> follow_num_le
		# 8) accu_watch_live_cnt --> accu_watch_live_cnt_le
		# 9) accu_watch_live_duration --> accu_watch_live_duration_le

le_cols = [
    "user_age",
    "user_gender",
    "user_country",
    "user_device_brand",
    "user_device_price",
    "fans_num",
    "follow_num",
    "accu_watch_live_cnt",
    "accu_watch_live_duration",
]

for c in le_cols:
    codes, _ = pd.factorize(df_user[c], sort=True)
    df_user[f"{c}_le"] = codes


with pd.option_context("display.max_columns", None, "display.width", None):
    display(df_user.head())
print(df_user.shape)

In [None]:
# check missingness of each column in df_user
(df_user.isna().sum().sort_values(ascending=False))

In [None]:
# inspect unique counts and values for selected user features
cols = [
    "user_age_le",
    "user_country_le",
    "user_device_brand_le",
    "user_device_price_le",
    "fans_num_le",
    "follow_num_le",
    "accu_watch_live_cnt_le",
    "accu_watch_live_duration_le",
]

for c in cols:
    uniq = df_user[c].dropna().unique()
    print(f"{c}: n_unique={len(uniq)}")
    print(uniq)
    print("-" * 60)

In [None]:
### 3. Room features
#### 3.0 remove duplicate live_id, room_id
#### 3.1 Convert p_date, start_timestamp and end_timestamp to datetime features
#### 3.2 Label encoding 
		# live_content_category --> see unique values and missingness ---> label encoding --> live_content_category_le
#### 3.3 New features
		# xxxx1) time_since_live_sart (ms) = imp_timestamp - start_timestamp (Pending after merge with interaction)
		# 2) live_start_year
		# 3) live_start_month
		# 4) live_start_day
		# 5) live_start_hour
		# 6) live_is_weekend
#### 3.4 merge title_embedding dataset by live_name_id
#### 3.5 Fill missing embedding with 0 and derive a flag variable title_emb_missing
#### 3.6 check missing of all variables

### 3. Room features
#### 3.0 remove duplicate live_id, room_id
#### 3.1 Convert p_date, start_timestamp and end_timestamp to datetime features
df_room = dataframes.get("room")

if df_room is not None:
    df_room = df_room.copy()

    # check duplicates by room_id + streamer_id
    key_cols = ["live_id", "streamer_id"]
    if all(c in df_room.columns for c in key_cols):
        dup_mask = df_room.duplicated(subset=key_cols, keep=False)
        dup_count = int(dup_mask.sum())
        print(f"df_room duplicate rows by {key_cols}: {dup_count}")
        if dup_count:
            display(df_room.loc[dup_mask].sort_values(key_cols).head())
    else:
        print(f"df_room duplicate rows by {key_cols}: keys not found")

    df_room["p_date"] = pd.to_datetime(df_room["p_date"].astype(str), format="%Y%m%d", errors="coerce")
    df_room["start_timestamp"] = pd.to_datetime(df_room["start_timestamp"], unit="ms", errors="coerce")
    df_room["end_timestamp"] = pd.to_datetime(df_room["end_timestamp"], unit="ms", errors="coerce")

display(df_room.head())
print(df_room.shape)
# check missingness of each column in df_user
(df_room.isna().sum().sort_values(ascending=False))

In [None]:
#### 3.2 Label encoding 
# live_content_category --> see unique values and missingness ---> label encoding --> live_content_category_le
print("live_content_category missing:", df_room["live_content_category"].isna().sum())
print("live_content_category unique:", df_room["live_content_category"].nunique(dropna=True))
print(df_room["live_content_category"].dropna().unique())

codes, _ = pd.factorize(df_room["live_content_category"], sort=True)
df_room["live_content_category_le"] = codes


#### 3.3 New features 
# 1) live_start_year
# 2) live_start_month
# 3) live_start_day
# 4) live_start_hour
# 5) live_is_weekend
df_room["live_start_year"] = df_room["start_timestamp"].dt.year
df_room["live_start_month"] = df_room["start_timestamp"].dt.month
df_room["live_start_day"] = df_room["start_timestamp"].dt.day
df_room["live_start_hour"] = df_room["start_timestamp"].dt.hour
df_room["live_is_weekend"] = df_room["start_timestamp"].dt.weekday.ge(5).astype("int64")

display(df_room.head())
print(df_room.shape)


In [None]:
print("min live_name_id:", df_room["live_name_id"].min())
print("max live_name_id:", df_room["live_name_id"].max())
(df_room["live_name_id"] == -1).sum()
# note about 1834913 (15%) records in df_room have live_name_id == -1, indicating they miss title_embedding

In [None]:
#### 3.4 merge title_embedding dataset by live_name_id
from pathlib import Path
import numpy as np
import pandas as pd

# use existing df_room (do NOT reload or overwrite)
assert df_room is not None

# load embeddings
data_dir = Path("..") / "data"
emb = np.load(data_dir / "title_embeddings.npy")

# build live_name_id list aligned to embeddings (exclude -1 for alignment only)
live_ids = (
    df_room["live_name_id"]
    .dropna()
    .astype("int64")
    .sort_values()
    .unique()
)
live_ids_for_emb = live_ids[live_ids != -1]

# ensure counts match
assert len(live_ids_for_emb) == emb.shape[0]

df_title_embedding = pd.DataFrame(
    emb, columns=[f"title_emb_{i}" for i in range(emb.shape[1])]
)
df_title_embedding["live_name_id"] = live_ids_for_emb

# merge into existing df_room (keeps derived columns)
df_room = df_room.merge(df_title_embedding, on="live_name_id", how="left")


#### 3.5 Fill missing embedding with 0 and derive a flag variable title_emb_missing
emb_cols = [c for c in df_room.columns if c.startswith("title_emb_")]

df_room["title_emb_missing"] = df_room[emb_cols].isna().any(axis=1).astype("int64")
df_room[emb_cols] = df_room[emb_cols].fillna(0)


display("df_title_embedding:",df_title_embedding.head())
print("df_title_embedding:",df_title_embedding.shape)
display("df_room:", df_room.head())
print("df_room:",df_room.shape)


In [None]:
# Goal: pick one live_name_id per live_id
# Rule: most frequent live_name_id; if tied, pick the one with latest start_timestamp

tmp = df_room[["live_id", "live_name_id", "start_timestamp"]].dropna(subset=["live_id", "live_name_id"])

counts = (
    tmp.groupby(["live_id", "live_name_id"], as_index=False)
       .agg(freq=("live_name_id", "size"), latest_ts=("start_timestamp", "max"))
)

best = (
    counts.sort_values(["live_id", "freq", "latest_ts"], ascending=[True, False, False])
          .drop_duplicates(subset=["live_id"], keep="first")
          .rename(columns={"live_name_id": "live_name_id_mostfreq"})
          [["live_id", "live_name_id_mostfreq"]]
)

df_room = df_room.merge(best, on="live_id", how="left")

# overwrite live_name_id with chosen one
df_room["live_name_id"] = df_room["live_name_id_mostfreq"]

# now drop duplicate rows: keep the latest start_timestamp per live_id
df_room = (
    df_room.sort_values(["live_id", "start_timestamp"], ascending=[True, False])
           .drop_duplicates(subset=["live_id"], keep="first")
)

# cleanup
df_room = df_room.drop(columns=["live_name_id_mostfreq"], errors="ignore")
for v in ["tmp", "counts", "best"]:
    if v in globals():
        del globals()[v]

display(df_room.head())
print(df_room.shape)

In [None]:
# verify df_room size, embedding coverage, unique IDs, and column missingness
print("rows in df_room:", len(df_room))
print("title_emb_missing flagged:", df_room["title_emb_missing"].sum())
print("unique live_name_id:", df_room["live_name_id"].nunique(dropna=True))
# check live_name_id range
print("min live_name_id:", df_room["live_name_id"].min())
print("max live_name_id:", df_room["live_name_id"].max())

missing_room = pd.DataFrame({
    "missing_count": df_room.isna().sum(),
    "missing_pct": df_room.isna().mean().mul(100)
}).sort_values("missing_count", ascending=False)

display(missing_room)

In [None]:
with pd.option_context("display.max_columns", None, "display.width", None):
    display(df_room.head())
print("df_room:",df_room.shape)

In [None]:
### 4. streamer features
#### 4.1 Convert reg_timestamp and first_live_timestamp to datetime features
#### 4.2 Rename
# 1) age -->  streamer_age
# 2) gender --> streamer_gender
# 3) country --> streamer_country
# 4) device_brand --> streamer_device_brand
# 5) device_price --> streamer_device_price
# 6) reg_timestamp --> streamer_reg_timestamp
# 7) onehot_feat0 --- onehot_feat6 --> streamer_onehot_feat0 --- streamer_onehot_feat6

df_streamer = dataframes.get("streamer")

if df_streamer is not None:
    df_streamer = df_streamer.copy()

    # check duplicates by streamer_id
    if "streamer_id" in df_streamer.columns:
        dup_mask = df_streamer.duplicated(subset=["streamer_id"], keep=False)
        dup_count = int(dup_mask.sum())
        print(f"df_streamer duplicate rows by streamer_id: {dup_count}")
        if dup_count:
            display(df_streamer.loc[dup_mask].sort_values("streamer_id").head())
    else:
        print("df_streamer duplicate rows by streamer_id: streamer_id column not found")

    df_streamer["reg_timestamp"] = pd.to_datetime(df_streamer["reg_timestamp"], errors="coerce")
    df_streamer["first_live_timestamp"] = pd.to_datetime(df_streamer["first_live_timestamp"], errors="coerce")

    rename_map = {
        "age": "streamer_age",
        "gender": "streamer_gender",
        "country": "streamer_country",
        "device_brand": "streamer_device_brand",
        "device_price": "streamer_device_price",
        "reg_timestamp": "streamer_reg_timestamp",
    }
    rename_map.update({f"onehot_feat{i}": f"streamer_onehot_feat{i}" for i in range(7)})

    df_streamer = df_streamer.rename(columns=rename_map)


display("df_streamer:",df_streamer.head())
print("df_streamer:", df_streamer.shape)

#### 4.3 Label Encoding
le_cols = [
    "streamer_age",
    "streamer_gender",      # pending if you want special handling
    "streamer_country",
    "streamer_device_brand",
    "streamer_device_price",
    "live_operation_tag",
    "fans_user_num",
    "fans_group_fans_num",
    "follow_user_num",
    "accu_live_cnt",
    "accu_live_duration",
    "accu_play_cnt",
    "accu_play_duration",
]

# check unique values + missingness, then label encode
for c in le_cols:
    if c in df_streamer.columns:
        print(f"{c}: missing={df_streamer[c].isna().sum()}, unique={df_streamer[c].nunique(dropna=True)}")
        codes, _ = pd.factorize(df_streamer[c], sort=True)
        df_streamer[f"{c}_le"] = codes


#### 4.4 check missing of all variables
missing_streamer = pd.DataFrame({
    "missing_count": df_streamer.isna().sum(),
    "missing_pct": df_streamer.isna().mean().mul(100)
}).sort_values("missing_count", ascending=False)

display(missing_streamer)


In [None]:
display("df_streamer:",df_streamer.head())
print("df_streamer:", df_streamer.shape)

In [None]:
### 5. concatenation
#### 5.1 merge df_room + df_streamer by streamer_id --> df_room_streamer
df_room_streamer = df_room.merge(df_streamer, on="streamer_id", how="left")

print("df_room_streamer:", df_room_streamer.shape)

#### 5.2 merge df_interactions + df_user by user_id--> df_interaction_user
df_interaction_user = df_interactions.merge(df_user, on="user_id", how="left")

print("df_interaction_user:", df_interaction_user.shape)

#### 5.3 merge df_interaction_user + df_room_streamer by streamer_id and live_id
df_final = df_interaction_user.merge(
    df_room_streamer,
    on=["streamer_id", "live_id"],
    how="left"
)

print("df_final:", df_final.shape)

In [None]:
# check duplicates on (streamer_id, live_id) in df_room_streamer
dup_mask = df_room_streamer.duplicated(subset=["streamer_id", "live_id"], keep=False)
dup_rows = df_room_streamer[dup_mask]

print("duplicate row count:", int(dup_mask.sum()))
print("unique duplicate keys:", dup_rows[["streamer_id", "live_id"]].drop_duplicates().shape[0])

# preview duplicate keys + a few rows
# show full columns for duplicate rows (first 10)
with pd.option_context("display.max_columns", None, "display.width", None):
    display(dup_rows.head(10))


In [None]:
### 6. Other derivation (Contextual / Temporal / Cross Features)
#### 6.1 User Features
##### 6.1.1 Basic
			# 1) user_account_age = imp_timestamp - user_reg_timestamp
			# 2) user_watch_live_age = imp_timestamp - first_watch_live_timestamp
##### 6.1.2 User CTR (pre-impression, denominator: impressions)
			# ctr_user_15min
			# ctr_user_3hr
			# ctr_user_1d
			# ctr_user_7d
##### 6.1.3 User exposure fatigue - Imp
			# num_imp_user_10min
			# num_imp_user_30min
			# num_imp_user_2hr
			# num_imp_user_12hr
			# num_imp_user_1d
			# num_imp_user_7d
##### 6.1.4 User click fatigue - click
			# num_click_user_15min
			# num_click_user_3hr
			# num_click_user_1d
			# num_click_user_7d
			# click_trend_user = log(num_click_user_15min + 1) - log(num_click_user_3hr + 1)
##### 6.1.5 User recency
			# time_since_last_impression_user
			# tsli_missing - 1 if the user has no prior impression (first impression); 0 - otherwise
			# time_since_last_click_user
			# tslc_missing - - 1 if the user has no reliable prior click (never clicked or click masked by causality guard); 0 - otherwise
			# consecutive_skips_user: number of impressions since last click
##### 6.1.6 User dwell / engagement quality (from past clicks only)
			# avg_watch_time_user
			# avg_watch_time_user
			# median_watch_time_user
			# median_watch_time_user
			# pct_long_watch_user_30s
##### 6.1.7 User comment behavior (Denominator: clicks)
			# comment_rate_user = (num_comment_user + 1) / (num_click_user + 1)
			# has_comment_user_24h
			# num_comment_user_24h
##### 6.1.8 User like behavior (Denominator: clicks)
			# like_rate_user = (num_like_user + 1) / (num_click_user + 1)
			# has_like_user_24h
			# num_like_user_24h
##### 6.1.9 User gift behavior
			# has_gift_user_7d
			# num_gift_user_7d
			# amount_gift_user_7d

In [None]:
df_final_sample = df_final.sample(frac=0.02, random_state=42).copy()
df_final_sample = df_final_sample.reset_index(drop=True)
df_final_sample.shape

In [None]:
##### 6.1.1 Basic
# user_account_age (days) = imp_timestamp - user_reg_timestamp
# user_watch_live_age (days) = imp_timestamp - first_watch_live_timestamp
df_final_sample["user_account_age"] = (
    (df_final_sample["imp_timestamp"] - df_final_sample["user_reg_timestamp"])
    .dt.total_seconds() / 86400
)

df_final_sample["user_watch_live_age"] = (
    (df_final_sample["imp_timestamp"] - df_final_sample["first_watch_live_timestamp"])
    .dt.total_seconds() / 86400
)

In [None]:
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "imp_timestamp",
    "user_reg_timestamp",
    "first_watch_live_timestamp",
    "user_account_age",
    "user_watch_live_age",
]

display(df_final_sample[cols].head())

In [None]:
##### 6.1.2 User CTR (pre-impression, denominator: impressions)
df_final_sample = df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)
g = df_final_sample.groupby("user_id", group_keys=False)

windows = {
    "15min": "ctr_user_15min",
    "3h": "ctr_user_3hr",
    "1d": "ctr_user_1d",
    "7d": "ctr_user_7d",
}

for w, col in windows.items():
    clicks = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .sum()
        .reset_index(level=0, drop=True)
    )
    imps = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .count()
        .reset_index(level=0, drop=True)
    )
    df_final_sample[col] = (clicks / imps.replace(0, np.nan)).fillna(0).to_numpy()

In [None]:
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "ctr_user_15min",
    "ctr_user_3hr",
    "ctr_user_1d",
    "ctr_user_7d",
]

display(df_final_sample[cols].head())


cols = ["ctr_user_15min", "ctr_user_3hr", "ctr_user_1d", "ctr_user_7d"]

# missingness
missing = pd.DataFrame({
    "missing_count": df_final_sample[cols].isna().sum(),
    "missing_pct": df_final_sample[cols].isna().mean().mul(100)
})
display(missing)

# distribution summary
display(df_final_sample[cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
##### 6.1.3 User exposure fatigue - Imp (on df_final_sample)

# ensure datetime + stable ordering
df_final_sample["imp_timestamp"] = pd.to_datetime(df_final_sample["imp_timestamp"], errors="coerce")
df_final_sample = df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("user_id", group_keys=False)

imp_windows = {
    "10min": "num_imp_user_10min",
    "30min": "num_imp_user_30min",
    "2h": "num_imp_user_2hr",
    "12h": "num_imp_user_12hr",
    "1d": "num_imp_user_1d",
    "7d": "num_imp_user_7d",
}

for w, col in imp_windows.items():
    imps = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .count()
        .reset_index(level=0, drop=True)
    )
    df_final_sample[col] = imps.to_numpy()

# optional: fill missing with 0
df_final_sample[list(imp_windows.values())] = df_final_sample[list(imp_windows.values())].fillna(0)


In [None]:
# check new 6.1.3 features (user exposure fatigue)
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_imp_user_10min",
    "num_imp_user_30min",
    "num_imp_user_2hr",
    "num_imp_user_12hr",
    "num_imp_user_1d",
    "num_imp_user_7d"
]

display(df_final_sample[cols].head())
print(df_final_sample[cols].shape)

feat_cols = [
    "num_imp_user_10min",
    "num_imp_user_30min",
    "num_imp_user_2hr",
    "num_imp_user_12hr",
    "num_imp_user_1d",
    "num_imp_user_7d"
]

# missingness
missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

# distribution summary
display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
##### 6.1.4 User click fatigue - click (on df_final_sample)
df_final_sample = df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("user_id", group_keys=False)

click_windows = {
    "15min": "num_click_user_15min",
    "3h": "num_click_user_3hr",
    "1d": "num_click_user_1d",
    "7d": "num_click_user_7d",
}

for w, col in click_windows.items():
    clicks = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .sum()
        .reset_index(level=0, drop=True)
    )
    df_final_sample[col] = clicks.to_numpy()

# trend feature
df_final_sample["click_trend_user"] = (
    np.log(df_final_sample["num_click_user_15min"] + 1)
    - np.log(df_final_sample["num_click_user_3hr"] + 1)
)

# optional: fill missing with 0
df_final_sample[list(click_windows.values()) + ["click_trend_user"]] = (
    df_final_sample[list(click_windows.values()) + ["click_trend_user"]].fillna(0)
)

In [None]:
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_click_user_15min",
    "num_click_user_3hr",
    "num_click_user_1d",
    "num_click_user_7d",
    "click_trend_user",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_click_user_15min",
    "num_click_user_3hr",
    "num_click_user_1d",
    "num_click_user_7d",
    "click_trend_user",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
##### 6.1.5 User recency (on df_final_sample) — milliseconds
df_final_sample = (
    df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort")
    .reset_index(drop=True)
)
g = df_final_sample.groupby("user_id", group_keys=False)

# 1) time_since_last_impression_user (ms)
df_final_sample["time_since_last_impression_user"] = (
    g["imp_timestamp"].diff().dt.total_seconds().mul(1000)
)

# 2) time_since_last_click_user (ms) — prior click only, causality-guarded
last_click_time = (
    df_final_sample["imp_timestamp"]
    .where(df_final_sample["is_click"] == 1)
    .groupby(df_final_sample["user_id"])
    .ffill()
    .shift(1)
)

# guard: do not allow "last click" to be in the future
last_click_time = last_click_time.where(last_click_time <= df_final_sample["imp_timestamp"])

df_final_sample["time_since_last_click_user"] = (
    (df_final_sample["imp_timestamp"] - last_click_time)
    .dt.total_seconds()
    .mul(1000)
)

# 3) consecutive_skips_user: #imps since last click (click row -> 0)
click_group = g["is_click"].cumsum()
df_final_sample["consecutive_skips_user"] = (
    df_final_sample.groupby(["user_id", click_group]).cumcount()
)

# -------------------------------
# Missing handling for NN / DCNv2
# -------------------------------
# Missing indicators (high-signal for cold-start / no history)
df_final_sample["tsli_missing"] = df_final_sample["time_since_last_impression_user"].isna().astype(np.int8)
df_final_sample["tslc_missing"] = df_final_sample["time_since_last_click_user"].isna().astype(np.int8)

# Sentinel fill (use a large "no/very old history" value)
TSLI_FILL_MS = 7 * 24 * 3600 * 1000     # 7 days
TSLC_FILL_MS = 30 * 24 * 3600 * 1000    # 30 days

df_final_sample["time_since_last_impression_user"] = (
    df_final_sample["time_since_last_impression_user"].fillna(TSLI_FILL_MS)
)
df_final_sample["time_since_last_click_user"] = (
    df_final_sample["time_since_last_click_user"].fillna(TSLC_FILL_MS)
)

# Optional but recommended: log transform to reduce scale / heavy tails
# df_final_sample["time_since_last_impression_user"] = np.log1p(df_final_sample["time_since_last_impression_user"])
# df_final_sample["time_since_last_click_user"] = np.log1p(df_final_sample["time_since_last_click_user"])

In [None]:
# check 6.1.5 user recency features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "imp_timestamp",
    "time_since_last_impression_user",
    "time_since_last_click_user",
    "consecutive_skips_user",
    "tsli_missing",
    "tslc_missing",
]

display(df_final_sample[cols].head())

feat_cols = [
    "time_since_last_impression_user",
    "time_since_last_click_user",
    "consecutive_skips_user",
    "tsli_missing",
    "tslc_missing",
]

# missingness
missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

# distribution summary
display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
# Checking code (to prove the problem + verify the fix)
# A) How many “future last click” cases exist?
future_last_click = last_click_time > df_final_sample["imp_timestamp"]
print("future_last_click rows:", int(future_last_click.sum()))

# B) Any negatives after the guard?
neg = df_final_sample["time_since_last_click_user"] < 0
print("negative time_since_last_click_user rows:", int(neg.sum()))

# C) Show a bad user’s timeline (if any future cases still exist)
if future_last_click.any():
    uid = df_final_sample.loc[future_last_click, "user_id"].iloc[0]
    tmp = df_final_sample[df_final_sample["user_id"] == uid].copy()
    tmp["last_click_time_used"] = last_click_time.loc[tmp.index]
    tmp["future_last_click"] = tmp["last_click_time_used"] > tmp["imp_timestamp"]
    tmp[["imp_timestamp","is_click","last_click_time_used","future_last_click","time_since_last_click_user"]].tail(80)

In [None]:
##### 6.1.6 User dwell / engagement quality (from past clicks only)
df_final_sample = df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

# consider watch time only on clicks
df_final_sample["_watch_on_click"] = df_final_sample["watch_live_time"].where(df_final_sample["is_click"] == 1)

g = df_final_sample.groupby("user_id", group_keys=False)

# expanding stats using past clicks only (shift by 1 to avoid leakage)
df_final_sample["avg_watch_time_user"] = (
    g["_watch_on_click"].expanding().mean().shift(1).reset_index(level=0, drop=True)
)

df_final_sample["median_watch_time_user"] = (
    g["_watch_on_click"].expanding().median().shift(1).reset_index(level=0, drop=True)
)

# pct_long_watch_user_30s: past clicks with watch_live_time >= 30
past_clicks = g["is_click"].expanding().sum().shift(1).reset_index(level=0, drop=True)
past_long = (
    g["_watch_on_click"].apply(lambda s: (s >= 30).expanding().sum())
    .shift(1)
    .reset_index(level=0, drop=True)
)

df_final_sample["pct_long_watch_user_30s"] = (past_long / past_clicks.replace(0, np.nan)).fillna(0)

# cleanup helper
df_final_sample = df_final_sample.drop(columns=["_watch_on_click"])


# -------------------------------
# Missing handling for NN / DCNv2
# -------------------------------

# Missing indicators: no past clicked watch history
df_final_sample["avg_watch_time_user_missing"] = df_final_sample["avg_watch_time_user"].isna().astype(np.int8)
df_final_sample["median_watch_time_user_missing"] = df_final_sample["median_watch_time_user"].isna().astype(np.int8)

# Fill NaNs with 0 (paired with missing flags, so semantics are preserved)
df_final_sample["avg_watch_time_user"] = df_final_sample["avg_watch_time_user"].fillna(0.0)
df_final_sample["median_watch_time_user"] = df_final_sample["median_watch_time_user"].fillna(0.0)

In [None]:
# check 6.1.6 user dwell / engagement quality features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "avg_watch_time_user",
    "median_watch_time_user",
    "pct_long_watch_user_30s",
]

display(df_final_sample[cols].head())

feat_cols = [
    "avg_watch_time_user",
    "median_watch_time_user",
    "pct_long_watch_user_30s",
]

# missingness
missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

# distribution summary
display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
##### 6.1.7 User comment behavior (denominator: clicks)
df_final_sample = df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("user_id", group_keys=False)

# past clicks (all history, pre-impression)
num_click_user = (
    g["is_click"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)

# past comments (all history, pre-impression)
num_comment_user = (
    g["is_comment"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)

df_final_sample["comment_rate_user"] = (num_comment_user + 1) / (num_click_user + 1)

# 24h window
num_comment_user_24h = (
    g.rolling("24h", on="imp_timestamp", closed="left")["is_comment"]
    .sum()
    .reset_index(level=0, drop=True)
)

df_final_sample["num_comment_user_24h"] = num_comment_user_24h.to_numpy()
df_final_sample["has_comment_user_24h"] = (df_final_sample["num_comment_user_24h"] > 0).astype("int64")
df_final_sample["num_comment_user_24h"] = df_final_sample["num_comment_user_24h"].fillna(0)
df_final_sample["comment_rate_user"] = df_final_sample["comment_rate_user"].fillna(0)

In [None]:
# check 6.1.7 user comment behavior features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "comment_rate_user",
    "num_comment_user_24h",
    "has_comment_user_24h",
]

display(df_final_sample[cols].head())

feat_cols = [
    "comment_rate_user",
    "num_comment_user_24h",
    "has_comment_user_24h",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
##### 6.1.8 User like behavior (denominator: clicks)
df_final_sample = df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("user_id", group_keys=False)

# past clicks (all history, pre-impression)
num_click_user = (
    g["is_click"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)

# past likes (all history, pre-impression)
num_like_user = (
    g["is_like"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)

df_final_sample["like_rate_user"] = (num_like_user + 1) / (num_click_user + 1)

# 24h window
num_like_user_24h = (
    g.rolling("24h", on="imp_timestamp", closed="left")["is_like"]
    .sum()
    .reset_index(level=0, drop=True)
)

df_final_sample["num_like_user_24h"] = num_like_user_24h.to_numpy()
df_final_sample["has_like_user_24h"] = (df_final_sample["num_like_user_24h"] > 0).astype("int64")

# fill missing
df_final_sample["num_like_user_24h"] = df_final_sample["num_like_user_24h"].fillna(0)
df_final_sample["like_rate_user"] = df_final_sample["like_rate_user"].fillna(0)

In [None]:
# check 6.1.8 user like behavior features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "like_rate_user",
    "num_like_user_24h",
    "has_like_user_24h",
]

display(df_final_sample[cols].head())

feat_cols = [
    "like_rate_user",
    "num_like_user_24h",
    "has_like_user_24h",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
##### 6.1.9 User gift behavior (7d window)

df_final_sample["imp_timestamp"] = pd.to_datetime(df_final_sample["imp_timestamp"], errors="coerce")
df_final_sample = df_final_sample.sort_values(["user_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("user_id", group_keys=False)

# 7d window: gift count + amount
num_gift_user_7d = (
    g.rolling("7d", on="imp_timestamp", closed="left")["is_gift"]
    .sum()
    .reset_index(level=0, drop=True)
)
amount_gift_user_7d = (
    g.rolling("7d", on="imp_timestamp", closed="left")["gift_price"]
    .sum()
    .reset_index(level=0, drop=True)
)

df_final_sample["num_gift_user_7d"] = num_gift_user_7d.to_numpy()
df_final_sample["amount_gift_user_7d"] = amount_gift_user_7d.to_numpy()
df_final_sample["has_gift_user_7d"] = (df_final_sample["num_gift_user_7d"] > 0).astype("int64")

# fill missing
df_final_sample[["num_gift_user_7d", "amount_gift_user_7d"]] = (
    df_final_sample[["num_gift_user_7d", "amount_gift_user_7d"]].fillna(0)
)

In [None]:
# check 6.1.9 user gift behavior features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_gift_user_7d",
    "amount_gift_user_7d",
    "has_gift_user_7d",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_gift_user_7d",
    "amount_gift_user_7d",
    "has_gift_user_7d",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
df_final_sample.shape

In [None]:
user_feat_6_1 = [
    # 6.1.1
    "user_account_age", "user_watch_live_age",
    # 6.1.2
    "ctr_user_15min", "ctr_user_3hr", "ctr_user_1d", "ctr_user_7d",
    # 6.1.3
    "num_imp_user_10min", "num_imp_user_30min", "num_imp_user_2hr",
    "num_imp_user_12hr", "num_imp_user_1d", "num_imp_user_7d",
    # 6.1.4
    "num_click_user_15min", "num_click_user_3hr", "num_click_user_1d",
    "num_click_user_7d", "click_trend_user",
    # 6.1.5
    "time_since_last_impression_user", "tsli_missing",
    "time_since_last_click_user", "tslc_missing",
    "consecutive_skips_user",
    # 6.1.6
    "avg_watch_time_user", "avg_watch_time_user_missing",
    "median_watch_time_user", "median_watch_time_user_missing",  "pct_long_watch_user_30s",
    # 6.1.7
    "comment_rate_user", "has_comment_user_24h", "num_comment_user_24h",
    # 6.1.8
    "like_rate_user", "has_like_user_24h", "num_like_user_24h",
    # 6.1.9
    "has_gift_user_7d", "num_gift_user_7d", "amount_gift_user_7d"
]

print("Total engineered features (6.1.1–6.1.9):", len(user_feat_6_1))


In [None]:
cols = ["user_id", "streamer_id", "live_id", "imp_timestamp"] + user_feat_6_1
display(df_final_sample[cols].head())


In [None]:
#### 6.2 Room (Live) Features
##### 6.2.1 Basic
			# time_since_live_start (ms) = imp_timestamp - start_timestamp
##### 6.2.2 Room CTR (pre-impression, denominator: impressions)
			# ctr_room_10min
			# ctr_room_30min
			# ctr_room_2hr
			# ctr_room_12hr
##### 6.2.3 Room exposure volume - imp
			# num_imp_room_10min
			# num_imp_room_30min
			# num_imp_room_2hr
			# num_imp_room_12hr
			# num_imp_room_1d
##### 6.2.4 Room click volume - click
			# num_click_room_10min
			# num_click_room_30min
			# num_click_room_2hr
			# num_click_room_12hr
			# num_click_room_1d
			# ctr_trend_room = log(ctr_room_10min + 1e-6) - log(ctr_room_2hr + 1e-6)
##### 6.2.5 Room freshness (leakage-safe)
			# time_since_start_live
			# time_since_start_live_bucket: (<5min, 5-20min, >20min)
##### 6.2.6 Room dwell / engagement quality (from past clicks only)
			# avg_watch_time_live
			# median_watch_time_live
			# watch_time_live_missing
			# avg_watch_time_live_30min
			# median_watch_time_live_30min
			# watch_time_live_30min_missing
			# pct_long_watch_live_60s_30min
##### 6.2.7 Room comment behavior (Denominator: impressions)
			# comment_rate_live
			# comment_rate_live_15min
			# comment_rate_live_1hr
			# comment_rate_live_3hr
			# num_comment_live
			# num_comment_live_15min
			# num_comment_live_1hr
			# num_comment_live_3hr
			# comment_trend_room = log(comment_rate_live_15min + 1e-6) - log(comment_rate_live_1hr + 1e-6)
##### 6.2.8 Room like behavior (Denominator: impressions)
			# like_rate_live
			# like_rate_live_15min
			# like_rate_live_1hr
			# like_rate_live_3hr
			# num_like_live
			# num_like_live_15min
			# num_like_live_1hr
			# num_like_live_3hr
			# like_trend_room = log(like_rate_live_15min + 1e-6) - log(like_rate_live_1hr + 1e-6)
##### 6.2.9 Room gift behavior (Denominator: impressions)
			# gift_rate_live
			# gift_rate_live_15min
			# gift_rate_live_1hr
			# gift_rate_live_3hr
			# num_gift_live
			# num_gift_live_15min
			# num_gift_live_1hr
			# num_gift_live_3hr
			# amount_gift_live
			# amount_gift_live_15min
			# amount_gift_live_1hr
			# amount_gift_live_3hr
			# gift_trend_room = log(log_amount_gift_room_15min + 1) - log(log_amount_gift_room_1hr + 1)

In [None]:
##### 6.2.1 Basic
# time_since_live_start (ms) = imp_timestamp - start_timestamp
df_final_sample["time_since_live_start"] = (
    (df_final_sample["imp_timestamp"] - df_final_sample["start_timestamp"])
    .dt.total_seconds()
    .mul(1000)
)

In [None]:
# check 6.2.1 time_since_live_start
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "imp_timestamp",
    "start_timestamp",
    "time_since_live_start",
]

display(df_final_sample[cols].head())

feat_cols = ["time_since_live_start"]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
##### 6.2.2 Room CTR (pre-impression, denominator: impressions)
df_final_sample = df_final_sample.sort_values(["live_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("live_id", group_keys=False)

room_windows = {
    "10min": "ctr_room_10min",
    "30min": "ctr_room_30min",
    "2h": "ctr_room_2hr",
    "12h": "ctr_room_12hr",
}

for w, col in room_windows.items():
    clicks = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .sum()
        .reset_index(level=0, drop=True)
    )
    imps = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .count()
        .reset_index(level=0, drop=True)
    )
    df_final_sample[col] = (clicks / imps.replace(0, np.nan)).fillna(0).to_numpy()

In [None]:
# check 6.2.2 room CTR features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "ctr_room_10min",
    "ctr_room_30min",
    "ctr_room_2hr",
    "ctr_room_12hr",
]

display(df_final_sample[cols].head())

feat_cols = [
    "ctr_room_10min",
    "ctr_room_30min",
    "ctr_room_2hr",
    "ctr_room_12hr",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
##### 6.2.3 Room exposure volume - imp
df_final_sample = df_final_sample.sort_values(["live_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("live_id", group_keys=False)

room_imp_windows = {
    "10min": "num_imp_room_10min",
    "30min": "num_imp_room_30min",
    "2h": "num_imp_room_2hr",
    "12h": "num_imp_room_12hr",
    "1d": "num_imp_room_1d",
}

for w, col in room_imp_windows.items():
    imps = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .count()
        .reset_index(level=0, drop=True)
    )
    df_final_sample[col] = imps.to_numpy()


# for these newly added features, replace missing values with 0
room_imp_cols = [
    "num_imp_room_10min",
    "num_imp_room_30min",
    "num_imp_room_2hr",
    "num_imp_room_12hr",
    "num_imp_room_1d"
]

df_final_sample[room_imp_cols] = df_final_sample[room_imp_cols].fillna(0)


In [None]:
# check 6.2.3 room exposure volume features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_imp_room_10min",
    "num_imp_room_30min",
    "num_imp_room_2hr",
    "num_imp_room_12hr",
    "num_imp_room_1d",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_imp_room_10min",
    "num_imp_room_30min",
    "num_imp_room_2hr",
    "num_imp_room_12hr",
    "num_imp_room_1d",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
##### 6.2.4 Room click volume - click
df_final_sample = df_final_sample.sort_values(["live_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("live_id", group_keys=False)

room_click_windows = {
    "10min": "num_click_room_10min",
    "30min": "num_click_room_30min",
    "2h": "num_click_room_2hr",
    "12h": "num_click_room_12hr",
    "1d": "num_click_room_1d",
}

for w, col in room_click_windows.items():
    clicks = (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .sum()
        .reset_index(level=0, drop=True)
    )
    df_final_sample[col] = clicks.to_numpy()

# ctr trend
df_final_sample["ctr_trend_room"] = (
    np.log(df_final_sample["ctr_room_10min"] + 1e-6)
    - np.log(df_final_sample["ctr_room_2hr"] + 1e-6)
)

# fill missing
df_final_sample[list(room_click_windows.values()) + ["ctr_trend_room"]] = (
    df_final_sample[list(room_click_windows.values()) + ["ctr_trend_room"]].fillna(0)
)

In [None]:
# check 6.2.4 room click volume features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_click_room_10min",
    "num_click_room_30min",
    "num_click_room_2hr",
    "num_click_room_12hr",
    "num_click_room_1d",
    "ctr_trend_room",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_click_room_10min",
    "num_click_room_30min",
    "num_click_room_2hr",
    "num_click_room_12hr",
    "num_click_room_1d",
    "ctr_trend_room",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
##### 6.2.5 Room freshness (leakage-safe)
df_final_sample["start_timestamp"] = pd.to_datetime(df_final_sample["start_timestamp"], errors="coerce")

# time since live start (ms)
df_final_sample["time_since_start_live"] = (
    (df_final_sample["imp_timestamp"] - df_final_sample["start_timestamp"])
    .dt.total_seconds()
    .mul(1000)
)

# buckets based on minutes: <5min, 5-20min, >20min
mins = df_final_sample["time_since_start_live"] / (60 * 1000)
df_final_sample["time_since_start_live_bucket"] = pd.cut(
    mins,
    bins=[-float("inf"), 5, 20, float("inf")],
    labels=["<5min", "5-20min", ">20min"],
)
# label encoding for time_since_start_live_bucket
df_final_sample["time_since_start_live_bucket"] = (
    df_final_sample["time_since_start_live_bucket"].astype("category").cat.codes
)

In [None]:
# check 6.2.5 room freshness features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "time_since_start_live",
    "time_since_start_live_bucket"
]

display(df_final_sample[cols].head())

feat_cols = ["time_since_start_live"]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

# bucket distribution
display(df_final_sample["time_since_start_live_bucket"].value_counts(dropna=False))


In [None]:
##### 6.2.6 Room dwell / engagement quality (from past clicks only)
df_final_sample = df_final_sample.sort_values(["live_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

# watch time only on clicks
df_final_sample["_watch_on_click"] = df_final_sample["watch_live_time"].where(df_final_sample["is_click"] == 1)

g = df_final_sample.groupby("live_id", group_keys=False)

# all-history (past only)
df_final_sample["avg_watch_time_live"] = (
    g["_watch_on_click"].expanding().mean().shift(1).reset_index(level=0, drop=True)
)
df_final_sample["median_watch_time_live"] = (
    g["_watch_on_click"].expanding().median().shift(1).reset_index(level=0, drop=True)
)

# 30min rolling (past only)
df_final_sample["avg_watch_time_live_30min"] = (
    g.rolling("30min", on="imp_timestamp", closed="left")["_watch_on_click"]
    .mean()
    .reset_index(level=0, drop=True)
    .to_numpy()
)

df_final_sample["median_watch_time_live_30min"] = (
    g.rolling("30min", on="imp_timestamp", closed="left")["_watch_on_click"]
    .median()
    .reset_index(level=0, drop=True)
    .to_numpy()
)


# pct_long_watch_live_60s_30min (past only)
past_long_30 = (
    g.rolling("30min", on="imp_timestamp", closed="left")["_watch_on_click"]
    .apply(lambda s: (s >= 60000).sum(), raw=True)
    .reset_index(level=0, drop=True)
    .to_numpy()
)

past_clicks_30 = (
    g.rolling("30min", on="imp_timestamp", closed="left")["_watch_on_click"]
    .count()
    .reset_index(level=0, drop=True)
    .to_numpy()
)

df_final_sample["pct_long_watch_live_60s_30min"] = (
    past_long_30 / np.where(past_clicks_30 == 0, np.nan, past_clicks_30)
)
df_final_sample["pct_long_watch_live_60s_30min"] = df_final_sample["pct_long_watch_live_60s_30min"].fillna(0)


# missing flags for watch-time stats (avg/median share one flag per window)
df_final_sample["watch_time_live_missing"] = (
    df_final_sample[["avg_watch_time_live", "median_watch_time_live"]].isna().any(axis=1).astype("int64")
)
df_final_sample["watch_time_live_30min_missing"] = (
    df_final_sample[["avg_watch_time_live_30min", "median_watch_time_live_30min"]].isna().any(axis=1).astype("int64")
)


# fill NaNs with 0
fill_cols = [
    "avg_watch_time_live",
    "median_watch_time_live",
    "avg_watch_time_live_30min",
    "median_watch_time_live_30min",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)

# cleanup helper
df_final_sample = df_final_sample.drop(columns=["_watch_on_click"])

In [None]:
# check 6.2.6 room dwell / engagement quality features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "avg_watch_time_live",
    "median_watch_time_live",
    "avg_watch_time_live_30min",
    "median_watch_time_live_30min",
    "pct_long_watch_live_60s_30min",
    "watch_time_live_missing",
    "watch_time_live_30min_missing",
]

display(df_final_sample[cols].head())

feat_cols = [
    "avg_watch_time_live",
    "median_watch_time_live",
    "avg_watch_time_live_30min",
    "median_watch_time_live_30min",
    "pct_long_watch_live_60s_30min",
    "watch_time_live_missing",
    "watch_time_live_30min_missing",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
##### 6.2.7 Room comment behavior (denominator: impressions)
df_final_sample = df_final_sample.sort_values(["live_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("live_id", group_keys=False)

# all-history (past only)
num_comment_live = (
    g["is_comment"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)
num_imp_live = (
    g["is_comment"].expanding().count().shift(1).reset_index(level=0, drop=True)
)
df_final_sample["num_comment_live"] = num_comment_live.to_numpy()
df_final_sample["comment_rate_live"] = (num_comment_live / num_imp_live.replace(0, np.nan)).fillna(0)

# rolling windows (past only)
def _roll_sum(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_comment"]
        .sum()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

def _roll_cnt(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_comment"]
        .count()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

df_final_sample["num_comment_live_15min"] = _roll_sum("15min")
df_final_sample["num_comment_live_1hr"]  = _roll_sum("1h")
df_final_sample["num_comment_live_3hr"]  = _roll_sum("3h")

imp_15 = _roll_cnt("15min")
imp_1h = _roll_cnt("1h")
imp_3h = _roll_cnt("3h")

df_final_sample["comment_rate_live_15min"] = (df_final_sample["num_comment_live_15min"] / np.where(imp_15 == 0, np.nan, imp_15))
df_final_sample["comment_rate_live_1hr"]   = (df_final_sample["num_comment_live_1hr"]  / np.where(imp_1h == 0, np.nan, imp_1h))
df_final_sample["comment_rate_live_3hr"]   = (df_final_sample["num_comment_live_3hr"]  / np.where(imp_3h == 0, np.nan, imp_3h))

# trend
df_final_sample["comment_trend_room"] = (
    np.log(df_final_sample["comment_rate_live_15min"] + 1e-6)
    - np.log(df_final_sample["comment_rate_live_1hr"] + 1e-6)
)

# fill missing
fill_cols = [
    "num_comment_live", "num_comment_live_15min", "num_comment_live_1hr", "num_comment_live_3hr",
    "comment_rate_live", "comment_rate_live_15min", "comment_rate_live_1hr", "comment_rate_live_3hr",
    "comment_trend_room",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)

In [None]:
# check 6.2.7 room comment behavior features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_comment_live",
    "num_comment_live_15min",
    "num_comment_live_1hr",
    "num_comment_live_3hr",
    "comment_rate_live",
    "comment_rate_live_15min",
    "comment_rate_live_1hr",
    "comment_rate_live_3hr",
    "comment_trend_room",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_comment_live",
    "num_comment_live_15min",
    "num_comment_live_1hr",
    "num_comment_live_3hr",
    "comment_rate_live",
    "comment_rate_live_15min",
    "comment_rate_live_1hr",
    "comment_rate_live_3hr",
    "comment_trend_room",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
##### 6.2.8 Room like behavior (denominator: impressions)
df_final_sample = df_final_sample.sort_values(["live_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("live_id", group_keys=False)

# all-history (past only)
num_like_live = (
    g["is_like"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)
num_imp_live = (
    g["is_like"].expanding().count().shift(1).reset_index(level=0, drop=True)
)
df_final_sample["num_like_live"] = num_like_live.to_numpy()
df_final_sample["like_rate_live"] = (num_like_live / num_imp_live.replace(0, np.nan)).fillna(0)

# rolling windows (past only)
def _roll_sum(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_like"]
        .sum()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

def _roll_cnt(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_like"]
        .count()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

df_final_sample["num_like_live_15min"] = _roll_sum("15min")
df_final_sample["num_like_live_1hr"]  = _roll_sum("1h")
df_final_sample["num_like_live_3hr"]  = _roll_sum("3h")

imp_15 = _roll_cnt("15min")
imp_1h = _roll_cnt("1h")
imp_3h = _roll_cnt("3h")

df_final_sample["like_rate_live_15min"] = (df_final_sample["num_like_live_15min"] / np.where(imp_15 == 0, np.nan, imp_15))
df_final_sample["like_rate_live_1hr"]   = (df_final_sample["num_like_live_1hr"]  / np.where(imp_1h == 0, np.nan, imp_1h))
df_final_sample["like_rate_live_3hr"]   = (df_final_sample["num_like_live_3hr"]  / np.where(imp_3h == 0, np.nan, imp_3h))

# trend
df_final_sample["like_trend_room"] = (
    np.log(df_final_sample["like_rate_live_15min"] + 1e-6)
    - np.log(df_final_sample["like_rate_live_1hr"] + 1e-6)
)

# fill missing
fill_cols = [
    "num_like_live", "num_like_live_15min", "num_like_live_1hr", "num_like_live_3hr",
    "like_rate_live", "like_rate_live_15min", "like_rate_live_1hr", "like_rate_live_3hr",
    "like_trend_room",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)

In [None]:
# check 6.2.8 room like behavior features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_like_live",
    "num_like_live_15min",
    "num_like_live_1hr",
    "num_like_live_3hr",
    "like_rate_live",
    "like_rate_live_15min",
    "like_rate_live_1hr",
    "like_rate_live_3hr",
    "like_trend_room",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_like_live",
    "num_like_live_15min",
    "num_like_live_1hr",
    "num_like_live_3hr",
    "like_rate_live",
    "like_rate_live_15min",
    "like_rate_live_1hr",
    "like_rate_live_3hr",
    "like_trend_room",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
##### 6.2.9 Room gift behavior (denominator: impressions)
df_final_sample = df_final_sample.sort_values(["live_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("live_id", group_keys=False)

# all-history (past only)
num_gift_live = (
    g["is_gift"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)
amount_gift_live = (
    g["gift_price"].expanding().sum().shift(1).reset_index(level=0, drop=True)
)
num_imp_live = (
    g["is_gift"].expanding().count().shift(1).reset_index(level=0, drop=True)
)

df_final_sample["num_gift_live"] = num_gift_live.to_numpy()
df_final_sample["amount_gift_live"] = amount_gift_live.to_numpy()
df_final_sample["gift_rate_live"] = (num_gift_live / num_imp_live.replace(0, np.nan)).fillna(0)

# rolling windows (past only)
def _roll_sum(col, w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")[col]
        .sum()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

def _roll_cnt(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_gift"]
        .count()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

df_final_sample["num_gift_live_15min"] = _roll_sum("is_gift", "15min")
df_final_sample["num_gift_live_1hr"]  = _roll_sum("is_gift", "1h")
df_final_sample["num_gift_live_3hr"]  = _roll_sum("is_gift", "3h")

df_final_sample["amount_gift_live_15min"] = _roll_sum("gift_price", "15min")
df_final_sample["amount_gift_live_1hr"]  = _roll_sum("gift_price", "1h")
df_final_sample["amount_gift_live_3hr"]  = _roll_sum("gift_price", "3h")

imp_15 = _roll_cnt("15min")
imp_1h = _roll_cnt("1h")
imp_3h = _roll_cnt("3h")

df_final_sample["gift_rate_live_15min"] = (df_final_sample["num_gift_live_15min"] / np.where(imp_15 == 0, np.nan, imp_15))
df_final_sample["gift_rate_live_1hr"]   = (df_final_sample["num_gift_live_1hr"]  / np.where(imp_1h == 0, np.nan, imp_1h))
df_final_sample["gift_rate_live_3hr"]   = (df_final_sample["num_gift_live_3hr"]  / np.where(imp_3h == 0, np.nan, imp_3h))

# trend (log amount)
df_final_sample["gift_trend_room"] = (
    np.log(np.log1p(df_final_sample["amount_gift_live_15min"]) + 1)
    - np.log(np.log1p(df_final_sample["amount_gift_live_1hr"]) + 1)
)

# fill missing
fill_cols = [
    "num_gift_live", "amount_gift_live", "gift_rate_live",
    "num_gift_live_15min", "num_gift_live_1hr", "num_gift_live_3hr",
    "amount_gift_live_15min", "amount_gift_live_1hr", "amount_gift_live_3hr",
    "gift_rate_live_15min", "gift_rate_live_1hr", "gift_rate_live_3hr",
    "gift_trend_room",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)

In [None]:
# check 6.2.9 room gift behavior features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_gift_live",
    "amount_gift_live",
    "gift_rate_live",
    "num_gift_live_15min",
    "num_gift_live_1hr",
    "num_gift_live_3hr",
    "amount_gift_live_15min",
    "amount_gift_live_1hr",
    "amount_gift_live_3hr",
    "gift_rate_live_15min",
    "gift_rate_live_1hr",
    "gift_rate_live_3hr",
    "gift_trend_room",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_gift_live",
    "amount_gift_live",
    "gift_rate_live",
    "num_gift_live_15min",
    "num_gift_live_1hr",
    "num_gift_live_3hr",
    "amount_gift_live_15min",
    "amount_gift_live_1hr",
    "amount_gift_live_3hr",
    "gift_rate_live_15min",
    "gift_rate_live_1hr",
    "gift_rate_live_3hr",
    "gift_trend_room",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100)
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

In [None]:
room_feat_6_2 = [
    # 6.2.1
    "time_since_live_start",
    # 6.2.2
    "ctr_room_10min", "ctr_room_30min", "ctr_room_2hr", "ctr_room_12hr",
    # 6.2.3
    "num_imp_room_10min", "num_imp_room_30min", "num_imp_room_2hr",
    "num_imp_room_12hr", "num_imp_room_1d",
    # 6.2.4
    "num_click_room_10min", "num_click_room_30min", "num_click_room_2hr",
    "num_click_room_12hr", "num_click_room_1d", "ctr_trend_room",
    # 6.2.5
    "time_since_start_live", "time_since_start_live_bucket",
    # 6.2.6
    "avg_watch_time_live", "median_watch_time_live",
    "avg_watch_time_live_30min", "median_watch_time_live_30min",
    "pct_long_watch_live_60s_30min",
    "watch_time_live_missing", "watch_time_live_30min_missing",
    # 6.2.7
    "comment_rate_live", "comment_rate_live_15min", "comment_rate_live_1hr", "comment_rate_live_3hr",
    "num_comment_live", "num_comment_live_15min", "num_comment_live_1hr", "num_comment_live_3hr",
    "comment_trend_room",
    # 6.2.8
    "like_rate_live", "like_rate_live_15min", "like_rate_live_1hr", "like_rate_live_3hr",
    "num_like_live", "num_like_live_15min", "num_like_live_1hr", "num_like_live_3hr",
    "like_trend_room",
    # 6.2.9
    "gift_rate_live", "gift_rate_live_15min", "gift_rate_live_1hr", "gift_rate_live_3hr",
    "num_gift_live", "num_gift_live_15min", "num_gift_live_1hr", "num_gift_live_3hr",
    "amount_gift_live", "amount_gift_live_15min", "amount_gift_live_1hr", "amount_gift_live_3hr",
    "gift_trend_room",
]

print("Total engineered room features (6.2.1–6.2.9):", len(room_feat_6_2))

In [None]:
cols = ["user_id", "streamer_id", "live_id", "imp_timestamp"] + room_feat_6_2

with pd.option_context("display.max_columns", None, "display.width", None):
    display(df_final_sample[cols].head())

In [None]:
df_final_sample.shape

In [None]:
#### 6.3 Streamer Features
##### 6.3.1 Basic
			# streamer_account_age = imp_timestamp - streamer_reg_timestamp
			# streamer_live_age = imp_timestamp - first_live_timestamp
##### 6.3.2 Streamer CTR / volume
			# ctr_streamer_1d
			# ctr_streamer_7d
			# num_imp_streamer_7d
			# num_click_streamer_7d
			# num_lives_streamer_7d
##### 6.3.3 Streamer engagement quality - dwell time (from past clicks only)
			# avg_watch_time_streamer
			# median_watch_time_streamer
			# pct_long_watch_streamer_30s
##### 6.3.4 Streamer interaction volume
			# num_comment_streamer_7d
			# num_like_streamer_7d
			# amount_gift_streamer_7d

In [None]:
# 6.3.1 Streamer basic features
# streamer_account_age (days) = imp_timestamp - streamer_reg_timestamp
# streamer_live_age (days) = imp_timestamp - first_live_timestamp

# compute ages (in days)
df_final_sample["streamer_account_age"] = (
    (df_final_sample["imp_timestamp"] - df_final_sample["streamer_reg_timestamp"])
    .dt.total_seconds() / 86400
)
df_final_sample["streamer_live_age"] = (
    (df_final_sample["imp_timestamp"] - df_final_sample["first_live_timestamp"])
    .dt.total_seconds() / 86400
)

In [None]:
# checks for 6.3.1 streamer basic features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "imp_timestamp",
    "streamer_reg_timestamp",
    "first_live_timestamp",
    "streamer_account_age",
    "streamer_live_age",
]

display(df_final_sample[cols].head())

feat_cols = [
    "streamer_account_age",
    "streamer_live_age",
]

# missingness
missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100),
})
display(missing)

# distribution summary
display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))

# sanity checks: negative ages
neg_account = (df_final_sample["streamer_account_age"] < 0).sum()
neg_live = (df_final_sample["streamer_live_age"] < 0).sum()

print("negative streamer_account_age:", int(neg_account))
print("negative streamer_live_age:", int(neg_live))

# show a few problematic rows if any
if neg_account > 0 or neg_live > 0:
    bad = df_final_sample[
        (df_final_sample["streamer_account_age"] < 0) |
        (df_final_sample["streamer_live_age"] < 0)
    ][cols]
    display(bad.head(10))


In [None]:
# 6.3.2 Streamer CTR / volume (past-only, leakage-safe)
df_final_sample = df_final_sample.sort_values(["streamer_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("streamer_id", group_keys=False)

# helper: rolling counts with time windows (past only)
def _roll_cnt(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .count()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

def _roll_sum(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .sum()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

# 7d window
imp_7d = _roll_cnt("7d")
click_7d = _roll_sum("7d")

df_final_sample["num_imp_streamer_7d"] = imp_7d
df_final_sample["num_click_streamer_7d"] = click_7d
df_final_sample["ctr_streamer_7d"] = (click_7d / np.where(imp_7d == 0, np.nan, imp_7d))

# 1d window
imp_1d = _roll_cnt("1d")
click_1d = _roll_sum("1d")

df_final_sample["ctr_streamer_1d"] = (click_1d / np.where(imp_1d == 0, np.nan, imp_1d))

# num_lives_streamer_7d: unique live_id in past 7d
df_final_sample["num_lives_streamer_7d"] = (
    g.rolling("7d", on="imp_timestamp", closed="left")["live_id"]
    .apply(lambda s: s.nunique(), raw=False)
    .reset_index(level=0, drop=True)
    .to_numpy()
)

# fill missing with 0
fill_cols = [
    "num_imp_streamer_7d",
    "num_click_streamer_7d",
    "ctr_streamer_7d",
    "ctr_streamer_1d",
    "num_lives_streamer_7d",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)


In [None]:
# check 6.3.2 streamer CTR / volume features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "imp_timestamp",
    "num_imp_streamer_7d",
    "num_click_streamer_7d",
    "ctr_streamer_7d",
    "ctr_streamer_1d",
    "num_lives_streamer_7d",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_imp_streamer_7d",
    "num_click_streamer_7d",
    "ctr_streamer_7d",
    "ctr_streamer_1d",
    "num_lives_streamer_7d",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
# 6.3.3 Streamer engagement quality - dwell time (past clicks only)
df_final_sample = df_final_sample.sort_values(["streamer_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

# watch time only on clicks
df_final_sample["_watch_on_click"] = df_final_sample["watch_live_time"].where(df_final_sample["is_click"] == 1)

g = df_final_sample.groupby("streamer_id", group_keys=False)

# all-history (past only)
df_final_sample["avg_watch_time_streamer"] = (
    g["_watch_on_click"].expanding().mean().shift(1).reset_index(level=0, drop=True)
)
df_final_sample["median_watch_time_streamer"] = (
    g["_watch_on_click"].expanding().median().shift(1).reset_index(level=0, drop=True)
)

# pct_long_watch_streamer_30s (past only)
past_long = (
    g["_watch_on_click"].expanding()
    .apply(lambda s: (s >= 30000).sum(), raw=True)
    .shift(1)
    .reset_index(level=0, drop=True)
)
past_clicks = (
    g["_watch_on_click"].expanding()
    .count()
    .shift(1)
    .reset_index(level=0, drop=True)
)
df_final_sample["pct_long_watch_streamer_30s"] = (
    past_long / np.where(past_clicks == 0, np.nan, past_clicks)
)

# missing flags
df_final_sample["watch_time_streamer_missing"] = (
    df_final_sample[["avg_watch_time_streamer", "median_watch_time_streamer"]].isna().any(axis=1).astype("int64")
)

# fill NaNs with 0 (paired with missing flags)
fill_cols = [
    "avg_watch_time_streamer",
    "median_watch_time_streamer",
    "pct_long_watch_streamer_30s",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)

# cleanup helper
df_final_sample = df_final_sample.drop(columns=["_watch_on_click"])


In [None]:
# check 6.3.3 streamer engagement quality features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "avg_watch_time_streamer",
    "median_watch_time_streamer",
    "pct_long_watch_streamer_30s",
    "watch_time_streamer_missing",
]

display(df_final_sample[cols].head())

feat_cols = [
    "avg_watch_time_streamer",
    "median_watch_time_streamer",
    "pct_long_watch_streamer_30s",
    "watch_time_streamer_missing",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
# 6.3.4 Streamer interaction volume (past-only, 7d)
df_final_sample = df_final_sample.sort_values(["streamer_id", "imp_timestamp"], kind="mergesort").reset_index(drop=True)

g = df_final_sample.groupby("streamer_id", group_keys=False)

def _roll_sum(col, w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")[col]
        .sum()
        .reset_index(level=0, drop=True)
        .to_numpy()
    )

df_final_sample["num_comment_streamer_7d"] = _roll_sum("is_comment", "7d")
df_final_sample["num_like_streamer_7d"] = _roll_sum("is_like", "7d")
df_final_sample["amount_gift_streamer_7d"] = _roll_sum("gift_price", "7d")

fill_cols = [
    "num_comment_streamer_7d",
    "num_like_streamer_7d",
    "amount_gift_streamer_7d",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)


In [None]:
# check 6.3.4 streamer interaction volume features
cols = [
    "user_id",
    "live_id",
    "streamer_id",
    "num_comment_streamer_7d",
    "num_like_streamer_7d",
    "amount_gift_streamer_7d",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_comment_streamer_7d",
    "num_like_streamer_7d",
    "amount_gift_streamer_7d",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
# 6.3 feature list (6.3.1–6.3.4)
streamer_feat_6_3 = [
    # 6.3.1
    "streamer_account_age",
    "streamer_live_age",
    # 6.3.2
    "ctr_streamer_1d",
    "ctr_streamer_7d",
    "num_imp_streamer_7d",
    "num_click_streamer_7d",
    "num_lives_streamer_7d",
    # 6.3.3
    "avg_watch_time_streamer",
    "median_watch_time_streamer",
    "pct_long_watch_streamer_30s",
    "watch_time_streamer_missing",
    # 6.3.4
    "num_comment_streamer_7d",
    "num_like_streamer_7d",
    "amount_gift_streamer_7d",
]

print("Total engineered streamer features (6.3.1–6.3.4):", len(streamer_feat_6_3))

# handle dropped/missing columns
missing_cols = [c for c in streamer_feat_6_3 if c not in df_final_sample.columns]
present_cols = [c for c in streamer_feat_6_3 if c in df_final_sample.columns]

print("Missing columns:", missing_cols)

# preview
cols = ["user_id", "live_id", "streamer_id", "imp_timestamp"] + present_cols
with pd.option_context("display.max_columns", None, "display.width", None):
    display(df_final_sample[cols].head())

# missingness + summary
missing = pd.DataFrame({
    "missing_count": df_final_sample[present_cols].isna().sum(),
    "missing_pct": df_final_sample[present_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[present_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
cols = ["user_id", "streamer_id", "live_id", "imp_timestamp"] + streamer_feat_6_3

with pd.option_context("display.max_columns", None, "display.width", None):
    display(df_final_sample[cols].head())

df_final_sample[cols].shape

In [None]:
df_final_sample.shape

In [None]:
#### 6.4 Cross Features (High ROI)
##### 6.4.1 User x streamer
			# ctr_user_streamer_7d
			# num_click_user_streamer_7d
			# num_imp_user_streamer_7d
			# time_since_last_impression_user_streamer
			# time_since_last_click_user_streamer
##### 6.4.2 User x category
			# ctr_user_category_7d
			# num_click_user_category_7d
			# num_imp_user_category_7d

In [None]:
# 6.4.1 User x Streamer features (past-only, leakage-safe)
df_final_sample = df_final_sample.sort_values(
    ["user_id", "streamer_id", "imp_timestamp"], kind="mergesort"
).reset_index(drop=True)

g = df_final_sample.groupby(["user_id", "streamer_id"], group_keys=False)

# helper: rolling counts with time windows (past only)
def _roll_cnt(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .count()
        .reset_index(level=[0, 1], drop=True)
        .to_numpy()
    )

def _roll_sum(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .sum()
        .reset_index(level=[0, 1], drop=True)
        .to_numpy()
    )

# 7d window counts
imp_7d = _roll_cnt("7d")
click_7d = _roll_sum("7d")

df_final_sample["num_imp_user_streamer_7d"] = imp_7d
df_final_sample["num_click_user_streamer_7d"] = click_7d
df_final_sample["ctr_user_streamer_7d"] = (click_7d / np.where(imp_7d == 0, np.nan, imp_7d))

# recency: time since last impression/click (in seconds)
last_imp = g["imp_timestamp"].shift(1)
df_final_sample["time_since_last_impression_user_streamer"] = (
    (df_final_sample["imp_timestamp"] - last_imp).dt.total_seconds()
)

last_click_ts = df_final_sample["imp_timestamp"].where(df_final_sample["is_click"] == 1)
last_click_ts = g["imp_timestamp"].apply(lambda s: s.where(df_final_sample.loc[s.index, "is_click"] == 1)).ffill().shift(1)
df_final_sample["time_since_last_click_user_streamer"] = (
    (df_final_sample["imp_timestamp"] - last_click_ts).dt.total_seconds()
)

# missing flags (first impression/click)
df_final_sample["tsli_user_streamer_missing"] = df_final_sample["time_since_last_impression_user_streamer"].isna().astype("int64")
df_final_sample["tslc_user_streamer_missing"] = df_final_sample["time_since_last_click_user_streamer"].isna().astype("int64")

# fill NaNs with 0
fill_cols = [
    "num_imp_user_streamer_7d",
    "num_click_user_streamer_7d",
    "ctr_user_streamer_7d",
    "time_since_last_impression_user_streamer",
    "time_since_last_click_user_streamer",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)

In [None]:
# check 6.4.1 user x streamer features
cols = [
    "user_id",
    "streamer_id",
    "live_id",
    "imp_timestamp",
    "num_imp_user_streamer_7d",
    "num_click_user_streamer_7d",
    "ctr_user_streamer_7d",
    "time_since_last_impression_user_streamer",
    "time_since_last_click_user_streamer",
    "tsli_user_streamer_missing",
    "tslc_user_streamer_missing",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_imp_user_streamer_7d",
    "num_click_user_streamer_7d",
    "ctr_user_streamer_7d",
    "time_since_last_impression_user_streamer",
    "time_since_last_click_user_streamer",
    "tsli_user_streamer_missing",
    "tslc_user_streamer_missing",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
# 6.4.2 User x Category features (past-only, 7d)
cat_col = "live_content_category_le"  # change if your category column name differs

df_final_sample = df_final_sample.sort_values(
    ["user_id", cat_col, "imp_timestamp"], kind="mergesort"
).reset_index(drop=True)

g = df_final_sample.groupby(["user_id", cat_col], group_keys=False)

def _roll_cnt(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .count()
        .reset_index(level=[0, 1], drop=True)
        .to_numpy()
    )

def _roll_sum(w):
    return (
        g.rolling(w, on="imp_timestamp", closed="left")["is_click"]
        .sum()
        .reset_index(level=[0, 1], drop=True)
        .to_numpy()
    )

imp_7d = _roll_cnt("7d")
click_7d = _roll_sum("7d")

df_final_sample["num_imp_user_category_7d"] = imp_7d
df_final_sample["num_click_user_category_7d"] = click_7d
df_final_sample["ctr_user_category_7d"] = (click_7d / np.where(imp_7d == 0, np.nan, imp_7d))

# fill missing with 0
fill_cols = [
    "num_imp_user_category_7d",
    "num_click_user_category_7d",
    "ctr_user_category_7d",
]
df_final_sample[fill_cols] = df_final_sample[fill_cols].fillna(0)


In [None]:
# check 6.4.2 user x category features
cat_col = "live_content_category_le"  # change if needed

cols = [
    "user_id",
    "streamer_id",
    "live_id",
    "imp_timestamp",
    cat_col,
    "num_imp_user_category_7d",
    "num_click_user_category_7d",
    "ctr_user_category_7d",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_imp_user_category_7d",
    "num_click_user_category_7d",
    "ctr_user_category_7d",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
# check 6.4.2 user x category features
cat_col = "live_content_category_le"  # change if needed

cols = [
    "user_id",
    "streamer_id",
    "live_id",
    "imp_timestamp",
    cat_col,
    "num_imp_user_category_7d",
    "num_click_user_category_7d",
    "ctr_user_category_7d",
]

display(df_final_sample[cols].head())

feat_cols = [
    "num_imp_user_category_7d",
    "num_click_user_category_7d",
    "ctr_user_category_7d",
]

missing = pd.DataFrame({
    "missing_count": df_final_sample[feat_cols].isna().sum(),
    "missing_pct": df_final_sample[feat_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[feat_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
# 6.4.1–6.4.2 feature list
user_x_feat_6_4 = [
    # 6.4.1 user x streamer
    "num_imp_user_streamer_7d",
    "num_click_user_streamer_7d",
    "ctr_user_streamer_7d",
    "time_since_last_impression_user_streamer",
    "time_since_last_click_user_streamer",
    "tsli_user_streamer_missing",
    "tslc_user_streamer_missing",
    # 6.4.2 user x category
    "num_imp_user_category_7d",
    "num_click_user_category_7d",
    "ctr_user_category_7d",
]

print("Total engineered user-x features (6.4.1–6.4.2):", len(user_x_feat_6_4))

missing_cols = [c for c in user_x_feat_6_4 if c not in df_final_sample.columns]
present_cols = [c for c in user_x_feat_6_4 if c in df_final_sample.columns]

print("Missing columns:", missing_cols)

cols = ["user_id", "streamer_id", "live_id", "imp_timestamp"] + present_cols
with pd.option_context("display.max_columns", None, "display.width", None):
    display(df_final_sample[cols].head())

missing = pd.DataFrame({
    "missing_count": df_final_sample[present_cols].isna().sum(),
    "missing_pct": df_final_sample[present_cols].isna().mean().mul(100),
})
display(missing)

display(df_final_sample[present_cols].describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]))


In [None]:
cols = ["user_id", "streamer_id", "live_id", "imp_timestamp"] + user_x_feat_6_4

with pd.option_context("display.max_columns", None, "display.width", None):
    display(df_final_sample[cols].head())

df_final_sample[cols].shape

In [None]:
df_final_sample.shape

In [None]:
# count features created in 6.1–6.4

feat_6_1 = [
    # 6.1 User features (as defined in your notebook)
    "user_account_age",
    "user_watch_live_age",
    "ctr_user_15min", "ctr_user_3hr", "ctr_user_1d", "ctr_user_7d",
    "num_imp_user_10min", "num_imp_user_30min", "num_imp_user_2hr", "num_imp_user_12hr", "num_imp_user_1d", "num_imp_user_7d",
    "num_click_user_15min", "num_click_user_3hr", "num_click_user_1d", "num_click_user_7d",
    "click_trend_user",
    "time_since_last_impression_user", "tsli_missing",
    "time_since_last_click_user", "tslc_missing",
    "consecutive_skips_user",
    "avg_watch_time_user", "median_watch_time_user", "pct_long_watch_user_30s",
    "comment_rate_user", "has_comment_user_24h", "num_comment_user_24h",
    "like_rate_user", "has_like_user_24h", "num_like_user_24h",
    "has_gift_user_7d", "num_gift_user_7d", "amount_gift_user_7d",
]

feat_6_2 = room_feat_6_2  # from your earlier block

feat_6_3 = [
    # 6.3.1
    "streamer_account_age",
    "streamer_live_age",
    "streamer_account_age_missing",
    "streamer_live_age_missing",
    # 6.3.2
    "ctr_streamer_1d",
    "ctr_streamer_7d",
    "num_imp_streamer_7d",
    "num_click_streamer_7d",
    "num_lives_streamer_7d",
    # 6.3.3
    "avg_watch_time_streamer",
    "median_watch_time_streamer",
    "pct_long_watch_streamer_30s",
    "watch_time_streamer_missing",
    # 6.3.4
    "num_comment_streamer_7d",
    "num_like_streamer_7d",
    "amount_gift_streamer_7d",
]

feat_6_4 = [
    # 6.4.1 user x streamer
    "num_imp_user_streamer_7d",
    "num_click_user_streamer_7d",
    "ctr_user_streamer_7d",
    "time_since_last_impression_user_streamer",
    "time_since_last_click_user_streamer",
    "tsli_user_streamer_missing",
    "tslc_user_streamer_missing",
    # 6.4.2 user x category
    "num_imp_user_category_7d",
    "num_click_user_category_7d",
    "ctr_user_category_7d",
    # 6.4.3 user x room (dropped if removed)
    "num_imp_user_room_1d",
    "num_click_user_room_7d",
    "time_since_last_impression_user_room",
    "tsli_user_room_missing",
]

all_feats = feat_6_1 + feat_6_2 + feat_6_3 + feat_6_4

print("Total features (6.1–6.4):", len(all_feats))
present = [c for c in all_feats if c in df_final_sample.columns]
missing = [c for c in all_feats if c not in df_final_sample.columns]

print("Present:", len(present))
print("Missing:", len(missing))
print("Missing columns:", missing)



In [None]:
df_final_sample_transformed = df_final_sample.copy()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def explore_numeric_feature(df, col, bins=50, iqr_k=1.5, clip_q=None):
    """
    Explore one numeric feature:
    1) distribution + skewness (summary + plots)
    2) outliers (IQR rule)
    
    Params:
      df: DataFrame
      col: column name
      bins: histogram bins
      iqr_k: multiplier for IQR outlier bounds
      clip_q: optional quantile clip for plotting (e.g., 0.995)
    """
    if col not in df.columns:
        print(f"[ERROR] Column not found: {col}")
        return
    if not pd.api.types.is_numeric_dtype(df[col]):
        print(f"[ERROR] Column is not numeric: {col}")
        return

    s = df[col].dropna()
    if s.empty:
        print(f"[WARN] Column is all NaN: {col}")
        return

    # basic stats
    skew = s.skew()
    summary = s.describe(percentiles=[0.01, 0.05, 0.5, 0.95, 0.99])

    print(f"Feature: {col}")
    print(f"Count (non-null): {len(s)}")
    print(f"Skewness: {skew:.6f}")
    display(summary)

    # optional clipping for plotting only
    plot_s = s
    if clip_q is not None:
        hi = s.quantile(clip_q)
        lo = s.quantile(1 - clip_q) if clip_q > 0.5 else s.quantile(clip_q)
        plot_s = s.clip(lower=lo, upper=hi)

    # plots
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].hist(plot_s, bins=bins)
    axes[0].set_title(f"{col} Histogram")
    axes[1].boxplot(plot_s, vert=False, showfliers=True)
    axes[1].set_title(f"{col} Boxplot")
    plt.tight_layout()
    plt.show()

    # outlier check (IQR)
    q1, q3 = np.percentile(s, [25, 75])
    iqr = q3 - q1
    if iqr == 0:
        print("Outlier check: IQR = 0 (no outliers by IQR rule).")
        return

    lo = q1 - iqr_k * iqr
    hi = q3 + iqr_k * iqr
    outliers = s[(s < lo) | (s > hi)]

    print(f"Outlier bounds: [{lo:.6f}, {hi:.6f}]")
    print(f"Outlier count: {len(outliers)} ({len(outliers) / len(s) * 100:.4f}%)")

In [None]:
import numpy as np
import pandas as pd

def normalize_numeric_feature(
    df,
    col,
    method="zscore",   # "zscore", "minmax", "robust"
    clip_q=None,       # e.g., 0.995 to clip tails before scaling
    eps=1e-9,
):
    """
    Normalize/standardize a numeric feature.
    Returns (scaled_series, params_dict).

    method:
      - "zscore": (x - mean) / std
      - "minmax": (x - min) / (max - min)
      - "robust": (x - median) / IQR
    """
    if col not in df.columns:
        raise ValueError(f"Column not found: {col}")
    if not pd.api.types.is_numeric_dtype(df[col]):
        raise TypeError(f"Column is not numeric: {col}")

    s = df[col].astype(float)

    # optional clipping for stability
    if clip_q is not None:
        lo = s.quantile(1 - clip_q)
        hi = s.quantile(clip_q)
        s = s.clip(lower=lo, upper=hi)

    if method == "zscore":
        mean = s.mean()
        std = s.std()
        std = std if std > eps else eps
        scaled = (s - mean) / std
        params = {"method": method, "mean": mean, "std": std}

    elif method == "minmax":
        minv = s.min()
        maxv = s.max()
        denom = (maxv - minv) if (maxv - minv) > eps else eps
        scaled = (s - minv) / denom
        params = {"method": method, "min": minv, "max": maxv}

    elif method == "robust":
        med = s.median()
        q1 = s.quantile(0.25)
        q3 = s.quantile(0.75)
        iqr = (q3 - q1) if (q3 - q1) > eps else eps
        scaled = (s - med) / iqr
        params = {"method": method, "median": med, "q1": q1, "q3": q3, "iqr": iqr}

    else:
        raise ValueError("method must be one of: 'zscore', 'minmax', 'robust'")

    return scaled, params


In [None]:
import numpy as np
import pandas as pd

def log_transform_feature(df, col, method="log1p", add_shift_if_negative=True):
    """
    Log-transform a numeric feature.
    - method: "log1p" or "log"
    - add_shift_if_negative: if True, shifts data so min >= 0 before log1p/log
    Returns (transformed_series, params_dict)
    """
    if col not in df.columns:
        raise ValueError(f"Column not found: {col}")
    if not pd.api.types.is_numeric_dtype(df[col]):
        raise TypeError(f"Column is not numeric: {col}")

    s = df[col].astype(float)
    shift = 0.0

    if add_shift_if_negative:
        min_val = s.min()
        if min_val < 0:
            shift = -min_val
            s = s + shift

    if method == "log1p":
        transformed = np.log1p(s)
    elif method == "log":
        # avoid log(0)
        transformed = np.log(s.replace(0, np.nan))
    else:
        raise ValueError("method must be 'log1p' or 'log'")

    params = {"method": method, "shift": shift}
    return transformed, params


In [None]:
def logit_transform(x, eps=1e-6):
    x = np.clip(x, eps, 1 - eps)
    return np.log(x / (1 - x))

In [None]:
# Exploration for 6. 1 User features
# 6.1.1
# explore_numeric_feature(df_final_sample_transformed, "user_account_age")
# explore_numeric_feature(df_final_sample_transformed, "user_watch_live_age")

# 6.1.2
# for col in ["ctr_user_15min", "ctr_user_3hr", "ctr_user_1d", "ctr_user_7d"]:
#     explore_numeric_feature(df_final_sample, col)

# 6.1.3
# for col in [
#     "num_imp_user_10min",
#     "num_imp_user_30min",
#     "num_imp_user_2hr",
#     "num_imp_user_12hr",
#     "num_imp_user_1d",
#     "num_imp_user_7d",
# ]:
#     explore_numeric_feature(df_final_sample_transformed, col)

# 6.1.4
# for col in [
#     "num_click_user_15min",
#     "num_click_user_3hr",
#     "num_click_user_1d",
#     "num_click_user_7d",
#     "click_trend_user",
# ]:
#     explore_numeric_feature(df_final_sample_transformed, col)

# 6.1.5
# for col in [
#     "time_since_last_impression_user",
#     "time_since_last_click_user",
#     "consecutive_skips_user",
# ]:
#     explore_numeric_feature(df_final_sample_transformed, col)

# 6.1.6
# for col in ["avg_watch_time_user", "median_watch_time_user", "pct_long_watch_user_30s"]:
#     explore_numeric_feature(df_final_sample_transformed, col)

# 6.1.7
# # numeric
# for col in ["comment_rate_user", "num_comment_user_24h"]:
#     explore_numeric_feature(df_final_sample_transformed, col)


# 6.1.8 
# # numeric
# for col in ["like_rate_user", "num_like_user_24h"]:
#     explore_numeric_feature(df_final_sample_transformed, col)

# 6.1.9
# # numeric
# for col in ["num_gift_user_7d", "amount_gift_user_7d"]:
#     explore_numeric_feature(df_final_sample_transformed, col)




# 6.2.1
# explore_numeric_feature(df_final_sample_transformed, "time_since_live_start")



In [None]:
# 6.1 
# .............................................standardize 
# user_account_age
# user_whatch_live_age

# .............................................clip_0, 1
# ctr_user_15min
# ctr_user_3hr
# ctr_user_1d
# ctr_user_7d
# comment_rate_user
# like_rate_user

#.............................................log1p + clip p99 + standardize
# num_imp_user_10min
# num_imp_user_30min
# num_imp_user_2hr
# num_imp_user_12hr
# num_imp_user_1d
# num_imp_user_7d

# num_click_user_15min
# num_click_user_3hr
# num_click_user_1d
# num_click_user_7d

# time_since_last_impression_user
# time_since_last_click_user
# consecutive_skips_user

# avg_watch_time_user
# median_watch_time_user
# num_comment_user_24h
# num_like_user_24h

# num_gift_user_7d
# amount_gift_user_7d

#............................................clip p1-p99 + standardize
# click_trend_user


#.............................................leave as is
# tsli_missing 
# tslc_missing
# avg_watch_time_user_missing
# median_watch_time_user_missing
# pct_long_watch_user_30s
# has_comment_user_24h
# has_like_user_24h
# has_gift_user_7d





# 6.2 
# .............................................log1p + standardize
# time_since_live_start
# time_since_start_live

# .............................................clip_0, 1
# ctr_room_10min
# ctr_room_30min
# ctr_room_2hr
# ctr_room_12hr

# comment_rate_live
# comment_rate_live_15min
# comment_rate_live_1hr
# comment_rate_live_3hr

# like_rate_live
# like_rate_live_15min
# like_rate_live_1hr
# like_rate_live_3hr

# gift_rate_live
# gift_rate_live_15min
# gift_rate_live_1hr
# gift_rate_live_3hr


# .............................................log1p + clip p99 + standardize
# num_imp_room_10min
# num_imp_room_30min
# num_imp_room_2hr
# num_imp_room_12hr
# num_imp_room_1d

# num_click_room_10min
# num_click_room_30min
# num_click_room_2hr
# num_click_room_12hr
# num_click_room_1d

# num_comment_live
# num_comment_live_15min
# num_comment_live_1hr
# num_comment_live_3hr

# num_like_live
# num_like_live_15min
# num_like_live_1hr
# num_like_live_3hr

# num_gift_live
# num_gift_live_15min
# num_gift_live_1hr
# num_gift_live_3hr

# amount_gift_live
# amount_gift_live_15min
# amount_gift_live_1hr
# amount_gift_live_3hr


# ............................................clip p1-p99 + standardize
# ctr_trend_room
# comment_trend_room
# like_trend_room
# gift_trend_room


# .............................................standardize
# avg_watch_time_live
# median_watch_time_live
# avg_watch_time_live_30min
# median_watch_time_live_30min

# .............................................leave as is
# time_since_start_live_bucket
# watch_time_live_missing
# watch_time_live_30min_missing
# pct_long_watch_live_60s_30min




# 6.3
# .............................................log1p + standardize
# streamer_account_age
# streamer_live_age

# .............................................clip_0, 1
# ctr_streamer_1d
# ctr_streamer_7d

# .............................................log1p + clip p99 + standardize
# num_imp_streamer_7d
# num_click_streamer_7d
# num_lives_streamer_7d

# num_comment_streamer_7d
# num_like_streamer_7d
# amount_gift_streamer_7d

# .............................................standardize
# avg_watch_time_streamer
# median_watch_time_streamer

# .............................................leave as is
# pct_long_watch_streamer_30s
# watch_time_streamer_missing



# 6.4
# .............................................clip_0, 1
# ctr_user_streamer_7d
# ctr_user_category_7d

# .............................................log1p + clip p99 + standardize
# num_click_user_streamer_7d
# num_imp_user_streamer_7d

# num_click_user_category_7d
# num_imp_user_category_7d

# time_since_last_impression_user_streamer
# time_since_last_click_user_streamer

# .............................................leave as is
# tsli_user_streamer_missing
# tslc_user_streamer_missing


In [None]:
# Transformation for 6. 1 User features
# 6.1.1
for col in ["user_account_age", "user_watch_live_age"]:
    scaled, params = normalize_numeric_feature(df_final_sample_transformed, col, method="zscore")
    df_final_sample_transformed[col] = scaled

# 6.1.2
for col in ["ctr_user_15min", "ctr_user_3hr", "ctr_user_1d", "ctr_user_7d"]:
    df_final_sample_transformed[col] = df_final_sample_transformed[col].clip(lower=0, upper=1)

# 6.1.3
cols = [
    "num_imp_user_10min",
    "num_imp_user_30min",
    "num_imp_user_2hr",
    "num_imp_user_12hr",
    "num_imp_user_1d",
    "num_imp_user_7d",
]

# log1p + clip p99
for c in cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    df_final_sample_transformed[c] = s.clip(upper=hi)

# standardize
means = df_final_sample_transformed[cols].mean()
stds = df_final_sample_transformed[cols].std().replace(0, 1.0)
df_final_sample_transformed[cols] = (df_final_sample_transformed[cols] - means) / stds


# 6.1.4
cols = [
    "num_click_user_15min",
    "num_click_user_3hr",
    "num_click_user_1d",
    "num_click_user_7d",
]

# log1p + clip p99
for c in cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    df_final_sample_transformed[c] = s.clip(upper=hi)

# standardize
means = df_final_sample_transformed[cols].mean()
stds = df_final_sample_transformed[cols].std().replace(0, 1.0)
df_final_sample_transformed[cols] = (df_final_sample_transformed[cols] - means) / stds

col = "click_trend_user"
s = df_final_sample_transformed[col]
lo, hi = s.quantile(0.01), s.quantile(0.99)
s = s.clip(lower=lo, upper=hi)
mean = s.mean()
std = s.std() or 1.0
df_final_sample_transformed[col] = (s - mean) / std



# 6.1.5
cols = [
    "time_since_last_impression_user",
    "time_since_last_click_user",
    "consecutive_skips_user",
]
# log1p + clip p99
for c in cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    df_final_sample_transformed[c] = s.clip(upper=hi)

# standardize
means = df_final_sample_transformed[cols].mean()
stds = df_final_sample_transformed[cols].std().replace(0, 1.0)
df_final_sample_transformed[cols] = (df_final_sample_transformed[cols] - means) / stds




# 6.1.6
cols = ["avg_watch_time_user", "median_watch_time_user"]

# log1p + clip p99
for c in cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    df_final_sample_transformed[c] = s.clip(upper=hi)

# standardize
means = df_final_sample_transformed[cols].mean()
stds = df_final_sample_transformed[cols].std().replace(0, 1.0)
df_final_sample_transformed[cols] = (df_final_sample_transformed[cols] - means) / stds


# 6.1.7
# 1) clip_0,1 for comment_rate_user
df_final_sample_transformed["comment_rate_user"] = (
    df_final_sample_transformed["comment_rate_user"].clip(0, 1)
)

# 2) log1p + clip p99 + standardize for num_comment_user_24h
col = "num_comment_user_24h"
s = df_final_sample_transformed[col].clip(lower=0)
s = np.log1p(s)
hi = s.quantile(0.99)
s = s.clip(upper=hi)

mean = s.mean()
std = s.std() or 1.0
df_final_sample_transformed[col] = (s - mean) / std


# 6.1.8
# clip_0,1 for like_rate_user
df_final_sample_transformed["like_rate_user"] = (
    df_final_sample_transformed["like_rate_user"].clip(0, 1)
)

# log1p + clip p99 + standardize for num_like_user_24h
col = "num_like_user_24h"
s = df_final_sample_transformed[col].clip(lower=0)
s = np.log1p(s)
hi = s.quantile(0.99)
s = s.clip(upper=hi)

mean = s.mean()
std = s.std() or 1.0
df_final_sample_transformed[col] = (s - mean) / std


# 6.1.9
ols = ["num_gift_user_7d", "amount_gift_user_7d"]

# log1p + clip p99
for c in cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    df_final_sample_transformed[c] = s.clip(upper=hi)

# standardize
means = df_final_sample_transformed[cols].mean()
stds = df_final_sample_transformed[cols].std().replace(0, 1.0)
df_final_sample_transformed[cols] = (df_final_sample_transformed[cols] - means) / stds


In [None]:
# Transformation for 6.2 room features
# -------- log1p + standardize --------
log_std_cols = [
    "time_since_live_start",
    "time_since_start_live",
]
for c in log_std_cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

# -------- clip_0,1 --------
clip_01_cols = [
    "ctr_room_10min", "ctr_room_30min", "ctr_room_2hr", "ctr_room_12hr",
    "comment_rate_live", "comment_rate_live_15min", "comment_rate_live_1hr", "comment_rate_live_3hr",
    "like_rate_live", "like_rate_live_15min", "like_rate_live_1hr", "like_rate_live_3hr",
    "gift_rate_live", "gift_rate_live_15min", "gift_rate_live_1hr", "gift_rate_live_3hr",
]
for c in clip_01_cols:
    df_final_sample_transformed[c] = df_final_sample_transformed[c].clip(0, 1)

# -------- log1p + clip p99 + standardize --------
log_clip_std_cols = [
    "num_imp_room_10min", "num_imp_room_30min", "num_imp_room_2hr", "num_imp_room_12hr", "num_imp_room_1d",
    "num_click_room_10min", "num_click_room_30min", "num_click_room_2hr", "num_click_room_12hr", "num_click_room_1d",
    "num_comment_live", "num_comment_live_15min", "num_comment_live_1hr", "num_comment_live_3hr",
    "num_like_live", "num_like_live_15min", "num_like_live_1hr", "num_like_live_3hr",
    "num_gift_live", "num_gift_live_15min", "num_gift_live_1hr", "num_gift_live_3hr",
    "amount_gift_live", "amount_gift_live_15min", "amount_gift_live_1hr", "amount_gift_live_3hr",
]
for c in log_clip_std_cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    s = s.clip(upper=hi)
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

# -------- clip p1-p99 + standardize --------
trend_cols = ["ctr_trend_room", "comment_trend_room", "like_trend_room", "gift_trend_room"]
for c in trend_cols:
    s = df_final_sample_transformed[c]
    lo, hi = s.quantile(0.01), s.quantile(0.99)
    s = s.clip(lower=lo, upper=hi)
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

# -------- standardize --------
std_cols = [
    "avg_watch_time_live", "median_watch_time_live",
    "avg_watch_time_live_30min", "median_watch_time_live_30min",
]
for c in std_cols:
    s = df_final_sample_transformed[c]
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

# leave-as-is:
# time_since_start_live_bucket, watch_time_live_missing, watch_time_live_30min_missing, pct_long_watch_live_60s_30min

In [None]:
features_52 = [
    # log1p + standardize
    "time_since_live_start",
    "time_since_start_live",

    # clip_0,1
    "ctr_room_10min", "ctr_room_30min", "ctr_room_2hr", "ctr_room_12hr",
    "comment_rate_live", "comment_rate_live_15min", "comment_rate_live_1hr", "comment_rate_live_3hr",
    "like_rate_live", "like_rate_live_15min", "like_rate_live_1hr", "like_rate_live_3hr",
    "gift_rate_live", "gift_rate_live_15min", "gift_rate_live_1hr", "gift_rate_live_3hr",

    # log1p + clip p99 + standardize
    "num_imp_room_10min", "num_imp_room_30min", "num_imp_room_2hr", "num_imp_room_12hr", "num_imp_room_1d",
    "num_click_room_10min", "num_click_room_30min", "num_click_room_2hr", "num_click_room_12hr", "num_click_room_1d",
    "num_comment_live", "num_comment_live_15min", "num_comment_live_1hr", "num_comment_live_3hr",
    "num_like_live", "num_like_live_15min", "num_like_live_1hr", "num_like_live_3hr",
    "num_gift_live", "num_gift_live_15min", "num_gift_live_1hr", "num_gift_live_3hr",
    "amount_gift_live", "amount_gift_live_15min", "amount_gift_live_1hr", "amount_gift_live_3hr",

    # clip p1–p99 + standardize
    "ctr_trend_room", "comment_trend_room", "like_trend_room", "gift_trend_room",

    # standardize
    "avg_watch_time_live", "median_watch_time_live",
    "avg_watch_time_live_30min", "median_watch_time_live_30min",
]

# for col in features_52:
#     explore_numeric_feature(df_final_sample_transformed, col)


In [None]:
df_final_sample_transformed.shape

In [None]:
# Transformation for 6.3 room features

# -------- log1p + standardize --------
log_std_cols = [
    "streamer_account_age",
    "streamer_live_age",
]
for c in log_std_cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

# -------- clip_0,1 --------
clip_01_cols = [
    "ctr_streamer_1d",
    "ctr_streamer_7d",
]
for c in clip_01_cols:
    df_final_sample_transformed[c] = df_final_sample_transformed[c].clip(0, 1)

# -------- log1p + clip p99 + standardize --------
log_clip_std_cols = [
    "num_imp_streamer_7d",
    "num_click_streamer_7d",
    "num_lives_streamer_7d",
    "num_comment_streamer_7d",
    "num_like_streamer_7d",
    "amount_gift_streamer_7d",
]
for c in log_clip_std_cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    s = s.clip(upper=hi)
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

# -------- standardize --------
std_cols = [
    "avg_watch_time_streamer",
    "median_watch_time_streamer",
]
for c in std_cols:
    s = df_final_sample_transformed[c]
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

In [None]:
df_final_sample_transformed.shape

In [None]:
features_12 = [
    # log1p + standardize
    "streamer_account_age",
    "streamer_live_age",

    # clip_0,1
    "ctr_streamer_1d",
    "ctr_streamer_7d",

    # log1p + clip p99 + standardize
    "num_imp_streamer_7d",
    "num_click_streamer_7d",
    "num_lives_streamer_7d",
    "num_comment_streamer_7d",
    "num_like_streamer_7d",
    "amount_gift_streamer_7d",

    # standardize
    "avg_watch_time_streamer",
    "median_watch_time_streamer",
]

# for col in features_12:
#     explore_numeric_feature(df_final_sample_transformed, col)


In [None]:
# Transformation for 6.4 room features

# -------- clip_0,1 --------
clip_01_cols = [
    "ctr_user_streamer_7d",
    "ctr_user_category_7d",
]
for c in clip_01_cols:
    df_final_sample_transformed[c] = df_final_sample_transformed[c].clip(0, 1)

# -------- log1p + clip p99 + standardize --------
log_clip_std_cols = [
    "num_click_user_streamer_7d",
    "num_imp_user_streamer_7d",
    "num_click_user_category_7d",
    "num_imp_user_category_7d",
    "time_since_last_impression_user_streamer",
    "time_since_last_click_user_streamer",
]
for c in log_clip_std_cols:
    s = df_final_sample_transformed[c].clip(lower=0)
    s = np.log1p(s)
    hi = s.quantile(0.99)
    s = s.clip(upper=hi)
    mean = s.mean()
    std = s.std() or 1.0
    df_final_sample_transformed[c] = (s - mean) / std

# leave as is:
# tsli_user_streamer_missing, tslc_user_streamer_missing


In [None]:
features_8 = [
    # clip_0,1
    "ctr_user_streamer_7d",
    "ctr_user_category_7d",

    # log1p + clip p99 + standardize
    "num_click_user_streamer_7d",
    "num_imp_user_streamer_7d",
    "num_click_user_category_7d",
    "num_imp_user_category_7d",
    "time_since_last_impression_user_streamer",
    "time_since_last_click_user_streamer",
]

# for col in features_8:
#     explore_numeric_feature(df_final_sample_transformed, col)


In [None]:
from pathlib import Path

out_path = Path("..") / "data" / "draft_sample.csv"
df_final_sample_transformed.to_csv(out_path, index=False)
print(f"Wrote: {out_path}")


In [1]:
import pandas as pd

path = "/workspace/multi_stage_ranking_kuaishou/data/draft_sample.csv"
df = pd.read_csv(path, parse_dates=["imp_timestamp"])

print("rows:", len(df))
print("imp_timestamp min:", df["imp_timestamp"].min())
print("imp_timestamp max:", df["imp_timestamp"].max())


  df = pd.read_csv(path, parse_dates=["imp_timestamp"])


rows: 352307
imp_timestamp min: 2025-05-04 14:10:59.857000
imp_timestamp max: 2025-05-25 14:59:47.745000
