In [2]:
import pandas as pd

ratings_path = "../data/external/ml-1m/ml-1m/ratings.dat"

ratings = pd.read_csv(
    ratings_path,
    sep="::",
    engine="python",
    names=["user_id", "movie_id", "rating", "timestamp"]
)

# proxy signals
item_popularity = ratings.groupby("movie_id").size().rename("item_rating_count").reset_index()
user_activity = ratings.groupby("user_id").size().rename("user_rating_count").reset_index()

proxy = ratings.merge(item_popularity, on="movie_id", how="left") \
               .merge(user_activity, on="user_id", how="left")

proxy["engagement_proxy"] = proxy["item_rating_count"] * proxy["user_rating_count"]

threshold = proxy["engagement_proxy"].quantile(0.80)
proxy["high_proxy"] = (proxy["engagement_proxy"] >= threshold).astype(int)

proxy["misaligned"] = ((proxy["high_proxy"] == 1) & (proxy["rating"] <= 2)).astype(int)

print("Proxy dataset shape:", proxy.shape)
print("Misaligned rate:", proxy["misaligned"].mean())
proxy["misaligned"].value_counts()

Proxy dataset shape: (1000209, 9)
Misaligned rate: 0.02295820173583721


misaligned
0    977246
1     22963
Name: count, dtype: int64

In [3]:
proxy_sample = proxy.sample(5000, random_state=42)
proxy_sample.to_csv("../data/raw/proxy_engagement_sample.csv", index=False)
print("Saved: data/raw/proxy_engagement_sample.csv")

Saved: data/raw/proxy_engagement_sample.csv
