In [None]:
import pandas as pd

# Interactions

In [None]:
interactions_path = "../../data/contentwise/data/contentwise/CW10M-CSV/interactions.csv.gz"
interactions = pd.read_csv(interactions_path)
interactions.shape

In [None]:
interactions = interactions[interactions["interaction_type"] == 0].reset_index(drop=True)

columns = ["utc_ts_milliseconds", "user_id", "series_id", "recommendation_id", "vision_factor"]
interactions = interactions[columns]
interactions

# Impressions (direct)

In [None]:
impressions_dl_path = "../../data/contentwise/data/contentwise/CW10M-CSV/impressions-direct-link.csv.gz"
impressions_dl = pd.read_csv(impressions_dl_path)
impressions_dl.shape

In [None]:
impressions_dl

In [None]:
impressions_dl["recommended_series_list"] = impressions_dl["recommended_series_list"].str.replace(r"(\[|\])", "", regex=True).str.split()
impressions_dl = impressions_dl.explode("recommended_series_list").reset_index(drop=True)
impressions_dl

# Join

In [None]:
merged = interactions.merge(impressions_dl, "inner", "recommendation_id")
merged["recommended_series_list"] = pd.to_numeric(merged["recommended_series_list"])
merged

In [None]:
merged.loc[merged["series_id"] == merged["recommended_series_list"], "target"] = 1
merged.loc[merged["series_id"] != merged["recommended_series_list"], "target"] = 0
merged

In [None]:
output = merged[["user_id", "recommended_series_list", "target", "utc_ts_milliseconds"]]
output["target"] = output["target"].astype(int)
output.columns = ["user", "item", "target", "timestamp"]
output

In [None]:
output = output.groupby(["user", "item"]).agg({"target": "sum"}).reset_index()
output.loc[output["target"] > 0, "target"] = 1
output

In [None]:
output["target"].value_counts()

In [None]:
user_to_idx = {user: idx for idx, user in enumerate(output["user"].unique())}
item_to_idx = {item: idx for idx, item in enumerate(output["item"].unique())}
output["user"] = output["user"].map(user_to_idx)
output["item"] = output["item"].map(item_to_idx)

## implicit

In [None]:
train_data = output[:800_000].reset_index(drop=True)
val_data = output[800_000:1_000_000].reset_index(drop=True)
test_data = output[1_000_000:].reset_index(drop=True)

In [None]:
train_data.to_csv("train_data_implicit.csv", index=False)
val_data.to_csv("val_data_implicit.csv", index=False)
test_data.to_csv("test_data_implicit.csv", index=False)

In [None]:
output["user"].nunique()

In [None]:
output["item"].nunique()

## implicit_br

In [None]:
tmp0 = train_data.loc[train_data["target"] == 0, ["user", "item"]]
tmp1 = train_data.loc[train_data["target"] == 1, ["user", "item"]]

train_data = tmp0.merge(tmp1, "inner", "user", suffixes=("_neg", "_pos"))
train_data = train_data.sample(frac=0.2, random_state=0).reset_index(drop=True)

In [None]:
train_data.to_csv("train_data_implicit_bpr.csv", index=False)
val_data.to_csv("val_data_implicit_bpr.csv", index=False)
test_data.to_csv("test_data_implicit_bpr.csv", index=False)