In [None]:
from pathlib import Path
import datetime as dt
import numpy as np
import pandas as pd

In [None]:
interactions_path = Path("../data/contentwise/data/contentwise/CW10M/interactions")
interactions = pd.concat(pd.read_parquet(p) for p in interactions_path.glob("*.parquet")).reset_index()

impressions_dl_path = Path("../data/contentwise/data/contentwise/CW10M/impressions-direct-link")
impressions_dl = pd.concat(pd.read_parquet(p) for p in impressions_dl_path.glob("*.parquet")).reset_index()

In [None]:
# Select 'clicks' only from all interactions
interactions = interactions[interactions["interaction_type"] == 0].reset_index(drop=True)

impressions_dl = impressions_dl.explode("recommended_series_list")
impressions_dl["recommended_series_list"] = pd.to_numeric(impressions_dl["recommended_series_list"])

# Join indirectly positive actions with negative (impressions)
interactions = interactions.merge(impressions_dl, "inner", "recommendation_id")

# Mark positive interactions with 1 and negative with 0
interactions.loc[
    interactions["series_id"] == interactions["recommended_series_list"], "target"
] = 1
interactions.loc[
    interactions["series_id"] != interactions["recommended_series_list"], "target"
] = 0

interactions = interactions[
    ["user_id", "recommended_series_list", "target", "utc_ts_milliseconds"]
]
interactions.columns = ["user", "item", "target", "timestamp"]
interactions["target"] = interactions["target"].astype("int32")
interactions["timestamp"] = pd.to_datetime(interactions["timestamp"], unit="ms")

# Handle (user, item) duplicates
interactions = (
    interactions.groupby(["user", "item"])
    .agg({"target": "sum", "timestamp": "max"})
    .reset_index()
)
interactions.loc[interactions["target"] > 0, "target"] = 1

interactions = interactions.sort_values("timestamp").reset_index(drop=True)

# Split data
train_data = interactions[interactions["timestamp"] < dt.datetime(2019, 4, 14)].reset_index(drop=True)
val_data = interactions[interactions["timestamp"] >= dt.datetime(2019, 4, 14)].reset_index(drop=True)

# Prepare user/item to idx mappers based on train data
unique_users = np.sort(train_data["user"].unique())
unique_items = np.sort(train_data["item"].unique())
train_user_to_idx = pd.DataFrame({"user": unique_users, "user_idx": np.arange(unique_users.size)})
train_item_to_idx = pd.DataFrame({"item": unique_items, "item_idx": np.arange(unique_items.size)})

# Map user/item to idx
train_data = train_data.merge(train_user_to_idx, on="user", how="inner")
train_data = train_data.merge(train_item_to_idx, on="item", how="inner")
val_data = val_data.merge(train_user_to_idx, on="user", how="inner")
val_data = val_data.merge(train_item_to_idx, on="item", how="inner")

train_data = train_data.sort_values("timestamp").reset_index(drop=True)
val_data = val_data.sort_values("timestamp").reset_index(drop=True)

# Select valid columns
train_data = train_data[["user_idx", "item_idx", "target", "timestamp"]]
train_data.columns = ["user", "item", "target", "timestamp"]
val_data = val_data[["user_idx", "item_idx", "target", "timestamp"]]
val_data.columns = ["user", "item", "target", "timestamp"]

test_data = val_data.copy()  # test set == validation set (to change in the future!)

In [None]:
train_data.shape, val_data.shape, test_data.shape

In [None]:
train_user_to_idx.shape, train_item_to_idx.shape

In [None]:
train_data.head(), val_data.head(), test_data.head()

# Old - parsing csv files

In [None]:
data_path = "../data/contentwise/data/contentwise/CW10M-CSV/interactions.csv.gz"
data_path2 = "../data/contentwise/data/contentwise/CW10M-CSV/impressions-direct-link.csv.gz"

In [None]:
data = pd.read_csv(data_path)
data.head()

In [None]:
data2 = pd.read_csv(data_path2)
data2.head()

In [None]:
data = data[data["interaction_type"] == 0].reset_index(drop=True)
columns = [
    "utc_ts_milliseconds",
    "user_id",
    "series_id",
    "recommendation_id",
]
data = data[columns]

data2["recommended_series_list"] = (
    data2["recommended_series_list"]
    .str.replace(r"(\[|\])", "", regex=True)
    .str.split()
)
data2 = data2.explode("recommended_series_list").reset_index(
    drop=True
)

merged = data.merge(data2, "inner", "recommendation_id")
merged["recommended_series_list"] = pd.to_numeric(
    merged["recommended_series_list"]
)
merged.loc[
    merged["series_id"] == merged["recommended_series_list"], "target"
] = 1
merged.loc[
    merged["series_id"] != merged["recommended_series_list"], "target"
] = 0
merged = merged[
    ["user_id", "recommended_series_list", "target", "utc_ts_milliseconds"]
]
merged["target"] = merged["target"].astype(int)
merged.columns = ["user", "item", "target", "utc_ts_milliseconds"]

merged = (
    merged.groupby(["user", "item"])
    .agg({"target": "sum", "utc_ts_milliseconds": "max"})
    .reset_index()
)
merged.loc[merged["target"] > 0, "target"] = 1

merged = merged.sort_values("utc_ts_milliseconds").reset_index(drop=True)
merged = merged.drop(columns=["utc_ts_milliseconds"])

In [None]:
merged.dtypes

In [None]:
merged

In [None]:
# Split data into train/val/test
train_data = merged[:1_000_000].reset_index(drop=True)
val_data = merged[1_000_000:1_100_000].reset_index(drop=True)
test_data = merged[1_100_000:].reset_index(drop=True)

# Prepare unique train user and items
train_users = train_data["user"].unique()
train_items = train_data["item"].unique()

# Filter val/test data
val_data = val_data[val_data["user"].isin(train_users)]
val_data = val_data[val_data["item"].isin(train_items)]
val_data = val_data.reset_index(drop=True)
test_data = test_data[test_data["user"].isin(train_users)]
test_data = test_data[test_data["item"].isin(train_items)]
test_data = test_data.reset_index(drop=True)

# Map idx
user_to_idx = {user: idx for idx, user in enumerate(train_users)}
item_to_idx = {item: idx for idx, item in enumerate(train_items)}
train_data["user"] = train_data["user"].map(user_to_idx)
train_data["item"] = train_data["item"].map(item_to_idx)
val_data["user"] = val_data["user"].map(user_to_idx)
val_data["item"] = val_data["item"].map(item_to_idx)
test_data["user"] = test_data["user"].map(user_to_idx)
test_data["item"] = test_data["item"].map(item_to_idx)

In [None]:
train_data.shape