In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import datetime as dt

import torch
from torchmetrics import RetrievalNormalizedDCG

# Positive signals - clicks, details, ratings, purchases

DataFrame which contains the interactions of users with items

In [4]:
prefix = Path("../../data/contentwise/data/contentwise/CW10M/")
interactions_path = prefix / Path("interactions")
interactions = pd.concat(
    pd.read_parquet(p) for p in interactions_path.glob("*.parquet")
).reset_index()

interactions["utc_ts_milliseconds"] = pd.to_datetime(
    interactions["utc_ts_milliseconds"], unit="ms"
)
interactions["date"] = interactions["utc_ts_milliseconds"].dt.date
interactions = interactions.sort_values("utc_ts_milliseconds").reset_index(drop=True)

interactions["item_type"] = interactions["item_type"].map(
    {
        0: "movies",
        1: "movies and clips in series",
        2: "TV movies or shows",
        3: "episodes of TV series",
    }
)
interactions["interaction_type"] = interactions["interaction_type"].map(
    {0: "views", 1: "details", 2: "ratings", 3: "purchases"}
)

## Keep movies only!

In [5]:
interactions = interactions[interactions["item_type"] == "movies"].reset_index(
    drop=True
)
interactions.shape

(987518, 12)

In [6]:
interactions

Unnamed: 0,utc_ts_milliseconds,user_id,item_id,series_id,episode_number,series_length,item_type,recommendation_id,interaction_type,vision_factor,explicit_rating,date
0,2019-01-07 09:00:04,17094,86576,16187,1,1,movies,272495,details,-1.00,-1.0,2019-01-07
1,2019-01-07 09:00:07,9011,9614,23832,1,1,movies,-1,details,-1.00,-1.0,2019-01-07
2,2019-01-07 09:00:22,17094,13926,27394,1,1,movies,8022,details,-1.00,-1.0,2019-01-07
3,2019-01-07 09:00:26,10478,37741,27655,1,1,movies,-1,details,-1.00,-1.0,2019-01-07
4,2019-01-07 09:00:48,17094,86576,16187,1,1,movies,272495,details,-1.00,-1.0,2019-01-07
...,...,...,...,...,...,...,...,...,...,...,...,...
987513,2019-04-15 08:56:53,29764,101182,1770,1,1,movies,-1,views,1.00,-1.0,2019-04-15
987514,2019-04-15 08:57:25,39325,13711,23882,1,1,movies,-1,views,0.02,-1.0,2019-04-15
987515,2019-04-15 08:58:58,5610,44853,26746,1,1,movies,-1,purchases,-1.00,-1.0,2019-04-15
987516,2019-04-15 08:59:51,26024,47226,14716,1,1,movies,-1,details,-1.00,-1.0,2019-04-15


# Negative signals - impressions (direct)

In [7]:
impressions_dl_path = prefix / Path("impressions-direct-link")
impressions_dl = pd.concat(
    pd.read_parquet(p) for p in impressions_dl_path.glob("*.parquet")
).reset_index()
impressions_dl

Unnamed: 0,recommendation_id,row_position,recommendation_list_length,recommended_series_list
0,0,0,10,"[20128, 6674, 4625, 19462, 19041, 23229, 5914,..."
1,1,0,10,"[7906, 1240, 1712, 8348, 3227, 7607, 24175, 15..."
2,2,0,10,"[13673, 15810, 16821, 3826, 26860, 22223, 1847..."
3,3,1,10,"[13673, 1272, 2293, 23996, 15810, 16821, 13737..."
4,4,0,6,"[21885, 22288, 7493, 17042, 18483, 9330]"
...,...,...,...,...
307448,307449,0,12,"[21261, 26515, 5544, 1393, 5678, 22552, 9101, ..."
307449,307450,1,10,"[20128, 4862, 6674, 28598, 27215, 4625, 19041,..."
307450,307451,0,30,"[9969, 17425, 9101, 14797, 5743, 4172, 17953, ..."
307451,307452,0,10,"[21079, 23099, 28598, 25404, 19462, 26304, 152..."


# Merged positive and negative interactions

In [134]:
merged = interactions.merge(impressions_dl, "left", "recommendation_id")

merged = merged.explode("recommended_series_list").reset_index(drop=True)
merged["recommended_series_list"] = pd.to_numeric(merged["recommended_series_list"])

merged["signal"] = 0  # Set negative signals as 0
merged.loc[
    merged["recommendation_id"] == -1, "signal"
] = 1  # Set positive signals as 1 ('details' action)
merged.loc[
    merged["series_id"] == merged["recommended_series_list"], "signal"
] = 1  # Set positive signals as 1 ('views' action)

merged.loc[~merged["recommended_series_list"].isna(), "series_id"] = merged.loc[
    ~merged["recommended_series_list"].isna(), "recommended_series_list"
]

merged.loc[merged["signal"] == 0, "interaction_type"] = "impressions"
merged.loc[merged["interaction_type"] == "impressions", "vision_factor"] = -1

merged = merged.drop(
    columns=[
        "item_id",
        "episode_number",
        "series_length",
        "item_type",
        "date",
        "row_position",
        "recommendation_list_length",
        "recommended_series_list",
    ]
)

## Prepare weights of each interaction

In [135]:
merged["interaction_weight"] = 1
# Weights based on vision_factor
merged.loc[merged["vision_factor"] != -1, "interaction_weight"] = (
    2 * merged["vision_factor"]
)
# Weights based on explicit_rating
merged.loc[merged["explicit_rating"] != -1, "interaction_weight"] = (
    2 * merged["explicit_rating"] / 5
)
# Weights based on purchases
merged.loc[merged["interaction_type"] == "purchases", "interaction_weight"] = 2
# Weights based on time (square root of normalized (0-1) value)
merged["days_ago"] = (
    merged["utc_ts_milliseconds"].max() - merged["utc_ts_milliseconds"]
).dt.days
merged["time_weight"] = 2 * np.sqrt(
    (merged["days_ago"].max() - merged["days_ago"]) / merged["days_ago"].max()
)

# Final signal weights
merged["final_signal_weight"] = (
    merged["interaction_weight"] + merged["time_weight"]
) / 2

## Get rid of inactive users and items (having <10 interactions)

In [136]:
# Users
tmp_u = (
    merged.groupby("user_id")
    .agg({"signal": "sum"})
    .rename(columns={"signal": "sum"})
    .reset_index()
)
tmp_u = tmp_u[tmp_u["sum"] >= 10]
merged = merged.merge(tmp_u, "inner", "user_id")

# Items
tmp_i = (
    merged.groupby("series_id")
    .agg({"signal": "sum"})
    .rename(columns={"signal": "sum"})
    .reset_index()
)
tmp_i = tmp_i[tmp_i["sum"] >= 10]
merged = merged.merge(tmp_i, "inner", "series_id")

In [137]:
merged

Unnamed: 0,utc_ts_milliseconds,user_id,series_id,recommendation_id,interaction_type,vision_factor,explicit_rating,signal,interaction_weight,days_ago,time_weight,final_signal_weight,sum_x,sum_y
0,2019-01-07 09:00:04,17094,6854,272495,impressions,-1.00,-1.0,0,1.00,97,0.000000,0.500000,86,262
1,2019-01-07 09:00:48,17094,6854,272495,impressions,-1.00,-1.0,0,1.00,97,0.000000,0.500000,86,262
2,2019-02-02 01:13:31,6273,6854,278455,impressions,-1.00,-1.0,0,1.00,72,1.015346,1.007673,260,262
3,2019-02-02 03:01:59,6273,6854,278455,impressions,-1.00,-1.0,0,1.00,72,1.015346,1.007673,260,262
4,2019-02-02 10:32:20,6273,6854,278455,impressions,-1.00,-1.0,0,1.00,71,1.035454,1.017727,260,262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2315590,2019-03-23 12:46:49,42064,14128,-1,views,0.47,-1.0,1,0.94,22,1.758631,1.349316,15,11
2315591,2019-03-23 13:32:46,42064,14128,-1,views,0.60,-1.0,1,1.20,22,1.758631,1.479316,15,11
2315592,2019-03-26 20:01:27,42064,14128,-1,views,0.23,-1.0,1,0.46,19,1.793459,1.126729,15,11
2315593,2019-03-26 20:31:00,42064,14128,-1,views,0.46,-1.0,1,0.92,19,1.793459,1.356729,15,11


# Split data

In [163]:
train_data = merged.loc[
    merged["utc_ts_milliseconds"] < dt.datetime(2019, 4, 14)
].reset_index(drop=True)
val_data = merged.loc[
    merged["utc_ts_milliseconds"] >= dt.datetime(2019, 4, 14)
].reset_index(drop=True)

In [164]:
train_data

Unnamed: 0,utc_ts_milliseconds,user_id,series_id,recommendation_id,interaction_type,vision_factor,explicit_rating,signal,interaction_weight,days_ago,time_weight,final_signal_weight,sum_x,sum_y
0,2019-01-07 09:00:04,17094,6854,272495,impressions,-1.00,-1.0,0,1.00,97,0.000000,0.500000,86,262
1,2019-01-07 09:00:48,17094,6854,272495,impressions,-1.00,-1.0,0,1.00,97,0.000000,0.500000,86,262
2,2019-02-02 01:13:31,6273,6854,278455,impressions,-1.00,-1.0,0,1.00,72,1.015346,1.007673,260,262
3,2019-02-02 03:01:59,6273,6854,278455,impressions,-1.00,-1.0,0,1.00,72,1.015346,1.007673,260,262
4,2019-02-02 10:32:20,6273,6854,278455,impressions,-1.00,-1.0,0,1.00,71,1.035454,1.017727,260,262
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2284495,2019-03-23 12:46:49,42064,14128,-1,views,0.47,-1.0,1,0.94,22,1.758631,1.349316,15,11
2284496,2019-03-23 13:32:46,42064,14128,-1,views,0.60,-1.0,1,1.20,22,1.758631,1.479316,15,11
2284497,2019-03-26 20:01:27,42064,14128,-1,views,0.23,-1.0,1,0.46,19,1.793459,1.126729,15,11
2284498,2019-03-26 20:31:00,42064,14128,-1,views,0.46,-1.0,1,0.92,19,1.793459,1.356729,15,11


In [165]:
val_data

Unnamed: 0,utc_ts_milliseconds,user_id,series_id,recommendation_id,interaction_type,vision_factor,explicit_rating,signal,interaction_weight,days_ago,time_weight,final_signal_weight,sum_x,sum_y
0,2019-04-14 12:09:58,22887,6854,-1,details,-1.00,-1.0,1,1.00,0,2.000000,1.500000,1175,262
1,2019-04-14 07:29:22,19776,6854,-1,views,0.97,-1.0,1,1.94,1,1.989664,1.964832,373,262
2,2019-04-14 17:04:41,4684,19911,127661,impressions,-1.00,-1.0,0,1.00,0,2.000000,1.500000,579,5163
3,2019-04-14 18:55:54,4684,19911,127661,impressions,-1.00,-1.0,0,1.00,0,2.000000,1.500000,579,5163
4,2019-04-14 23:41:52,4684,19911,28079,impressions,-1.00,-1.0,0,1.00,0,2.000000,1.500000,579,5163
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31090,2019-04-14 15:56:05,41587,15917,298028,impressions,-1.00,-1.0,0,1.00,0,2.000000,1.500000,55,17
31091,2019-04-14 23:48:15,29410,5645,-1,purchases,-1.00,-1.0,1,2.00,0,2.000000,2.000000,18,10
31092,2019-04-15 00:44:46,29410,5645,-1,views,0.55,-1.0,1,1.10,0,2.000000,1.550000,18,10
31093,2019-04-14 08:00:24,1001,2163,-1,views,0.63,-1.0,1,1.26,1,1.989664,1.624832,107,17


In [146]:
# Remove (user, item) duplicates
val_data = (
    val_data.groupby(["user_id", "series_id"]).agg({"signal": "sum"}).reset_index()
)
val_data.loc[val_data["signal"] > 0, "signal"] = 1

In [151]:
val_data

Unnamed: 0,user_id,series_id,signal
0,6,14999,1
1,25,1393,1
2,25,3178,1
3,25,12772,1
4,51,9630,1
...,...,...,...
17659,42151,12654,1
17660,42151,15505,0
17661,42151,20381,0
17662,42151,23586,1


# NDCG

In [152]:
# Best NDCG
indexes = torch.tensor(val_data["user_id"], dtype=torch.int64)
target = torch.tensor(val_data["signal"], dtype=torch.float32)

ndcg = RetrievalNormalizedDCG()
ndcg(target, target, indexes=indexes).item()

1.0

In [153]:
# Worst NDCG
indexes = torch.tensor(val_data["user_id"], dtype=torch.int64)
target = torch.tensor(val_data["signal"], dtype=torch.float32)
pred = torch.tensor((val_data["signal"] + 1) % 2, dtype=torch.float32)

ndcg = RetrievalNormalizedDCG()
ndcg(pred, target, indexes=indexes).item()

0.9154776930809021

In [154]:
# Random NDCG
indexes = torch.tensor(val_data["user_id"], dtype=torch.int64)
target = torch.tensor(val_data["signal"], dtype=torch.float32)
pred = torch.rand(val_data.shape[0])

ndcg = RetrievalNormalizedDCG()
ndcg(pred, target, indexes=indexes).item()

0.9397118091583252

In [155]:
tmp = merged[(merged["explicit_rating"] != -1) & (merged["time_weight"] != 0)]
wm = lambda x: np.average(x, weights=merged.loc[x.index, "time_weight"])

tmp = (
    tmp.groupby("series_id")
    .agg(
        n_ratings=("explicit_rating", "count"),
        mean_rating=("explicit_rating", "mean"),
        weighted_mean_rating=("explicit_rating", wm),
    )
    .sort_values("weighted_mean_rating", ascending=False)
)
val_data2 = val_data.merge(tmp, "inner", "series_id")

# NDCG based on explicit_rating
indexes = torch.tensor(val_data2["user_id"], dtype=torch.int64)
target = torch.tensor(val_data2["signal"], dtype=torch.float32)
pred = torch.tensor(val_data2["weighted_mean_rating"], dtype=torch.float32)

ndcg = RetrievalNormalizedDCG()
ndcg(pred, target, indexes=indexes).item()

0.9191480875015259

In [156]:
val_data.shape, val_data2.shape

((17664, 3), (13057, 6))

In [157]:
tmp = merged.groupby("series_id").agg(n_interactions=("signal", "sum"))
val_data2 = val_data.merge(tmp, "inner", "series_id")

# NDCG based on n_interactions
indexes = torch.tensor(val_data2["user_id"], dtype=torch.int64)
target = torch.tensor(val_data2["signal"], dtype=torch.float32)
pred = torch.tensor(val_data2["n_interactions"], dtype=torch.float32)

ndcg = RetrievalNormalizedDCG()
ndcg(pred, target, indexes=indexes).item()

0.9461121559143066

In [158]:
val_data.shape, val_data2.shape

((17664, 3), (17664, 4))

In [159]:
wm = lambda x: np.average(x, weights=merged.loc[x.index, "final_signal_weight"])

tmp = (
    merged.groupby("series_id")
    .agg(
        n_actions=("signal", "count"),
        ctr=("signal", "mean"),
        weighted_ctr=("signal", wm),
    )
    .sort_values("weighted_ctr", ascending=False)
)
val_data2 = val_data.merge(tmp, "inner", "series_id")

# NDCG based on explicit_rating
indexes = torch.tensor(val_data2["user_id"], dtype=torch.int64)
target = torch.tensor(val_data2["signal"], dtype=torch.float32)
pred = torch.tensor(val_data2["weighted_ctr"], dtype=torch.float32)

ndcg = RetrievalNormalizedDCG()
ndcg(pred, target, indexes=indexes).item()

0.9646114110946655

In [160]:
val_data.shape, val_data2.shape

((17664, 3), (17664, 6))

In [161]:
wm = lambda x: np.average(x, weights=merged.loc[x.index, "final_signal_weight"])

tmp = merged.groupby("series_id").agg(
    n_actions=("signal", "count"), ctr=("signal", "mean"), weighted_ctr=("signal", wm)
)
tmp = tmp[tmp["ctr"] != 1.0].sort_values("weighted_ctr", ascending=False)
val_data2 = val_data.merge(tmp, "inner", "series_id")

# NDCG based on explicit_rating
indexes = torch.tensor(val_data2["user_id"], dtype=torch.int64)
target = torch.tensor(val_data2["signal"], dtype=torch.float32)
pred = torch.tensor(val_data2["weighted_ctr"], dtype=torch.float32)

ndcg = RetrievalNormalizedDCG()
ndcg(pred, target, indexes=indexes).item()

0.9478943347930908

In [162]:
val_data.shape, val_data2.shape

((17664, 3), (14900, 6))