In [1]:
import pandas as pd
from pathlib import Path

# Interactions

In [26]:
prefix = Path("../../data/contentwise/data/contentwise/CW10M/")
interactions_path = prefix / Path("interactions")
interactions = pd.concat(pd.read_parquet(p) for p in interactions_path.glob("*.parquet")).reset_index()
interactions

Unnamed: 0,utc_ts_milliseconds,user_id,item_id,series_id,episode_number,series_length,item_type,recommendation_id,interaction_type,vision_factor,explicit_rating
0,1551045278000,3203,89985,16867,21,27,3,-1,0,1.00,-1.0
1,1551084937000,10257,89985,16867,21,27,3,-1,0,1.00,-1.0
2,1551086238000,13936,89985,16867,21,27,3,-1,1,-1.00,-1.0
3,1551086915000,13936,89985,16867,21,27,3,-1,0,1.00,-1.0
4,1551093249000,32185,89985,16867,21,27,3,-1,0,1.00,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
10457805,1549021133000,40763,90910,19532,1,1,0,132437,1,-1.00,-1.0
10457806,1549022895000,40763,90910,19532,1,1,0,132437,0,0.26,-1.0
10457807,1549021537000,7183,66535,1428,1,1,0,158480,1,-1.00,-1.0
10457808,1549021513000,33495,111990,23969,1,1,1,67304,1,-1.00,-1.0


In [27]:
interactions["utc_ts_milliseconds"] = pd.to_datetime(interactions["utc_ts_milliseconds"], unit="ms")

In [29]:
# Items mapping (item_type): {0: movies, 1: movies and clips in series, 2: TV movies or shows, 3: episodes of TV series}
interactions["item_type"].value_counts()

item_type
3    9076428
0     987518
1     231290
2     162574
Name: count, dtype: int64

In [30]:
interactions = interactions[interactions["item_type"] == 0].reset_index(drop=True)  # Select movies only

# Iteractions mapping (interaction_type): {0: views, 1: detail, 2: ratings, 3: purchases}
interactions["interaction_type"].value_counts()

interaction_type
0    519179
1    389614
3     71324
2      7401
Name: count, dtype: int64

In [31]:
interactions = interactions[interactions["interaction_type"] == 0].reset_index(drop=True)  # Select clicks only

In [32]:
interactions.head()

Unnamed: 0,utc_ts_milliseconds,user_id,item_id,series_id,episode_number,series_length,item_type,recommendation_id,interaction_type,vision_factor,explicit_rating
0,2019-02-24 21:54:39,3656,61034,23080,1,1,0,-1,0,0.97,-1.0
1,2019-02-24 21:57:41,2455,61034,23080,1,1,0,-1,0,0.1,-1.0
2,2019-02-25 04:05:38,2455,61034,23080,1,1,0,-1,0,0.98,-1.0
3,2019-02-25 02:06:04,23350,61034,23080,1,1,0,-1,0,0.85,-1.0
4,2019-02-24 22:05:01,33551,61034,23080,1,1,0,-1,0,0.95,-1.0


# Impressions (direct)

In [33]:
impressions_dl_path = prefix / Path("impressions-direct-link")
impressions_dl = pd.concat(pd.read_parquet(p) for p in impressions_dl_path.glob("*.parquet")).reset_index()
impressions_dl

Unnamed: 0,recommendation_id,row_position,recommendation_list_length,recommended_series_list
0,0,0,10,"[20128, 6674, 4625, 19462, 19041, 23229, 5914,..."
1,1,0,10,"[7906, 1240, 1712, 8348, 3227, 7607, 24175, 15..."
2,2,0,10,"[13673, 15810, 16821, 3826, 26860, 22223, 1847..."
3,3,1,10,"[13673, 1272, 2293, 23996, 15810, 16821, 13737..."
4,4,0,6,"[21885, 22288, 7493, 17042, 18483, 9330]"
...,...,...,...,...
307448,307449,0,12,"[21261, 26515, 5544, 1393, 5678, 22552, 9101, ..."
307449,307450,1,10,"[20128, 4862, 6674, 28598, 27215, 4625, 19041,..."
307450,307451,0,30,"[9969, 17425, 9101, 14797, 5743, 4172, 17953, ..."
307451,307452,0,10,"[21079, 23099, 28598, 25404, 19462, 26304, 152..."


In [34]:
impressions_dl = impressions_dl.explode("recommended_series_list").reset_index(drop=True)
impressions_dl["recommended_series_list"] = pd.to_numeric(impressions_dl["recommended_series_list"])
impressions_dl

Unnamed: 0,recommendation_id,row_position,recommendation_list_length,recommended_series_list
0,0,0,10,20128
1,0,0,10,6674
2,0,0,10,4625
3,0,0,10,19462
4,0,0,10,19041
...,...,...,...,...
3555033,307453,0,6,28598
3555034,307453,0,6,10244
3555035,307453,0,6,4046
3555036,307453,0,6,17421


# Join relevant clicks and impressions

In [35]:
merged = interactions.merge(impressions_dl, "inner", "recommendation_id")

merged.loc[merged["series_id"] == merged["recommended_series_list"], "target"] = 1  # Set clicks as 1
merged.loc[merged["series_id"] != merged["recommended_series_list"], "target"] = 0  # Set impresisons as 0
merged["target"] = merged["target"].astype(int)

merged = merged[["utc_ts_milliseconds", "user_id", "recommended_series_list", "target"]]
merged = merged.rename(columns={"utc_ts_milliseconds": "timestamp", "user_id": "user", "recommended_series_list": "item"})
merged

Unnamed: 0,timestamp,user,item,target
0,2019-02-25 11:43:40,9147,16194,0
1,2019-02-25 11:43:40,9147,18689,0
2,2019-02-25 11:43:40,9147,9473,0
3,2019-02-25 11:43:40,9147,13943,0
4,2019-02-25 11:43:40,9147,17297,0
...,...,...,...,...
767073,2019-02-01 12:08:15,40763,24429,0
767074,2019-02-01 12:08:15,40763,4347,0
767075,2019-02-01 12:08:15,40763,2484,0
767076,2019-02-01 12:08:15,40763,1881,0


In [36]:
# Remove (user, item) duplicates
merged = merged.groupby(["user", "item"]).agg({"target": "sum"}).reset_index()
merged.loc[merged["target"] > 0, "target"] = 1
merged

Unnamed: 0,user,item,target
0,0,181,0
1,0,2607,0
2,0,2920,1
3,0,4281,0
4,0,9522,1
...,...,...,...
477335,42151,15505,0
477336,42151,20381,0
477337,42151,24644,0
477338,42151,26167,0


In [37]:
merged["target"].value_counts()

target
0    431340
1     46000
Name: count, dtype: int64

In [38]:
merged["user"].nunique(), merged["item"].nunique()

(13252, 3659)