In [1]:
import pandas as pd

# Interactions

In [2]:
interactions_path = "../../data/contentwise/data/contentwise/CW10M-CSV/interactions.csv.gz"
interactions = pd.read_csv(interactions_path)
interactions.shape

(10457810, 11)

In [3]:
interactions = interactions[interactions["interaction_type"] == 0].reset_index(drop=True)

columns = ["utc_ts_milliseconds", "user_id", "series_id", "recommendation_id", "vision_factor"]
interactions = interactions[columns]
interactions

Unnamed: 0,utc_ts_milliseconds,user_id,series_id,recommendation_id,vision_factor
0,1546851602000,7285,434,56402,1.00
1,1546853253000,10811,434,-1,0.50
2,1546856158000,10811,434,-1,0.99
3,1546864637000,16091,434,-1,1.00
4,1546857754000,35133,434,-1,0.35
...,...,...,...,...,...
6122100,1555315990000,28773,7157,183697,1.00
6122101,1555316085000,28773,7157,183697,0.05
6122102,1555317987000,33296,11292,277497,0.60
6122103,1555317103000,20765,14006,275661,1.00


# Impressions (direct)

In [4]:
impressions_dl_path = "../../data/contentwise/data/contentwise/CW10M-CSV/impressions-direct-link.csv.gz"
impressions_dl = pd.read_csv(impressions_dl_path)
impressions_dl.shape

(307453, 4)

In [5]:
impressions_dl

Unnamed: 0,recommendation_id,row_position,recommendation_list_length,recommended_series_list
0,0,0,10,[20128 6674 4625 19462 19041 23229 5914 76...
1,1,0,10,[ 7906 1240 1712 8348 3227 7607 24175 152...
2,2,0,10,[13673 15810 16821 3826 26860 22223 18470 284...
3,3,1,10,[13673 1272 2293 23996 15810 16821 13737 124...
4,4,0,6,[21885 22288 7493 17042 18483 9330]
...,...,...,...,...
307448,307449,0,12,[21261 26515 5544 1393 5678 22552 9101 226...
307449,307450,1,10,[20128 4862 6674 28598 27215 4625 19041 232...
307450,307451,0,30,[ 9969 17425 9101 14797 5743 4172 17953 104...
307451,307452,0,10,[21079 23099 28598 25404 19462 26304 15256 158...


In [6]:
impressions_dl["recommended_series_list"] = impressions_dl["recommended_series_list"].str.replace(r"(\[|\])", "", regex=True).str.split()
impressions_dl = impressions_dl.explode("recommended_series_list").reset_index(drop=True)
impressions_dl

Unnamed: 0,recommendation_id,row_position,recommendation_list_length,recommended_series_list
0,0,0,10,20128
1,0,0,10,6674
2,0,0,10,4625
3,0,0,10,19462
4,0,0,10,19041
...,...,...,...,...
3555033,307453,0,6,28598
3555034,307453,0,6,10244
3555035,307453,0,6,4046
3555036,307453,0,6,17421


# Join

In [7]:
merged = interactions.merge(impressions_dl, "inner", "recommendation_id")
merged["recommended_series_list"] = pd.to_numeric(merged["recommended_series_list"])
merged

Unnamed: 0,utc_ts_milliseconds,user_id,series_id,recommendation_id,vision_factor,row_position,recommendation_list_length,recommended_series_list
0,1546851602000,7285,434,56402,1.00,0,3,22815
1,1546851602000,7285,434,56402,1.00,0,3,10432
2,1546851602000,7285,434,56402,1.00,0,3,434
3,1546855011000,40252,434,42565,0.45,0,10,21079
4,1546855011000,40252,434,42565,0.45,0,10,20128
...,...,...,...,...,...,...,...,...
3423680,1555318176000,32390,20314,292156,1.00,1,5,20314
3423681,1555318176000,32390,20314,292156,1.00,1,5,13273
3423682,1555318176000,32390,20314,292156,1.00,1,5,4260
3423683,1555318176000,32390,20314,292156,1.00,1,5,12892


In [8]:
merged.loc[merged["series_id"] == merged["recommended_series_list"], "target"] = 1
merged.loc[merged["series_id"] != merged["recommended_series_list"], "target"] = 0
merged

Unnamed: 0,utc_ts_milliseconds,user_id,series_id,recommendation_id,vision_factor,row_position,recommendation_list_length,recommended_series_list,target
0,1546851602000,7285,434,56402,1.00,0,3,22815,0.0
1,1546851602000,7285,434,56402,1.00,0,3,10432,0.0
2,1546851602000,7285,434,56402,1.00,0,3,434,1.0
3,1546855011000,40252,434,42565,0.45,0,10,21079,0.0
4,1546855011000,40252,434,42565,0.45,0,10,20128,0.0
...,...,...,...,...,...,...,...,...,...
3423680,1555318176000,32390,20314,292156,1.00,1,5,20314,1.0
3423681,1555318176000,32390,20314,292156,1.00,1,5,13273,0.0
3423682,1555318176000,32390,20314,292156,1.00,1,5,4260,0.0
3423683,1555318176000,32390,20314,292156,1.00,1,5,12892,0.0


In [9]:
output = merged[["user_id", "recommended_series_list", "target", "utc_ts_milliseconds"]]
output["target"] = output["target"].astype(int)
output.columns = ["user", "item", "target", "timestamp"]
output

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output["target"] = output["target"].astype(int)


Unnamed: 0,user,item,target,timestamp
0,7285,22815,0,1546851602000
1,7285,10432,0,1546851602000
2,7285,434,1,1546851602000
3,40252,21079,0,1546855011000
4,40252,20128,0,1546855011000
...,...,...,...,...
3423680,32390,20314,1,1555318176000
3423681,32390,13273,0,1555318176000
3423682,32390,4260,0,1555318176000
3423683,32390,12892,0,1555318176000


In [11]:
output = output.groupby(["user", "item"]).agg({"target": "sum"}).reset_index()
output.loc[output["target"] > 0, "target"] = 1
output

Unnamed: 0,user,item,target
0,0,181,0
1,0,221,0
2,0,1440,0
3,0,2607,0
4,0,2920,1
...,...,...,...
1202256,42152,26373,0
1202257,42152,26494,0
1202258,42152,27215,0
1202259,42152,28527,0


In [12]:
output["target"].value_counts()

0    1059621
1     142640
Name: target, dtype: int64

In [13]:
user_to_idx = {user: idx for idx, user in enumerate(output["user"].unique())}
item_to_idx = {item: idx for idx, item in enumerate(output["item"].unique())}
output["user"] = output["user"].map(user_to_idx)
output["item"] = output["item"].map(item_to_idx)

## implicit

In [31]:
train_data = output[:800_000].reset_index(drop=True)
val_data = output[800_000:1_000_000].reset_index(drop=True)
test_data = output[1_000_000:].reset_index(drop=True)

In [18]:
train_data.to_csv("train_data_implicit.csv", index=False)
val_data.to_csv("val_data_implicit.csv", index=False)
test_data.to_csv("test_data_implicit.csv", index=False)

In [19]:
output["user"].nunique()

28597

In [20]:
output["item"].nunique()

6733

## implicit_br

In [28]:
tmp0 = train_data.loc[train_data["target"] == 0, ["user", "item"]]
tmp1 = train_data.loc[train_data["target"] == 1, ["user", "item"]]

train_data = tmp0.merge(tmp1, "inner", "user", suffixes=("_neg", "_pos"))
train_data = train_data.sample(frac=0.2, random_state=0).reset_index(drop=True)

In [30]:
train_data.to_csv("train_data_implicit_bpr.csv", index=False)
val_data.to_csv("val_data_implicit_bpr.csv", index=False)
test_data.to_csv("test_data_implicit_bpr.csv", index=False)