# Getting started with the EB-NeRD

In [1]:
from pathlib import Path
import polars as pl

from ebrec.utils._descriptive_analysis import (
    min_max_impression_time_behaviors,
    min_max_impression_time_history,
)
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_USER_COL,
)
from exputils.const import DATA_DIR

## Load dataset:

In [2]:
data_dirs = {
    "train": DATA_DIR / "ebnerd" / "train",
    "validation": DATA_DIR / "ebnerd" / "validation",
    "test": DATA_DIR / "ebnerd" / "ebnerd_testset" / "test",
}
data_split = "train"

In [3]:
df_behaviors = pl.scan_parquet(data_dirs[data_split] / "behaviors.parquet")
df_history = pl.scan_parquet(data_dirs[data_split] / "history.parquet")

### Check min/max time-stamps in the data-split period

In [4]:
print(f"History: {min_max_impression_time_history(df_history).collect()}")
print(f"Behaviors: {min_max_impression_time_behaviors(df_behaviors).collect()}")

History: shape: (1, 2)
┌─────────────────────┬─────────────────────┐
│ min                 ┆ max                 │
│ ---                 ┆ ---                 │
│ datetime[μs]        ┆ datetime[μs]        │
╞═════════════════════╪═════════════════════╡
│ 2023-04-27 07:00:00 ┆ 2023-05-18 06:59:59 │
└─────────────────────┴─────────────────────┘
Behaviors: shape: (1, 2)
┌─────────────────────┬─────────────────────┐
│ min                 ┆ max                 │
│ ---                 ┆ ---                 │
│ datetime[μs]        ┆ datetime[μs]        │
╞═════════════════════╪═════════════════════╡
│ 2023-05-18 07:00:00 ┆ 2023-05-25 06:59:59 │
└─────────────────────┴─────────────────────┘


## Add History to Behaviors

In [5]:
df_history = df_history.select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL).pipe(
    truncate_history,
    column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_size=30,
    padding_value=0,
    enable_warning=False,
)
df_history.head(5).collect()

user_id,article_id_fixed
u32,list[i32]
10029,"[9768708, 9768790, … 9770541]"
10033,"[9768802, 9767765, … 9769404]"
10034,"[9756899, 9764579, … 9767363]"
10041,"[9758866, 9758858, … 9757869]"
10103,"[9759929, 9760528, … 9769433]"


In [6]:
df = slice_join_dataframes(
    df1=df_behaviors.collect(),
    df2=df_history.collect(),
    on=DEFAULT_USER_COL,
    how="left",
)
df.head(5)

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,article_id_fixed
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32,list[i32]
47727,,2023-05-21 21:35:07,20.0,,1,"[9482380, 9775183, … 9538375]",[9775183],18293,False,,,,False,265,34.0,100.0,"[9766452, 9766419, … 9768850]"
47731,,2023-05-21 21:32:33,13.0,,1,"[9774557, 9774516, … 9759966]",[9759966],18293,False,,,,False,265,45.0,100.0,"[9766452, 9766419, … 9768850]"
47736,,2023-05-21 21:33:32,17.0,,1,"[9759966, 9774557, … 9775323]",[9774652],18293,False,,,,False,265,78.0,100.0,"[9766452, 9766419, … 9768850]"
47737,,2023-05-21 21:38:17,27.0,,1,"[9774580, 9775131, … 9774899]",[9775184],18293,False,,,,False,265,6.0,52.0,"[9766452, 9766419, … 9768850]"
47740,,2023-05-21 21:36:02,48.0,,1,"[9774826, 9775171, … 9774648]",[9774648],18293,False,,,,False,265,32.0,100.0,"[9766452, 9766419, … 9768850]"


## Generate labels

Here's an example how to generate binary labels based on article_ids_clicked and article_ids_inview

In [7]:
df.select(DEFAULT_CLICKED_ARTICLES_COL, DEFAULT_INVIEW_ARTICLES_COL).pipe(
    create_binary_labels_column, shuffle=True, seed=123
).with_columns(pl.col("labels").list.len().name.suffix("_len")).head(5)

article_ids_clicked,article_ids_inview,labels,labels_len
list[i32],list[i32],list[i8],u32
[9775183],"[9482380, 9774020, … 9775183]","[0, 0, … 1]",6
[9759966],"[9775331, 9774516, … 9774557]","[0, 0, … 0]",5
[9774652],"[9746360, 9738729, … 9774079]","[0, 0, … 0]",13
[9775184],"[9775131, 9775283, … 9774789]","[0, 0, … 0]",11
[9774648],"[9774555, 9774648, … 9775056]","[0, 1, … 0]",9


An example using the downsample strategy employed by Wu et al.

In [8]:
NPRATIO = 2
df.select(DEFAULT_CLICKED_ARTICLES_COL, DEFAULT_INVIEW_ARTICLES_COL).pipe(
    sampling_strategy_wu2019,
    npratio=NPRATIO,
    shuffle=False,
    with_replacement=True,
    seed=123,
).pipe(create_binary_labels_column, shuffle=True, seed=123).with_columns(
    pl.col("labels").list.len().name.suffix("_len")
).head(5)

article_ids_clicked,article_ids_inview,labels,labels_len
list[i64],list[i64],list[i8],u32
[9775183],"[9775183, 9775297, 9538375]","[1, 0, 0]",3
[9759966],"[9759966, 9774516, 9775277]","[1, 0, 0]",3
[9774652],"[9772300, 9774652, 9775323]","[0, 1, 0]",3
[9775184],"[9774972, 9774899, 9775184]","[0, 0, 1]",3
[9774648],"[9774648, 9769624, 9772275]","[1, 0, 0]",3
