# Item relation matrices

### Get the data

In [1]:
import polars as pl

# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.UInt32, "ts": pl.UInt64, "type": str})
df_schema = {"session": pl.UInt32, "events": pl.List(event_schema)}

df = pl.read_ndjson('../data/train.jsonl', schema=df_schema, low_memory=True)

print(df)

shape: (12_899_779, 2)
┌──────────┬─────────────────────────────────┐
│ session  ┆ events                          │
│ ---      ┆ ---                             │
│ u32      ┆ list[struct[3]]                 │
╞══════════╪═════════════════════════════════╡
│ 0        ┆ [{1517085,1659304800025,"click… │
│ 1        ┆ [{424964,1659304800025,"carts"… │
│ 2        ┆ [{763743,1659304800038,"clicks… │
│ 3        ┆ [{1425967,1659304800095,"carts… │
│ 4        ┆ [{613619,1659304800119,"clicks… │
│ …        ┆ …                               │
│ 12899774 ┆ [{33035,1661723968869,"clicks"… │
│ 12899775 ┆ [{1743151,1661723970935,"click… │
│ 12899776 ┆ [{548599,1661723972537,"clicks… │
│ 12899777 ┆ [{384045,1661723976974,"clicks… │
│ 12899778 ┆ [{561560,1661723983611,"clicks… │
└──────────┴─────────────────────────────────┘


In [2]:
# Sessions
exploded_df = (
    df
    .explode("events")
    .unnest("events")
)

print(exploded_df)

shape: (216_716_096, 4)
┌──────────┬─────────┬───────────────┬────────┐
│ session  ┆ aid     ┆ ts            ┆ type   │
│ ---      ┆ ---     ┆ ---           ┆ ---    │
│ u32      ┆ u32     ┆ u64           ┆ str    │
╞══════════╪═════════╪═══════════════╪════════╡
│ 0        ┆ 1517085 ┆ 1659304800025 ┆ clicks │
│ 0        ┆ 1563459 ┆ 1659304904511 ┆ clicks │
│ 0        ┆ 1309446 ┆ 1659367439426 ┆ clicks │
│ 0        ┆ 16246   ┆ 1659367719997 ┆ clicks │
│ 0        ┆ 1781822 ┆ 1659367871344 ┆ clicks │
│ …        ┆ …       ┆ …             ┆ …      │
│ 12899776 ┆ 1737908 ┆ 1661723987073 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723976974 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723986800 ┆ clicks │
│ 12899778 ┆ 561560  ┆ 1661723983611 ┆ clicks │
│ 12899778 ┆ 32070   ┆ 1661723994936 ┆ clicks │
└──────────┴─────────┴───────────────┴────────┘


In [3]:
is_over_30_minutes = (pl.col("time_between_sec") > 30*60) # 30 minutes in seconds
is_same_session = (pl.col("session") == pl.col("next_session"))

sub_sessions = (
    exploded_df
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .sort(["session", "ts"])
    .with_columns(
        next_session = pl.col("session").shift(-1),
        next_aid = pl.col("aid").shift(-1),
        next_ts = pl.col("ts").shift(-1),
        next_type = pl.col("type").shift(-1),
    )
    # Calculate time between events for events in the same session
    .with_columns(
        time_between_sec = pl.when(is_same_session).then((pl.col("next_ts") - pl.col("ts"))).otherwise(None)
    )
    # Split session into sub sessions by checking if the time between events is more than 30 minutes or if the session changes
    .with_columns(
        sub_session=(
            (
                pl.when(is_over_30_minutes)
                .then(pl.col("time_between_sec")) # Distinct consecutive new sessions using the time between events
                .otherwise(True) # If the time between events is less than 30 minutes, it is the same session
            )
            |
                is_same_session.not_()
        )
        .rle_id().cast(pl.UInt32)
    )
    # Remove info about next event where the next event is a new session
    .with_columns(
        next_session = pl.when(is_over_30_minutes | is_same_session.not_()).then(None).otherwise(pl.col("next_session")),
        next_aid = pl.when(is_over_30_minutes | is_same_session.not_()).then(None).otherwise(pl.col("next_aid")),
        next_ts = pl.when(is_over_30_minutes | is_same_session.not_()).then(None).otherwise(pl.col("next_ts")),
        next_type = pl.when(is_over_30_minutes | is_same_session.not_()).then(None).otherwise(pl.col("next_type")),
        time_between_sec = pl.when(is_over_30_minutes | is_same_session.not_()).then(None).otherwise(pl.col("time_between_sec"))
    )
    .sort(["sub_session", "ts"])
)

print(sub_sessions)

print("Amount of sub sessions:", sub_sessions.select("sub_session").n_unique())

shape: (216_716_096, 10)
┌──────────┬─────────┬────────────┬────────┬───┬────────────┬───────────┬─────────────┬────────────┐
│ session  ┆ aid     ┆ ts         ┆ type   ┆ … ┆ next_ts    ┆ next_type ┆ time_betwee ┆ sub_sessio │
│ ---      ┆ ---     ┆ ---        ┆ ---    ┆   ┆ ---        ┆ ---       ┆ n_sec       ┆ n          │
│ u32      ┆ u32     ┆ u32        ┆ str    ┆   ┆ u32        ┆ str       ┆ ---         ┆ ---        │
│          ┆         ┆            ┆        ┆   ┆            ┆           ┆ u32         ┆ u32        │
╞══════════╪═════════╪════════════╪════════╪═══╪════════════╪═══════════╪═════════════╪════════════╡
│ 0        ┆ 1517085 ┆ 1659304800 ┆ clicks ┆ … ┆ 1659304904 ┆ clicks    ┆ 104         ┆ 0          │
│ 0        ┆ 1563459 ┆ 1659304904 ┆ clicks ┆ … ┆ null       ┆ null      ┆ null        ┆ 1          │
│ 0        ┆ 1309446 ┆ 1659367439 ┆ clicks ┆ … ┆ 1659367719 ┆ clicks    ┆ 280         ┆ 2          │
│ 0        ┆ 16246   ┆ 1659367719 ┆ clicks ┆ … ┆ 1659367871 ┆ clic

### Click to click matrix
Click to click matrix is defined as the probability of an aid being clicked immediately after the previous aid.
Click to click matrix is formed from the sub sessions since there is no point in counting subsequent clicks that are from a user coming back to site after a long time

In [60]:
# Count how many same click to click events there are
subsequent_clicks_count = (
    sub_sessions
    .filter((pl.col("type") == "clicks") & (pl.col("next_type") == "clicks"))
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_sum = (
    subsequent_clicks_count
    .group_by("aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been clicked
click_to_click_matrix = (
    subsequent_clicks_count
    .join(aid_clicks_sum, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["aid", "next_aid"])
)

print(click_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

shape: (63_195_404, 3)
┌─────────┬──────────┬─────────────┐
│ aid     ┆ next_aid ┆ probability │
│ ---     ┆ ---      ┆ ---         │
│ u32     ┆ u32      ┆ f32         │
╞═════════╪══════════╪═════════════╡
│ 0       ┆ 0        ┆ 0.02381     │
│ 0       ┆ 13759    ┆ 0.02381     │
│ 0       ┆ 54474    ┆ 0.02381     │
│ 0       ┆ 90491    ┆ 0.02381     │
│ 0       ┆ 218900   ┆ 0.02381     │
│ …       ┆ …        ┆ …           │
│ 1855602 ┆ 1598688  ┆ 0.058824    │
│ 1855602 ┆ 1621374  ┆ 0.058824    │
│ 1855602 ┆ 1693232  ┆ 0.058824    │
│ 1855602 ┆ 1783511  ┆ 0.058824    │
│ 1855602 ┆ 1855602  ┆ 0.058824    │
└─────────┴──────────┴─────────────┘
Unique aids: 1852443
Total probability: 1852443.125


In [15]:
# save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix.csv")

### Click to cart matrix
Click to cart matrix is defined as the probability of clicked items being added to cart later in the same sub session.
Click to cart matrix is formed from the sub sessions since the sub session should show clear intent of the user to buy items they click.

In [96]:
# Get clicks and carts of sub sessions
clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "click_ts", "aid": "click_aid"})
)

carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "cart_ts", "aid": "cart_aid"})
)

carts_after_clicks_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine clicks and carts of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after clicks
    .filter(pl.col("click_ts") < pl.col("cart_ts"))
    .select(["sub_session", "click_aid", "cart_aid"])
)

# Count how many same click to cart events there are
click_to_cart_count = (
    carts_after_clicks_in_sub_sessions
    .group_by(["click_aid", "cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_sum = (
    click_to_cart_count
    .group_by("click_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been clicked
click_to_cart_matrix = (
    click_to_cart_count
    .join(aid_clicks_sum, on="click_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["click_aid", "cart_aid"])
)

print(click_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_cart_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_cart_matrix.select("probability").sum().select(pl.first()).item())

shape: (104_794_185, 3)
┌───────────┬──────────┬─────────────┐
│ click_aid ┆ cart_aid ┆ probability │
│ ---       ┆ ---      ┆ ---         │
│ u32       ┆ u32      ┆ f32         │
╞═══════════╪══════════╪═════════════╡
│ 0         ┆ 29217    ┆ 0.018182    │
│ 0         ┆ 31465    ┆ 0.018182    │
│ 0         ┆ 45036    ┆ 0.018182    │
│ 0         ┆ 78027    ┆ 0.018182    │
│ 0         ┆ 150507   ┆ 0.036364    │
│ …         ┆ …        ┆ …           │
│ 1855602   ┆ 1376245  ┆ 0.083333    │
│ 1855602   ┆ 1504078  ┆ 0.083333    │
│ 1855602   ┆ 1513725  ┆ 0.041667    │
│ 1855602   ┆ 1768521  ┆ 0.041667    │
│ 1855602   ┆ 1783511  ┆ 0.083333    │
└───────────┴──────────┴─────────────┘
Unique aids: 1712957
Total probability: 1712956.875


In [97]:
# save to csv
click_to_cart_matrix.write_csv("./click_to_cart_matrix.csv")