# Item relation matrices

### Get the data

In [1]:
import polars as pl

# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.UInt32, "ts": pl.UInt64, "type": str})
df_schema = {"session": pl.UInt32, "events": pl.List(event_schema)}

df = pl.read_ndjson('../data/train.jsonl', schema=df_schema, low_memory=True)

print(df)

shape: (12_899_779, 2)
┌──────────┬─────────────────────────────────┐
│ session  ┆ events                          │
│ ---      ┆ ---                             │
│ u32      ┆ list[struct[3]]                 │
╞══════════╪═════════════════════════════════╡
│ 0        ┆ [{1517085,1659304800025,"click… │
│ 1        ┆ [{424964,1659304800025,"carts"… │
│ 2        ┆ [{763743,1659304800038,"clicks… │
│ 3        ┆ [{1425967,1659304800095,"carts… │
│ 4        ┆ [{613619,1659304800119,"clicks… │
│ …        ┆ …                               │
│ 12899774 ┆ [{33035,1661723968869,"clicks"… │
│ 12899775 ┆ [{1743151,1661723970935,"click… │
│ 12899776 ┆ [{548599,1661723972537,"clicks… │
│ 12899777 ┆ [{384045,1661723976974,"clicks… │
│ 12899778 ┆ [{561560,1661723983611,"clicks… │
└──────────┴─────────────────────────────────┘


In [2]:
# Sessions
exploded_df = (
    df
    .explode("events")
    .unnest("events")
)

print(exploded_df)

shape: (216_716_096, 4)
┌──────────┬─────────┬───────────────┬────────┐
│ session  ┆ aid     ┆ ts            ┆ type   │
│ ---      ┆ ---     ┆ ---           ┆ ---    │
│ u32      ┆ u32     ┆ u64           ┆ str    │
╞══════════╪═════════╪═══════════════╪════════╡
│ 0        ┆ 1517085 ┆ 1659304800025 ┆ clicks │
│ 0        ┆ 1563459 ┆ 1659304904511 ┆ clicks │
│ 0        ┆ 1309446 ┆ 1659367439426 ┆ clicks │
│ 0        ┆ 16246   ┆ 1659367719997 ┆ clicks │
│ 0        ┆ 1781822 ┆ 1659367871344 ┆ clicks │
│ …        ┆ …       ┆ …             ┆ …      │
│ 12899776 ┆ 1737908 ┆ 1661723987073 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723976974 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723986800 ┆ clicks │
│ 12899778 ┆ 561560  ┆ 1661723983611 ┆ clicks │
│ 12899778 ┆ 32070   ┆ 1661723994936 ┆ clicks │
└──────────┴─────────┴───────────────┴────────┘


In [None]:
def get_sub_sessions(with_next_event=True, only_clicks=False, with_row_index=False, limit=None):
    """
    Splits sessions into sub sessions based on time between events or session boundaries.

    Args:
        with_next_event: whether rows should include consecutive events or not.
        only_clicks: only clicks are considered in the sub sessions
        with_row_index: add row index to the sub sessions in order to calculate delta time between events
        limit: limit the amount of rows to be processed.

    Returns: DataFrame of sub sessions
    """

    sub_sessions = exploded_df

    if limit is not None:
        sub_sessions = exploded_df.limit(limit)
    if only_clicks:
        sub_sessions = sub_sessions.filter(pl.col("type") == "clicks")

    sub_sessions = (
        sub_sessions
        # Convert ts to seconds and cast to UInt32 to save memory
        .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
        .sort(["session", "ts"])
        .with_columns(
            next_session = pl.col("session").shift(-1),
            next_aid = pl.col("aid").shift(-1),
            next_ts = pl.col("ts").shift(-1),
            next_type = pl.col("type").shift(-1),
        )
        # Row is a sub session boundary if there is existing session boundary or if time between events is more than 30 minutes
        .with_columns(
            is_session_boundary = ((pl.col("session") != pl.col("next_session")) | (pl.col("next_ts") - pl.col("ts") > 1800)),
        )
        .with_columns(
            sub_session = pl.col("is_session_boundary").cum_sum().cast(pl.UInt32),
        )
    )

    # Use this when immediate next event is required
    # Last event of sub session is always found in the same row as the second last
    if with_next_event:
        sub_sessions = (
            sub_sessions
            # Filter out session boundaries. This also removes sub sessions with only 1 event which are not interesting
            .filter(pl.col("is_session_boundary").not_())
            .drop("is_session_boundary")
        )
    # Use this when all events are wanted to be found in their own rows and single column
    else:
        sub_sessions = (
            sub_sessions
            # Keep each event in their own row
            .with_columns(sub_session = pl.when(pl.col("is_session_boundary")).then(pl.col("sub_session") - 1).otherwise(pl.col("sub_session")))
            .drop(["session", "next_session", "next_aid", "next_ts", "next_type", "is_session_boundary"])
            .filter(pl.col("sub_session").is_null().not_())
        )

        # Filter out sub session with only one event
        multi_event_sub_sessions = (
            sub_sessions
            .group_by("sub_session")
            .agg(pl.len())
            .filter(pl.col("len") > 1)
            .select("sub_session")
        )

        sub_sessions = (
            sub_sessions
            .join(multi_event_sub_sessions, on="sub_session", how="inner")
        )

    if with_row_index:
        sub_sessions = sub_sessions.with_row_index()

    return sub_sessions

# sub_sessions = get_sub_sessions()
# print(sub_sessions)
# print("Amount of sub sessions:", sub_sessions.select("sub_session").n_unique())

In [None]:
# Helper functions
# Note there should only be used when sub_session are generated with param with_next_event=False
# Otherwise some events are lost

def get_clicks_of_sub_session(sub_sessions):
    return (
        sub_sessions
        .filter(pl.col("type") == "clicks")
        .select(["sub_session", "ts", "aid"])
        .rename({"ts": "click_ts", "aid": "click_aid"})
    )

def get_carts_of_sub_session(sub_sessions):
    return (
        sub_sessions
        .filter(pl.col("type") == "carts")
        .select(["sub_session", "ts", "aid"])
        .rename({"ts": "cart_ts", "aid": "cart_aid"})
    )

def get_orders_of_sub_session(sub_sessions):
    return (
        sub_sessions
        .filter(pl.col("type") == "orders")
        .select(["sub_session", "ts", "aid"])
        .rename({"ts": "order_ts", "aid": "order_aid"})
    )

### Click to click matrix
Click to click matrix is defined as the probabilities of other aids being clicked after the previous aid is clicked.
Click to click matrix is formed from the sub sessions since there is no point in counting subsequent clicks that are from a user coming back to site after a long time

We do multiple variations of the click to click matrix:

Next click only

In [None]:
sub_sessions = get_sub_sessions()

# Count how many same click to click events there are
subsequent_clicks_count = (
    sub_sessions
    .filter((pl.col("type") == "clicks") & (pl.col("next_type") == "clicks"))
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    subsequent_clicks_count
    .group_by("aid")
    .agg(pl.sum("count").alias("total_count"))
)

# Calculate the probabilities of items being clicked immediately after another item has been clicked
click_to_click_matrix = (
    subsequent_clicks_count
    .join(aid_clicks_total_count, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
)

print(click_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix.csv")

Next click only. Remove carts and orders from the sub sessions.

In [None]:
sub_sessions = get_sub_sessions(only_clicks=True)

# Count how many same click to click events there are
subsequent_clicks_count = (
    sub_sessions
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    subsequent_clicks_count
    .group_by("aid")
    .agg(pl.sum("count").alias("total_count"))
)

# Calculate the probabilities of items being clicked immediately after another item has been clicked
click_to_click_matrix = (
    subsequent_clicks_count
    .join(aid_clicks_total_count, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
)

print(click_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix_only_clicks.csv")

Next two clicks with time decay. Remove carts and orders from the sub sessions.

In [6]:
# Read and manipulate data and then save it to csv in order to rest with streaming.
# Running out of memory otherwise

import polars as pl

# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.UInt32, "ts": pl.UInt64, "type": str})
df_schema = {"session": pl.UInt32, "events": pl.List(event_schema)}

sub_sessions = (
    pl.read_ndjson('../data/train.jsonl', schema=df_schema, low_memory=True)
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .sort(["session", "ts"])
    .with_columns(
        next_session = pl.col("session").shift(-1),
        next_aid = pl.col("aid").shift(-1),
        next_ts = pl.col("ts").shift(-1),
        next_type = pl.col("type").shift(-1),
    )
    # Row is a sub session boundary if there is existing session boundary or if time between events is more than 1 hour
    .with_columns(
        is_session_boundary = ((pl.col("session") != pl.col("next_session")) | (pl.col("next_ts") - pl.col("ts") > 3600)),
    )
    .with_columns(
        sub_session = pl.col("is_session_boundary").cum_sum().cast(pl.UInt32),
    )
    # Keep each event in their own row
    .with_columns(sub_session = pl.when(pl.col("is_session_boundary")).then(pl.col("sub_session") - 1).otherwise(pl.col("sub_session")))
    .drop(["session", "ts", "type", "next_session", "next_aid", "next_ts", "next_type", "is_session_boundary"])
    .filter(pl.col("sub_session").is_null().not_())
    # Filter sub session with only one event
    .group_by(pl.col("sub_session"))
    .agg(pl.col("aid"))
    .filter(pl.col("aid").list.len() > 1)
    .explode("aid")
    .with_row_index()
)

print(sub_sessions)

# Save to csv
sub_sessions.write_csv("./clicks/clicks_of_sub_sessions_1h.csv")

shape: (200_096_794, 3)
┌───────────┬─────────────┬─────────┐
│ index     ┆ sub_session ┆ aid     │
│ ---       ┆ ---         ┆ ---     │
│ u32       ┆ u32         ┆ u32     │
╞═══════════╪═════════════╪═════════╡
│ 0         ┆ 12115466    ┆ 552662  │
│ 1         ┆ 12115466    ┆ 871283  │
│ 2         ┆ 12115466    ┆ 1436133 │
│ 3         ┆ 12115466    ┆ 871283  │
│ 4         ┆ 12115466    ┆ 1006139 │
│ …         ┆ …           ┆ …       │
│ 200096789 ┆ 2372394     ┆ 1283290 │
│ 200096790 ┆ 2372394     ┆ 723240  │
│ 200096791 ┆ 10679973    ┆ 1828755 │
│ 200096792 ┆ 10679973    ┆ 1581840 │
│ 200096793 ┆ 10679973    ┆ 63237   │
└───────────┴─────────────┴─────────┘


In [5]:
# Run computations in streaming mode in order to not run out of memory

import polars as pl

n_subsequent_clicks = 7

df_schema = {
    "index": pl.UInt32,
    "sub_session": pl.UInt32,
    "aid": pl.UInt32
}

subsequent_clicks = (
    pl.scan_csv("./clicks/clicks_of_sub_sessions_1h.csv", schema=df_schema)
    .join(
        pl.scan_csv("./clicks/clicks_of_sub_sessions_1h.csv", schema=df_schema).rename({"index": "next_index", "aid": "next_aid"}),
        on="sub_session",
        how="inner"
    )
    .with_columns(delta_index=(pl.col("next_index") - pl.col("index")))
    .filter((pl.col("delta_index") >= 1) & (pl.col("delta_index") <= n_subsequent_clicks))
    .with_columns(weight=(1/pl.col("delta_index")).cast(pl.Float32))
    .drop(["index", "next_index", "sub_session", "delta_index"])
)

# Sum all the weights for each click-click pair
click_to_click_count = (
    subsequent_clicks
    .group_by(["aid", "next_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_clicks_total_count = (
    click_to_click_count
    .group_by("aid")
    .agg(pl.col("weighted_count").sum().alias("weighted_total_count"))
)


# Calculate the probabilities of items being clicked after another item has been clicked
click_to_click_matrix = (
    click_to_click_count
    .join(aid_clicks_total_count, on="aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
    .collect(streaming=True)
)

print(click_to_click_matrix)

FileNotFoundError: The system cannot find the file specified. (os error 2): ./clicks/clicks_of_sub_sessions_1h.csv

This error occurred with the following context stack:
	[1] 'csv scan'
	[2] 'join left'
	[3] 'join'
	[4] 'with_columns'
	[5] 'filter'
	[6] 'with_columns'
	[7] 'format!("{}", function).to_lowercase()'
	[8] 'group_by'
	[9] 'join left'
	[10] 'join'
	[11] 'with_columns'
	[12] 'format!("{}", function).to_lowercase()'
	[13] 'sort'
	[14] 'format!("{}", function).to_lowercase()'


In [None]:
# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [4]:
# Save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix_only_clicks_time_decay_7_1h.csv")

### Click to cart matrix
Click to cart matrix is defined as the probabilities of other aids being added to cart in the same sub session after an aid is clicked.
Click to cart matrix is formed from the sub sessions since the sub session should show clear intent of the user to buy items they click.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get clicks and carts of sub sessions
clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

carts_after_clicks_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine clicks and carts of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after clicks
    .filter(pl.col("click_ts") < pl.col("cart_ts"))
    .select(["sub_session", "click_aid", "cart_aid"])
)

# Count how many same click to cart events there are
click_to_cart_count = (
    carts_after_clicks_in_sub_sessions
    .group_by(["click_aid", "cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    click_to_cart_count
    .group_by("click_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been clicked
click_to_cart_matrix = (
    click_to_cart_count
    .join(aid_clicks_total_count, on="click_aid", how="inner")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["click_aid", "cart_aid"])
)

print(click_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_cart_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_cart_matrix.write_csv("./click_to_cart_matrix.csv")

Click to cart with time decay. Carts happening later in the session count less into the probability than carts that are closer to the click.
The first cart relative to the clicked aid has weight of 1, second has weight of 1/2, third 1/3, etc...

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out orders
    .filter(pl.col("type") != "orders")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "carts").cum_sum()))
)

# Get clicks and carts of sub sessions
clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .rename({"ts": "click_ts", "aid": "click_aid", "index": "click_index"})
)

carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

carts_after_clicks_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine clicks and carts of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after clicks
    .filter(pl.col("click_ts") < pl.col("cart_ts"))
    .select(["sub_session", "click_aid", "cart_aid", "click_index", "cart_index"])
    # Weight the click-to-cart relation based in index. Next cart has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("cart_index")-pl.col("click_index"))).cast(pl.Float32))
    .drop(["click_index", "cart_index"])
)

# Sum all the weights for each click-cart pair
click_to_cart_count = (
    carts_after_clicks_in_sub_sessions
    .group_by(["click_aid", "cart_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_clicks_total_count = (
    carts_after_clicks_in_sub_sessions
    .group_by("click_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to cart after another item has been clicked
click_to_cart_matrix = (
    click_to_cart_count
    .join(aid_clicks_total_count, on="click_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["click_aid", "cart_aid"])
)

print(click_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_cart_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# Save to csv
click_to_cart_matrix.write_csv("./click_to_cart_matrix_time_decay.csv")

### Click to order matrix
Click to order matrix is defined as the probabilities of other aids being ordered in the same sub session after an aid is clicked.
Click to order matrix is formed from the sub sessions since the sub session should show clear intent of the user to buy items they click.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get clicks and orders of sub sessions
clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

orders_after_clicks_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine clicks and orders of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after clicks
    .filter(pl.col("click_ts") < pl.col("order_ts"))
    .select(["sub_session", "click_aid", "order_aid"])
)

# Count how many same click to order events there are
click_to_order_count = (
    orders_after_clicks_in_sub_sessions
    .group_by(["click_aid", "order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    click_to_order_count
    .group_by("click_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been clicked
click_to_order_matrix = (
    click_to_order_count
    .join(aid_clicks_total_count, on="click_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["click_aid", "order_aid"])
)

print(click_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_order_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_order_matrix.write_csv("./click_to_order_matrix.csv")

Click to order with time decay

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out carts
    .filter(pl.col("type") != "carts")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "orders").cum_sum()))
)

# Get clicks and orders of sub sessions
clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .rename({"ts": "click_ts", "aid": "click_aid", "index": "click_index"})
)

orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

orders_after_clicks_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine clicks and orders of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after clicks
    .filter(pl.col("click_ts") < pl.col("order_ts"))
    .select(["sub_session", "click_aid", "order_aid", "click_index", "order_index"])
    # Weight the click-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("order_index")-pl.col("click_index"))).cast(pl.Float32))
    .drop(["click_index", "order_index"])
)

# Sum all the weights for each click-order pair
click_to_order_count = (
    orders_after_clicks_in_sub_sessions
    .group_by(["click_aid", "order_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_clicks_total_count = (
    orders_after_clicks_in_sub_sessions
    .group_by("click_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to order after another item has been clicked
click_to_order_matrix = (
    click_to_order_count
    .join(aid_clicks_total_count, on="click_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["click_aid", "order_aid"])
)

print(click_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_order_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# Save to csv
click_to_order_matrix.write_csv("./click_to_order_matrix_time_decay.csv")

### Cart to click matrix
Cart to click matrix is defined as the probabilities of other aids being clicked immediately after an aid is added to cart.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get carts and clicks of sub sessions
carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

clicks_after_carts_in_sub_sessions = (
    clicks_of_sub_sessions
    # Combine carts and clicks of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only clicks that happened after carts
    .filter(pl.col("cart_ts") < pl.col("click_ts"))
    .select(["sub_session", "cart_aid", "click_aid"])
)

# Count how many same cart to click events there are
cart_to_click_count = (
    clicks_after_carts_in_sub_sessions
    .group_by(["cart_aid", "click_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_total_count = (
    cart_to_click_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been added to cart
cart_to_click_matrix = (
    cart_to_click_count
    .join(aid_carts_total_count, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["cart_aid", "click_aid"])
)

print(cart_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_click_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_click_matrix.write_csv("./cart_to_click_matrix.csv")

### Cart to cart matrix
Cart to cart matrix is defined as the probabilities of other aids being added to cart later in the same sub session where an aid is added to cart.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get carts of sub sessions
carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

next_carts_of_sub_sessions = (
    carts_of_sub_sessions
    .rename({"cart_ts": "next_cart_ts", "cart_aid": "next_cart_aid"})
)

# Find subsequent carts in the same sub session
subsequent_carts = (
    carts_of_sub_sessions
    .join(next_carts_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("cart_ts") < pl.col("next_cart_ts"))
    .select(["sub_session", "cart_aid", "next_cart_aid"])
)

# Count how many same cart to cart events there are
subsequent_carts_count = (
    subsequent_carts
    .group_by(["cart_aid", "next_cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_total_count = (
    subsequent_carts_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been added to cart
cart_to_cart_matrix = (
    subsequent_carts_count
    .join(aid_carts_total_count, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["cart_aid", "next_cart_aid"])
)

print(cart_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_cart_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_cart_matrix.write_csv("./cart_to_cart_matrix.csv")

Cart to cart with time decay

In [None]:
carts_of_sub_sessions = (
    get_sub_sessions(with_next_event=False)
    .filter(pl.col("type") == "carts")
    .with_row_index()
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

next_carts_of_sub_sessions = (
    carts_of_sub_sessions
    .rename({"cart_ts": "next_cart_ts", "cart_aid": "next_cart_aid", "cart_index": "next_cart_index"})
)

# Find subsequent carts in the same sub session
subsequent_carts = (
    carts_of_sub_sessions
    .join(next_carts_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("cart_ts") < pl.col("next_cart_ts"))
    .select(["sub_session", "cart_aid", "next_cart_aid", "cart_index", "next_cart_index"])
    # Weight the cart-to-cart relation based in index. Next cart has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("next_cart_index")-pl.col("cart_index"))).cast(pl.Float32))
    .drop(["cart_index", "next_cart_index"])
)

# Sum all the weights for each cart-cart pair
cart_to_cart_count = (
    subsequent_carts
    .group_by(["cart_aid", "next_cart_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each cart
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_carts_total_count = (
    subsequent_carts
    .group_by("cart_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to cart after another item has been clicked
cart_to_cart_matrix = (
    cart_to_cart_count
    .join(aid_carts_total_count, on="cart_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["cart_aid", "next_cart_aid"])
)

print(cart_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_cart_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_cart_matrix.write_csv("./cart_to_cart_matrix_time_decay.csv")

### Cart to order matrix
Cart to order matrix is defined as the probabilities of other aids being ordered later in the same sub session where an aid is added to cart.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get carts and orders of sub sessions
carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

orders_after_carts_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine carts and orders of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after carts
    .filter(pl.col("cart_ts") < pl.col("order_ts"))
    .select(["sub_session", "cart_aid", "order_aid"])
)

# Count how many same cart to order events there are
cart_to_order_count = (
    orders_after_carts_in_sub_sessions
    .group_by(["cart_aid", "order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_total_count = (
    cart_to_order_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been added to cart
cart_to_order_matrix = (
    cart_to_order_count
    .join(aid_carts_total_count, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["cart_aid", "order_aid"])
)

print(cart_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_order_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_order_matrix.write_csv("./cart_to_order_matrix.csv")

Cart to order with time decay

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out clicks
    .filter(pl.col("type") != "clicks")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "orders").cum_sum()))
)

# Get clicks and orders of sub sessions
carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

orders_after_carts_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine carts and orders of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after carts
    .filter(pl.col("cart_ts") < pl.col("order_ts"))
    .select(["sub_session", "cart_aid", "order_aid", "cart_index", "order_index"])
    # Weight the cart-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("order_index")-pl.col("cart_index"))).cast(pl.Float32))
    .drop(["cart_index", "order_index"])
)

# Sum all the weights for each cart-order pair
cart_to_order_count = (
    orders_after_carts_in_sub_sessions
    .group_by(["cart_aid", "order_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each cart
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_carts_total_count = (
    orders_after_carts_in_sub_sessions
    .group_by("cart_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to order after another item has been clicked
cart_to_order_matrix = (
    cart_to_order_count
    .join(aid_carts_total_count, on="cart_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["cart_aid", "order_aid"])
)

print(cart_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_order_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_order_matrix.write_csv("./cart_to_order_matrix_time_decay.csv")

### Order to click matrix
Order to click matrix is defined as the probabilities of other aids being clicked immediately after an aid is ordered.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get orders and clicks of sub sessions
orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

clicks_after_orders_in_sub_sessions = (
    clicks_of_sub_sessions
    # Combine orders and clicks of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only clicks that happened after orders
    .filter(pl.col("order_ts") < pl.col("click_ts"))
    .select(["sub_session", "order_aid", "click_aid"])
)

# Count how many same order to click events there are
order_to_click_count = (
    clicks_after_orders_in_sub_sessions
    .group_by(["order_aid", "click_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_total_count = (
    order_to_click_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been ordered
order_to_click_matrix = (
    order_to_click_count
    .join(aid_orders_total_count, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["order_aid", "click_aid"])
)

print(order_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_click_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_click_matrix.write_csv("./order_to_click_matrix.csv")

### Order to cart matrix
Order to cart matrix is defined as the probabilities of other aids being added to cart later in the same sub session where an aid is ordered.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get orders and carts of sub sessions
orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

carts_after_orders_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine orders and carts of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after orders
    .filter(pl.col("order_ts") < pl.col("cart_ts"))
    .select(["sub_session", "order_aid", "cart_aid"])
)

# Count how many same order to cart events there are
order_to_cart_count = (
    carts_after_orders_in_sub_sessions
    .group_by(["order_aid", "cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_total_count = (
    order_to_cart_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been ordered
order_to_cart_matrix = (
    order_to_cart_count
    .join(aid_orders_total_count, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["order_aid", "cart_aid"])
)

print(order_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_cart_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_cart_matrix.write_csv("./order_to_cart_matrix.csv")

Order to cart with time decay

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out clicks
    .filter(pl.col("type") != "clicks")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "cart").cum_sum()))
)

# Get clicks and orders of sub sessions
orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

carts_after_orders_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine carts and orders of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after carts
    .filter(pl.col("order_ts") < pl.col("cart_ts"))
    .select(["sub_session", "cart_aid", "order_aid", "cart_index", "order_index"])
    # Weight the cart-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("cart_index")-pl.col("order_index"))).cast(pl.Float32))
    .drop(["cart_index", "order_index"])
)

# Sum all the weights for each order-cart pair
order_to_cart_count = (
    orders_after_carts_in_sub_sessions
    .group_by(["order_aid", "cart_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each cart
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_orders_total_count = (
    orders_after_carts_in_sub_sessions
    .group_by("order_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to cart after another item has been ordered
order_to_cart_matrix = (
    order_to_cart_count
    .join(aid_orders_total_count, on="order_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["order_aid", "cart_aid"])
)

print(order_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_cart_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_cart_matrix.write_csv("./order_to_cart_matrix_time_decay.csv")

### Order to order matrix
Order to order matrix is defined as the probabilities of other aids being ordered later in the same sub session where an aid is ordered.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get orders of sub sessions
orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

next_orders_of_sub_sessions = (
    orders_of_sub_sessions
    .rename({"order_ts": "next_order_ts", "order_aid": "next_order_aid"})
)

# Find subsequent orders in the same sub session
subsequent_orders = (
    orders_of_sub_sessions
    .join(next_orders_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("order_ts") < pl.col("next_order_ts"))
    .select(["sub_session", "order_aid", "next_order_aid"])
)

# Count how many same order to order events there are
subsequent_orders_count = (
    subsequent_orders
    .group_by(["order_aid", "next_order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_total_count = (
    subsequent_orders_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been ordered
order_to_order_matrix = (
    subsequent_orders_count
    .join(aid_orders_total_count, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["order_aid", "next_order_aid"])
)

print(order_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_order_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_order_matrix.write_csv("./order_to_order_matrix.csv")

Order to order with time decay

In [None]:
orders_of_sub_sessions = (
    get_sub_sessions(with_next_event=False)
    .filter(pl.col("type") == "orders")
    .with_row_index()
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

next_orders_of_sub_sessions = (
    orders_of_sub_sessions
    .rename({"order_ts": "next_order_ts", "order_aid": "next_order_aid", "order_index": "next_order_index"})
)

# Find subsequent orders in the same sub session
subsequent_orders = (
    orders_of_sub_sessions
    .join(next_orders_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("order_ts") < pl.col("next_order_ts"))
    .select(["sub_session", "order_aid", "next_order_aid", "order_index", "next_order_index"])
    # Weight the order-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("next_order_index")-pl.col("order_index"))).cast(pl.Float32))
    .drop(["order_index", "next_order_index"])
)

# Sum all the weights for each order-order pair
order_to_order_count = (
    subsequent_orders
    .group_by(["order_aid", "next_order_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each order
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_orders_total_count = (
    subsequent_orders
    .group_by("order_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being ordered after another item has been ordered
order_to_order_matrix = (
    order_to_order_count
    .join(aid_orders_total_count, on="order_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["order_aid", "next_order_aid"])
)

print(order_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_order_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_order_matrix.write_csv("./order_to_order_matrix_time_decay.csv")

### Order incompatibility matrix
Order incompatibility matrix is defined as links between items which are often ordered with same items but are never ordered together.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

orders_of_sub_sessions_df = get_orders_of_sub_session(sub_sessions)

# how many times item has to be ordered with another item so that their relation can be considered strong
ordered_together_threshold = 4

# Filter aids with too few total orders
allowed_aids_df = (
    orders_of_sub_sessions_df
    .group_by("order_aid")
    .agg(pl.len().alias("count"))
    .filter(pl.col("count") >= ordered_together_threshold)
    .sort("order_aid")
)

multi_order_sub_sessions_df = (
    orders_of_sub_sessions_df
    # filter aids with too few orders
    .join(allowed_aids_df.select("order_aid"), on="order_aid", how="inner")
    # group orders and get unique aids for each order
    .group_by("sub_session")
    # Remove items ordered multiple times in single session
    .agg(pl.col("order_aid").unique().sort())
    # filter out orders with only one item left
    .filter(pl.col("order_aid").list.len() > 1)
    .explode("order_aid")
)

# Find all order aid pairs an count how many times these pairs have been ordered together
ordered_together_count = (
    multi_order_sub_sessions_df
    .join(multi_order_sub_sessions_df, on="sub_session", how="inner")
    .drop("sub_session")
    .group_by(["order_aid", "order_aid_right"])
    .agg(pl.len().alias("count"))
    # Filter out pairs with too few orders
    .filter((pl.col("count") >= ordered_together_threshold) & (pl.col("order_aid") != pl.col("order_aid_right")))
)

# Combine aid pairs into groups of aids that have been ordered together
aids_ordered_together_df = (
    ordered_together_count
    .group_by("order_aid")
    .agg(pl.col("order_aid_right").alias("ordered_together_with"))
)

# Find all incompatible pairs by finding aids that have never been ordered together but have common aids that they are often ordered with
incompatible_pairs = []

print("Start")
for index, order_aid in enumerate(aids_ordered_together_df.select("order_aid").to_numpy().reshape(-1)):
    if (index+1) % 1000 == 0:
        print("row", index+1)

    aids_ordered_together = aids_ordered_together_df.row(by_predicate=(pl.col("order_aid") == order_aid))[1]
    for aid in aids_ordered_together:
        incompatible_aids = aids_ordered_together_df.row(by_predicate=(pl.col("order_aid") == aid))[1]
        new_incompatible_pairs = [(order_aid, aid) for aid in incompatible_aids]
        incompatible_pairs.extend(new_incompatible_pairs)

print("Done")
print()

incompatible_df = (
    pl.DataFrame(
        data=incompatible_pairs,
        orient='row',
        schema={"aid": pl.UInt32, "incompatible_aid": pl.UInt32}
    )
    .filter(pl.col("aid") != pl.col("incompatible_aid"))
    .unique()
    .sort(["aid", "incompatible_aid"])
)
print(incompatible_df)

In [None]:
# Write to csv
incompatible_df.write_csv("./incompatible_matrix.csv")

In [None]:
unique_incompatible_count = (
    incompatible_df
    .select("aid")
    .n_unique()
)

print(f"Incompatible products found for {unique_incompatible_count} unique products")

### Click count matrix
Click counts of aids normalized by the maximum click count.

In [17]:
click_count_matrix = (
    exploded_df
    .filter(pl.col("type") == "clicks")
    .group_by("aid")
    .agg(pl.len().alias("count"))
    .with_columns(weight = (pl.col("count") / pl.col("count").max()).cast(pl.Float32))
    .drop("count")
    .sort("aid")
)

print(click_count_matrix)

click_count_matrix.write_csv("./click_count_matrix.csv")

shape: (1_855_603, 2)
┌─────────┬──────────┐
│ aid     ┆ weight   │
│ ---     ┆ ---      │
│ u32     ┆ f32      │
╞═════════╪══════════╡
│ 0       ┆ 0.000363 │
│ 1       ┆ 0.000272 │
│ 2       ┆ 0.00014  │
│ 3       ┆ 0.019235 │
│ 4       ┆ 0.001682 │
│ …       ┆ …        │
│ 1855598 ┆ 0.000058 │
│ 1855599 ┆ 0.000107 │
│ 1855600 ┆ 0.000676 │
│ 1855601 ┆ 0.000701 │
│ 1855602 ┆ 0.000157 │
└─────────┴──────────┘
