# Item relation matrices

### Get the data

In [1]:
import polars as pl

# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.UInt32, "ts": pl.UInt64, "type": str})
df_schema = {"session": pl.UInt32, "events": pl.List(event_schema)}

df = pl.read_ndjson('../data/train.jsonl', schema=df_schema, low_memory=True)

print(df)

shape: (12_899_779, 2)
┌──────────┬─────────────────────────────────┐
│ session  ┆ events                          │
│ ---      ┆ ---                             │
│ u32      ┆ list[struct[3]]                 │
╞══════════╪═════════════════════════════════╡
│ 0        ┆ [{1517085,1659304800025,"click… │
│ 1        ┆ [{424964,1659304800025,"carts"… │
│ 2        ┆ [{763743,1659304800038,"clicks… │
│ 3        ┆ [{1425967,1659304800095,"carts… │
│ 4        ┆ [{613619,1659304800119,"clicks… │
│ …        ┆ …                               │
│ 12899774 ┆ [{33035,1661723968869,"clicks"… │
│ 12899775 ┆ [{1743151,1661723970935,"click… │
│ 12899776 ┆ [{548599,1661723972537,"clicks… │
│ 12899777 ┆ [{384045,1661723976974,"clicks… │
│ 12899778 ┆ [{561560,1661723983611,"clicks… │
└──────────┴─────────────────────────────────┘


In [2]:
# Sessions
exploded_df = (
    df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
)

print(exploded_df)

shape: (216_716_096, 4)
┌──────────┬─────────┬────────────┬────────┐
│ session  ┆ aid     ┆ ts         ┆ type   │
│ ---      ┆ ---     ┆ ---        ┆ ---    │
│ u32      ┆ u32     ┆ u32        ┆ str    │
╞══════════╪═════════╪════════════╪════════╡
│ 0        ┆ 1517085 ┆ 1659304800 ┆ clicks │
│ 0        ┆ 1563459 ┆ 1659304904 ┆ clicks │
│ 0        ┆ 1309446 ┆ 1659367439 ┆ clicks │
│ 0        ┆ 16246   ┆ 1659367719 ┆ clicks │
│ 0        ┆ 1781822 ┆ 1659367871 ┆ clicks │
│ …        ┆ …       ┆ …          ┆ …      │
│ 12899776 ┆ 1737908 ┆ 1661723987 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723976 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723986 ┆ clicks │
│ 12899778 ┆ 561560  ┆ 1661723983 ┆ clicks │
│ 12899778 ┆ 32070   ┆ 1661723994 ┆ clicks │
└──────────┴─────────┴────────────┴────────┘


In [3]:
def get_sub_sessions(with_next_event=True, only_clicks=False, with_row_index=False, limit=None):
    """
    Splits sessions into sub sessions based on time between events or session boundaries.

    Args:
        with_next_event: whether rows should include consecutive events or not.
        only_clicks: only clicks are considered in the sub sessions
        with_row_index: add row index to the sub sessions in order to calculate delta time between events
        limit: limit the amount of rows to be processed.

    Returns: DataFrame of sub sessions
    """

    sub_sessions = exploded_df

    if limit is not None:
        sub_sessions = exploded_df.limit(limit)
    if only_clicks:
        sub_sessions = sub_sessions.filter(pl.col("type") == "clicks")

    sub_sessions = (
        sub_sessions
        # Convert ts to seconds and cast to UInt32 to save memory
        .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
        .sort(["session", "ts"])
        .with_columns(
            next_session = pl.col("session").shift(-1),
            next_aid = pl.col("aid").shift(-1),
            next_ts = pl.col("ts").shift(-1),
            next_type = pl.col("type").shift(-1),
        )
        # Row is a sub session boundary if there is existing session boundary or if time between events is more than 30 minutes
        .with_columns(
            is_session_boundary = ((pl.col("session") != pl.col("next_session")) | (pl.col("next_ts") - pl.col("ts") > 1800)),
        )
        .with_columns(
            sub_session = pl.col("is_session_boundary").cum_sum().cast(pl.UInt32),
        )
    )

    # Use this when immediate next event is required
    # Last event of sub session is always found in the same row as the second last
    if with_next_event:
        sub_sessions = (
            sub_sessions
            # Filter out session boundaries. This also removes sub sessions with only 1 event which are not interesting
            .filter(pl.col("is_session_boundary").not_())
            .drop("is_session_boundary")
        )
    # Use this when all events are wanted to be found in their own rows and single column
    else:
        sub_sessions = (
            sub_sessions
            # Keep each event in their own row
            .with_columns(sub_session = pl.when(pl.col("is_session_boundary")).then(pl.col("sub_session") - 1).otherwise(pl.col("sub_session")))
            .drop(["session", "next_session", "next_aid", "next_ts", "next_type", "is_session_boundary"])
            .filter(pl.col("sub_session").is_null().not_())
        )

        # Filter out sub session with only one event
        multi_event_sub_sessions = (
            sub_sessions
            .group_by("sub_session")
            .agg(pl.len())
            .filter(pl.col("len") > 1)
            .select("sub_session")
        )

        sub_sessions = (
            sub_sessions
            .join(multi_event_sub_sessions, on="sub_session", how="inner")
        )

    if with_row_index:
        sub_sessions = sub_sessions.with_row_index()

    return sub_sessions

# sub_sessions = get_sub_sessions()
# print(sub_sessions)
# print("Amount of sub sessions:", sub_sessions.select("sub_session").n_unique())

In [4]:
# Helper functions
# Note there should only be used when sub_session are generated with param with_next_event=False
# Otherwise some events are lost

def get_clicks_of_sub_session(sub_sessions):
    return (
        sub_sessions
        .filter(pl.col("type") == "clicks")
        .select(["sub_session", "ts", "aid"])
        .rename({"ts": "click_ts", "aid": "click_aid"})
    )

def get_carts_of_sub_session(sub_sessions):
    return (
        sub_sessions
        .filter(pl.col("type") == "carts")
        .select(["sub_session", "ts", "aid"])
        .rename({"ts": "cart_ts", "aid": "cart_aid"})
    )

def get_orders_of_sub_session(sub_sessions):
    return (
        sub_sessions
        .filter(pl.col("type") == "orders")
        .select(["sub_session", "ts", "aid"])
        .rename({"ts": "order_ts", "aid": "order_aid"})
    )

### Click to click matrix
Click to click matrix is defined as the probabilities of other aids being clicked after the previous aid is clicked.
Click to click matrix is formed from the sub sessions since there is no point in counting subsequent clicks that are from a user coming back to site after a long time

We do multiple variations of the click to click matrix:

Next click only

In [5]:
subsequent_clicks = (
    exploded_df
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .filter(pl.col("type") == "clicks")
    .sort(["session", "ts"], descending=[False, False])
    .with_columns(
        next_session = pl.col("session").shift(-3),
        next_aid = pl.col("aid").shift(-3),
        next_ts = pl.col("ts").shift(-3),
    )
    .with_columns(
        time_between_min = pl.when(pl.col("session") == pl.col("next_session")).then(((pl.col("next_ts") - pl.col("ts")) / 60)).otherwise(None).cast(pl.Float32)
    )
    # Click has taken less than one hour
    .filter(pl.col("time_between_min").is_not_null() & (pl.col("time_between_min") <= 62))
    .select(["aid", "next_aid"])
)

# Count how many same click to click events there are
subsequent_clicks_count = (
    subsequent_clicks
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    subsequent_clicks_count
    .group_by("aid")
    .agg(pl.sum("count").alias("total_count"))
)

# Calculate the probabilities of items being clicked immediately after another item has been clicked
click_to_click_matrix = (
    subsequent_clicks_count
    .join(aid_clicks_total_count, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
)

print(click_to_click_matrix)

click_to_click_matrix.write_csv("./click_to_click_matrix_60_3.csv")

shape: (65_140_992, 3)
┌───────────┬────────────────┬─────────────┐
│ click_aid ┆ next_click_aid ┆ probability │
│ ---       ┆ ---            ┆ ---         │
│ u32       ┆ u32            ┆ f32         │
╞═══════════╪════════════════╪═════════════╡
│ 0         ┆ 0              ┆ 0.096774    │
│ 0         ┆ 36236          ┆ 0.032258    │
│ 0         ┆ 89130          ┆ 0.032258    │
│ 0         ┆ 137543         ┆ 0.032258    │
│ 0         ┆ 215649         ┆ 0.032258    │
│ …         ┆ …              ┆ …           │
│ 1855602   ┆ 1046590        ┆ 0.071429    │
│ 1855602   ┆ 1297712        ┆ 0.071429    │
│ 1855602   ┆ 1376245        ┆ 0.071429    │
│ 1855602   ┆ 1394911        ┆ 0.071429    │
│ 1855602   ┆ 1498281        ┆ 0.071429    │
└───────────┴────────────────┴─────────────┘


In [None]:
sub_sessions = get_sub_sessions()

# Count how many same click to click events there are
subsequent_clicks_count = (
    sub_sessions
    .filter((pl.col("type") == "clicks") & (pl.col("next_type") == "clicks"))
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    subsequent_clicks_count
    .group_by("aid")
    .agg(pl.sum("count").alias("total_count"))
)

# Calculate the probabilities of items being clicked immediately after another item has been clicked
click_to_click_matrix = (
    subsequent_clicks_count
    .join(aid_clicks_total_count, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
)

print(click_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix.csv")

Next click only. Remove carts and orders from the sub sessions.

In [None]:
sub_sessions = get_sub_sessions(only_clicks=True)

# Count how many same click to click events there are
subsequent_clicks_count = (
    sub_sessions
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    subsequent_clicks_count
    .group_by("aid")
    .agg(pl.sum("count").alias("total_count"))
)

# Calculate the probabilities of items being clicked immediately after another item has been clicked
click_to_click_matrix = (
    subsequent_clicks_count
    .join(aid_clicks_total_count, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
)

print(click_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix_only_clicks.csv")

Next two clicks with time decay. Remove carts and orders from the sub sessions.

In [6]:
# Read and manipulate data and then save it to csv in order to rest with streaming.
# Running out of memory otherwise

import polars as pl

# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.UInt32, "ts": pl.UInt64, "type": str})
df_schema = {"session": pl.UInt32, "events": pl.List(event_schema)}

sub_sessions = (
    pl.read_ndjson('../data/train.jsonl', schema=df_schema, low_memory=True)
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .sort(["session", "ts"])
    .with_columns(
        next_session = pl.col("session").shift(-1),
        next_aid = pl.col("aid").shift(-1),
        next_ts = pl.col("ts").shift(-1),
        next_type = pl.col("type").shift(-1),
    )
    # Row is a sub session boundary if there is existing session boundary or if time between events is more than 1 hour
    .with_columns(
        is_session_boundary = ((pl.col("session") != pl.col("next_session")) | (pl.col("next_ts") - pl.col("ts") > 3600)),
    )
    .with_columns(
        sub_session = pl.col("is_session_boundary").cum_sum().cast(pl.UInt32),
    )
    # Keep each event in their own row
    .with_columns(sub_session = pl.when(pl.col("is_session_boundary")).then(pl.col("sub_session") - 1).otherwise(pl.col("sub_session")))
    .drop(["session", "ts", "type", "next_session", "next_aid", "next_ts", "next_type", "is_session_boundary"])
    .filter(pl.col("sub_session").is_null().not_())
    # Filter sub session with only one event
    .group_by(pl.col("sub_session"))
    .agg(pl.col("aid"))
    .filter(pl.col("aid").list.len() > 1)
    .explode("aid")
    .with_row_index()
)

print(sub_sessions)

# Save to csv
sub_sessions.write_csv("./clicks/clicks_of_sub_sessions_1h.csv")

shape: (200_096_794, 3)
┌───────────┬─────────────┬─────────┐
│ index     ┆ sub_session ┆ aid     │
│ ---       ┆ ---         ┆ ---     │
│ u32       ┆ u32         ┆ u32     │
╞═══════════╪═════════════╪═════════╡
│ 0         ┆ 12115466    ┆ 552662  │
│ 1         ┆ 12115466    ┆ 871283  │
│ 2         ┆ 12115466    ┆ 1436133 │
│ 3         ┆ 12115466    ┆ 871283  │
│ 4         ┆ 12115466    ┆ 1006139 │
│ …         ┆ …           ┆ …       │
│ 200096789 ┆ 2372394     ┆ 1283290 │
│ 200096790 ┆ 2372394     ┆ 723240  │
│ 200096791 ┆ 10679973    ┆ 1828755 │
│ 200096792 ┆ 10679973    ┆ 1581840 │
│ 200096793 ┆ 10679973    ┆ 63237   │
└───────────┴─────────────┴─────────┘


In [5]:
# Run computations in streaming mode in order to not run out of memory

import polars as pl

n_subsequent_clicks = 7

df_schema = {
    "index": pl.UInt32,
    "sub_session": pl.UInt32,
    "aid": pl.UInt32
}

subsequent_clicks = (
    pl.scan_csv("./clicks/clicks_of_sub_sessions_1h.csv", schema=df_schema)
    .join(
        pl.scan_csv("./clicks/clicks_of_sub_sessions_1h.csv", schema=df_schema).rename({"index": "next_index", "aid": "next_aid"}),
        on="sub_session",
        how="inner"
    )
    .with_columns(delta_index=(pl.col("next_index") - pl.col("index")))
    .filter((pl.col("delta_index") >= 1) & (pl.col("delta_index") <= n_subsequent_clicks))
    .with_columns(weight=(1/pl.col("delta_index")).cast(pl.Float32))
    .drop(["index", "next_index", "sub_session", "delta_index"])
)

# Sum all the weights for each click-click pair
click_to_click_count = (
    subsequent_clicks
    .group_by(["aid", "next_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_clicks_total_count = (
    click_to_click_count
    .group_by("aid")
    .agg(pl.col("weighted_count").sum().alias("weighted_total_count"))
)


# Calculate the probabilities of items being clicked after another item has been clicked
click_to_click_matrix = (
    click_to_click_count
    .join(aid_clicks_total_count, on="aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
    .collect(streaming=True)
)

print(click_to_click_matrix)

FileNotFoundError: The system cannot find the file specified. (os error 2): ./clicks/clicks_of_sub_sessions_1h.csv

This error occurred with the following context stack:
	[1] 'csv scan'
	[2] 'join left'
	[3] 'join'
	[4] 'with_columns'
	[5] 'filter'
	[6] 'with_columns'
	[7] 'format!("{}", function).to_lowercase()'
	[8] 'group_by'
	[9] 'join left'
	[10] 'join'
	[11] 'with_columns'
	[12] 'format!("{}", function).to_lowercase()'
	[13] 'sort'
	[14] 'format!("{}", function).to_lowercase()'


In [None]:
# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [4]:
# Save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix_only_clicks_time_decay_7_1h.csv")

In [3]:
# Run computations in streaming mode in order to not run out of memory

# median_ts = (
#     exploded_df
#     .select("ts")
#     .median()
#     .item()
# )
# print(median_ts)

subsequent_clicks_of_sessions = (
    exploded_df
    # Take last half of events this is around 2 weeks
    # .filter((pl.col("type") == "clicks") & (pl.col("ts") > median_ts))
    .filter(pl.col("type") == "clicks")
    .drop("type")
    .sort(["session", "ts"], descending=[False, False])
    .with_columns(
        next_session=pl.col("session").shift(-1),
        next_aid=pl.col("aid").shift(-1),
        next_ts=pl.col("ts").shift(-1),
        weight=pl.lit(1).cast(pl.Float32)
    )
    .lazy()
)

click_pairs = subsequent_clicks_of_sessions.collect(streaming=True)

# more than 1 click ahead
n_clicks_ahead = 5
for i in range(2, n_clicks_ahead+1):
    new_click_pairs = (
        # take the original clicks of sessions
        click_pairs
        .with_columns(
            next_session=pl.col("session").shift(-i),
            next_aid=pl.col("aid").shift(-i),
            next_ts=pl.col("ts").shift(-i),
            weight=pl.lit(1/i).cast(pl.Float32)
        )
        # same session and less than 45 minutes apart
        .filter(pl.col("next_session").is_not_null() & (pl.col("session") == pl.col("next_session")) & (pl.col("next_ts") - pl.col("ts") < 45*60))
        .lazy()
    )
    subsequent_clicks_of_sessions = pl.concat([
        subsequent_clicks_of_sessions,
        new_click_pairs
    ])

# print(subsequent_clicks_of_sessions.collect(streaming=True))


# Sum all the weights for each click-click pair
click_to_click_count = (
    subsequent_clicks_of_sessions
    .group_by(["aid", "next_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_clicks_total_count = (
    click_to_click_count
    .group_by("aid")
    .agg(pl.col("weighted_count").sum().alias("weighted_total_count"))
)


# Calculate the probabilities of items being clicked after another item has been clicked
click_to_click_matrix = (
    click_to_click_count
    .join(aid_clicks_total_count, on="aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "click_aid", "next_aid": "next_click_aid"})
    .collect(streaming=True)
)

print(click_to_click_matrix)

shape: (234_067_458, 3)
┌───────────┬────────────────┬─────────────┐
│ click_aid ┆ next_click_aid ┆ probability │
│ ---       ┆ ---            ┆ ---         │
│ u32       ┆ u32            ┆ f32         │
╞═══════════╪════════════════╪═════════════╡
│ 0         ┆ 0              ┆ 0.047685    │
│ 0         ┆ 8136           ┆ 0.005961    │
│ 0         ┆ 13759          ┆ 0.011921    │
│ 0         ┆ 29217          ┆ 0.00298     │
│ 0         ┆ 31465          ┆ 0.00298     │
│ …         ┆ …              ┆ …           │
│ 1855602   ┆ 1739200        ┆ 0.007009    │
│ 1855602   ┆ 1762441        ┆ 0.005607    │
│ 1855602   ┆ 1768521        ┆ 0.005607    │
│ 1855602   ┆ 1783511        ┆ 0.042056    │
│ 1855602   ┆ 1855602        ┆ 0.035047    │
└───────────┴────────────────┴─────────────┘


In [5]:
# save to csv
click_to_click_matrix.write_csv("./click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay.csv")

### Click to cart matrix
Click to cart matrix is defined as the probabilities of other aids being added to cart in the same sub session after an aid is clicked.
Click to cart matrix is formed from the sub sessions since the sub session should show clear intent of the user to buy items they click.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get clicks and carts of sub sessions
clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

carts_after_clicks_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine clicks and carts of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after clicks
    .filter(pl.col("click_ts") < pl.col("cart_ts"))
    .select(["sub_session", "click_aid", "cart_aid"])
)

# Count how many same click to cart events there are
click_to_cart_count = (
    carts_after_clicks_in_sub_sessions
    .group_by(["click_aid", "cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    click_to_cart_count
    .group_by("click_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been clicked
click_to_cart_matrix = (
    click_to_cart_count
    .join(aid_clicks_total_count, on="click_aid", how="inner")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["click_aid", "cart_aid"])
)

print(click_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_cart_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_cart_matrix.write_csv("./click_to_cart_matrix.csv")

Click to cart with time decay. Carts happening later in the session count less into the probability than carts that are closer to the click.
The first cart relative to the clicked aid has weight of 1, second has weight of 1/2, third 1/3, etc...

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out orders
    .filter(pl.col("type") != "orders")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "carts").cum_sum()))
)

# Get clicks and carts of sub sessions
clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .rename({"ts": "click_ts", "aid": "click_aid", "index": "click_index"})
)

carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

carts_after_clicks_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine clicks and carts of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after clicks
    .filter(pl.col("click_ts") < pl.col("cart_ts"))
    .select(["sub_session", "click_aid", "cart_aid", "click_index", "cart_index"])
    # Weight the click-to-cart relation based in index. Next cart has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("cart_index")-pl.col("click_index"))).cast(pl.Float32))
    .drop(["click_index", "cart_index"])
)

# Sum all the weights for each click-cart pair
click_to_cart_count = (
    carts_after_clicks_in_sub_sessions
    .group_by(["click_aid", "cart_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_clicks_total_count = (
    carts_after_clicks_in_sub_sessions
    .group_by("click_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to cart after another item has been clicked
click_to_cart_matrix = (
    click_to_cart_count
    .join(aid_clicks_total_count, on="click_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["click_aid", "cart_aid"])
)

print(click_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_cart_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# Save to csv
click_to_cart_matrix.write_csv("./click_to_cart_matrix_time_decay.csv")

### Click to order matrix
Click to order matrix is defined as the probabilities of other aids being ordered in the same sub session after an aid is clicked.
Click to order matrix is formed from the sub sessions since the sub session should show clear intent of the user to buy items they click.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get clicks and orders of sub sessions
clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

orders_after_clicks_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine clicks and orders of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after clicks
    .filter(pl.col("click_ts") < pl.col("order_ts"))
    .select(["sub_session", "click_aid", "order_aid"])
)

# Count how many same click to order events there are
click_to_order_count = (
    orders_after_clicks_in_sub_sessions
    .group_by(["click_aid", "order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_total_count = (
    click_to_order_count
    .group_by("click_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been clicked
click_to_order_matrix = (
    click_to_order_count
    .join(aid_clicks_total_count, on="click_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["click_aid", "order_aid"])
)

print(click_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_order_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
click_to_order_matrix.write_csv("./click_to_order_matrix.csv")

Click to order with time decay

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out carts
    .filter(pl.col("type") != "carts")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "orders").cum_sum()))
)

# Get clicks and orders of sub sessions
clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .rename({"ts": "click_ts", "aid": "click_aid", "index": "click_index"})
)

orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

orders_after_clicks_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine clicks and orders of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after clicks
    .filter(pl.col("click_ts") < pl.col("order_ts"))
    .select(["sub_session", "click_aid", "order_aid", "click_index", "order_index"])
    # Weight the click-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("order_index")-pl.col("click_index"))).cast(pl.Float32))
    .drop(["click_index", "order_index"])
)

# Sum all the weights for each click-order pair
click_to_order_count = (
    orders_after_clicks_in_sub_sessions
    .group_by(["click_aid", "order_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_clicks_total_count = (
    orders_after_clicks_in_sub_sessions
    .group_by("click_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to order after another item has been clicked
click_to_order_matrix = (
    click_to_order_count
    .join(aid_clicks_total_count, on="click_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["click_aid", "order_aid"])
)

print(click_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_order_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# Save to csv
click_to_order_matrix.write_csv("./click_to_order_matrix_time_decay.csv")

click to order whole sessions

In [6]:
# click to order only next order

clicks_and_orders_df = (
    df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .filter(pl.col("type") != "carts")
    .sort(["session", "ts"], descending=[False, False])
    .with_row_index()
    # Make groups based on types and then make sure that session boundaries are not crossed
    .with_columns(group=(pl.col("type") == "orders").rle_id().cast(pl.UInt32))
    .with_columns(group=pl.struct("session", "group").rle_id())
    .group_by(["session", "group"])
    .agg(pl.col("ts").max().alias("max_ts"), pl.col("index").max().alias("max_index"), pl.col("aid"), pl.col("ts"), pl.col("type"), pl.col("index"))
    .explode(["aid", "ts", "type", "index"])
    # Filter clicks of last 1 hour
    .filter(pl.when(pl.col("type") == "clicks").then(pl.col("max_ts") - pl.col("ts") < 1*60*60).otherwise(True))
    # Get last 10 clicks
    .filter(pl.when(pl.col("type") == "clicks").then(pl.col("max_index") - pl.col("index") < 5).otherwise(True))
    .drop(["ts", "max_ts", "index", "max_index"])
    .lazy()
)

next_event = (
     clicks_and_orders_df
     .rename({"group": "next_group", "aid": "next_aid", "type": "next_type" })
)

click_to_order = (
    clicks_and_orders_df
    .join(next_event, on="session", how="inner")
    .filter((pl.col("group") == pl.col("next_group") - 1) & (pl.col("type") == "clicks") & (pl.col("next_type") == "orders"))
    .drop(["type", "next_type", "group", "next_group"])
)

click_to_order_count = (
    click_to_order
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
)

click_aid_total_count = (
    click_to_order_count
    .group_by("aid")
    .agg(pl.sum("count").alias("total_count"))
)

click_to_order_matrix_df = (
    click_to_order_count
    .join(click_aid_total_count, on="aid")
    .with_columns(
        probability = (pl.col("count") / pl.col("total_count")).cast(pl.Float32)
    )
    .drop(["count", "total_count"])
    .sort(["aid", "next_aid"])
    .collect(streaming=True)
)

print(click_to_order_matrix_df)

# save to csv
click_to_order_matrix_df.write_csv("./click_to_order_matrix_whole_sessions_only_next_order_last_1h_max_5.csv")

shape: (12_260_282, 3)
┌─────────┬──────────┬─────────────┐
│ aid     ┆ next_aid ┆ probability │
│ ---     ┆ ---      ┆ ---         │
│ u32     ┆ u32      ┆ f32         │
╞═════════╪══════════╪═════════════╡
│ 0       ┆ 53946    ┆ 0.043478    │
│ 0       ┆ 82031    ┆ 0.043478    │
│ 0       ┆ 312330   ┆ 0.086957    │
│ 0       ┆ 524473   ┆ 0.043478    │
│ 0       ┆ 532042   ┆ 0.086957    │
│ …       ┆ …        ┆ …           │
│ 1855601 ┆ 1580259  ┆ 0.041667    │
│ 1855601 ┆ 1708564  ┆ 0.041667    │
│ 1855601 ┆ 1712873  ┆ 0.041667    │
│ 1855601 ┆ 1716876  ┆ 0.041667    │
│ 1855601 ┆ 1749320  ┆ 0.041667    │
└─────────┴──────────┴─────────────┘


In [3]:
# click to order only next order

clicks_and_orders_df = (
    df
    .explode("events")
    .unnest("events")
    .filter(pl.col("type") != "carts")
    .sort(["session", "ts"], descending=[False, False])
    .drop("ts")
    # Make groups based on types and then make sure that session boundaries are not crossed
    .with_columns(group=(pl.col("type") == "orders").rle_id().cast(pl.UInt32))
    .with_columns(group=pl.struct("session", "group").rle_id())
    .lazy()
)

click_to_order = (
    clicks_and_orders_df
    .join(clicks_and_orders_df.rename({"group": "next_group", "aid": "next_aid", "type": "next_type" }), on="session", how="inner")
    .filter((pl.col("group") == pl.col("next_group") - 1) & (pl.col("type") == "clicks") & (pl.col("next_type") == "orders"))
    .drop(["type", "next_type", "group", "next_group"])
)

click_to_order_count = (
    click_to_order
    .group_by(["aid", "next_aid"])
    # time decay
    # .agg(pl.col("weight").sum().alias("weighted_count"))
    # no time decay
    .agg(pl.len().alias("count"))
    # .sort("count", descending=True)
)

click_aid_total_count = (
    click_to_order_count
    .group_by("aid")
    # time decay
    # .agg(pl.col("weighted_count").sum().alias("weighted_total_count"))
    # no time decay
    .agg(pl.sum("count").alias("total_count"))
    # .sort("total_count", descending=True)
)

click_to_order_matrix_df = (
    click_to_order_count
    .join(click_aid_total_count, on="aid")
    .with_columns(
        probability = (pl.col("count") / pl.col("total_count")).cast(pl.Float32)
    )
    .drop(["count", "total_count"])
    .sort(["aid", "next_aid"])
    .collect(streaming=True)
)

print(click_to_order_matrix_df)

# save to csv
click_to_order_matrix_df.write_csv("./click_to_order_matrix_whole_sessions_only_next_order.csv")

shape: (143_112_255, 5)
┌──────────┬─────────┬───────────────┬──────────┬───────────────┐
│ session  ┆ aid     ┆ ts            ┆ next_aid ┆ next_ts       │
│ ---      ┆ ---     ┆ ---           ┆ ---      ┆ ---           │
│ u32      ┆ u32     ┆ u64           ┆ u32      ┆ u64           │
╞══════════╪═════════╪═══════════════╪══════════╪═══════════════╡
│ 0        ┆ 1517085 ┆ 1659304800025 ┆ 305831   ┆ 1659370027105 │
│ 0        ┆ 1563459 ┆ 1659304904511 ┆ 305831   ┆ 1659370027105 │
│ 0        ┆ 1309446 ┆ 1659367439426 ┆ 305831   ┆ 1659370027105 │
│ 0        ┆ 16246   ┆ 1659367719997 ┆ 305831   ┆ 1659370027105 │
│ 0        ┆ 1781822 ┆ 1659367871344 ┆ 305831   ┆ 1659370027105 │
│ …        ┆ …       ┆ …             ┆ …        ┆ …             │
│ 12899525 ┆ 1599360 ┆ 1661723859676 ┆ 996393   ┆ 1661723996896 │
│ 12899525 ┆ 127479  ┆ 1661723746653 ┆ 956231   ┆ 1661723996896 │
│ 12899525 ┆ 996393  ┆ 1661723809311 ┆ 956231   ┆ 1661723996896 │
│ 12899525 ┆ 127479  ┆ 1661723833093 ┆ 956231   ┆ 16

### Cart to click matrix
Cart to click matrix is defined as the probabilities of other aids being clicked immediately after an aid is added to cart.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get carts and clicks of sub sessions
carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

clicks_after_carts_in_sub_sessions = (
    clicks_of_sub_sessions
    # Combine carts and clicks of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only clicks that happened after carts
    .filter(pl.col("cart_ts") < pl.col("click_ts"))
    .select(["sub_session", "cart_aid", "click_aid"])
)

# Count how many same cart to click events there are
cart_to_click_count = (
    clicks_after_carts_in_sub_sessions
    .group_by(["cart_aid", "click_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_total_count = (
    cart_to_click_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been added to cart
cart_to_click_matrix = (
    cart_to_click_count
    .join(aid_carts_total_count, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["cart_aid", "click_aid"])
)

print(cart_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_click_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_click_matrix.write_csv("./cart_to_click_matrix.csv")

### Cart or order to click matrix

In [12]:
# Run computations in streaming mode in order to not run out of memory

subsequent_events_of_sessions = (
    exploded_df
    # Take last half of events this is around 2 weeks
    .sort(["session", "ts"], descending=[False, False])
    .drop(["ts"])
    .with_columns(
        next_session=pl.col("session").shift(-1),
        next_aid=pl.col("aid").shift(-1),
        # next_ts=pl.col("ts").shift(-1),
        next_type=pl.col("type").shift(-1),
        weight=pl.lit(1).cast(pl.Float32)
    )
    .lazy()
)

# event_pairs = subsequent_events_of_sessions.collect(streaming=True)
# print(event_pairs)

cart_or_order_to_click = (
    subsequent_events_of_sessions
    .filter(
        pl.col("next_session").is_not_null()
        & (pl.col("session") == pl.col("next_session"))
        & (pl.col("type") != "clicks")
        & (pl.col("next_type") == "clicks")
    )
    .drop(["session", "next_session", "next_type"])
)

# print(cart_or_order_to_click.collect(streaming=True))

# Sum all the weights for each click-click pair
cart_or_order_to_click_count = (
    cart_or_order_to_click
    .group_by(["aid", "next_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# print(cart_or_order_to_click_count.collect(streaming=True))

# Count total weight for each click
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_cart_or_order_to_click_total_count = (
    cart_or_order_to_click_count
    .group_by("aid")
    .agg(pl.col("weighted_count").sum().alias("weighted_total_count"))
)

# print(aid_cart_or_order_to_click_total_count.collect(streaming=True))

# Calculate the probabilities of items being clicked after another item has been clicked
cart_or_order_to_click_matrix = (
    cart_or_order_to_click_count
    .join(aid_cart_or_order_to_click_total_count, on="aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "cart_or_order_aid", "next_aid": "next_click_aid"})
    .collect(streaming=True)
)

print(cart_or_order_to_click_matrix)

shape: (15_102_363, 4)
┌─────────┬────────┬──────────┬────────┐
│ aid     ┆ type   ┆ next_aid ┆ weight │
│ ---     ┆ ---    ┆ ---      ┆ ---    │
│ u32     ┆ str    ┆ u32      ┆ f32    │
╞═════════╪════════╪══════════╪════════╡
│ 461689  ┆ orders ┆ 362233   ┆ 1.0    │
│ 789245  ┆ carts  ┆ 366890   ┆ 1.0    │
│ 974651  ┆ carts  ┆ 974651   ┆ 1.0    │
│ 1521766 ┆ carts  ┆ 661144   ┆ 1.0    │
│ 1760145 ┆ carts  ┆ 1639229  ┆ 1.0    │
│ …       ┆ …      ┆ …        ┆ …      │
│ 468148  ┆ carts  ┆ 468148   ┆ 1.0    │
│ 1126169 ┆ carts  ┆ 1126169  ┆ 1.0    │
│ 1379999 ┆ carts  ┆ 992635   ┆ 1.0    │
│ 1677695 ┆ carts  ┆ 1677695  ┆ 1.0    │
│ 573530  ┆ carts  ┆ 203778   ┆ 1.0    │
└─────────┴────────┴──────────┴────────┘
shape: (10_153_295, 3)
┌─────────┬──────────┬────────────────┐
│ aid     ┆ next_aid ┆ weighted_count │
│ ---     ┆ ---      ┆ ---            │
│ u32     ┆ u32      ┆ f32            │
╞═════════╪══════════╪════════════════╡
│ 1148621 ┆ 1588038  ┆ 1.0            │
│ 380509  ┆ 74019

In [13]:
# save to csv
cart_or_order_to_click_matrix.write_csv("./cart_or_order_to_click_matrix.csv")

### Cart to cart matrix
Cart to cart matrix is defined as the probabilities of other aids being added to cart later in the same sub session where an aid is added to cart.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get carts of sub sessions
carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

next_carts_of_sub_sessions = (
    carts_of_sub_sessions
    .rename({"cart_ts": "next_cart_ts", "cart_aid": "next_cart_aid"})
)

# Find subsequent carts in the same sub session
subsequent_carts = (
    carts_of_sub_sessions
    .join(next_carts_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("cart_ts") < pl.col("next_cart_ts"))
    .select(["sub_session", "cart_aid", "next_cart_aid"])
)

# Count how many same cart to cart events there are
subsequent_carts_count = (
    subsequent_carts
    .group_by(["cart_aid", "next_cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_total_count = (
    subsequent_carts_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been added to cart
cart_to_cart_matrix = (
    subsequent_carts_count
    .join(aid_carts_total_count, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["cart_aid", "next_cart_aid"])
)

print(cart_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_cart_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_cart_matrix.write_csv("./cart_to_cart_matrix.csv")

Cart to cart with time decay

In [None]:
carts_of_sub_sessions = (
    get_sub_sessions(with_next_event=False)
    .filter(pl.col("type") == "carts")
    .with_row_index()
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

next_carts_of_sub_sessions = (
    carts_of_sub_sessions
    .rename({"cart_ts": "next_cart_ts", "cart_aid": "next_cart_aid", "cart_index": "next_cart_index"})
)

# Find subsequent carts in the same sub session
subsequent_carts = (
    carts_of_sub_sessions
    .join(next_carts_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("cart_ts") < pl.col("next_cart_ts"))
    .select(["sub_session", "cart_aid", "next_cart_aid", "cart_index", "next_cart_index"])
    # Weight the cart-to-cart relation based in index. Next cart has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("next_cart_index")-pl.col("cart_index"))).cast(pl.Float32))
    .drop(["cart_index", "next_cart_index"])
)

# Sum all the weights for each cart-cart pair
cart_to_cart_count = (
    subsequent_carts
    .group_by(["cart_aid", "next_cart_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each cart
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_carts_total_count = (
    subsequent_carts
    .group_by("cart_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to cart after another item has been clicked
cart_to_cart_matrix = (
    cart_to_cart_count
    .join(aid_carts_total_count, on="cart_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["cart_aid", "next_cart_aid"])
)

print(cart_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_cart_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_cart_matrix.write_csv("./cart_to_cart_matrix_time_decay.csv")

### Cart to order matrix
Cart to order matrix is defined as the probabilities of other aids being ordered later in the same sub session where an aid is added to cart.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get carts and orders of sub sessions
carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

orders_after_carts_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine carts and orders of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after carts
    .filter(pl.col("cart_ts") < pl.col("order_ts"))
    .select(["sub_session", "cart_aid", "order_aid"])
)

# Count how many same cart to order events there are
cart_to_order_count = (
    orders_after_carts_in_sub_sessions
    .group_by(["cart_aid", "order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_total_count = (
    cart_to_order_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been added to cart
cart_to_order_matrix = (
    cart_to_order_count
    .join(aid_carts_total_count, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["cart_aid", "order_aid"])
)

print(cart_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_order_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_order_matrix.write_csv("./cart_to_order_matrix.csv")

Cart to order with time decay

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out clicks
    .filter(pl.col("type") != "clicks")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "orders").cum_sum()))
)

# Get clicks and orders of sub sessions
carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

orders_after_carts_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine carts and orders of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after carts
    .filter(pl.col("cart_ts") < pl.col("order_ts"))
    .select(["sub_session", "cart_aid", "order_aid", "cart_index", "order_index"])
    # Weight the cart-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("order_index")-pl.col("cart_index"))).cast(pl.Float32))
    .drop(["cart_index", "order_index"])
)

# Sum all the weights for each cart-order pair
cart_to_order_count = (
    orders_after_carts_in_sub_sessions
    .group_by(["cart_aid", "order_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each cart
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_carts_total_count = (
    orders_after_carts_in_sub_sessions
    .group_by("cart_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to order after another item has been clicked
cart_to_order_matrix = (
    cart_to_order_count
    .join(aid_carts_total_count, on="cart_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["cart_aid", "order_aid"])
)

print(cart_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_order_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
cart_to_order_matrix.write_csv("./cart_to_order_matrix_time_decay.csv")

Cart to order from whole sessions

In [42]:
carts_and_orders_df = (
    df
    .explode("events")
    .unnest("events")
    .filter(pl.col("type") != "clicks")
    .sort(["session", "ts"], descending=[False, False])
    .drop("ts")
    # Make groups based on types and then make sure that session boundaries are not crossed
    .with_columns(group=(pl.col("type") == "orders").rle_id().cast(pl.UInt32))
    .with_columns(group=pl.struct("session", "group").rle_id())
    # Remove multiple carts and orders in the same group
    .unique()
    .lazy()
)

cart_to_order = (
     carts_and_orders_df
    .join(carts_and_orders_df.rename({"group": "next_group", "aid": "next_aid", "type": "next_type" }), on="session", how="inner")
    .filter(pl.col("group") < pl.col("next_group"))
    .filter((pl.col("type") == "carts") & (pl.col("next_type") == "orders"))
    .drop(["type", "next_type"])
    # time decay
    .with_columns(weight = (1 / (pl.col("next_group") - pl.col("group"))).cast(pl.Float32))
    .drop(["group", "next_group"])
)

cart_to_order_count = (
    cart_to_order
    .group_by(["aid", "next_aid"])
    # time decay
    .agg(pl.col("weight").sum().alias("weighted_count"))
    # no time decay
    # .agg(pl.len().alias("count"))
    # .sort("count", descending=True)
)

cart_aid_total_count = (
    cart_to_order_count
    .group_by("aid")
    # time decay
    .agg(pl.col("weighted_count").sum().alias("weighted_total_count"))
    # no time decay
    # .agg(pl.sum("count").alias("total_count"))
    # .sort("total_count", descending=True)
)

cart_to_order_matrix_df = (
    cart_to_order_count
    .join(cart_aid_total_count, on="aid")
    .with_columns(
        probability = pl.col("weighted_count") / pl.col("weighted_total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["aid", "next_aid"])
    .collect(streaming=True)
)

print(cart_to_order_matrix_df)

# save to csv
cart_to_order_matrix_df.write_csv("./cart_to_order_matrix_whole_sessions_time_decay.csv")

shape: (28_135_990, 3)
┌─────────┬──────────┬─────────────┐
│ aid     ┆ next_aid ┆ probability │
│ ---     ┆ ---      ┆ ---         │
│ u32     ┆ u32      ┆ f32         │
╞═════════╪══════════╪═════════════╡
│ 3       ┆ 3        ┆ 0.366832    │
│ 3       ┆ 22107    ┆ 0.00957     │
│ 3       ┆ 61428    ┆ 0.00957     │
│ 3       ┆ 67776    ┆ 0.00957     │
│ 3       ┆ 68426    ┆ 0.00319     │
│ …       ┆ …        ┆ …           │
│ 1855601 ┆ 1486834  ┆ 0.062241    │
│ 1855601 ┆ 1566830  ┆ 0.062241    │
│ 1855601 ┆ 1700846  ┆ 0.082988    │
│ 1855601 ┆ 1712873  ┆ 0.062241    │
│ 1855601 ┆ 1786336  ┆ 0.062241    │
└─────────┴──────────┴─────────────┘


### Order to click matrix
Order to click matrix is defined as the probabilities of other aids being clicked immediately after an aid is ordered.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get orders and clicks of sub sessions
orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

clicks_of_sub_sessions = get_clicks_of_sub_session(sub_sessions)

clicks_after_orders_in_sub_sessions = (
    clicks_of_sub_sessions
    # Combine orders and clicks of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only clicks that happened after orders
    .filter(pl.col("order_ts") < pl.col("click_ts"))
    .select(["sub_session", "order_aid", "click_aid"])
)

# Count how many same order to click events there are
order_to_click_count = (
    clicks_after_orders_in_sub_sessions
    .group_by(["order_aid", "click_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_total_count = (
    order_to_click_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been ordered
order_to_click_matrix = (
    order_to_click_count
    .join(aid_orders_total_count, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["order_aid", "click_aid"])
)

print(order_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_click_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_click_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_click_matrix.write_csv("./order_to_click_matrix.csv")

### Order to cart matrix
Order to cart matrix is defined as the probabilities of other aids being added to cart later in the same sub session where an aid is ordered.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get orders and carts of sub sessions
orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

carts_of_sub_sessions = get_carts_of_sub_session(sub_sessions)

carts_after_orders_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine orders and carts of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after orders
    .filter(pl.col("order_ts") < pl.col("cart_ts"))
    .select(["sub_session", "order_aid", "cart_aid"])
)

# Count how many same order to cart events there are
order_to_cart_count = (
    carts_after_orders_in_sub_sessions
    .group_by(["order_aid", "cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_total_count = (
    order_to_cart_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been ordered
order_to_cart_matrix = (
    order_to_cart_count
    .join(aid_orders_total_count, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["order_aid", "cart_aid"])
)

print(order_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_cart_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_cart_matrix.write_csv("./order_to_cart_matrix.csv")

Order to cart with time decay

In [None]:
sub_sessions = (
    get_sub_sessions(with_next_event=False)
    # Filter out clicks
    .filter(pl.col("type") != "clicks")
    .with_columns(index=(pl.col("sub_session") + (pl.col("type") == "cart").cum_sum()))
)

# Get clicks and orders of sub sessions
orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .rename({"ts": "cart_ts", "aid": "cart_aid", "index": "cart_index"})
)

carts_after_orders_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine carts and orders of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after carts
    .filter(pl.col("order_ts") < pl.col("cart_ts"))
    .select(["sub_session", "cart_aid", "order_aid", "cart_index", "order_index"])
    # Weight the cart-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("cart_index")-pl.col("order_index"))).cast(pl.Float32))
    .drop(["cart_index", "order_index"])
)

# Sum all the weights for each order-cart pair
order_to_cart_count = (
    orders_after_carts_in_sub_sessions
    .group_by(["order_aid", "cart_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each cart
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_orders_total_count = (
    orders_after_carts_in_sub_sessions
    .group_by("order_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being added to cart after another item has been ordered
order_to_cart_matrix = (
    order_to_cart_count
    .join(aid_orders_total_count, on="order_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["order_aid", "cart_aid"])
)

print(order_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_cart_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_cart_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_cart_matrix.write_csv("./order_to_cart_matrix_time_decay.csv")

### Order to order matrix
Order to order matrix is defined as the probabilities of other aids being ordered later in the same sub session where an aid is ordered.

In [None]:
sub_sessions = get_sub_sessions(with_next_event=False)

# Get orders of sub sessions
orders_of_sub_sessions = get_orders_of_sub_session(sub_sessions)

next_orders_of_sub_sessions = (
    orders_of_sub_sessions
    .rename({"order_ts": "next_order_ts", "order_aid": "next_order_aid"})
)

# Find subsequent orders in the same sub session
subsequent_orders = (
    orders_of_sub_sessions
    .join(next_orders_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("order_ts") < pl.col("next_order_ts"))
    .select(["sub_session", "order_aid", "next_order_aid"])
)

# Count how many same order to order events there are
subsequent_orders_count = (
    subsequent_orders
    .group_by(["order_aid", "next_order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_total_count = (
    subsequent_orders_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("total_count"))
    .sort("total_count", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been ordered
order_to_order_matrix = (
    subsequent_orders_count
    .join(aid_orders_total_count, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("total_count")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "total_count"])
    .sort(["order_aid", "next_order_aid"])
)

print(order_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_order_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_order_matrix.write_csv("./order_to_order_matrix.csv")

Order to order with time decay

In [None]:
orders_of_sub_sessions = (
    get_sub_sessions(with_next_event=False)
    .filter(pl.col("type") == "orders")
    .with_row_index()
    .rename({"ts": "order_ts", "aid": "order_aid", "index": "order_index"})
)

next_orders_of_sub_sessions = (
    orders_of_sub_sessions
    .rename({"order_ts": "next_order_ts", "order_aid": "next_order_aid", "order_index": "next_order_index"})
)

# Find subsequent orders in the same sub session
subsequent_orders = (
    orders_of_sub_sessions
    .join(next_orders_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("order_ts") < pl.col("next_order_ts"))
    .select(["sub_session", "order_aid", "next_order_aid", "order_index", "next_order_index"])
    # Weight the order-to-order relation based in index. Next order has weight 1/1, second 1/2, third 1/3, etc...
    .with_columns(weight=(1/(pl.col("next_order_index")-pl.col("order_index"))).cast(pl.Float32))
    .drop(["order_index", "next_order_index"])
)

# Sum all the weights for each order-order pair
order_to_order_count = (
    subsequent_orders
    .group_by(["order_aid", "next_order_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each order
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_orders_total_count = (
    subsequent_orders
    .group_by("order_aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being ordered after another item has been ordered
order_to_order_matrix = (
    order_to_order_count
    .join(aid_orders_total_count, on="order_aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["order_aid", "next_order_aid"])
)

print(order_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_order_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_order_matrix.select("probability").sum().select(pl.first()).item())

In [None]:
# save to csv
order_to_order_matrix.write_csv("./order_to_order_matrix_time_decay.csv")

Order to order whole sessions

In [46]:
orders_df = (
    df
    .explode("events")
    .unnest("events")
    .sort(["session", "ts"], descending=[False, False])
    .drop("ts")
    # Make groups based on types and then make sure that session boundaries are not crossed
    .with_columns(group=(pl.col("type") == "orders").rle_id().cast(pl.UInt32))
    .with_columns(group=pl.struct("session", "group").rle_id())
    .filter(pl.col("type") == "orders")
    .drop("type")
    # Remove multiple orders in the same group
    .unique()
    .lazy()
)

order_to_order = (
    orders_df
    .join(orders_df.rename({"group": "next_group", "aid": "next_aid"}), on="session", how="inner")
    .filter(pl.col("group") <= pl.col("next_group"))
    # Filter out same items ordered multiple times at the same time
    .filter((pl.col("group") != pl.col("next_group")) | (pl.col("aid") != pl.col("next_aid")))
    # time decay
    .with_columns(weight = (1 / (pl.col("next_group") - pl.col("group") + 1)).cast(pl.Float32))
    .drop(["group", "next_group"])
)

# Sum all the weights for each order-order pair
order_to_order_count = (
    order_to_order
    .group_by(["aid", "next_aid"])
    .agg(pl.col("weight").sum().alias("weighted_count"))
)

# Count total weight for each order
# Since we have weights in play we need to count weighed sum instead of count of rows
aid_orders_total_count = (
    order_to_order
    .group_by("aid")
    .agg(pl.col("weight").sum().alias("weighted_total_count"))
)


# Calculate the weighted probabilities of items being ordered after another item has been ordered
order_to_order_matrix = (
    order_to_order_count
    .join(aid_orders_total_count, on="aid", how="inner")
    .with_columns(
        probability = (pl.col("weighted_count") / pl.col("weighted_total_count")).cast(pl.Float32)
    )
    .drop(["weighted_count", "weighted_total_count"])
    .sort(["aid", "next_aid"])
    .rename({"aid": "order_aid", "next_aid": "next_order_aid"})
    .sort(["probability", "order_aid", "next_order_aid"], descending=True)
    .collect(streaming=True)
)

print(order_to_order_matrix)
order_to_order_matrix.write_csv("./order_to_order_matrix_whole_sessions_time_decay.csv")


# # Check that probabilities sum to the amount of unique aids
# print("Unique aids:", order_to_order_matrix.select("order_aid").n_unique())
# print("Total probability:", order_to_order_matrix.select("probability").sum().select(pl.first()).item())

shape: (18_196_971, 3)
┌───────────┬────────────────┬─────────────┐
│ order_aid ┆ next_order_aid ┆ probability │
│ ---       ┆ ---            ┆ ---         │
│ u32       ┆ u32            ┆ f32         │
╞═══════════╪════════════════╪═════════════╡
│ 1836735   ┆ 1836735        ┆ 1.0         │
│ 1835249   ┆ 1740171        ┆ 1.0         │
│ 1798478   ┆ 1798478        ┆ 1.0         │
│ 1718110   ┆ 1718110        ┆ 1.0         │
│ 1673053   ┆ 37525          ┆ 1.0         │
│ …         ┆ …              ┆ …           │
│ 231487    ┆ 644882         ┆ 0.000001    │
│ 846545    ┆ 1094510        ┆ 0.000001    │
│ 756588    ┆ 1711735        ┆ 0.000001    │
│ 846545    ┆ 444504         ┆ 0.000001    │
│ 846545    ┆ 1252537        ┆ 0.000001    │
└───────────┴────────────────┴─────────────┘


### Order incompatibility matrix
Order incompatibility matrix is defined as links between items which are often ordered with same items but are never ordered together.

In [6]:
grouped_clicks_and_orders = (
    df
    .explode("events")
    .unnest("events")
    .sort(["session", "ts"], descending=[False, False])
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    # .drop("ts")
    # Make groups based on types and then make sure that session boundaries are not crossed
    .filter(pl.col("type") != "carts")
    .with_columns(group=(pl.col("type") == "orders").rle_id().cast(pl.UInt32))
    .with_columns(group=pl.struct("session", "group").rle_id())
    .with_columns(group=pl.when(pl.col("type") == "clicks").then(pl.col("group")+1).otherwise(pl.col("group")))
    # Filter out events that happened over 1 hour before the last order of the group
    # This is done to form a context for ordered item
    .with_columns(max_ts=pl.max("ts").over("group"))
    .filter(pl.col("ts") > pl.col("max_ts") - 3600)
    .drop(["session", "ts", "max_ts"])
    # Keep only unique events
    .unique()
)

print(grouped_clicks_and_orders)

# how many times click order pair has to be seen to be considered reliable
# Setting the threshold to 5 leaves ~1.4 % of the data removing the long tail
click_order_count_threshold = 10
# how many times items have to be ordered together to be considered often ordered together
ordered_together_count_threshold = 0
# how similar the contexts of two items have to be to be considered incompatible
context_similarity_threshold = 0.8

order_context = (
    grouped_clicks_and_orders
    .rename({"aid": "click_aid"})
    # Get pairs of items ordered at the same time
    .join(grouped_clicks_and_orders.rename({"aid": "order_aid"}), on="group", how="inner")
    .filter((pl.col("type") == "clicks") & (pl.col("type_right") == "orders"))
    .drop(["type", "type_right", "group"])
    .group_by(["click_aid", "order_aid"])
    .agg(pl.len().alias("count"))
    # Filter out unreliable relations with too few counts
    .filter(pl.col("count") >= click_order_count_threshold)
    # Create context for order by joining clicked aids into list
    .group_by("order_aid")
    .agg(pl.col("click_aid").alias("context"), pl.len())
    # Filter out orders with only the item itself in context
    .filter(pl.col("len") > 1)
    .drop("len")
)

print(order_context)

aids_often_ordered_together = (
    grouped_clicks_and_orders
    .filter(pl.col("type") == "orders")
    .drop("type")
    .join(grouped_clicks_and_orders.filter(pl.col("type") == "orders").drop("type").rename({"aid": "ordered_with"}), on="group", how="inner")
    .filter(pl.col("aid") != pl.col("ordered_with"))
    .drop("group")
    .group_by(["aid", "ordered_with"])
    .agg(pl.len().alias("count"))
    # Filter out unreliable relations with too few counts
    .filter(pl.col("count") >= ordered_together_count_threshold)
    .drop("count")
)

print(aids_often_ordered_together)


# Find all incompatible pairs by finding aids that have never been ordered together but have common aids that are often clicked before ordering
candidate_incompatible_pairs = []

order_context_rows = order_context.rows()
len_order_context = len(order_context_rows)
print("Start")
for index, row in enumerate(order_context.iter_rows()):
    if (index+1) % 500 == 0:
        print(f"row {index+1} / {len_order_context}")

    order_aid = row[0]
    order_aid_context = row[1]

    for click_aid in order_aid_context:
        # click_aid_context = order_context.row(by_predicate=(pl.col("order_aid") == click_aid))[1]
        click_aid_context = next((row[1] for row in order_context_rows if row[0] == click_aid), None)
        if click_aid_context is not None:
            # click_aid_context = click_aid_context[0]
            similarity = len(set(order_aid_context)) / len(set(order_aid_context).union(click_aid_context))
            if similarity > context_similarity_threshold:
                new_candidate_incompatible_pairs = [(order_aid, aid) for aid in click_aid_context]
                candidate_incompatible_pairs.extend(new_candidate_incompatible_pairs)

print("Done")
print()

incompatible_df = (
    pl.DataFrame(
        data=candidate_incompatible_pairs,
        orient='row',
        schema={"aid": pl.UInt32, "incompatible_aid": pl.UInt32}
    )
    .filter(pl.col("aid") != pl.col("incompatible_aid"))
    .unique()
    # Filter out pairs that are too often bought together
    .join(aids_often_ordered_together, left_on=["aid", "incompatible_aid"], right_on=["aid", "ordered_with"], how="anti")
    .sort(["aid", "incompatible_aid"])
)
print(incompatible_df)

# Write to csv
incompatible_df.write_csv("./incompatible_matrix_10_0_08.csv")

shape: (54_885_019, 3)
┌─────────┬────────┬──────────┐
│ aid     ┆ type   ┆ group    │
│ ---     ┆ ---    ┆ ---      │
│ u32     ┆ str    ┆ u32      │
╞═════════╪════════╪══════════╡
│ 650278  ┆ clicks ┆ 1928806  │
│ 1170472 ┆ clicks ┆ 5669514  │
│ 120879  ┆ clicks ┆ 6288250  │
│ 782195  ┆ clicks ┆ 14680300 │
│ 614880  ┆ clicks ┆ 8579638  │
│ …       ┆ …      ┆ …        │
│ 462368  ┆ clicks ┆ 14932259 │
│ 698925  ┆ clicks ┆ 6344663  │
│ 1475721 ┆ clicks ┆ 3193570  │
│ 1808088 ┆ orders ┆ 6065309  │
│ 212895  ┆ clicks ┆ 5200209  │
└─────────┴────────┴──────────┘
shape: (17_404, 2)
┌───────────┬──────────────────────────────┐
│ order_aid ┆ context                      │
│ ---       ┆ ---                          │
│ u32       ┆ list[u32]                    │
╞═══════════╪══════════════════════════════╡
│ 721998    ┆ [1528260, 721998]            │
│ 1344385   ┆ [608780, 979311, … 293028]   │
│ 441353    ┆ [139685, 441353]             │
│ 1058104   ┆ [1058104, 1380567]           │
│ 1328763

In [5]:
sub_sessions = get_sub_sessions(with_next_event=False)

orders_of_sub_sessions_df = get_orders_of_sub_session(sub_sessions)

# Lower threshold for how many times item has to be ordered so that some conclusions can be made
lower_threshold = 3
# how many times item has to be ordered with another item so that their relation can be considered strong
ordered_together_threshold = 15

# Filter aids with too few total orders
allowed_aids_df = (
    orders_of_sub_sessions_df
    .group_by("order_aid")
    .agg(pl.len().alias("count"))
    .filter(pl.col("count") >= lower_threshold)
)

# Find all order aid pairs
multi_order_sub_sessions_df = (
    orders_of_sub_sessions_df
    # filter aids with too few orders
    .join(allowed_aids_df.select("order_aid"), on="order_aid", how="semi")
    # group orders and get unique aids for each order
    .group_by("sub_session")
    # Remove items ordered multiple times in single session
    .agg(pl.col("order_aid").unique())
    # filter out orders with only one item left
    .filter(pl.col("order_aid").list.len() > 1)
    .explode("order_aid")
)

# Find all order aid pairs and count how many times these pairs have been ordered together
# high correlation is used to draw conclusion of incompatibility
# low correlation is used to check if the proposed incompatible pairs are not ordered together
low_correlation, high_correlation = (
    multi_order_sub_sessions_df
    .join(multi_order_sub_sessions_df, on="sub_session", how="inner")
    .drop("sub_session")
    .group_by(["order_aid", "order_aid_right"])
    .agg(pl.len().alias("count"))
    .filter(pl.col("order_aid") != pl.col("order_aid_right"))
    .sort("count")
    # Partition pairs by how many times they have been ordered together
    .with_columns(over_threshold = pl.col("count") >= ordered_together_threshold)
    .partition_by("over_threshold")
)

# Combine aid pairs into groups of aids that have been ordered together
low_correlation = (
    low_correlation
    .drop(["over_threshold", "count"])
    .group_by("order_aid")
    .agg(pl.col("order_aid_right").alias("ordered_together_with"))
    .explode("ordered_together_with")
)

high_correlation = (
    high_correlation
    .drop(["over_threshold", "count"])
    .group_by("order_aid")
    .agg(pl.col("order_aid_right").alias("ordered_together_with"))
)

# Find all incompatible pairs by finding aids that have never been ordered together but have common aids that they are often ordered with
candidate_incompatible_pairs = []

print("Start")
for index, order_aid in enumerate(high_correlation.select("order_aid").to_numpy().reshape(-1)):
    if (index+1) % 1000 == 0:
        print("row", index+1)

    aids_ordered_together = high_correlation.row(by_predicate=(pl.col("order_aid") == order_aid))[1]
    for aid in aids_ordered_together:
        candidate_incompatible_aids = high_correlation.row(by_predicate=(pl.col("order_aid") == aid))[1]
        new_candidate_incompatible_pairs = [(order_aid, aid) for aid in candidate_incompatible_aids]
        candidate_incompatible_pairs.extend(new_candidate_incompatible_pairs)

print("Done")
print()


incompatible_df = (
    pl.DataFrame(
        data=candidate_incompatible_pairs,
        orient='row',
        schema={"aid": pl.UInt32, "incompatible_aid": pl.UInt32}
    )
    # Filter out pairs that have low correlation
    .join(low_correlation, left_on=["aid", "incompatible_aid"], right_on=["order_aid", "ordered_together_with"], how="anti")
    .filter(pl.col("aid") != pl.col("incompatible_aid"))
    .unique()
    .sort(["aid", "incompatible_aid"])
)
print(incompatible_df)

Start
row 1000
row 2000
row 3000
row 4000
Done

shape: (23_192, 2)
┌─────────┬──────────────────┐
│ aid     ┆ incompatible_aid │
│ ---     ┆ ---              │
│ u32     ┆ u32              │
╞═════════╪══════════════════╡
│ 1029    ┆ 732479           │
│ 1029    ┆ 1205296          │
│ 1029    ┆ 1337658          │
│ 1414    ┆ 282805           │
│ 1414    ┆ 1581568          │
│ …       ┆ …                │
│ 1854493 ┆ 1316639          │
│ 1854493 ┆ 1458929          │
│ 1854493 ┆ 1467999          │
│ 1854493 ┆ 1554991          │
│ 1854493 ┆ 1617987          │
└─────────┴──────────────────┘


In [6]:
# Write to csv
incompatible_df.write_csv("./incompatible_matrix_3_15.csv")

In [6]:
unique_incompatible_count = (
    incompatible_df
    .select("aid")
    .n_unique()
)

print(f"Incompatible products found for {unique_incompatible_count} unique products")

Incompatible products found for 4931 unique products


### Click count matrix
Click counts of aids normalized by the maximum click count.

In [17]:
click_count_matrix = (
    exploded_df
    .filter(pl.col("type") == "clicks")
    .group_by("aid")
    .agg(pl.len().alias("count"))
    .with_columns(weight = (pl.col("count") / pl.col("count").max()).cast(pl.Float32))
    .drop("count")
    .sort("aid")
)

print(click_count_matrix)

click_count_matrix.write_csv("./click_count_matrix.csv")

shape: (1_855_603, 2)
┌─────────┬──────────┐
│ aid     ┆ weight   │
│ ---     ┆ ---      │
│ u32     ┆ f32      │
╞═════════╪══════════╡
│ 0       ┆ 0.000363 │
│ 1       ┆ 0.000272 │
│ 2       ┆ 0.00014  │
│ 3       ┆ 0.019235 │
│ 4       ┆ 0.001682 │
│ …       ┆ …        │
│ 1855598 ┆ 0.000058 │
│ 1855599 ┆ 0.000107 │
│ 1855600 ┆ 0.000676 │
│ 1855601 ┆ 0.000701 │
│ 1855602 ┆ 0.000157 │
└─────────┴──────────┘
