# Item relation matrices

### Get the data

In [1]:
import polars as pl

# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.UInt32, "ts": pl.UInt64, "type": str})
df_schema = {"session": pl.UInt32, "events": pl.List(event_schema)}

df = pl.read_ndjson('../data/train.jsonl', schema=df_schema, low_memory=True)

print(df)

shape: (12_899_779, 2)
┌──────────┬─────────────────────────────────┐
│ session  ┆ events                          │
│ ---      ┆ ---                             │
│ u32      ┆ list[struct[3]]                 │
╞══════════╪═════════════════════════════════╡
│ 0        ┆ [{1517085,1659304800025,"click… │
│ 1        ┆ [{424964,1659304800025,"carts"… │
│ 2        ┆ [{763743,1659304800038,"clicks… │
│ 3        ┆ [{1425967,1659304800095,"carts… │
│ 4        ┆ [{613619,1659304800119,"clicks… │
│ …        ┆ …                               │
│ 12899774 ┆ [{33035,1661723968869,"clicks"… │
│ 12899775 ┆ [{1743151,1661723970935,"click… │
│ 12899776 ┆ [{548599,1661723972537,"clicks… │
│ 12899777 ┆ [{384045,1661723976974,"clicks… │
│ 12899778 ┆ [{561560,1661723983611,"clicks… │
└──────────┴─────────────────────────────────┘


In [2]:
# Sessions
exploded_df = (
    df
    .explode("events")
    .unnest("events")
)

print(exploded_df)

shape: (216_716_096, 4)
┌──────────┬─────────┬───────────────┬────────┐
│ session  ┆ aid     ┆ ts            ┆ type   │
│ ---      ┆ ---     ┆ ---           ┆ ---    │
│ u32      ┆ u32     ┆ u64           ┆ str    │
╞══════════╪═════════╪═══════════════╪════════╡
│ 0        ┆ 1517085 ┆ 1659304800025 ┆ clicks │
│ 0        ┆ 1563459 ┆ 1659304904511 ┆ clicks │
│ 0        ┆ 1309446 ┆ 1659367439426 ┆ clicks │
│ 0        ┆ 16246   ┆ 1659367719997 ┆ clicks │
│ 0        ┆ 1781822 ┆ 1659367871344 ┆ clicks │
│ …        ┆ …       ┆ …             ┆ …      │
│ 12899776 ┆ 1737908 ┆ 1661723987073 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723976974 ┆ clicks │
│ 12899777 ┆ 384045  ┆ 1661723986800 ┆ clicks │
│ 12899778 ┆ 561560  ┆ 1661723983611 ┆ clicks │
│ 12899778 ┆ 32070   ┆ 1661723994936 ┆ clicks │
└──────────┴─────────┴───────────────┴────────┘


In [3]:
sub_sessions = (
    exploded_df
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .sort(["session", "ts"])
    .with_columns(
        next_session = pl.col("session").shift(-1),
        next_aid = pl.col("aid").shift(-1),
        next_ts = pl.col("ts").shift(-1),
        next_type = pl.col("type").shift(-1),
    )
    # Row is a sub session boundary if there is existing session boundary or if time between events is more than 30 minutes
    .with_columns(
        is_session_boundary = ((pl.col("session") != pl.col("next_session")) | (pl.col("next_ts") - pl.col("ts") > 1800)),
    )
    .with_columns(
        sub_session = pl.col("is_session_boundary").cum_sum().cast(pl.UInt32),
    )
    # Filter out session boundaries. This also removes sub sessions with only 1 event which are not interesting
    .filter(pl.col("is_session_boundary").not_())
    .drop("is_session_boundary")
)

print(sub_sessions)

# Check that all the sub sessions are part of only one original session
sub_session_parent_count = (
    sub_sessions
    .group_by("sub_session")
    .agg(pl.col("session").n_unique().alias("n_sessions"))
    .sort("n_sessions", descending=True)
)
print(sub_session_parent_count)

print("Amount of sub sessions:", sub_sessions.select("sub_session").n_unique())

shape: (170_654_920, 9)
┌──────────┬─────────┬────────────┬────────┬───┬──────────┬────────────┬───────────┬─────────────┐
│ session  ┆ aid     ┆ ts         ┆ type   ┆ … ┆ next_aid ┆ next_ts    ┆ next_type ┆ sub_session │
│ ---      ┆ ---     ┆ ---        ┆ ---    ┆   ┆ ---      ┆ ---        ┆ ---       ┆ ---         │
│ u32      ┆ u32     ┆ u32        ┆ str    ┆   ┆ u32      ┆ u32        ┆ str       ┆ u32         │
╞══════════╪═════════╪════════════╪════════╪═══╪══════════╪════════════╪═══════════╪═════════════╡
│ 0        ┆ 1517085 ┆ 1659304800 ┆ clicks ┆ … ┆ 1563459  ┆ 1659304904 ┆ clicks    ┆ 0           │
│ 0        ┆ 1309446 ┆ 1659367439 ┆ clicks ┆ … ┆ 16246    ┆ 1659367719 ┆ clicks    ┆ 1           │
│ 0        ┆ 16246   ┆ 1659367719 ┆ clicks ┆ … ┆ 1781822  ┆ 1659367871 ┆ clicks    ┆ 1           │
│ 0        ┆ 1781822 ┆ 1659367871 ┆ clicks ┆ … ┆ 1152674  ┆ 1659367885 ┆ clicks    ┆ 1           │
│ 0        ┆ 1649869 ┆ 1659369893 ┆ carts  ┆ … ┆ 461689   ┆ 1659369898 ┆ carts     ┆ 

### Click to click matrix
Click to click matrix is defined as the probabilities of other aids being clicked immediately after the previous aid is clicked.
Click to click matrix is formed from the sub sessions since there is no point in counting subsequent clicks that are from a user coming back to site after a long time

In [4]:
# Count how many same click to click events there are
subsequent_clicks_count = (
    sub_sessions
    .filter((pl.col("type") == "clicks") & (pl.col("next_type") == "clicks"))
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_sum = (
    subsequent_clicks_count
    .group_by("aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been clicked
click_to_click_matrix = (
    subsequent_clicks_count
    .join(aid_clicks_sum, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["aid", "next_aid"])
)

print(click_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_click_matrix.select("aid").n_unique())
print("Total probability:", click_to_click_matrix.select("probability").sum().select(pl.first()).item())

shape: (63_195_404, 3)
┌─────────┬──────────┬─────────────┐
│ aid     ┆ next_aid ┆ probability │
│ ---     ┆ ---      ┆ ---         │
│ u32     ┆ u32      ┆ f32         │
╞═════════╪══════════╪═════════════╡
│ 0       ┆ 0        ┆ 0.02381     │
│ 0       ┆ 13759    ┆ 0.02381     │
│ 0       ┆ 54474    ┆ 0.02381     │
│ 0       ┆ 90491    ┆ 0.02381     │
│ 0       ┆ 218900   ┆ 0.02381     │
│ …       ┆ …        ┆ …           │
│ 1855602 ┆ 1598688  ┆ 0.058824    │
│ 1855602 ┆ 1621374  ┆ 0.058824    │
│ 1855602 ┆ 1693232  ┆ 0.058824    │
│ 1855602 ┆ 1783511  ┆ 0.058824    │
│ 1855602 ┆ 1855602  ┆ 0.058824    │
└─────────┴──────────┴─────────────┘
Unique aids: 1852443
Total probability: 1852443.125


In [5]:
# save to csv
click_to_click_matrix.write_csv("./click_to_click_matrix.csv")

### Click to cart matrix
Click to cart matrix is defined as the probabilities of other aids being added to cart in the same sub session after an aid is clicked.
Click to cart matrix is formed from the sub sessions since the sub session should show clear intent of the user to buy items they click.

In [6]:
# Get clicks and carts of sub sessions
clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "click_ts", "aid": "click_aid"})
)

carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "cart_ts", "aid": "cart_aid"})
)

carts_after_clicks_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine clicks and carts of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after clicks
    .filter(pl.col("click_ts") < pl.col("cart_ts"))
    .select(["sub_session", "click_aid", "cart_aid"])
)

# Count how many same click to cart events there are
click_to_cart_count = (
    carts_after_clicks_in_sub_sessions
    .group_by(["click_aid", "cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_sum = (
    click_to_cart_count
    .group_by("click_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been clicked
click_to_cart_matrix = (
    click_to_cart_count
    .join(aid_clicks_sum, on="click_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["click_aid", "cart_aid"])
)

print(click_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_cart_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_cart_matrix.select("probability").sum().select(pl.first()).item())

shape: (68_511_302, 3)
┌───────────┬──────────┬─────────────┐
│ click_aid ┆ cart_aid ┆ probability │
│ ---       ┆ ---      ┆ ---         │
│ u32       ┆ u32      ┆ f32         │
╞═══════════╪══════════╪═════════════╡
│ 0         ┆ 29217    ┆ 0.020408    │
│ 0         ┆ 31465    ┆ 0.020408    │
│ 0         ┆ 45036    ┆ 0.020408    │
│ 0         ┆ 78027    ┆ 0.020408    │
│ 0         ┆ 150507   ┆ 0.040816    │
│ …         ┆ …        ┆ …           │
│ 1855602   ┆ 1192996  ┆ 0.055556    │
│ 1855602   ┆ 1376245  ┆ 0.111111    │
│ 1855602   ┆ 1513725  ┆ 0.055556    │
│ 1855602   ┆ 1768521  ┆ 0.055556    │
│ 1855602   ┆ 1783511  ┆ 0.111111    │
└───────────┴──────────┴─────────────┘
Unique aids: 1630426
Total probability: 1630426.0


In [7]:
# save to csv
click_to_cart_matrix.write_csv("./click_to_cart_matrix.csv")

### Click to order matrix
Click to order matrix is defined as the probabilities of other aids being ordered in the same sub session after an aid is clicked.
Click to order matrix is formed from the sub sessions since the sub session should show clear intent of the user to buy items they click.

In [8]:
# Get clicks and orders of sub sessions
clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "click_ts", "aid": "click_aid"})
)

orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "order_ts", "aid": "order_aid"})
)

orders_after_clicks_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine clicks and orders of sub sessions
    .join(clicks_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after clicks
    .filter(pl.col("click_ts") < pl.col("order_ts"))
    .select(["sub_session", "click_aid", "order_aid"])
)

# Count how many same click to order events there are
click_to_order_count = (
    orders_after_clicks_in_sub_sessions
    .group_by(["click_aid", "order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the clicks for each aid
aid_clicks_sum = (
    click_to_order_count
    .group_by("click_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been clicked
click_to_order_matrix = (
    click_to_order_count
    .join(aid_clicks_sum, on="click_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["click_aid", "order_aid"])
)

print(click_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", click_to_order_matrix.select("click_aid").n_unique())
print("Total probability:", click_to_order_matrix.select("probability").sum().select(pl.first()).item())

shape: (28_372_718, 3)
┌───────────┬───────────┬─────────────┐
│ click_aid ┆ order_aid ┆ probability │
│ ---       ┆ ---       ┆ ---         │
│ u32       ┆ u32       ┆ f32         │
╞═══════════╪═══════════╪═════════════╡
│ 0         ┆ 53946     ┆ 0.026316    │
│ 0         ┆ 78027     ┆ 0.026316    │
│ 0         ┆ 197519    ┆ 0.026316    │
│ 0         ┆ 312330    ┆ 0.052632    │
│ 0         ┆ 359350    ┆ 0.026316    │
│ …         ┆ …         ┆ …           │
│ 1855602   ┆ 512179    ┆ 0.285714    │
│ 1855602   ┆ 796654    ┆ 0.142857    │
│ 1855602   ┆ 1513725   ┆ 0.142857    │
│ 1855602   ┆ 1768521   ┆ 0.142857    │
│ 1855602   ┆ 1783511   ┆ 0.285714    │
└───────────┴───────────┴─────────────┘
Unique aids: 1113411
Total probability: 1113411.0


In [9]:
# save to csv
click_to_order_matrix.write_csv("./click_to_order_matrix.csv")

### Cart to click matrix
Cart to click matrix is defined as the probabilities of other aids being clicked immediately after an aid is added to cart.

In [10]:
# Get carts and clicks of sub sessions
carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "cart_ts", "aid": "cart_aid"})
)

clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "click_ts", "aid": "click_aid"})
)

clicks_after_carts_in_sub_sessions = (
    clicks_of_sub_sessions
    # Combine carts and clicks of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only clicks that happened after carts
    .filter(pl.col("cart_ts") < pl.col("click_ts"))
    .select(["sub_session", "cart_aid", "click_aid"])
)

# Count how many same cart to click events there are
cart_to_click_count = (
    clicks_after_carts_in_sub_sessions
    .group_by(["cart_aid", "click_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_sum = (
    cart_to_click_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been added to cart
cart_to_click_matrix = (
    cart_to_click_count
    .join(aid_carts_sum, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["cart_aid", "click_aid"])
)

print(cart_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_click_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_click_matrix.select("probability").sum().select(pl.first()).item())

shape: (67_045_900, 3)
┌──────────┬───────────┬─────────────┐
│ cart_aid ┆ click_aid ┆ probability │
│ ---      ┆ ---       ┆ ---         │
│ u32      ┆ u32       ┆ f32         │
╞══════════╪═══════════╪═════════════╡
│ 1        ┆ 835715    ┆ 1.0         │
│ 3        ┆ 3         ┆ 0.318777    │
│ 3        ┆ 16778     ┆ 0.004367    │
│ 3        ┆ 24318     ┆ 0.002183    │
│ 3        ┆ 46596     ┆ 0.002183    │
│ …        ┆ …         ┆ …           │
│ 1855601  ┆ 1151932   ┆ 0.076923    │
│ 1855601  ┆ 1428175   ┆ 0.076923    │
│ 1855601  ┆ 1636201   ┆ 0.076923    │
│ 1855601  ┆ 1712873   ┆ 0.076923    │
│ 1855601  ┆ 1855601   ┆ 0.076923    │
└──────────┴───────────┴─────────────┘
Unique aids: 1101844
Total probability: 1101844.0


In [11]:
# save to csv
cart_to_click_matrix.write_csv("./cart_to_click_matrix.csv")

### Cart to cart matrix
Cart to cart matrix is defined as the probabilities of other aids being added to cart later in the same sub session where an aid is added to cart.

In [12]:
# Get carts of sub sessions
carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .select(["sub_session", "ts", "aid"])
)

next_carts_of_sub_sessions = (
    carts_of_sub_sessions
    .rename({"ts": "next_ts", "aid": "next_aid"})
)

# Find subsequent carts in the same sub session
subsequent_carts = (
    carts_of_sub_sessions
    .join(next_carts_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("ts") < pl.col("next_ts"))
    .select(["sub_session", "aid", "next_aid"])
)

# Count how many same cart to cart events there are
subsequent_carts_count = (
    subsequent_carts
    .group_by(["aid", "next_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_sum = (
    subsequent_carts_count
    .group_by("aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been added to cart
cart_to_cart_matrix = (
    subsequent_carts_count
    .join(aid_carts_sum, on="aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["aid", "next_aid"])
)

print(cart_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_cart_matrix.select("aid").n_unique())
print("Total probability:", cart_to_cart_matrix.select("probability").sum().select(pl.first()).item())

shape: (21_537_440, 3)
┌─────────┬──────────┬─────────────┐
│ aid     ┆ next_aid ┆ probability │
│ ---     ┆ ---      ┆ ---         │
│ u32     ┆ u32      ┆ f32         │
╞═════════╪══════════╪═════════════╡
│ 1       ┆ 1832177  ┆ 1.0         │
│ 3       ┆ 3        ┆ 0.357143    │
│ 3       ┆ 16778    ┆ 0.007937    │
│ 3       ┆ 67776    ┆ 0.007937    │
│ 3       ┆ 109499   ┆ 0.007937    │
│ …       ┆ …        ┆ …           │
│ 1855601 ┆ 1326991  ┆ 0.076923    │
│ 1855601 ┆ 1566830  ┆ 0.076923    │
│ 1855601 ┆ 1700846  ┆ 0.076923    │
│ 1855601 ┆ 1712873  ┆ 0.076923    │
│ 1855601 ┆ 1786336  ┆ 0.076923    │
└─────────┴──────────┴─────────────┘
Unique aids: 947173
Total probability: 947173.0625


In [13]:
# save to csv
cart_to_cart_matrix.write_csv("./cart_to_cart_matrix.csv")

### Cart to order matrix
Cart to order matrix is defined as the probabilities of other aids being ordered later in the same sub session where an aid is added to cart.

In [14]:
# Get carts and orders of sub sessions
carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "cart_ts", "aid": "cart_aid"})
)

orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "order_ts", "aid": "order_aid"})
)

orders_after_carts_in_sub_sessions = (
    orders_of_sub_sessions
    # Combine carts and orders of sub sessions
    .join(carts_of_sub_sessions, on="sub_session", how="inner")
    # Keep only orders that happened after carts
    .filter(pl.col("cart_ts") < pl.col("order_ts"))
    .select(["sub_session", "cart_aid", "order_aid"])
)

# Count how many same cart to order events there are
cart_to_order_count = (
    orders_after_carts_in_sub_sessions
    .group_by(["cart_aid", "order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the carts for each aid
aid_carts_sum = (
    cart_to_order_count
    .group_by("cart_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been added to cart
cart_to_order_matrix = (
    cart_to_order_count
    .join(aid_carts_sum, on="cart_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["cart_aid", "order_aid"])
)

print(cart_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", cart_to_order_matrix.select("cart_aid").n_unique())
print("Total probability:", cart_to_order_matrix.select("probability").sum().select(pl.first()).item())

shape: (10_252_477, 3)
┌──────────┬───────────┬─────────────┐
│ cart_aid ┆ order_aid ┆ probability │
│ ---      ┆ ---       ┆ ---         │
│ u32      ┆ u32       ┆ f32         │
╞══════════╪═══════════╪═════════════╡
│ 3        ┆ 3         ┆ 0.333333    │
│ 3        ┆ 22107     ┆ 0.027778    │
│ 3        ┆ 67776     ┆ 0.027778    │
│ 3        ┆ 138431    ┆ 0.027778    │
│ 3        ┆ 164205    ┆ 0.055556    │
│ …        ┆ …         ┆ …           │
│ 1855601  ┆ 1428175   ┆ 0.090909    │
│ 1855601  ┆ 1486834   ┆ 0.090909    │
│ 1855601  ┆ 1566830   ┆ 0.090909    │
│ 1855601  ┆ 1712873   ┆ 0.090909    │
│ 1855601  ┆ 1786336   ┆ 0.090909    │
└──────────┴───────────┴─────────────┘
Unique aids: 624940
Total probability: 624940.0


In [15]:
# save to csv
cart_to_order_matrix.write_csv("./cart_to_order_matrix.csv")

### Order to click matrix
Order to click matrix is defined as the probabilities of other aids being clicked immediately after an aid is ordered.

In [16]:
# Get orders and clicks of sub sessions
orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "order_ts", "aid": "order_aid"})
)

clicks_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "clicks")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "click_ts", "aid": "click_aid"})
)

clicks_after_orders_in_sub_sessions = (
    clicks_of_sub_sessions
    # Combine orders and clicks of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only clicks that happened after orders
    .filter(pl.col("order_ts") < pl.col("click_ts"))
    .select(["sub_session", "order_aid", "click_aid"])
)

# Count how many same order to click events there are
order_to_click_count = (
    clicks_after_orders_in_sub_sessions
    .group_by(["order_aid", "click_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_sum = (
    order_to_click_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being clicked immediately after another item has been ordered
order_to_click_matrix = (
    order_to_click_count
    .join(aid_orders_sum, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["order_aid", "click_aid"])
)

print(order_to_click_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_click_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_click_matrix.select("probability").sum().select(pl.first()).item())

shape: (3_001_626, 3)
┌───────────┬───────────┬─────────────┐
│ order_aid ┆ click_aid ┆ probability │
│ ---       ┆ ---       ┆ ---         │
│ u32       ┆ u32       ┆ f32         │
╞═══════════╪═══════════╪═════════════╡
│ 3         ┆ 3         ┆ 0.152174    │
│ 3         ┆ 67776     ┆ 0.021739    │
│ 3         ┆ 139351    ┆ 0.043478    │
│ 3         ┆ 332381    ┆ 0.021739    │
│ 3         ┆ 383680    ┆ 0.021739    │
│ …         ┆ …         ┆ …           │
│ 1855594   ┆ 1678106   ┆ 0.014286    │
│ 1855594   ┆ 1688187   ┆ 0.014286    │
│ 1855594   ┆ 1775911   ┆ 0.014286    │
│ 1855594   ┆ 1828181   ┆ 0.014286    │
│ 1855594   ┆ 1855594   ┆ 0.185714    │
└───────────┴───────────┴─────────────┘
Unique aids: 243925
Total probability: 243925.0


In [17]:
# save to csv
order_to_click_matrix.write_csv("./order_to_click_matrix.csv")

### Order to cart matrix
Order to cart matrix is defined as the probabilities of other aids being added to cart later in the same sub session where an aid is ordered.

In [18]:
# Get orders and carts of sub sessions
orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "order_ts", "aid": "order_aid"})
)

carts_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "carts")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "cart_ts", "aid": "cart_aid"})
)

carts_after_orders_in_sub_sessions = (
    carts_of_sub_sessions
    # Combine orders and carts of sub sessions
    .join(orders_of_sub_sessions, on="sub_session", how="inner")
    # Keep only carts that happened after orders
    .filter(pl.col("order_ts") < pl.col("cart_ts"))
    .select(["sub_session", "order_aid", "cart_aid"])
)

# Count how many same order to cart events there are
order_to_cart_count = (
    carts_after_orders_in_sub_sessions
    .group_by(["order_aid", "cart_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_sum = (
    order_to_cart_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being added to cart after another item has been ordered
order_to_cart_matrix = (
    order_to_cart_count
    .join(aid_orders_sum, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["order_aid", "cart_aid"])
)

print(order_to_cart_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_cart_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_cart_matrix.select("probability").sum().select(pl.first()).item())

shape: (814_744, 3)
┌───────────┬──────────┬─────────────┐
│ order_aid ┆ cart_aid ┆ probability │
│ ---       ┆ ---      ┆ ---         │
│ u32       ┆ u32      ┆ f32         │
╞═══════════╪══════════╪═════════════╡
│ 3         ┆ 3        ┆ 0.5         │
│ 3         ┆ 139351   ┆ 0.125       │
│ 3         ┆ 905951   ┆ 0.125       │
│ 3         ┆ 1347221  ┆ 0.125       │
│ 3         ┆ 1821347  ┆ 0.125       │
│ …         ┆ …        ┆ …           │
│ 1855594   ┆ 1320670  ┆ 0.125       │
│ 1855594   ┆ 1334761  ┆ 0.125       │
│ 1855594   ┆ 1469175  ┆ 0.125       │
│ 1855594   ┆ 1556873  ┆ 0.125       │
│ 1855594   ┆ 1855594  ┆ 0.375       │
└───────────┴──────────┴─────────────┘
Unique aids: 147717
Total probability: 147717.0


In [19]:
# save to csv
order_to_cart_matrix.write_csv("./order_to_cart_matrix.csv")

### Order to order matrix
Order to order matrix is defined as the probabilities of other aids being ordered later in the same sub session where an aid is ordered.

In [20]:
# Get orders of sub sessions
orders_of_sub_sessions = (
    sub_sessions
    .filter(pl.col("type") == "orders")
    .select(["sub_session", "ts", "aid"])
    .rename({"ts": "order_ts", "aid": "order_aid"})
)

next_orders_of_sub_sessions = (
    orders_of_sub_sessions
    .rename({"order_ts": "next_order_ts", "order_aid": "next_order_aid"})
)

# Find subsequent orders in the same sub session
subsequent_orders = (
    orders_of_sub_sessions
    .join(next_orders_of_sub_sessions, on="sub_session", how="inner")
    .filter(pl.col("order_ts") < pl.col("next_order_ts"))
    .select(["sub_session", "order_aid", "next_order_aid"])
)

# Count how many same order to order events there are
subsequent_orders_count = (
    subsequent_orders
    .group_by(["order_aid", "next_order_aid"])
    .agg(pl.len().alias("count"))
    .sort("count", descending=True)
)

# Sum all the orders for each aid
aid_orders_sum = (
    subsequent_orders_count
    .group_by("order_aid")
    .agg(pl.sum("count").alias("sum"))
    .sort("sum", descending=True)
)

# Calculate the probabilities of items being ordered after another item has been ordered
order_to_order_matrix = (
    subsequent_orders_count
    .join(aid_orders_sum, on="order_aid")
    .with_columns(
        probability = pl.col("count") / pl.col("sum")
    )
    .with_columns(pl.col("probability").cast(pl.Float32))
    .drop(["count", "sum"])
    .sort(["order_aid", "next_order_aid"])
)

print(order_to_order_matrix)

# Check that probabilities sum to the amount of unique aids
print("Unique aids:", order_to_order_matrix.select("order_aid").n_unique())
print("Total probability:", order_to_order_matrix.select("probability").sum().select(pl.first()).item())

shape: (361_060, 3)
┌───────────┬────────────────┬─────────────┐
│ order_aid ┆ next_order_aid ┆ probability │
│ ---       ┆ ---            ┆ ---         │
│ u32       ┆ u32            ┆ f32         │
╞═══════════╪════════════════╪═════════════╡
│ 103       ┆ 774304         ┆ 1.0         │
│ 123       ┆ 421154         ┆ 0.5         │
│ 123       ┆ 523206         ┆ 0.5         │
│ 133       ┆ 133            ┆ 0.333333    │
│ 133       ┆ 732604         ┆ 0.333333    │
│ …         ┆ …              ┆ …           │
│ 1855547   ┆ 1853703        ┆ 0.083333    │
│ 1855547   ┆ 1855547        ┆ 0.083333    │
│ 1855571   ┆ 1126993        ┆ 1.0         │
│ 1855594   ┆ 156547         ┆ 0.5         │
│ 1855594   ┆ 1410784        ┆ 0.5         │
└───────────┴────────────────┴─────────────┘
Unique aids: 83514
Total probability: 83514.0078125


In [21]:
# save to csv
order_to_order_matrix.write_csv("./order_to_order_matrix.csv")