# Carts prediction model

In [1]:
import polars as pl

df_schema = {"session": pl.UInt32, "events": pl.List(pl.Struct({"aid": pl.UInt32, "ts": pl.UInt32, "type": pl.Utf8}))}
test_sessions_df = (
    pl.read_ndjson("../data/test/test_sessions.jsonl", schema=df_schema, low_memory=True)
    .explode("events")
    .unnest("events")
    .sort(["session", "ts"], descending=[False, True])
)

print(test_sessions_df.select("session").n_unique())

514739


In [6]:
df_schema = {"click_aid": pl.UInt32, "cart_aid": pl.UInt32, "probability": pl.Float32}
click_to_cart_matrix_df = (
    pl.read_csv("../matrices/click_to_cart_matrix_time-decay_1.csv", schema=df_schema)
    .sort(["click_aid", "probability"], descending=[False, True])
    .group_by("click_aid")
    # Take top 300 probabilities
    .agg(pl.all().top_k_by("probability", 300))
    .explode(["cart_aid", "probability"])
    .lazy()
)

df_schema = {"cart_aid": pl.UInt32, "next_cart_aid": pl.UInt32, "probability": pl.Float32}
cart_to_cart_matrix_df = (
    pl.scan_csv("../matrices/cart_to_cart_matrix_time-decay.csv", schema=df_schema)
    .sort(["cart_aid", "probability"], descending=[False, True])
    .group_by("cart_aid")
    # Take top 300 probabilities
    .agg(pl.all().top_k_by("probability", 300))
    .explode(["next_cart_aid", "probability"])
    .lazy()
)

In [20]:
# Count incompatible aids for session based on carted aids in the session
df_schema = {"aid": pl.UInt32, "incompatible_aid": pl.UInt32}
incompatible_matrix_df = (
    pl.read_csv("../matrices/incompatible_matrix.csv", schema=df_schema)
)

print(incompatible_matrix_df)

incompatible_aids_for_sessions = (
    test_sessions_df
    .filter(pl.col("type") != "clicks")
    .drop(["type", "ts"])
    .join(incompatible_matrix_df, on="aid", how="inner")
    .select(["session", "incompatible_aid"])
    .unique()
    .lazy()
)

print(incompatible_aids_for_sessions.collect(streaming=True))

shape: (53_510, 2)
┌─────────┬──────────────────┐
│ aid     ┆ incompatible_aid │
│ ---     ┆ ---              │
│ u32     ┆ u32              │
╞═════════╪══════════════════╡
│ 3       ┆ 776187           │
│ 3       ┆ 1180285          │
│ 3       ┆ 1771163          │
│ 3       ┆ 1852756          │
│ 137     ┆ 392403           │
│ …       ┆ …                │
│ 1855485 ┆ 962190           │
│ 1855485 ┆ 1057622          │
│ 1855506 ┆ 138484           │
│ 1855594 ┆ 658891           │
│ 1855594 ┆ 727265           │
└─────────┴──────────────────┘
shape: (117_737, 2)
┌─────────┬──────────────────┐
│ session ┆ incompatible_aid │
│ ---     ┆ ---              │
│ u32     ┆ u32              │
╞═════════╪══════════════════╡
│ 100236  ┆ 1037331          │
│ 130747  ┆ 849217           │
│ 278079  ┆ 792934           │
│ 294552  ┆ 654738           │
│ 413005  ┆ 930620           │
│ …       ┆ …                │
│ 412140  ┆ 112862           │
│ 389144  ┆ 1180883          │
│ 286734  ┆ 316716           │


In [3]:
last_unique_clicks_of_session = (
    test_sessions_df
    .filter(pl.col("type") == "clicks")
    .drop("type")
    # Only last unique clicks
    .with_columns(ts=pl.col("ts").max().over(["session", "aid"]))
    .unique()
    .group_by("session")
    .agg(pl.all().top_k_by("ts", 5))
    .explode(["aid", "ts"])
    .drop("ts")
    .lazy()
)

print(last_unique_clicks_of_session.collect(streaming=True))
print(last_unique_clicks_of_session.select("session").collect(streaming=True).n_unique())

shape: (954_599, 2)
┌─────────┬─────────┐
│ session ┆ aid     │
│ ---     ┆ ---     │
│ u32     ┆ u32     │
╞═════════╪═════════╡
│ 180370  ┆ 768718  │
│ 180370  ┆ 114966  │
│ 180370  ┆ 226432  │
│ 180370  ┆ 1383179 │
│ 180370  ┆ 82906   │
│ …       ┆ …       │
│ 161729  ┆ 1396869 │
│ 262671  ┆ 841757  │
│ 262671  ┆ 484944  │
│ 118877  ┆ 1543556 │
│ 146518  ┆ 1704099 │
└─────────┴─────────┘
512800


In [4]:
last_unique_carts_or_orders = (
    test_sessions_df
    .filter(pl.col("type") != "clicks")
    .drop("type")
    # Only last unique carts and orders
    .with_columns(ts=pl.col("ts").max().over("session", "aid"))
    .unique()
    .group_by("session")
    .agg(pl.all().top_k_by("ts", 5))
    .explode(["aid", "ts"])
    .drop("ts")
    .lazy()
)

print(last_unique_carts_or_orders.collect(streaming=True))
print(last_unique_carts_or_orders.select("session").collect(streaming=True).n_unique())

shape: (106_690, 2)
┌─────────┬─────────┐
│ session ┆ aid     │
│ ---     ┆ ---     │
│ u32     ┆ u32     │
╞═════════╪═════════╡
│ 311731  ┆ 453736  │
│ 225464  ┆ 605053  │
│ 448719  ┆ 920295  │
│ 289818  ┆ 1234414 │
│ 289818  ┆ 672140  │
│ …       ┆ …       │
│ 22907   ┆ 199662  │
│ 95541   ┆ 410310  │
│ 193979  ┆ 1215540 │
│ 469500  ┆ 1517147 │
│ 355881  ┆ 1216403 │
└─────────┴─────────┘
67602


In [7]:
w_click_to_cart = 0.7
w_cart_to_cart = 1 - w_click_to_cart

click_to_cart_predictions_df = (
    last_unique_clicks_of_session
    .join(click_to_cart_matrix_df, left_on="aid", right_on="click_aid", how="left")
    # Fill missing values
    .with_columns(
        cart_aid=pl.when(pl.col("cart_aid").is_null()).then(pl.col("aid")).otherwise(pl.col("cart_aid")),
        probability=pl.when(pl.col("probability").is_null()).then(pl.lit(0.0)).otherwise(pl.col("probability"))
    )
    # bias to the recently clicked aids
    .with_columns(bias=pl.when(pl.col("aid") == pl.col("cart_aid")).then(1).otherwise(0).cast(pl.Float32))
    .with_columns(probability=(pl.col("probability") + pl.col("bias")))
    .group_by(["session", "cart_aid"])
    .agg(pl.col("probability").sum())
    .with_columns(probability=(w_click_to_cart * pl.col("probability")))
)

cart_or_order_to_cart_predictions_df = (
    last_unique_carts_or_orders
    .join(cart_to_cart_matrix_df, left_on="aid", right_on="cart_aid", how="left")
    # Fill missing values
    .with_columns(
        next_cart_aid=pl.when(pl.col("next_cart_aid").is_null()).then(pl.col("aid")).otherwise(pl.col("next_cart_aid")),
        probability=pl.when(pl.col("probability").is_null()).then(pl.lit(0.0)).otherwise(pl.col("probability"))
    )
    # bias to the recently carted aids
    .with_columns(bias=pl.when(pl.col("cart_aid") == pl.col("next_cart_aid")).then(1).otherwise(0).cast(pl.Float32))
    .group_by(["session", "next_cart_aid"])
    .agg(pl.col("probability").sum())
    .with_columns(probability=(w_cart_to_cart * pl.col("probability")))
    .rename({"next_cart_aid": "cart_aid"})
)

carts_prediction_df = (
    pl.concat([
        click_to_cart_predictions_df,
        cart_or_order_to_cart_predictions_df
    ])
    # equal weight for all predictions
    .group_by(["session", "cart_aid"])
    .agg(pl.col("probability").sum())
    # Filter out incompatible aids
    # .join(incompatible_aids_for_sessions, left_on=["session", "cart_aid"], right_on=["session", "incompatible_aid"], how="anti")
    # Form the prediction csv
    .group_by("session")
    .agg(pl.col("cart_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_carts"),
        pl.col("cart_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "cart_aid": "labels"})
    .collect(streaming=True)
)

print(carts_prediction_df)

carts_prediction_df.write_csv("out/carts_predictions.csv")

# Last 5 unique aid clicks, all unique carts and orders of session.
# click_to_cart_matrix_time-decay.csv, cart_to_cart_matrix_time-decay at 1/1 weights. Top 300 probabilities
# Recall@20 score: 0.5138459054334078
# Recall@10 score: 0.48429432888407076
# Recall@5 score: 0.4630885135559729
# MRR@20 score: 0.5141451405842936
# MRR@10 score: 0.5119015720964412
# MRR@5 score: 0.5069403685782773

# Using incompatible matrix with orders did nothing. Using with carts Recall@20 score: 0.5097553755483194, slightly worse.

# Last 5 unique aid clicks, all unique carts and orders of session.
# click_to_cart_matrix_time-decay.csv, cart_to_cart_matrix_time-decay at 0.7/0.3 weights. Top 300 probabilities
# Recall@20 score:  0.5153394870690815
# Recall@10 score: 0.48495833051019716
# Recall@5 score: 0.4632700511796454
# MRR@20 score: 0.5154712250630091
# MRR@10 score: 0.5131787729044937
# MRR@5 score: 0.5081498784601399

# Last 5 unique aid clicks, all unique carts and orders of session.
# click_to_cart_matrix_time-decay.csv, cart_to_cart_matrix_time-decay at 0.9/0.1 weights. Top 300 probabilities
# Recall@20 score: 0.5134489625662693
# MRR@20 score: 0.5159078249834574

# Last 5 unique aid clicks, all unique carts and orders of session.
# click_to_cart_matrix_time-decay.csv, cart_to_cart_matrix_time-decay at 0.3/0.7 weights. Top 300 probabilities
# Recall@20 score: 0.5115180709922226
# MRR@20 score: 0.5103072615668478

# Last 5 unique aid clicks, last 5 unique carts or orders of session.
# click_to_cart_matrix_time-decay.csv, cart_to_cart_matrix_time-decay at 0.7/0.3 weights. Top 300 probabilities
# Recall@20 score: 0.5160391829704782
# Recall@10 score: 0.48556135239514875
# Recall@5 score: 0.46365407307587575
# MRR@20 score: 0.5156306427297778
# MRR@10 score: 0.5133202642590753
# MRR@5 score: 0.5082480682513211

# New better click-to-cart matrix
# Last 5 unique aid clicks, last 5 unique carts or orders of session.
# click_to_cart_matrix_time-decay.csv, cart_to_cart_matrix_time-decay at 0.7/0.3 weights. Top 300 probabilities
# Recall@20 score: 0.5228477623186846, Best
# Recall@10 score: 0.4931838200420083
# Recall@5 score: 0.47083877364352994
# MRR@20 score: 0.5192958010328336
# MRR@10 score: 0.5170727259024377
# MRR@5 score: 0.5119944998708487


shape: (514_739, 2)
┌──────────────┬─────────────────────────────────┐
│ session_type ┆ labels                          │
│ ---          ┆ ---                             │
│ str          ┆ str                             │
╞══════════════╪═════════════════════════════════╡
│ 203203_carts ┆ 1033795 448682 457537 1751964 … │
│ 82581_carts  ┆ 95710 414103 932046 211453 108… │
│ 289702_carts ┆ 1626577 1297763 1223830 122806… │
│ 336877_carts ┆ 925621 699087 1651437 524269 3… │
│ 373200_carts ┆ 891286 78881 1251548 933379 53… │
│ …            ┆ …                               │
│ 87803_carts  ┆ 704704 64600 1775057 689660 12… │
│ 317849_carts ┆ 1174319 386532 1416142 369161 … │
│ 332661_carts ┆ 782947 1198914 1635743 189083 … │
│ 450636_carts ┆ 1311747 1178814 1287397 817418… │
│ 448671_carts ┆ 1006991 749503 898986 1184000 … │
└──────────────┴─────────────────────────────────┘
