# Orders prediction model

In [1]:
import polars as pl

df_schema = {"session": pl.UInt32, "events": pl.List(pl.Struct({"aid": pl.UInt32, "ts": pl.UInt32, "type": pl.Utf8}))}
test_sessions_df = (
    pl.read_ndjson("../data/test/test_sessions.jsonl", schema=df_schema, low_memory=True)
    .explode("events")
    .unnest("events")
    .sort(["session", "ts"], descending=[False, True])
)

print(test_sessions_df.select("session").n_unique())

514739


In [3]:
# Try out carts predictions as orders predictions
# Recall@20: 0.6179312304631998
# MRR@20: 0.614267014584525

asdf = (
    pl.read_csv("out/carts_predictions.csv")
    .with_columns(session_type=pl.col("session_type").str.replace("carts", "orders"))
)
print(asdf)

asdf.write_csv("out/orders_predictions.csv")

shape: (514_739, 2)
┌───────────────┬─────────────────────────────────┐
│ session_type  ┆ labels                          │
│ ---           ┆ ---                             │
│ str           ┆ str                             │
╞═══════════════╪═════════════════════════════════╡
│ 434547_orders ┆ 368671 1840418 742592 298320 1… │
│ 335466_orders ┆ 1601965 1028090 1313390 721205… │
│ 458550_orders ┆ 1822672 1643318 1665617 104099… │
│ 323250_orders ┆ 19907 421691 669254 1429191 92… │
│ 310415_orders ┆ 1036076 1022319 1205469 960792… │
│ …             ┆ …                               │
│ 117838_orders ┆ 506852 1439896 1196308 1085166… │
│ 328064_orders ┆ 975052 506144 370131 1548418 8… │
│ 337386_orders ┆ 942367 1323899 9268 1570571 82… │
│ 435690_orders ┆ 1637932 312854 334903 181952 7… │
│ 20701_orders  ┆ 1617987 1600051 1634903 669402… │
└───────────────┴─────────────────────────────────┘


In [56]:
df_schema = {"click_aid": pl.UInt32, "cart_aid": pl.UInt32, "probability": pl.Float32}
click_to_cart_matrix_df = (
    pl.read_csv("../matrices/click_to_cart_matrix_time-decay.csv", schema=df_schema)
    .sort(["click_aid", "probability"], descending=[False, True])
    .group_by("click_aid")
    # Take top 300 probabilities
    .agg(pl.all().top_k_by("probability", 300))
    .explode(["cart_aid", "probability"])
    .lazy()
)

df_schema = {"cart_aid": pl.UInt32, "order_aid": pl.UInt32, "probability": pl.Float32}
cart_to_order_matrix_df = (
    pl.read_csv("../matrices/cart_to_order_matrix.csv", schema=df_schema)
    .sort(["cart_aid", "probability"], descending=[False, True])
    .group_by("cart_aid")
    # Take top 300 probabilities
    .agg(pl.all().top_k_by("probability", 300))
    .explode(["order_aid", "probability"])
    .lazy()
)

df_schema = {"aid": pl.UInt32, "incompatible_aid": pl.UInt32}
incompatible_matrix_df = pl.read_csv("../matrices/incompatible_matrix_7.csv", schema=df_schema)

In [39]:
last_unique_aids_of_session = (
    test_sessions_df
    .drop("type")
    # Only last unique clicks
    .with_columns(ts=pl.col("ts").max().over(["session", "aid"]))
    .unique()
    .group_by("session")
    .agg(pl.all().top_k_by("ts", 5))
    .explode(["aid", "ts"])
    .drop("ts")
    .lazy()
)

print(last_unique_aids_of_session.collect(streaming=True))
print(last_unique_aids_of_session.select("session").collect(streaming=True).n_unique())

shape: (967_224, 2)
┌─────────┬─────────┐
│ session ┆ aid     │
│ ---     ┆ ---     │
│ u32     ┆ u32     │
╞═════════╪═════════╡
│ 320624  ┆ 139296  │
│ 320624  ┆ 410862  │
│ 320624  ┆ 69857   │
│ 320624  ┆ 1770988 │
│ 320624  ┆ 600517  │
│ …       ┆ …       │
│ 194125  ┆ 1551213 │
│ 111410  ┆ 1624448 │
│ 111410  ┆ 1517548 │
│ 111410  ┆ 1395928 │
│ 48738   ┆ 1098846 │
└─────────┴─────────┘
514739


In [34]:
unique_carts_or_orders = (
    test_sessions_df
    .filter(pl.col("type") != "clicks")
    .drop("type")
    # Only last unique carts and orders
    .with_columns(ts=pl.col("ts").max().over("session", "aid"))
    .unique()
    .drop("ts")
    .lazy()
)

print(unique_carts_or_orders.collect(streaming=True))
print(unique_carts_or_orders.select("session").collect(streaming=True).n_unique())

shape: (113_401, 2)
┌─────────┬─────────┐
│ session ┆ aid     │
│ ---     ┆ ---     │
│ u32     ┆ u32     │
╞═════════╪═════════╡
│ 48271   ┆ 1781264 │
│ 87180   ┆ 1753460 │
│ 427550  ┆ 109943  │
│ 259526  ┆ 99354   │
│ 108774  ┆ 167934  │
│ …       ┆ …       │
│ 252844  ┆ 1274096 │
│ 362872  ┆ 547109  │
│ 428894  ┆ 132564  │
│ 481801  ┆ 1097691 │
│ 501899  ┆ 477274  │
└─────────┴─────────┘
67602


In [57]:
w_click_to_order = 1
w_cart_to_order = 1

click_to_order_predictions_df = (
    last_unique_aids_of_session
    .join(click_to_cart_matrix_df, left_on="aid", right_on="click_aid", how="left")
    # Fill missing values
    .with_columns(
        order_aid=pl.when(pl.col("cart_aid").is_null()).then(pl.col("aid")).otherwise(pl.col("cart_aid")),
        probability=pl.when(pl.col("probability").is_null()).then(pl.lit(0.0)).otherwise(pl.col("probability"))
    )
    .drop("cart_aid")
    # bias to the recently clicked aids
    .with_columns(bias=pl.when(pl.col("aid") == pl.col("order_aid")).then(1).otherwise(0).cast(pl.Float32))
    .with_columns(probability=(pl.col("probability") + pl.col("bias")))
    .group_by(["session", "order_aid"])
    .agg(pl.col("probability").sum())
    .with_columns(probability=pl.col("probability") * w_click_to_order)
)

cart_to_order_predictions_df = (
    unique_carts_or_orders
    .join(cart_to_order_matrix_df, left_on="aid", right_on="cart_aid", how="left")
    # Fill missing values
    .with_columns(
        order_aid=pl.when(pl.col("order_aid").is_null()).then(pl.col("aid")).otherwise(pl.col("order_aid")),
        probability=pl.when(pl.col("probability").is_null()).then(pl.lit(0.0)).otherwise(pl.col("probability"))
    )
    # bias to the recently carted or ordered aids
    .with_columns(bias=pl.when(pl.col("aid") == pl.col("order_aid")).then(1).otherwise(0).cast(pl.Float32))
    .with_columns(probability=(pl.col("probability") + pl.col("bias")))
    .group_by(["session", "order_aid"])
    .agg(pl.col("probability").sum())
    .with_columns(probability=pl.col("probability") * w_cart_to_order)
)

incompatible_aids_for_sessions = (
    test_sessions_df
    .filter(pl.col("type") != "clicks")
    .drop(["type", "ts"])
    .join(incompatible_matrix_df, on="aid", how="inner")
    .select(["session", "incompatible_aid"])
    .with_columns(bias=pl.lit(-1).cast(pl.Float32))
    .unique()
    .lazy()
)

orders_prediction_df = (
    pl.concat([
        click_to_order_predictions_df,
        cart_to_order_predictions_df
    ])
    .group_by(["session", "order_aid"])
    .agg(pl.col("probability").sum())
    # Filter out incompatible aids
    .join(incompatible_aids_for_sessions, left_on=["session", "order_aid"], right_on=["session", "incompatible_aid"], how="left")
    .with_columns(probability=pl.when(pl.col("bias").is_not_null()).then(pl.col("probability") - pl.col("bias")).otherwise(pl.col("probability")))
    .drop("bias")
    .group_by("session")
    .agg(pl.col("order_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_orders"),
        pl.col("order_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "order_aid": "labels"})
    .collect(streaming=True)
)

print(orders_prediction_df)

# write to csv
orders_prediction_df.write_csv("out/orders_predictions_1.csv")

# Click to order matrix
# Recall@20: 0.5534242682580278
# MRR@20: 0.599049896606218

# Click to cart matrix
# Recall@20: 0.5997087240693378
# MRR@20: 0.6260726606889974

# Last 5 unique aid clicks, last 5 unique carts or orders of session.
# click_to_cart_matrix_time-decay, cart_to_order_matrix at 1/1 weights. Top 300 probabilities
# Recall@20 score: 0.6542696788860471
# Recall@10 score: 0.6437286305560554
# Recall@5 score: 0.624019312009656
# MRR@20 score: 0.6905953832353601
# MRR@10 score: 0.6895777934357278
# MRR@5 score: 0.6858750496059938

# Last 5 unique aid clicks, last 5 unique carts or orders of session.
# click_to_cart_matrix_time-decay, cart_to_order_matrix at 0.7/0.3 weights. Top 300 probabilities
# Recall@20 score: 0.6550156294401819
# MRR@20 score: 0.6835478620037696

# Last 5 unique aid events, all unique carts or orders of session.
# click_to_cart_matrix_time-decay, cart_to_order_matrix at 1/1 weights. Top 300 probabilities
# Recall@20 score: 0.677536231884058
# Recall@10 score: 0.659744466438726
# Recall@5 score: 0.6256412190706095
# MRR@20 score: 0.6910628091208602
# MRR@10 score: 0.6900476892384084
# MRR@5 score: 0.6860281195079151

# Incompatible matrix makes the Recall slightly better. MRR is a bit more better.

# Incompatible matrix top 2 % of most ordered aids and Jaccard similarity of over 0.5. Incompatible aids removed
# Recall@20 score: 0.67718101733447
# MRR@20 score: 0.6911231521015865

# Incompatible matrix top 2 % of most ordered aids and Jaccard similarity of over 0.5. Negative bias -0.5
# Recall@20 score: 0.6773231031543052
# MRR@20 score: 0.6913478922666909

# Incompatible matrix top 2 % of most ordered aids and Jaccard similarity of over 0.5. Negative bias -1.0
# Recall@20 score: 0.6773231031543052
# MRR@20 score: 0.6913698595275813

# Incompatible matrix top 2 % of most ordered aids and Jaccard similarity of over 0.8. Negative bias -1.0
# Recall@20 score: 0.677536231884058
# MRR@20 score: 0.6913294719369799

# Incompatible matrix top 2 % of most ordered aids and Jaccard similarity of over 0.8. Bought together <= 2 times.  Negative bias -1.0
# Recall@20 score: 0.6775717533390168, Best
# MRR@20 score: 0.6914304417337004, Best

shape: (514_739, 2)
┌───────────────┬─────────────────────────────────┐
│ session_type  ┆ labels                          │
│ ---           ┆ ---                             │
│ str           ┆ str                             │
╞═══════════════╪═════════════════════════════════╡
│ 61615_orders  ┆ 1804837 1390616 1372233 68100 … │
│ 214466_orders ┆ 1256008 1576358 1738086 102397… │
│ 440329_orders ┆ 1048396 1508578 1621621 114924… │
│ 492875_orders ┆ 1006218 718253 457356 1519479 … │
│ 460485_orders ┆ 827442 1607793 1741703 134080 … │
│ …             ┆ …                               │
│ 76540_orders  ┆ 463024 1261705 427375 193639 6… │
│ 223463_orders ┆ 581724 196279 712132 580453 10… │
│ 8777_orders   ┆ 459429 1850239 1064857 879909 … │
│ 255990_orders ┆ 330565 1852471 69854 131957 84… │
│ 263317_orders ┆ 202312 148455 289721 1000942    │
└───────────────┴─────────────────────────────────┘


### Predicting recent clicks as orders

In [24]:
asdf = (
    test_sessions_df
    # .filter(pl.col("type") == "clicks")
    .drop("type")
    # Only last unique clicks
    .with_columns(ts=pl.col("ts").max().over(["session", "aid"]))
    .unique()
    .group_by("session")
    .agg(pl.all().top_k_by("ts", 5).cast(str))
    .drop("ts")
    .with_columns(
        (pl.col("session").cast(str) + "_orders"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)

print(asdf)

asdf.write_csv("out/orders_predictions_2.csv")

# Last 5 clicks: 0.4877095765842569
# Last 10 clicks: 0.5257530548451265
# Last 20 clicks: 0.5434427394146064

shape: (512_800, 2)
┌───────────────┬─────────────────────────────────┐
│ session_type  ┆ labels                          │
│ ---           ┆ ---                             │
│ str           ┆ str                             │
╞═══════════════╪═════════════════════════════════╡
│ 52058_orders  ┆ 125197                          │
│ 121149_orders ┆ 1798580 1278671                 │
│ 276712_orders ┆ 1371576                         │
│ 494721_orders ┆ 589670                          │
│ 336898_orders ┆ 698518 1754984                  │
│ …             ┆ …                               │
│ 374337_orders ┆ 1657938                         │
│ 205769_orders ┆ 1401487                         │
│ 166838_orders ┆ 1152354 198344                  │
│ 193851_orders ┆ 389999                          │
│ 11790_orders  ┆ 408250 169841 811084 950718 13… │
└───────────────┴─────────────────────────────────┘


### Predicting recent carts as orders

In [21]:
asdf = (
    test_sessions_df
    .filter(pl.col("type") == "carts")
    .drop("type")
    # Only last unique clicks
    .with_columns(ts=pl.col("ts").max().over(["session", "aid"]))
    .unique()
    .group_by("session")
    .agg(pl.all().top_k_by("ts", 20).cast(str))
    .drop("ts")
    .with_columns(
        (pl.col("session").cast(str) + "_orders"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)

print(asdf)

asdf.write_csv("out/orders_predictions_2.csv")

# Last 5 carts: 0.39567348678601877
# Last 10 carts: 0.4154234157431088
# Last 20 carts: 0.42043194089229896

shape: (66_549, 2)
┌───────────────┬─────────────────┐
│ session_type  ┆ labels          │
│ ---           ┆ ---             │
│ str           ┆ str             │
╞═══════════════╪═════════════════╡
│ 299128_orders ┆ 512176          │
│ 450288_orders ┆ 1183627         │
│ 472951_orders ┆ 537304          │
│ 227167_orders ┆ 1744908 1043508 │
│ 274613_orders ┆ 853430          │
│ …             ┆ …               │
│ 35111_orders  ┆ 98647           │
│ 185187_orders ┆ 1755112         │
│ 503_orders    ┆ 630151          │
│ 113512_orders ┆ 1769518 15906   │
│ 435964_orders ┆ 1278149         │
└───────────────┴─────────────────┘


### Predicting recent orders as orders

In [29]:
asdf = (
    test_sessions_df
    .filter(pl.col("type") == "orders")
    .drop("type")
    # Only last unique clicks
    .with_columns(ts=pl.col("ts").max().over(["session", "aid"]))
    .unique()
    .group_by("session")
    .agg(pl.all().top_k_by("ts", 20).cast(str))
    .drop("ts")
    .with_columns(
        (pl.col("session").cast(str) + "_orders"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)

print(asdf)

asdf.write_csv("out/orders_predictions_2.csv")

# Last 5 orders: 0.01424410343847684
# Last 10 clicks: 0.014812446717817562
# Last 20 clicks: 0.014812446717817562

shape: (3_948, 2)
┌───────────────┬────────────────────────────────┐
│ session_type  ┆ labels                         │
│ ---           ┆ ---                            │
│ str           ┆ str                            │
╞═══════════════╪════════════════════════════════╡
│ 280657_orders ┆ 426015                         │
│ 491946_orders ┆ 1021110 648355 1438538 1169176 │
│ 91102_orders  ┆ 19474                          │
│ 349224_orders ┆ 524871 1001247                 │
│ 77877_orders  ┆ 273918                         │
│ …             ┆ …                              │
│ 83093_orders  ┆ 1158875                        │
│ 348911_orders ┆ 404498                         │
│ 53856_orders  ┆ 333343                         │
│ 23708_orders  ┆ 267450 655799                  │
│ 428363_orders ┆ 1495817                        │
└───────────────┴────────────────────────────────┘
