# Orders prediction model

In [1]:
import polars as pl

# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.Int32, "ts": pl.Int64, "type": str})
df_schema = {"session": pl.Int32, "events": pl.List(event_schema)}

test_sessions_df = (
    pl.read_ndjson("../data/test/test_sessions.jsonl", schema=df_schema, low_memory=True)
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
)

total_sessions = (
    test_sessions_df
    .select("session")
    .n_unique()
)
print(total_sessions)

921704


In [13]:
df_schema = {"click_aid": pl.Int32, "order_aid": pl.Int32, "probability": pl.Float32}
click_to_order_matrix_df = (
    pl.scan_csv("../matrices/click_to_order_matrix_whole_sessions_only_next_order_last_1h_max_5.csv", schema=df_schema)
)

df_schema = {"cart_aid": pl.Int32, "order_aid": pl.Int32, "probability": pl.Float32}
cart_to_order_matrix_df = (
    pl.scan_csv("../matrices/cart_to_order_matrix_whole_sessions_time_decay.csv", schema=df_schema)
)

df_schema = {"order_aid": pl.Int32, "next_order_aid": pl.Int32, "probability": pl.Float32}
order_to_order_matrix_df = (
    pl.scan_csv("../matrices/order_to_order_matrix_whole_sessions_time_decay.csv", schema=df_schema)
)

df_schema = {"aid": pl.Int32, "incompatible_aid": pl.Int32}
incompatible_aids = (
    pl.scan_csv("../matrices/incompatible_matrix_6_1_08.csv", schema=df_schema)
)

In [11]:
asdf = (
    test_sessions_df
    .with_columns(max_ts=(pl.col("ts").max().over("session")))
    .filter((pl.col("type") == "carts") & (pl.col("ts") == pl.col("max_ts")))
)

print(asdf)

shape: (51_285, 5)
┌──────────┬─────────┬────────────┬───────┬────────────┐
│ session  ┆ aid     ┆ ts         ┆ type  ┆ max_ts     │
│ ---      ┆ ---     ┆ ---        ┆ ---   ┆ ---        │
│ i32      ┆ i32     ┆ u32        ┆ str   ┆ u32        │
╞══════════╪═════════╪════════════╪═══════╪════════════╡
│ 12899787 ┆ 1682750 ┆ 1661724042 ┆ carts ┆ 1661724042 │
│ 12899790 ┆ 1219653 ┆ 1661724002 ┆ carts ┆ 1661724002 │
│ 12899803 ┆ 1651971 ┆ 1661724817 ┆ carts ┆ 1661724817 │
│ 12899841 ┆ 235568  ┆ 1661724022 ┆ carts ┆ 1661724022 │
│ 12899846 ┆ 1697138 ┆ 1661724848 ┆ carts ┆ 1661724848 │
│ …        ┆ …       ┆ …          ┆ …     ┆ …          │
│ 14571257 ┆ 990048  ┆ 1662328546 ┆ carts ┆ 1662328546 │
│ 14571331 ┆ 843823  ┆ 1662328549 ┆ carts ┆ 1662328549 │
│ 14571349 ┆ 683653  ┆ 1662328564 ┆ carts ┆ 1662328564 │
│ 14571403 ┆ 614044  ┆ 1662328599 ┆ carts ┆ 1662328599 │
│ 14571534 ┆ 272221  ┆ 1662328720 ┆ carts ┆ 1662328720 │
└──────────┴─────────┴────────────┴───────┴────────────┘


In [14]:
clicks_after_last_cart_or_order = (
    test_sessions_df
    .join(
        (
            test_sessions_df
            .filter(pl.col("type") != "clicks")
            .group_by("session")
            .agg(pl.col("ts").max().alias("max_ts"))
        ),
        on="session",
        how="left"
    )
    .filter(pl.when(pl.col("max_ts").is_not_null()).then(pl.col("ts") > pl.col("max_ts")).otherwise(True))
    .drop(["type", "ts", "max_ts"])
    .lazy()
)

# last_n_clicks_of_session = (
#     test_sessions_df
#     .filter(pl.col("type") == "clicks")
#     .group_by("session")
#     .agg(pl.col("aid").top_k_by("ts", 15))
#     .explode("aid")
#     .lazy()
# )

# Weight for click-to-cart and cart-to-cart predictions
w_click_to_order = 1
w_cart_to_order = 1
w_order_to_order = 1

click_to_order_predictions_df = (
    clicks_after_last_cart_or_order
    .join(click_to_order_matrix_df, left_on="aid", right_on="click_aid", how="inner")
    .drop("aid")
    .group_by(["session", "order_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .rename({"order_aid": "next_order_aid"})
    # Normalize probabilities
    .with_columns(pl.col("probability") / pl.col("probability").max().over("session"))
    .with_columns(pl.col("probability") * w_click_to_order)
)

# print(click_to_order_predictions_df.collect(streaming=True))

cart_to_order_predictions_df = (
    test_sessions_df
    .lazy()
    .filter(pl.col("type") == "carts")
    .drop(["type", "ts"])
    .join(cart_to_order_matrix_df, left_on="aid", right_on="cart_aid", how="inner")
    .group_by(["session", "order_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .rename({"order_aid": "next_order_aid"})
    # Normalize probabilities
    .with_columns(pl.col("probability") / pl.col("probability").max().over("session"))
    .with_columns(pl.col("probability") * w_cart_to_order)
)

# print(cart_to_order_predictions_df.collect(streaming=True))

order_to_order_predictions_df = (
    test_sessions_df
    .lazy()
    .filter(pl.col("type") == "orders")
    .drop(["type", "ts"])
    .join(order_to_order_matrix_df, left_on="aid", right_on="order_aid", how="inner")
    .group_by(["session", "next_order_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .with_columns(pl.col("probability") * w_order_to_order)
)

incompatible_aids_for_sessions = (
    test_sessions_df
    .filter(pl.col("type") == "orders")
    .drop(["type", "ts"])
    .lazy()
    .join(incompatible_aids, on="aid", how="semi")
    # .group_by("session")
    # .agg(pl.col("aid"))
    .rename({"aid": "incompatible_aid"})
)

# print(incompatible_aids_for_sessions.collect(streaming=True))

orders_prediction_df = (
    pl.concat([
        click_to_order_predictions_df,
        cart_to_order_predictions_df,
        order_to_order_predictions_df
    ])
    .group_by(["session", "next_order_aid"])
    # 1/1/1 weight for click-to-order and cart-to-order and order-to-order predictions
    .agg(pl.col("probability").sum().alias("probability"))
    # Filter incompatible aids
    .join(incompatible_aids_for_sessions, left_on=["session", "next_order_aid"], right_on=["session", "incompatible_aid"], how="anti")
    # Form the prediction csv
    .group_by("session")
    .agg(pl.col("next_order_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_orders"),
        pl.col("next_order_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_order_aid": "labels"})
    .collect(streaming=True)
)

print(orders_prediction_df)
orders_prediction_df.write_csv("out/orders_predictions_2.csv")


# Old best
# W 1/1/1 all clicks Score: 0.5892841262783459 Best

# Normalized probabilities
# Current best performing with less computation than the overall best
# W 1/1/1 last clicks after last cart or order. Score: 0.5853045798132503

# Incompatibility is evaluated from carted items
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 5 3 03. Score: 0.37232103156958646
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 5 2 05. Score: 0.38412627834593155
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 5 2 08. Score: 0.39075144508670523
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 6 1 08. Score: 0.4303245887060916
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 6 0 08. Score: 0.43050244553134726
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 10 0 08. Score: 0.48523788350377944

# Incompatibility is evaluated from ordered items
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 10 0 08. Score: 0.5805469097376612
# W 1/1/1 last clicks after last cart or order with incompatibility matrix 6 1 08. Score:


# Sessions of length 1 (~450 000 sessions)
# W 1/2 Score: 0.07632281013783904
# Score / session: 0.00000017

# Sessions of length 2 (~165 000 sessions)
# W 1/2 Score: 0.08114717652289907
# Score / session: 0.00000049

# Sessions of length 3 (~85 000 sessions)
# W 1/2 Score: 0.05282347710093375
# Score / session: 0.00000062

# Sessions of length 4 (~51 000 sessions)
# W 1/2 Score: 0.041373943975100046
# Score / session: 0.00000081

# Sessions of length 5 (~34 000 sessions)
# W 1/2 Score: 0.03301467318808359
# Score / session: 0.00000097

# Sessions of length > 5 (~126 000 sessions)
# W 1/2 Score: 0.29584259670964874
# Score / session: 0.00000235

shape: (876_073, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 14462464_orders ┆ 1202869 1223009 935274 942372 … │
│ 13957022_orders ┆ 1670668 1773877 196121 286775 … │
│ 13860135_orders ┆ 327596 1386268 1639897 25824 3… │
│ 13492991_orders ┆ 1459553 1798852 413507 788341 … │
│ 13252968_orders ┆ 545073 804935 893799 957699 18… │
│ …               ┆ …                               │
│ 13896169_orders ┆ 1358765 691508 301951 21029 16… │
│ 14491522_orders ┆ 1788265 499919 982192 1221201 … │
│ 13934168_orders ┆ 1020072 1007647 1484213 461777… │
│ 14415488_orders ┆ 619351 330291 312043 206257 82… │
│ 13290434_orders ┆ 1460571 811371 944778 959548 4… │
└─────────────────┴─────────────────────────────────┘


In [6]:
df_schema = {"cart_aid": pl.Int32, "order_aid": pl.Int32, "probability": pl.Float32}
cart_to_order_matrix_df = (
    pl.scan_csv("../matrices/cart_to_order_matrix_whole_sessions_time_decay.csv", schema=df_schema)
    # .group_by("cart_aid")
    # .agg(pl.col("order_aid").top_k_by("probability", 200), pl.col("probability").top_k(200))
    # .explode(["order_aid", "probability"])
)


asdf = (
     test_sessions_df
    .lazy()
    .filter(pl.col("type") == "carts")
    .join(cart_to_order_matrix_df, left_on="aid", right_on="cart_aid", how="inner")
    .group_by(["session", "order_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .group_by("session")
    .agg(pl.col("order_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_orders"),
        pl.col("order_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "order_aid": "labels"})
    .collect(streaming=True)
)

print(asdf)

asdf.write_csv("out/orders_predictions_1.csv")

# Suggesting only carts score: 0.4196754112939084
# Suggesting only orders score: 0.014673188083592708
# Suggesting carts and orders: 0.4210760337927968
# Suggesting carts that are not ordered: 0.40764784348599375

# Cart to order whole sessions: 0.4094486438417074
# Cart to order whole sessions time decay: 0.4131391729657626

shape: (119_254, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 14065428_orders ┆ 1225920 958130 457427 1725080 … │
│ 14527610_orders ┆ 537933 407807 906597 1263754 2… │
│ 13901698_orders ┆ 1552199 783124 459324 1020683 … │
│ 13071842_orders ┆ 1725892 887666 200303 699166 1… │
│ 13869305_orders ┆ 1600635 555409 1845885 171073 … │
│ …               ┆ …                               │
│ 14331603_orders ┆ 1233404 1493824 1025795 983350… │
│ 14276690_orders ┆ 478989 1036259 1160725 1252158… │
│ 12970081_orders ┆ 275108 370526 752426 566228 10… │
│ 14440527_orders ┆ 211707 731547 1521334 733732 6… │
│ 13001923_orders ┆ 862435 802958 675366 1746164 1… │
└─────────────────┴─────────────────────────────────┘


In [4]:
# last_clicks_after_last_cart_or_order = (
#     test_sessions_df
#     .join(
#         (
#             test_sessions_df
#             .filter(pl.col("type") != "clicks")
#             .with_columns(pl.col("ts").max().over("session").alias("max_ts"))
#             .drop(["aid", "ts", "type"])
#         ),
#         on="session",
#         how="left"
#     )
#     .filter(pl.when(pl.col("max_ts").is_not_null()).then(pl.col("ts") > pl.col("max_ts")).otherwise(True))
#     .drop(["type", "max_ts"])
#     # .with_columns(pl.col("ts").max().over("session").alias("max_ts"))
#     # .with_columns(time_between_min = ((pl.col("max_ts") - pl.col("ts")) / 60).cast(pl.Float32))
#     # .filter(pl.col("time_between_min") < 2*60) # last 2 hours
#     # .group_by("session")
#     # .agg(pl.col("aid").top_k_by("ts", 20)) # last 20 clicks
#     # .explode("aid")
#     .lazy()
# )

last_n_clicks_of_session = (
    test_sessions_df
    .filter(pl.col("type") == "clicks")
    .group_by("session")
    .agg(pl.col("aid").top_k_by("ts", 15))
    .explode("aid")
    .lazy()
)

# last_n_minutes_of_session = (
#     test_sessions_df
#     .filter(pl.col("type") == "clicks")
#     .with_columns(max_ts = (pl.col("ts").max().over("session")))
#     .filter(pl.col("ts") > (pl.col("max_ts") - 2*60*60)) # last 2 h
#     .lazy()
# )

df_schema = {"click_aid": pl.Int32, "order_aid": pl.Int32, "probability": pl.Float32}
click_to_order_matrix_df = (
    pl.scan_csv("../matrices/click_to_order_matrix_whole_sessions_only_next_order_last_1h_max_20.csv", schema=df_schema)
)

asdf = (
    last_n_clicks_of_session
    .join(click_to_order_matrix_df, left_on="aid", right_on="click_aid", how="inner")
    .group_by(["session", "order_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .group_by("session")
    .agg(pl.col("order_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_orders"),
        pl.col("order_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "order_aid": "labels"})
    .collect(streaming=True)
)

print(asdf)

asdf.write_csv("out/orders_predictions_1.csv")

# Clicks after last cart or order
# Last 10 clicks or last 2 hours: 0.08285904846598488
# Last 5 clicks or last 2 hours: 0.08236994219653179
# Last 20 clicks or last 2 hours: 0.08299244108492664
# All last clicks: 0.08325922632281013
# All last clicks no time decay matrix: 0.08668297020898177
# All last clicks and click to order matrix of the whole sessions: 0.05538016896398399

# All clicks
# Last 20 clicks of session and click to order matrix of the whole sessions: 0.09959982214317474
# Last 15 clicks of session and click to order matrix of the whole sessions: 0.12152067585593597
# Last 10 clicks of session and click to order matrix of the whole sessions: 0.12100933748332593
# Last 5 clicks of session and click to order matrix of the whole sessions: 0.11867496665184527
# Last 10 clicks of session and click to order matrix of the whole sessions only next order: 0.1318363717207648
# Last 15 clicks of session and click to order matrix of the whole sessions only next order: 0.13357047576700756
# Last 20 clicks of session and click to order matrix of the whole sessions only next order: 0.1331702979101823

# Last 15 clicks of session and click to order matrix of the whole sessions only next order last 1h: 0.17772343263672744

# No limit on matrix top k probabilities
# Last 15 clicks of session and click to order matrix of the whole sessions only next order last 1h: 0.5184971098265896
# Last 15 clicks of session and click to order matrix of the whole sessions only next order last 1h 30m: 0.5174077367718987
# Last 15 clicks of session and click to order matrix of the whole sessions only next order last 2h: 0.5173188083592708
# Last 5 clicks of session and click to order matrix of the whole sessions only next order last 1h: 0.47712316585148956

# Last 15 clicks of session and click to order matrix of the whole sessions only next order last 1h or max 5 clicks: 0.517741218319253
# Last 15 clicks of session and click to order matrix of the whole sessions only next order last 1h or max 10 clicks: 0.5207647843485994
# Last 15 clicks of session and click to order matrix of the whole sessions only next order last 1h or max 20 clicks: 0.5210315695864829 Best

shape: (891_703, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 13540303_orders ┆ 1618626 1085367 1511908 172675… │
│ 14035801_orders ┆ 898905 1065138 310482 1022052 … │
│ 13701547_orders ┆ 1286367 1413049 576116 1498443… │
│ 13548303_orders ┆ 759163 496603 351986 113750 30… │
│ 13161095_orders ┆ 1286213 500609 903949 989193 1… │
│ …               ┆ …                               │
│ 14047683_orders ┆ 297149 1609228 14147 1603088 4… │
│ 13573050_orders ┆ 571090 1176975 1283268 1532039… │
│ 12941246_orders ┆ 298730 533518 1365247 196291    │
│ 13151347_orders ┆ 91289 1202273 368076 690631 55… │
│ 14517082_orders ┆ 1706657 1674266 1228852 162441… │
└─────────────────┴─────────────────────────────────┘
