# Carts prediction model

In [18]:
import polars as pl


# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.Int32, "ts": pl.Int64, "type": str})
df_schema = {"session": pl.Int32, "events": pl.List(event_schema)}

test_sessions_df = (
    pl.read_ndjson("../data/test/test_sessions.jsonl", schema=df_schema, low_memory=True)
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
)

print(test_sessions_df)

total_sessions = (
    test_sessions_df
    .select("session")
    .n_unique()
)
print(total_sessions)

shape: (3_086_540, 4)
┌──────────┬─────────┬────────────┬────────┐
│ session  ┆ aid     ┆ ts         ┆ type   │
│ ---      ┆ ---     ┆ ---        ┆ ---    │
│ i32      ┆ i32     ┆ u32        ┆ str    │
╞══════════╪═════════╪════════════╪════════╡
│ 12899780 ┆ 1142000 ┆ 1661724000 ┆ clicks │
│ 12899781 ┆ 141736  ┆ 1661724000 ┆ clicks │
│ 12899781 ┆ 199008  ┆ 1661724022 ┆ clicks │
│ 12899781 ┆ 57315   ┆ 1661724170 ┆ clicks │
│ 12899781 ┆ 194067  ┆ 1661724246 ┆ clicks │
│ …        ┆ …       ┆ …          ┆ …      │
│ 14571534 ┆ 272221  ┆ 1662328716 ┆ carts  │
│ 14571534 ┆ 272221  ┆ 1662328720 ┆ carts  │
│ 14571539 ┆ 317311  ┆ 1662328717 ┆ clicks │
│ 14571547 ┆ 1546409 ┆ 1662328727 ┆ clicks │
│ 14571548 ┆ 1453906 ┆ 1662328728 ┆ clicks │
└──────────┴─────────┴────────────┴────────┘
921704


In [31]:
df_schema = {"click_aid": pl.Int32, "cart_aid": pl.Int32, "probability": pl.Float32}
click_to_cart_matrix_df = (
    pl.read_csv("../matrices/click_to_cart_matrix_time_decay.csv", schema=df_schema)
    .sort(["click_aid", "probability"], descending=[False, True])
    .group_by("click_aid")
    # Take top 200 probabilities
    # .agg(pl.col("cart_aid").top_k_by("probability", 200), pl.col("probability").top_k(200))
    .agg(pl.col("cart_aid").limit(700), pl.col("probability").limit(700))
    .explode(["cart_aid", "probability"])
    .lazy()
)

df_schema = {"cart_aid": pl.Int32, "next_cart_aid": pl.Int32, "probability": pl.Float32}
cart_to_cart_matrix_df = (
    pl.scan_csv("../matrices/cart_to_cart_matrix_time_decay.csv", schema=df_schema)
    # .sort(["cart_aid", "probability"], descending=[False, True])
    # .group_by("cart_aid")
    # # Take top 200 probabilities
    # .agg(pl.col("next_cart_aid").limit(200), pl.col("probability").limit(200))
    # # .agg(pl.col("next_cart_aid").top_k_by("probability", 200), pl.col("probability").top_k(200))
    # .explode(["next_cart_aid", "probability"])
    # .lazy()
)

df_schema = {"order_aid": pl.Int32, "cart_aid": pl.Int32, "probability": pl.Float32}
order_to_cart_matrix_df = (
    pl.scan_csv("../matrices/order_to_cart_matrix_time_decay.csv", schema=df_schema)
    # .sort(["order_aid", "probability"], descending=[False, True])
    # .group_by("order_aid")
    # # Take top 200 probabilities
    # .agg(pl.col("cart_aid").limit(200), pl.col("probability").limit(200))
    # # .agg(pl.col("cart_aid").top_k_by("probability", 200), pl.col("probability").top_k(200))
    # .explode(["cart_aid", "probability"])
    # .lazy()
)

In [3]:
# Count incompatible aids for session based on carted aids in the session
df_schema = {"aid": pl.Int32, "incompatible_aid": pl.Int32}
incompatible_matrix_df = (
    pl.scan_csv("../matrices/incompatible_matrix_3_15.csv", schema=df_schema)
    .group_by("aid")
    .agg(pl.col("incompatible_aid"))
)

incompatible_aids_for_sessions = (
    test_sessions_df
    .lazy()
    .filter(pl.col("type") == "carts")
    .drop(["type", "ts"])
    .unique()
    .join(incompatible_matrix_df, on="aid", how="left")
    .select(["session", "incompatible_aid"])
    .filter(pl.col("incompatible_aid").is_not_null())
    .explode("incompatible_aid")
)

In [69]:
# Split test sessions into sub_sessions and take the last sub_session for prediction
sub_sessions_df = (
    test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_columns(
        next_session = pl.col("session").shift(-1),
        next_aid = pl.col("aid").shift(-1),
        next_ts = pl.col("ts").shift(-1),
        next_type = pl.col("type").shift(-1),
    )
    .with_columns(time_between_min = pl.when(pl.col("next_session") == pl.col("session")).then((pl.col("ts") - pl.col("next_ts"))/60).otherwise(0).cast(pl.Float32))
    .with_row_index()
)

sub_session_boundaries = (
    sub_sessions_df
    .filter(pl.col("time_between_min") > 120)
    .filter(pl.col("index") == pl.col("index").min().over("session"))
    .select(["session", "index"])
    .rename({"index": "cut_index"})
)

sub_sessions_df = (
    sub_sessions_df
    .join(sub_session_boundaries, on="session", how="left")
    .filter(pl.col("cut_index").is_null() | (pl.col("index") <= pl.col("cut_index")))
    .drop(["index", "next_session", "next_aid", "next_ts", "next_type", "time_between_min", "cut_index"])
)

print(sub_sessions_df)

shape: (2_701_298, 4)
┌──────────┬─────────┬────────────┬────────┐
│ session  ┆ aid     ┆ ts         ┆ type   │
│ ---      ┆ ---     ┆ ---        ┆ ---    │
│ i32      ┆ i32     ┆ u32        ┆ str    │
╞══════════╪═════════╪════════════╪════════╡
│ 12899780 ┆ 1142000 ┆ 1661724000 ┆ clicks │
│ 12899781 ┆ 194067  ┆ 1661724246 ┆ clicks │
│ 12899781 ┆ 57315   ┆ 1661724170 ┆ clicks │
│ 12899781 ┆ 199008  ┆ 1661724022 ┆ clicks │
│ 12899781 ┆ 141736  ┆ 1661724000 ┆ clicks │
│ …        ┆ …       ┆ …          ┆ …      │
│ 14571534 ┆ 272221  ┆ 1662328716 ┆ carts  │
│ 14571534 ┆ 272221  ┆ 1662328709 ┆ clicks │
│ 14571539 ┆ 317311  ┆ 1662328717 ┆ clicks │
│ 14571547 ┆ 1546409 ┆ 1662328727 ┆ clicks │
│ 14571548 ┆ 1453906 ┆ 1662328728 ┆ clicks │
└──────────┴─────────┴────────────┴────────┘


In [27]:
# clicks_after_last_cart_or_order = (
#     test_sessions_df
#     .join(
#         (
#             test_sessions_df
#             .filter(pl.col("type") != "clicks")
#             .group_by("session")
#             .agg(pl.col("ts").max().alias("max_ts"))
#         ),
#         on="session",
#         how="left"
#     )
#     .filter(pl.when(pl.col("max_ts").is_not_null()).then(pl.col("ts") > pl.col("max_ts")).otherwise(True))
#     # .drop(["type", "ts", "max_ts"])
#     .group_by("session")
#     # top 10 clicks
#     .agg(pl.col("aid").top_k_by("ts", 5).alias("aid"))
#     .explode("aid")
#     .sort(["session", "aid"])
# )

df_schema = {"session": pl.Int32, "aid": pl.Int32}
clicks_after_last_cart_or_order = (
    pl.scan_csv("temp/last_5_clicks_after_last_cart_or_order.csv", schema=df_schema)
)

# clicks_after_last_cart_or_order.write_csv("temp/last_5_clicks_after_last_cart_or_order.csv")

# print(clicks_after_last_cart_or_order.collect(streaming=True))

In [32]:
# Weight for click-to-cart and cart-to-cart predictions
w_click_to_cart = 0.7
w_cart_to_cart = 0.3
w_order_to_cart = 0.3

click_to_cart_predictions_df = (
    # test_sessions_df
    # .with_columns(pl.col("ts").max().over("session").alias("max_ts"))
    # .filter(pl.col("max_ts") - pl.col("ts") <= 6*60*60) # last 6 hours
    # .lazy()
    # .filter(pl.col("type") == "clicks")
    clicks_after_last_cart_or_order
    .join(click_to_cart_matrix_df, left_on="aid", right_on="click_aid", how="inner")
    .drop("aid")
    .group_by(["session", "cart_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .rename({"cart_aid": "next_cart_aid"})
    .with_columns(pl.col("probability") * w_click_to_cart)
)

cart_to_cart_predictions_df = (
    test_sessions_df
    .lazy()
    .filter(pl.col("type") == "carts")
    .drop(["type", "ts"])
    .join(cart_to_cart_matrix_df, left_on="aid", right_on="cart_aid", how="inner")
    .group_by(["session", "next_cart_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .with_columns(pl.col("probability") * w_cart_to_cart)
)

order_to_cart_predictions_df = (
    test_sessions_df
    .lazy()
    .filter(pl.col("type") == "orders")
    .join(order_to_cart_matrix_df, left_on="aid", right_on="order_aid", how="inner")
    .drop("aid")
    .group_by(["session", "cart_aid"])
    .agg(pl.col("probability").sum().alias("probability"))
    .rename({"cart_aid": "next_cart_aid"})
    .with_columns(pl.col("probability") * w_order_to_cart)
)


carts_prediction_df = (
    pl.concat([
        click_to_cart_predictions_df,
        cart_to_cart_predictions_df,
        order_to_cart_predictions_df
    ])
    # Filter incompatible aids
    # .join(incompatibile_aids_for_sessions, left_on=["session", "next_cart_aid"], right_on=["session", "incompatible_aid"], how="anti")
    .group_by(["session", "next_cart_aid"])
    # equal weight for all predictions
    .agg(pl.col("probability").sum().alias("probability"))
    # Form the prediction csv
    .group_by("session")
    # .agg(pl.col("next_cart_aid").cast(str).limit(20))
    .agg(pl.col("next_cart_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_carts"),
        pl.col("next_cart_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_cart_aid": "labels"})
    .collect(streaming=True)
)


print(carts_prediction_df)
carts_prediction_df.write_csv("out/carts_predictions_1.csv")

# W 50/50 Score: 0.4591558202034041
# W 60/40 Score: 0.45999894609263847
# W 70/30 Score: 0.4603502485464861
# W 80/20 Score: 0.46015000614779294
# Beats baseline

# W 70/30 Score, incompatibility 3 4: 0.445107235074037
# W 70/30 Score, incompatibility 3 5: 0.448233826913281
# W 70/30 Score, incompatibility 3 10: 0.4545326799107692
# W 70/30 Score, incompatibility 3 15: 0.4567458853700093

# W 70/30 all clicks, Score: 0.4685777520155978
# W 70/30 clicks of last 24 hours, Score: 0.46865152553090583
# W 70/30 clicks of last 12 hours, Score: 0.468630447383675
# W 70/30 clicks of last 6 hours, Score: 0.4687463771934447
# W 70/30 clicks of last 3 hours, Score: 0.46860585621190565
# W 70/30 clicks of last 1.5 hours, Score: 0.4686866557762906

# W 70/30 last sub session (30 min) clicks, Score: 0.4676889568073633
# W 70/30 last sub session (60 min) clicks, Score: 0.46803674623667246
# W 70/30 last sub session (90 min) clicks, Score: 0.46830373610159665
# W 70/30 last sub session (120 min) clicks, Score: 0.4684126398622894

# W 70/30 last sub session (120 min) clicks and carts, Score: 0.4677592172981328 Not good

# W 0.7/0.3/0.3 clicks of last 6 hours, Score: 0.4690976796472923 Best
# W 1/1/1 clicks of last 6 hours, Score: 0.4679313555005182
# W 0.7/0.3/0.7 clicks of last 6 hours, Score: 0.46888689817498375
# W 0.7/0.3/0.15 clicks of last 6 hours, Score: 0.4690133670583689

shape: (913_587, 2)
┌────────────────┬─────────────────────────────────┐
│ session_type   ┆ labels                          │
│ ---            ┆ ---                             │
│ str            ┆ str                             │
╞════════════════╪═════════════════════════════════╡
│ 14157304_carts ┆ 237743 269392 1089535 1496876 … │
│ 14096743_carts ┆ 1681550 995962 516917 1766353 … │
│ 13020826_carts ┆ 1611581 331708 987059 1061776 … │
│ 13024104_carts ┆ 1682446 1314879 555599 463771 … │
│ 13157159_carts ┆ 835647 730130 1118842 136530 5… │
│ …              ┆ …                               │
│ 12972969_carts ┆ 562313 1853183 1489709 1294307… │
│ 13912020_carts ┆ 1485427 1171759 425414 1176250… │
│ 13054761_carts ┆ 1366313 681173 786396 1166298 … │
│ 14295045_carts ┆ 1631509 771362 755646 169718 3… │
│ 13471316_carts ┆ 339355 781772 1243979 951559 1… │
└────────────────┴─────────────────────────────────┘
