# Clicks prediction model

In [1]:
import polars as pl

df_schema = {"session": pl.Int32, "events": pl.List(pl.Struct({"aid": pl.UInt32, "ts": pl.UInt32, "type": pl.Utf8}))}
test_sessions_df = pl.read_ndjson("../data/test/test_sessions.jsonl", schema=df_schema, low_memory=True)

### Current best model

In [21]:
df_schema = {"aid": pl.UInt32, "next_aid": pl.UInt32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv", schema=df_schema)
    # Take top 300 probabilities
    .group_by("aid")
    .agg(pl.all().top_k_by("probability", 300))
    .explode(["next_aid", "probability"])
    .lazy()
)

print(click_to_click_matrix_df.collect(streaming=True))

shape: (83_185_024, 3)
┌─────────┬──────────┬─────────────┐
│ aid     ┆ next_aid ┆ probability │
│ ---     ┆ ---      ┆ ---         │
│ u32     ┆ u32      ┆ f32         │
╞═════════╪══════════╪═════════════╡
│ 1686624 ┆ 1686624  ┆ 0.21457     │
│ 1686624 ┆ 37698    ┆ 0.141719    │
│ 1686624 ┆ 610511   ┆ 0.086511    │
│ 1686624 ┆ 329243   ┆ 0.075128    │
│ 1686624 ┆ 751653   ┆ 0.051224    │
│ …       ┆ …        ┆ …           │
│ 1601765 ┆ 307175   ┆ 0.015015    │
│ 1601765 ┆ 1026595  ┆ 0.015015    │
│ 1601765 ┆ 1247582  ┆ 0.015015    │
│ 1601765 ┆ 710810   ┆ 0.012012    │
│ 1601765 ┆ 1192970  ┆ 0.012012    │
└─────────┴──────────┴─────────────┘


In [22]:
last_unique_aids = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    .sort(["session", "ts"], descending=[False, True])
    .unique(subset=["session", "aid"], maintain_order=True)
    .group_by("session", maintain_order=True)
    .agg(pl.col("aid").top_k_by("ts", 5))
    .explode("aid")
    .with_row_index()
    .with_columns(first_index=pl.col("index").min().over("session"))
    .with_columns(weight=(1 / (pl.col("index") - pl.col("first_index") + 1)).cast(pl.Float32))
    .drop(["index", "first_index"])
    .lazy()
)

clicks_predictions_df = (
    last_unique_aids
    .join(click_to_click_matrix_df, on="aid", how="left")
    # Fill missing values
    .with_columns(
        next_aid=pl.when(pl.col("next_aid").is_null()).then(pl.col("aid")).otherwise(pl.col("next_aid")),
        probability=pl.when(pl.col("probability").is_null()).then(pl.lit(0.0)).otherwise(pl.col("probability"))
    )
    .with_columns(bias=pl.when(pl.col("aid") == pl.col("next_aid")).then(1).otherwise(0).cast(pl.Float32))
    .with_columns(probability=(pl.col("probability") * pl.col("weight") + pl.col("bias")))
    .collect(streaming=True)
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    # csv format
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions.csv")

# Last 3 unique aids of any event. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5063681710362438

# Last 5 unique aids of any event. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5080663499792422, Best value
# Recall@10 score: 0.4349925000573709
# MRR@20 score: 0.2589197593876678
# MRR@10 score: 0.2538283202746814

# Last 5 unique aids of any event. Exponential time decay. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5082582817516695

# Last 5 unique aids of any event for the past 45 minutes of session. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5080225394659708

# Last 10 unique aids of any event. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5085002826821213 Best but not too much when considering the extra computation

# Last 5 unique aids of any event. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv Top 500 probabilities from matrix
# Recall@20 score: 0.5080308843256415, Not better

# Last 5 unique aids of clicks. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5044363360224644

#####

# Last 5 unique aids of any event. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay_over-2-event-sessions.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5056880649730774, Filtering out short sessions is not helping

# Last 5 unique aids of any event. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay_over-3-event-sessions.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5022583276483977

#####

# Last 5 unique aids of any event. click-to-click-matrix_only-clicks_5-subsequent-clicks-time-decay_under-50-event-sessions.csv Top 300 probabilities from matrix
# Recall@20 score: 0.5068772074761598 Not better

shape: (514_739, 2)
┌───────────────┬─────────────────────────────────┐
│ session_type  ┆ labels                          │
│ ---           ┆ ---                             │
│ str           ┆ str                             │
╞═══════════════╪═════════════════════════════════╡
│ 212230_clicks ┆ 391802 1429965 665403 265913 2… │
│ 173183_clicks ┆ 1526607 1163166 11977 1743151 … │
│ 39610_clicks  ┆ 1722264 1008582 395095 1531348… │
│ 131367_clicks ┆ 83856 585186 493740 252993 165… │
│ 52963_clicks  ┆ 1749592 1115704 8602 1351387 7… │
│ …             ┆ …                               │
│ 149549_clicks ┆ 105393 1452081 1073464 1488751… │
│ 478676_clicks ┆ 662924 751283 1776861 1563129 … │
│ 167374_clicks ┆ 1835122 1674064 802541 751174 … │
│ 459169_clicks ┆ 286897 71364 967898 1310373 37… │
│ 63294_clicks  ┆ 1060697 32249 824487 803544 48… │
└───────────────┴─────────────────────────────────┘


### Predict next click to be some of the last n unique aids

In [28]:
clicks_predictions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("aid").unique(maintain_order=True).limit(10).cast(str))
    # csv format
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)

print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions.csv")

# Recall@20 scores
# last 5 aids: 0.23950409389957777
# last 10 aids: 0.24226818123888777

shape: (1_023_837, 2)
┌────────────────┬─────────────────┐
│ session_type   ┆ labels          │
│ ---            ┆ ---             │
│ str            ┆ str             │
╞════════════════╪═════════════════╡
│ 1_clicks       ┆ 582732 1142000  │
│ 2_clicks       ┆ 199008 141736   │
│ 3_clicks       ┆ 199008          │
│ 4_clicks       ┆ 199008          │
│ 5_clicks       ┆ 1494780 1669402 │
│ …              ┆ …               │
│ 1023833_clicks ┆ 229094          │
│ 1023834_clicks ┆ 272221          │
│ 1023835_clicks ┆ 317311          │
│ 1023836_clicks ┆ 1546409         │
│ 1023837_clicks ┆ 1453906         │
└────────────────┴─────────────────┘


### Predict next click to be some of the last n clicks

In [6]:
# Sessions
clicks_predictions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    .filter(pl.col("type") == "clicks")
    .group_by("session")
    .agg(pl.col("aid").top_k_by("ts", 3).cast(str))
    # csv format
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)
print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions.csv")

# Recall@20 scores
# last 3 clicks: 0.22587691349848216
# last 5 clicks: 0.2327766855605164
# last 10 clicks: 0.2367765685621105

shape: (1_020_317, 2)
┌───────────────┬───────────────────────┐
│ session_type  ┆ labels                │
│ ---           ┆ ---                   │
│ str           ┆ str                   │
╞═══════════════╪═══════════════════════╡
│ 152860_clicks ┆ 621423 545611 621423  │
│ 181174_clicks ┆ 1714583               │
│ 203060_clicks ┆ 1017032               │
│ 26090_clicks  ┆ 877497 1803001 605599 │
│ 585412_clicks ┆ 102345 199409         │
│ …             ┆ …                     │
│ 52531_clicks  ┆ 1502780 1571138       │
│ 492062_clicks ┆ 495732                │
│ 541488_clicks ┆ 1358020 1570243       │
│ 58581_clicks  ┆ 1816625               │
│ 737715_clicks ┆ 1343846 1684991       │
└───────────────┴───────────────────────┘


### Predict next click to be some of the last n carts

In [7]:
# Sessions
clicks_predictions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    .filter(pl.col("type") == "carts")
    .group_by("session")
    .agg(pl.col("aid").top_k_by("ts", 5).cast(str))
    # csv format
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)
print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions.csv")

# Recall@20 scores
# last 5 carts: 0.029992332425899267

shape: (131_365, 2)
┌───────────────┬─────────────────────────────────┐
│ session_type  ┆ labels                          │
│ ---           ┆ ---                             │
│ str           ┆ str                             │
╞═══════════════╪═════════════════════════════════╡
│ 287746_clicks ┆ 404609                          │
│ 941156_clicks ┆ 879978                          │
│ 508854_clicks ┆ 1389106 1389106                 │
│ 400856_clicks ┆ 643964 1660750                  │
│ 575447_clicks ┆ 543682 202774                   │
│ …             ┆ …                               │
│ 532824_clicks ┆ 110700 1281615                  │
│ 950302_clicks ┆ 664792                          │
│ 328728_clicks ┆ 262225                          │
│ 579880_clicks ┆ 682082 682082 1496286           │
│ 306437_clicks ┆ 817116 436574 52502 320264 862… │
└───────────────┴─────────────────────────────────┘


### Predict next click to be some of the last n orders

In [9]:
# Sessions
clicks_predictions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    .filter(pl.col("type") == "orders")
    .group_by("session")
    .agg(pl.col("aid").top_k_by("ts", 5).cast(str))
    # csv format
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)
print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions.csv")

# Recall@20 scores
# last 5 orders: 0.0021320870217429

shape: (6_966, 2)
┌───────────────┬────────────────────────┐
│ session_type  ┆ labels                 │
│ ---           ┆ ---                    │
│ str           ┆ str                    │
╞═══════════════╪════════════════════════╡
│ 255487_clicks ┆ 278062                 │
│ 372292_clicks ┆ 692964                 │
│ 223493_clicks ┆ 1773065 318230         │
│ 95574_clicks  ┆ 1175896 801774 1183286 │
│ 976955_clicks ┆ 1658592                │
│ …             ┆ …                      │
│ 875203_clicks ┆ 1502087 273740         │
│ 704024_clicks ┆ 1748119                │
│ 884108_clicks ┆ 267029                 │
│ 35236_clicks  ┆ 1476166                │
│ 812927_clicks ┆ 605890                 │
└───────────────┴────────────────────────┘
