# Clicks prediction model

In [2]:
import polars as pl


# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.Int32, "ts": pl.Int64, "type": str})
df_schema = {"session": pl.Int32, "events": pl.List(event_schema)}

test_sessions_df = pl.read_ndjson("../data/test/test_sessions.jsonl", schema=df_schema, low_memory=True)

print(test_sessions_df.explode("events").unnest("events"))

shape: (3_086_540, 4)
┌──────────┬─────────┬───────────────┬────────┐
│ session  ┆ aid     ┆ ts            ┆ type   │
│ ---      ┆ ---     ┆ ---           ┆ ---    │
│ i32      ┆ i32     ┆ i64           ┆ str    │
╞══════════╪═════════╪═══════════════╪════════╡
│ 12899780 ┆ 1142000 ┆ 1661724000378 ┆ clicks │
│ 12899781 ┆ 141736  ┆ 1661724000559 ┆ clicks │
│ 12899781 ┆ 199008  ┆ 1661724022851 ┆ clicks │
│ 12899781 ┆ 57315   ┆ 1661724170835 ┆ clicks │
│ 12899781 ┆ 194067  ┆ 1661724246188 ┆ clicks │
│ …        ┆ …       ┆ …             ┆ …      │
│ 14571534 ┆ 272221  ┆ 1662328716323 ┆ carts  │
│ 14571534 ┆ 272221  ┆ 1662328720752 ┆ carts  │
│ 14571539 ┆ 317311  ┆ 1662328717363 ┆ clicks │
│ 14571547 ┆ 1546409 ┆ 1662328727386 ┆ clicks │
│ 14571548 ┆ 1453906 ┆ 1662328728006 ┆ clicks │
└──────────┴─────────┴───────────────┴────────┘


In [2]:
df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = pl.read_csv("../matrices/click_to_click_matrix.csv", schema=df_schema)

print(click_to_click_matrix_df)

shape: (63_195_404, 3)
┌─────────┬──────────┬─────────────┐
│ aid     ┆ next_aid ┆ probability │
│ ---     ┆ ---      ┆ ---         │
│ i32     ┆ i32      ┆ f32         │
╞═════════╪══════════╪═════════════╡
│ 0       ┆ 0        ┆ 0.02381     │
│ 0       ┆ 13759    ┆ 0.02381     │
│ 0       ┆ 54474    ┆ 0.02381     │
│ 0       ┆ 90491    ┆ 0.02381     │
│ 0       ┆ 218900   ┆ 0.02381     │
│ …       ┆ …        ┆ …           │
│ 1855602 ┆ 1598688  ┆ 0.058824    │
│ 1855602 ┆ 1621374  ┆ 0.058824    │
│ 1855602 ┆ 1693232  ┆ 0.058824    │
│ 1855602 ┆ 1783511  ┆ 0.058824    │
│ 1855602 ┆ 1855602  ┆ 0.058824    │
└─────────┴──────────┴─────────────┘


In [3]:
clicks_of_test_sessions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .filter(pl.col("type") == "clicks")
)

print(clicks_of_test_sessions_df)

shape: (2_815_823, 4)
┌──────────┬─────────┬────────────┬────────┐
│ session  ┆ aid     ┆ ts         ┆ type   │
│ ---      ┆ ---     ┆ ---        ┆ ---    │
│ i32      ┆ i32     ┆ u32        ┆ str    │
╞══════════╪═════════╪════════════╪════════╡
│ 12899780 ┆ 1142000 ┆ 1661724000 ┆ clicks │
│ 12899781 ┆ 141736  ┆ 1661724000 ┆ clicks │
│ 12899781 ┆ 199008  ┆ 1661724022 ┆ clicks │
│ 12899781 ┆ 57315   ┆ 1661724170 ┆ clicks │
│ 12899781 ┆ 194067  ┆ 1661724246 ┆ clicks │
│ …        ┆ …       ┆ …          ┆ …      │
│ 14571533 ┆ 229094  ┆ 1662328708 ┆ clicks │
│ 14571534 ┆ 272221  ┆ 1662328709 ┆ clicks │
│ 14571539 ┆ 317311  ┆ 1662328717 ┆ clicks │
│ 14571547 ┆ 1546409 ┆ 1662328727 ┆ clicks │
│ 14571548 ┆ 1453906 ┆ 1662328728 ┆ clicks │
└──────────┴─────────┴────────────┴────────┘


In [6]:
# Clicks of last 30 minutes

clicks_of_last_30_minutes_df = (
    clicks_of_test_sessions_df
    .group_by("session")
    .agg(pl.col("aid"), pl.col("ts"), pl.col("type"), pl.col("ts").max().alias("max_ts"))
    .explode(["aid", "type", "ts"])
    .filter((pl.col("max_ts") - pl.col("ts")) <= 1800) # 1 hour
)

print(clicks_of_last_30_minutes_df)

shape: (2_281_151, 5)
┌──────────┬─────────┬────────────┬────────┬────────────┐
│ session  ┆ aid     ┆ ts         ┆ type   ┆ max_ts     │
│ ---      ┆ ---     ┆ ---        ┆ ---    ┆ ---        │
│ i32      ┆ i32     ┆ u32        ┆ str    ┆ u32        │
╞══════════╪═════════╪════════════╪════════╪════════════╡
│ 12931829 ┆ 279078  ┆ 1661755897 ┆ clicks ┆ 1661755901 │
│ 12931829 ┆ 1264522 ┆ 1661755901 ┆ clicks ┆ 1661755901 │
│ 13854907 ┆ 326904  ┆ 1662237036 ┆ clicks ┆ 1662238250 │
│ 13854907 ┆ 326904  ┆ 1662237136 ┆ clicks ┆ 1662238250 │
│ 13854907 ┆ 1123180 ┆ 1662237210 ┆ clicks ┆ 1662238250 │
│ …        ┆ …       ┆ …          ┆ …      ┆ …          │
│ 12938373 ┆ 645357  ┆ 1661758206 ┆ clicks ┆ 1661758315 │
│ 12938373 ┆ 1520649 ┆ 1661758315 ┆ clicks ┆ 1661758315 │
│ 14183477 ┆ 1150798 ┆ 1662200425 ┆ clicks ┆ 1662200711 │
│ 14183477 ┆ 672432  ┆ 1662200637 ┆ clicks ┆ 1662200711 │
│ 14183477 ┆ 463670  ┆ 1662200711 ┆ clicks ┆ 1662200711 │
└──────────┴─────────┴────────────┴────────┴──────

### Only last click of session considered

In [37]:
# Only last click and click_to_click_matrix
# Score 0.44085084753912007

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = pl.read_csv("../matrices/click_to_click_matrix.csv", schema=df_schema)


clicks_predictions_df = (
    clicks_of_test_sessions_df
    .group_by("session")
    .agg(pl.col("aid"), pl.col("ts"), pl.col("type"), pl.col("ts").max().alias("max_ts"))
    .explode(["aid", "type", "ts"])
    .filter(pl.col("max_ts") == pl.col("ts"))
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .drop("aid")
    .sort(["session", "probability"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions_1.csv")

shape: (918_765, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 12899780_clicks ┆ 1142000 1344758 1502122 768323… │
│ 12899781_clicks ┆ 194067 1066725 1724971 893268 … │
│ 12899782_clicks ┆ 779477 553147 1076930 348184 1… │
│ 12899783_clicks ┆ 1754419 294573 354698 1317341 … │
│ 12899784_clicks ┆ 1579935 707586 1492404 229362 … │
│ …               ┆ …                               │
│ 14571533_clicks ┆ 1170554 1513950 1204260 288668… │
│ 14571534_clicks ┆ 272221 1152891 279269 1099895 … │
│ 14571539_clicks ┆ 317311 1564473 275687 1450886 … │
│ 14571547_clicks ┆ 1546409 719278 925950 1837846 … │
│ 14571548_clicks ┆ 1453906 1798580 1278671 133903… │
└─────────────────┴─────────────────────────────────┘


In [38]:
# Only last click and click_to_click_matrix_only_clicks
# Score 0.44381926644679226

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = pl.read_csv("../matrices/click_to_click_matrix_only_clicks.csv", schema=df_schema)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .group_by("session")
    .agg(pl.col("aid"), pl.col("ts"), pl.col("type"), pl.col("ts").max().alias("max_ts"))
    .explode(["aid", "type", "ts"])
    .filter(pl.col("max_ts") == pl.col("ts"))
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .drop("aid")
    .sort(["session", "probability"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions_2.csv")

shape: (918_792, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 12899780_clicks ┆ 1142000 1344758 1502122 768323… │
│ 12899781_clicks ┆ 194067 1066725 1724971 893268 … │
│ 12899782_clicks ┆ 779477 1076930 348184 553147 1… │
│ 12899783_clicks ┆ 1754419 294573 354698 1317341 … │
│ 12899784_clicks ┆ 1579935 707586 1492404 229362 … │
│ …               ┆ …                               │
│ 14571533_clicks ┆ 1170554 1255319 1656492 288668… │
│ 14571534_clicks ┆ 272221 1152891 279269 1099895 … │
│ 14571539_clicks ┆ 317311 1564473 1450886 275687 … │
│ 14571547_clicks ┆ 1546409 719278 925950 1837846 … │
│ 14571548_clicks ┆ 1453906 1798580 1278671 318629… │
└─────────────────┴─────────────────────────────────┘


In [39]:
# Only last click and click_to_click_matrix_only_clicks_time_decay_2
# Score 0.4535270151305795

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_2.csv", schema=df_schema)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .group_by("session")
    .agg(pl.col("aid"), pl.col("ts"), pl.col("type"), pl.col("ts").max().alias("max_ts"))
    .explode(["aid", "type", "ts"])
    .filter(pl.col("max_ts") == pl.col("ts"))
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .drop("aid")
    .sort(["session", "probability"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions_3.csv")

shape: (918_808, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 12899780_clicks ┆ 1142000 1344758 1502122 487136… │
│ 12899781_clicks ┆ 194067 1066725 1724971 893268 … │
│ 12899782_clicks ┆ 779477 567686 424203 796614 34… │
│ 12899783_clicks ┆ 1754419 294573 1317341 354698 … │
│ 12899784_clicks ┆ 1579935 707586 1492404 229362 … │
│ …               ┆ …                               │
│ 14571533_clicks ┆ 229094 1170554 1656492 1204260… │
│ 14571534_clicks ┆ 272221 1152891 279269 1099895 … │
│ 14571539_clicks ┆ 317311 1564473 275687 171073 1… │
│ 14571547_clicks ┆ 1546409 719278 925950 1837846 … │
│ 14571548_clicks ┆ 1453906 1798580 1278671 318629… │
└─────────────────┴─────────────────────────────────┘


In [40]:
# Only last click and click_to_click_matrix_only_clicks_time_decay_3
# Score 0.45523045027697995

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_3.csv", schema=df_schema)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .group_by("session")
    .agg(pl.col("aid"), pl.col("ts"), pl.col("type"), pl.col("ts").max().alias("max_ts"))
    .explode(["aid", "type", "ts"])
    .filter(pl.col("max_ts") == pl.col("ts"))
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .drop("aid")
    .sort(["session", "probability"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_4.csv")

In [5]:
# Only last click and click_to_click_matrix_only_clicks_time_decay_4
# Score 0.455882975837093

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_4.csv", schema=df_schema)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .group_by("session")
    .agg(pl.col("aid"), pl.col("ts"), pl.col("type"), pl.col("ts").max().alias("max_ts"))
    .explode(["aid", "type", "ts"])
    .filter(pl.col("max_ts") == pl.col("ts"))
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .drop("aid")
    .sort(["session", "probability"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_5.csv")

Higher scores are most likely due to the result of having more aids in the prediction csv.

### Last two clicks of session considered

In [14]:
# Last two clicks and click_to_click_matrix_only_clicks
# Score 0.4710410301203509

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    # Last two clicks
    .agg(pl.col("aid").limit(2), pl.col("ts").limit(2))
    .explode(["aid", "ts"])
    .sort(["session", "ts"], descending=[False, False])
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_6.csv")

In [15]:
# Last two clicks and click_to_click_matrix_only_clicks_time_decay_2
# Score 0.48117234802736947

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_2.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    # Last two clicks
    .agg(pl.col("aid").limit(2), pl.col("ts").limit(2))
    .explode(["aid", "ts"])
    .sort(["session", "ts"], descending=[False, False])
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_7.csv")

In [16]:
# Last two clicks and click_to_click_matrix_only_clicks_time_decay_3
# Score 0.4822255471770256

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_3.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    # Last two clicks
    .agg(pl.col("aid").limit(2), pl.col("ts").limit(2))
    .explode(["aid", "ts"])
    .sort(["session", "ts"], descending=[False, False])
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_8.csv")

In [17]:
# Last two clicks and click_to_click_matrix_only_clicks_time_decay_4
# Score 0.48284029494155317

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_4.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    # Last two clicks
    .agg(pl.col("aid").limit(2), pl.col("ts").limit(2))
    .explode(["aid", "ts"])
    .sort(["session", "ts"], descending=[False, False])
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_9.csv")

### Last three clicks of session considered

In [18]:
# Last three clicks and click_to_click_matrix_only_clicks_time_decay_4
# Score 0.4845769287568014

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_4.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    # Last three clicks
    .agg(pl.col("aid").limit(3), pl.col("ts").limit(3))
    .explode(["aid", "ts"])
    .sort(["session", "ts"], descending=[False, False])
    .select(["session", "aid"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_10.csv")

In [19]:
# Last three clicks with time decay and click_to_click_matrix_only_clicks_time_decay_4
# Score 0.4885699272720196

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_4.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last three clicks
    .agg(pl.col("aid").limit(3), pl.col("ts").limit(3), pl.col("index").limit(3), pl.col("index").limit(3).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_11.csv")

In [17]:
# Last three clicks with time decay and click_to_click_matrix_only_clicks_time_decay_4 and incompatible_matrix
# Score 0.48545612109042746

# Count incompatible aids for session based on carted aids in the session
df_schema = {"aid": pl.Int32, "incompatible_aid": pl.Int32}
incompatible_matrix_df = (
    pl.read_csv("../matrices/incompatible_matrix.csv", schema=df_schema)
    .group_by("aid")
    .agg(pl.col("incompatible_aid"))
)

incompatibile_aids_for_sessions = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    .filter(pl.col("type") == "carts")
    .join(incompatible_matrix_df, on="aid", how="left")
    .select(["session", "incompatible_aid"])
    .filter(pl.col("incompatible_aid").is_not_null())
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last three clicks
    .agg(pl.col("aid").limit(3), pl.col("ts").limit(3), pl.col("index").limit(3), pl.col("index").limit(3).min().alias("last_event_index"))
    .lazy()
    .explode(["aid", "ts", "index"])
    # Weight
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df.lazy(), on="aid", how="inner")
    # Filter incompatible aids
    .join(incompatibile_aids_for_sessions.lazy(), on="session", how="left")
    .filter(pl.col("next_aid").is_in(pl.col("incompatible_aid")).not_())
    # Calculate weighted probability
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    # Sum probabilities
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    # Collect top 20 probabilities for each session
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.collect(streaming=True).write_csv("out/clicks_predictions_12.csv")

In [43]:
# Last three clicks within 1 hour time frame with time decay and click_to_click_matrix_only_clicks_time_decay_4
# Score 0.48788076869800545

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_4.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    # Get last 3 events which are maximum of 30 minutes apart from each other
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("aid").limit(3), pl.col("ts").limit(3), pl.col("type").limit(3), pl.col("ts").limit(3).max().alias("last_event_ts"))
    .explode(["aid", "type", "ts"])
    .filter((pl.col("last_event_ts") - pl.col("ts")) <= 3600) # last 1 hour
    .with_row_index()
    .group_by("session")
    # Last three clicks
    .agg(pl.col("aid").limit(3), pl.col("ts").limit(3), pl.col("index").limit(3), pl.col("index").limit(3).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    # Weight
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    # Calculate weighted probability
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    # Find top 20 aids
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions_13.csv")

shape: (918_824, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 12899780_clicks ┆ 1142000 1344758 1502122 487136… │
│ 12899781_clicks ┆ 194067 57315 199008 1066725 17… │
│ 12899782_clicks ┆ 413962 1315520 779477 594187 1… │
│ 12899783_clicks ┆ 1754419 1216820 294573 351665 … │
│ 12899784_clicks ┆ 1579935 476216 707586 910034 1… │
│ …               ┆ …                               │
│ 14571533_clicks ┆ 229094 1170554 1204260 1656492… │
│ 14571534_clicks ┆ 272221 1152891 279269 1099895 … │
│ 14571539_clicks ┆ 317311 1564473 275687 171073 1… │
│ 14571547_clicks ┆ 1546409 719278 925950 1837846 … │
│ 14571548_clicks ┆ 1453906 1798580 1278671 318629… │
└─────────────────┴─────────────────────────────────┘


### Last four clicks of session considered

In [5]:
# Last four clicks with time decay and click_to_click_matrix_only_clicks_time_decay_4
# Score 0.4903947092769673

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_4.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last four clicks
    .agg(pl.col("aid").limit(4), pl.col("ts").limit(4), pl.col("index").limit(4), pl.col("index").limit(4).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_14.csv")

### Last five clicks of session considered

In [4]:
# Last five clicks with time decay and click_to_click_matrix_only_clicks_time_decay_5
# Score 0.49129679724428155

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_5.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last five clicks
    .agg(pl.col("aid").limit(5), pl.col("ts").limit(5), pl.col("index").limit(5), pl.col("index").limit(5).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_15.csv")

### Last six clicks of session considered

In [4]:
# Last six clicks with time decay and click_to_click_matrix_only_clicks_time_decay_6
# Score 0.4917673025165736

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_6.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last six clicks
    .agg(pl.col("aid").limit(6), pl.col("ts").limit(6), pl.col("index").limit(6), pl.col("index").limit(6).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_16.csv")

### Last seven clicks of session considered

In [8]:
# Last seven clicks with time decay and click_to_click_matrix_only_clicks_time_decay_7
# Score 0.49198137669155806

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_7.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 100 probabilities
    .agg(pl.col("next_aid").limit(100), pl.col("probability").limit(100))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last seven clicks
    .agg(pl.col("aid").limit(7), pl.col("ts").limit(7), pl.col("index").limit(7), pl.col("index").limit(7).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_17.csv")

In [None]:
# Last seven clicks with time decay and click_to_click_matrix_only_clicks_time_decay_7
# top 150 probabilities
# Score 0.49205006359262254

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_7.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 150 probabilities
    .agg(pl.col("next_aid").limit(150), pl.col("probability").limit(150))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last seven clicks
    .agg(pl.col("aid").limit(7), pl.col("ts").limit(7), pl.col("index").limit(7), pl.col("index").limit(7).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_18.csv")

In [5]:
# Last seven clicks with time decay and click_to_click_matrix_only_clicks_time_decay_7
# top 150 probabilities
# boost most popular aids
# Score 0.49205006359262254

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_7_1h.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 150 probabilities
    .agg(pl.col("next_aid").limit(150), pl.col("probability").limit(150))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last seven clicks
    .agg(pl.col("aid").limit(7), pl.col("ts").limit(7), pl.col("index").limit(7), pl.col("index").limit(7).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_19.csv")