# Clicks prediction model

In [1]:
import polars as pl


# define the schema of the dataframe
event_schema = pl.Struct({"aid": pl.Int32, "ts": pl.Int64, "type": str})
df_schema = {"session": pl.Int32, "events": pl.List(event_schema)}

test_sessions_df = pl.read_ndjson("../data/test/test_sessions.jsonl", schema=df_schema, low_memory=True)

In [6]:
clicks_prediction_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    # .filter(pl.col("type") == "clicks")
    .sort(["session", "ts"], descending=[False, True])
    .drop(["ts"])
    .with_columns(
        event_position=pl.int_range(1, pl.len()+1).over("session").cast(pl.UInt32)
    )
    .lazy()
    .filter(pl.col("event_position") <= 3)
    .join(concat_matrices, on=["aid", "type"], how="inner")
    # .join(click_to_click_matrix_df, left_on=["event_position", "aid"], right_on=["event_position", "click_aid"], how="inner")
    .group_by(["session", "next_click_aid"])
    .agg(pl.col("probability").sum())
    .group_by("session")
    .agg(pl.col("next_click_aid").top_k_by("probability", 20).cast(str))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_click_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_click_aid": "labels"})
    .collect(streaming=True)
)

print(clicks_prediction_df)

clicks_prediction_df.write_csv("out/clicks_predictions.csv")

# Score: 0.4735286407205714
# Time decay. Score: 0.47667106644427376,
# three matrices. Only first consecutive click. 3 last clicks of a session: 0.4754541635137471,

shape: (921_221, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 13253102_clicks ┆ 353669 1243922 998036 1571048 … │
│ 13291009_clicks ┆ 1069940 228262 1559894 49290 1… │
│ 13403985_clicks ┆ 1097064 861409 531825 9848 479… │
│ 13702595_clicks ┆ 91926 17909 1821537 1356349 41… │
│ 13114051_clicks ┆ 1762564 552910 351055 1492369 … │
│ …               ┆ …                               │
│ 13926061_clicks ┆ 1488814 700995 1449873 1622471… │
│ 13625888_clicks ┆ 686109 645003 408008 1323311 1… │
│ 14358482_clicks ┆ 496180 861401 1162085 1755739 … │
│ 13373274_clicks ┆ 1733943 1611179 453446 1368396… │
│ 13359519_clicks ┆ 108125 659399 612920 39615 435… │
└─────────────────┴─────────────────────────────────┘


### Last seven clicks of session considered

In [None]:
# Last seven clicks with time decay and click_to_click_matrix_only_clicks_time_decay_7
# top 150 probabilities
# Score 0.49205006359262254

df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click_to_click_matrix_only_clicks_time_decay_7.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 150 probabilities
    .agg(pl.col("next_aid").limit(150), pl.col("probability").limit(150))
    .explode(["next_aid", "probability"])
)

clicks_predictions_df = (
    clicks_of_test_sessions_df
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .group_by("session")
    # Last seven clicks
    .agg(pl.col("aid").limit(7), pl.col("ts").limit(7), pl.col("index").limit(7), pl.col("index").limit(7).min().alias("last_event_index"))
    .explode(["aid", "ts", "index"])
    .with_columns(weight = (1/(pl.col("index") - pl.col("last_event_index") + 1)).cast(pl.Float32))
    .drop(["index", "last_event_index"])
    .sort(["session", "ts"], descending=[False, True])
    .select(["session", "aid", "weight"])
    .join(click_to_click_matrix_df, on="aid", how="inner")
    .with_columns(probability=(pl.col("probability") * pl.col("weight")))
    .group_by(["session", "next_aid"])
    .agg(pl.col("probability").sum())
    .sort(["session", "probability"], descending=[False, True])
    .group_by(pl.col("session"))
    .agg(pl.col("next_aid").cast(str).limit(20))
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("next_aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "next_aid": "labels"})
)

clicks_predictions_df.write_csv("out/clicks_predictions_18.csv")

### Current best model

In [6]:
df_schema = {"aid": pl.Int32, "next_aid": pl.Int32, "probability": pl.Float32}
click_to_click_matrix_df = (
    pl.read_csv("../matrices/click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay.csv", schema=df_schema)
    .sort(["aid", "probability"], descending=[False, True])
    .group_by("aid")
    # Take top 1000 probabilities
    .agg(pl.col("next_aid").limit(1000), pl.col("probability").limit(1000))
    .explode(["next_aid", "probability"])
)

# events of last 45 minutes in sessions
events_of_last_45_min = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .with_columns(max_ts=pl.col("ts").max().over("session"))
    .filter(pl.col("ts") >= pl.col("max_ts") - 45*60) # last 45 minutes
    .drop("max_ts")
)

last_5_carts_of_session = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .filter(pl.col("type") == "carts")
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("aid").unique(maintain_order=True).limit(5))
    .explode("aid")
    .with_columns(weight=pl.lit(1).cast(pl.Float32))
)
# print(last_5_carts_of_session)

# Last clicks of session
# Consider carts and orders also as clicks
last_5_clicks_of_session_time_decay = (
    events_of_last_45_min
    .group_by(["session", "aid"])
    .agg(pl.col("ts").max())
    .sort(["session", "ts"], descending=[False, True])
    .group_by("session")
    .agg(pl.col("aid").limit(5), pl.col("ts").limit(5))
    .explode(["aid", "ts"])
    .sort(["session", "ts"], descending=[False, True])
    .with_row_index()
    .drop("ts")
    .with_columns(first_index=pl.col("index").min().over("session"))
    .with_columns(weight = (1 / (pl.col("index") - pl.col("first_index") + 1)).cast(pl.Float32))
    .group_by("session")
    .agg(pl.col("aid"), pl.col("weight"))
)
# print(last_5_clicks_of_session_time_decay)

last_5_clicks_of_sessions_rows = last_5_clicks_of_session_time_decay.rows()
# print(last_5_clicks_of_sessions)
# print(last_5_clicks_of_session_time_decay)

last_5_clicks_and_5_carts_of_sessions = (
    pl.concat([
        last_5_carts_of_session,
        last_5_clicks_of_session_time_decay.explode(["aid", "weight"])
    ])
    # get unique aids with lowest weight
    .group_by("session")
    .agg(pl.col("aid").unique(maintain_order=True).limit(5), pl.col("weight").min())
    .explode("aid")
)
# print(last_5_clicks_and_5_carts_of_sessions)

clicks_predictions_df = pl.DataFrame({
    "session_type": [],
    "labels": []
}, schema={"session_type": str, "labels": str})

start = 0
end = len(last_5_clicks_of_sessions_rows)
step = 100000
batch = 1
for i in range(start, end, step):
    print(f"Batch {batch}")
    batch += 1

    probabilities = (
        pl.DataFrame(last_5_clicks_of_sessions_rows[i:i+step], schema={"session": pl.Int32, "aid": pl.List(pl.Int32), "weight": pl.List(pl.Float32)}, orient="row")
        .explode(["aid", "weight"])
        # Probabilities
        .join(click_to_click_matrix_df, on="aid", how="inner")
        # apply time decay to probabilities
        .with_columns(probability=(pl.col("probability") * pl.col("weight")))
        .group_by(["session", "next_aid"])
        .agg(pl.col("probability").sum())
    )

    # Add bias to last clicks and carts
    bias = (
        last_5_clicks_and_5_carts_of_sessions
        .drop("weight")
        # Filter only sessions of this batch
        .join(probabilities.select("session").unique(), on=["session"], how="semi")
        # bias as probability so we can concat the column
        .with_columns(probability=pl.lit(1).cast(pl.Float32))
        .rename({"aid": "next_aid"})
    )

    new_predictions_df = (
        pl.concat([
            probabilities,
            bias
        ])
        # Sum probabilities and bias
        .group_by(["session", "next_aid"])
        .agg(pl.col("probability").sum())
        .group_by(pl.col("session"))
        # take top 20
        .agg(pl.col("next_aid").top_k_by("probability", 20).cast(str))
        # csv format
        .with_columns(
            (pl.col("session").cast(str) + "_clicks"),
            pl.col("next_aid").list.join(separator=' ')
        )
        .rename({"session": "session_type", "next_aid": "labels"})
    )

    clicks_predictions_df = pl.concat([
        clicks_predictions_df,
        new_predictions_df
    ])

print(clicks_predictions_df)

# Notes

# click-to-click-matrix_only-clicks_low-event-sessions-100_1-subsequent-clicks-30-min.csv last 3 clicks. Score: 0.4643921380973042
# click-to-click-matrix_only-clicks_low-event-sessions-100_1-subsequent-clicks-30-min.csv last 5 clicks. Score: 0.4651362461921699
# click-to-click-matrix_only-clicks_low-event-sessions-50_1-subsequent-clicks-30-min.csv last 5 clicks. Score: 0.4560524035263855
# click-to-click-matrix_only-clicks_low-event-sessions-200_1-subsequent-clicks-30-min.csv last 5 clicks. Score: 0.4706827134531309
# click-to-click-matrix_only-clicks_low-event-sessions-300_1-subsequent-clicks-30-min.csv last 5 clicks. Score: 0.4725361150001889
# click-to-click-matrix_only-clicks_low-event-sessions-400_1-subsequent-clicks-30-min.csv last 5 clicks. Score: 0.473351199559488
# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-30-min.csv last 5 clicks. Score: 0.4736110650018488

# Removing top sessions by length did not improve the score. Infact it made it worse

# click-to-click-matrix_only-clicks_high-event-sessions-3_1-subsequent-clicks-30-min.csv last 5 clicks. Score: 0.47276736090043947

# Removing bottom sessions by length did not improve the score. Infact it made it worse

# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-45-min.csv last 5 clicks. Score: 0.4737129505717612
# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-45-min.csv last 5 clicks, bias on last 5 clicks of last 30 minutes of session. Score: 0.4786114713993468
# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-45-min.csv last 5 clicks, bias on last 5 clicks of session. Score: 0.4786114713993468

# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-45-min.csv last 5 clicks, bias on last 5 clicks and last 5 carts or orders of last 30 minutes of session. Score: 0.4789995523903614

# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-45-min.csv last 5 clicks, bias on last 5 clicks and carts of last 30 minutes of session. Score: 0.47835961942877686

# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-45-min.csv last 1 click. Score: 0.4439554954672369
# click-to-click-matrix_only-clicks_low-event-sessions-500_1-subsequent-clicks-30-min.csv last 1 click. Score: 0.4436944852431917

# click-to-click-matrix_last-2-weeks_only-clicks_5-subsequent-clicks-45-min-time-decay last 5 clicks, no bias. Score: 0.47314399407461
# click-to-click-matrix_last-2-weeks_only-clicks_5-subsequent-clicks-45-min-time-decay last 5 carts or orders of last 30 minutes of session, bias on last 5 clicks and carts or orders. Score: 0.47824514126033596
# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay last 5 carts or orders of last 30 minutes of session, bias on last 5 clicks and carts or orders. Score: 0.4848425181075843

# top 1000 probabilities
# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay last 5 carts or orders, bias on last 5 clicks and carts or orders. Score: 0.48874736843310296
# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay last 5 carts or orders of last 30 minutes of session, bias on last 5 clicks and carts or orders. Score: 0.48850124037095505
# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay last 5 carts or orders of session, bias on last 5 clicks and carts or orders. Score: 0.4884291191248373

# top 1200 probabilities
# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay last 5 carts or orders of last 30 minutes of session, bias on last 5 clicks and carts or orders. Score: 0.48842339521641526

# cart_or_order_to_click seems to not have an effect on the score

# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay, last 5 clicks for the past 45 min, bias to last 5 events of 45 min and last 5 carts of session. Score: 0.4904336318542372
# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay, last 5 events for the past 45 min, bias to last 5 events of 45 min and last 5 carts of session. Score: 0.49214164612737527
# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay, last 5 events for the past 45 min, bias to last 5 events of 45 min and last 3 carts of session. Score: 0.491975652783136

# click-to-click-matrix_only-clicks_5-subsequent-clicks-45-min-time-decay, last 5 events for the past 45 min with time decay, bias to last 5 events of 45 min and last 5 carts of session. Score: 0.4968123553997635 Best

# 45 minutes seems optimal

shape: (207_045, 3)
┌──────────┬─────────┬────────┐
│ session  ┆ aid     ┆ weight │
│ ---      ┆ ---     ┆ ---    │
│ i32      ┆ i32     ┆ f32    │
╞══════════╪═════════╪════════╡
│ 12899782 ┆ 413962  ┆ 1.0    │
│ 12899782 ┆ 1494780 ┆ 1.0    │
│ 12899787 ┆ 1682750 ┆ 1.0    │
│ 12899790 ┆ 1219653 ┆ 1.0    │
│ 12899799 ┆ 1325402 ┆ 1.0    │
│ …        ┆ …       ┆ …      │
│ 14571335 ┆ 1636724 ┆ 1.0    │
│ 14571349 ┆ 683653  ┆ 1.0    │
│ 14571393 ┆ 835431  ┆ 1.0    │
│ 14571403 ┆ 614044  ┆ 1.0    │
│ 14571534 ┆ 272221  ┆ 1.0    │
└──────────┴─────────┴────────┘
Batch 1
Batch 2
Batch 3
Batch 4
Batch 5
Batch 6
Batch 7
Batch 8
Batch 9
Batch 10
shape: (921_704, 2)
┌─────────────────┬─────────────────────────────────┐
│ session_type    ┆ labels                          │
│ ---             ┆ ---                             │
│ str             ┆ str                             │
╞═════════════════╪═════════════════════════════════╡
│ 12980856_clicks ┆ 600052 1737122 531551 1382666 … │
│ 13020841_

In [7]:
clicks_predictions_df.write_csv("out/clicks_predictions_2.csv")

### Predict next click to be some of the last n clicks

In [12]:
# Sessions
clicks_predictions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .filter(pl.col("type") == "clicks")
    .group_by("session")
    .agg(pl.col("aid").top_k_by("ts", 2).cast(str))
    # csv format
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)
print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions_3.csv")

# Last 1 click. Score: 0.1644639159089166
# Last 2 clicks. Score: 0.22302865731990582,
# last 3 clicks. Score: 0.23140731446809437
# last 5 clicks. Score: 0.23902812614120425
# last 20 clicks. Score: 0.2455888699745515

shape: (919_046, 2)
┌─────────────────┬─────────────────┐
│ session_type    ┆ labels          │
│ ---             ┆ ---             │
│ str             ┆ str             │
╞═════════════════╪═════════════════╡
│ 13670735_clicks ┆ 216067 1095230  │
│ 14448557_clicks ┆ 379278 735057   │
│ 13628916_clicks ┆ 1533194         │
│ 13817030_clicks ┆ 553430 1126011  │
│ 13123349_clicks ┆ 803262          │
│ …               ┆ …               │
│ 12903482_clicks ┆ 1581819         │
│ 13115301_clicks ┆ 1703092 774159  │
│ 13130527_clicks ┆ 1026107         │
│ 14381410_clicks ┆ 1713635 1595116 │
│ 14480369_clicks ┆ 1476447 390794  │
└─────────────────┴─────────────────┘


### Predict next click to be some of the last n carts

In [31]:
# Sessions
clicks_predictions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .filter(pl.col("type") == "carts")
    .group_by("session")
    .agg(pl.col("aid").top_k_by("ts", 4).cast(str))
    # csv format
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)
print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions_3.csv")

# last 1 cart. Score: 0.02698250430151718
# last 2 carts. Score: 0.03000243838498779
# last 3 carts. Score: 0.031032741900955778
# last 4 carts. Score: 0.031404795948388665
# last 5 carts. Score: 0.0316417657570613
# last 10 carts. Score: 0.031927961178163516

shape: (124_519, 2)
┌─────────────────┬────────────────┐
│ session_type    ┆ labels         │
│ ---             ┆ ---            │
│ str             ┆ str            │
╞═════════════════╪════════════════╡
│ 14318530_clicks ┆ 1447165        │
│ 13078511_clicks ┆ 1135056        │
│ 13221912_clicks ┆ 152547         │
│ 13085320_clicks ┆ 1064724        │
│ 13268974_clicks ┆ 1756871        │
│ …               ┆ …              │
│ 14332907_clicks ┆ 1485952        │
│ 13566765_clicks ┆ 1285845        │
│ 14194642_clicks ┆ 585652 17047   │
│ 13800485_clicks ┆ 617783 617783  │
│ 14561119_clicks ┆ 306708 1209390 │
└─────────────────┴────────────────┘


### Predict next click to be some of the last n orders

In [41]:
# Sessions
clicks_predictions_df = (
    test_sessions_df
    .explode("events")
    .unnest("events")
    # Convert ts to seconds and cast to UInt32 to save memory
    .with_columns((pl.col("ts")//1000).cast(pl.UInt32))
    .filter(pl.col("type") == "orders")
    .group_by("session")
    .agg(pl.col("aid").top_k_by("ts", 3).cast(str))
    # csv format
    .with_columns(
        (pl.col("session").cast(str) + "_clicks"),
        pl.col("aid").list.join(separator=' ')
    )
    .rename({"session": "session_type", "aid": "labels"})
)
print(clicks_predictions_df)

clicks_predictions_df.write_csv("out/clicks_predictions_3.csv")


# last 1 order. Score: 0.0027497656059501173
# last 3 orders. Score: 0.003035961027052336
# last 5 orders. Score: 0.003066870132531376
# last 10 carts. Score: 0.003080607512744282

shape: (11_073, 2)
┌─────────────────┬───────────────────────┐
│ session_type    ┆ labels                │
│ ---             ┆ ---                   │
│ str             ┆ str                   │
╞═════════════════╪═══════════════════════╡
│ 13942582_clicks ┆ 811735                │
│ 13756329_clicks ┆ 1098508 538167        │
│ 12906549_clicks ┆ 367530 1150072        │
│ 13390602_clicks ┆ 963551                │
│ 13662220_clicks ┆ 684886                │
│ …               ┆ …                     │
│ 12916743_clicks ┆ 589558                │
│ 13585099_clicks ┆ 440769                │
│ 13009533_clicks ┆ 1071837               │
│ 13133787_clicks ┆ 1384693               │
│ 13932090_clicks ┆ 303762 1038926 410592 │
└─────────────────┴───────────────────────┘
