In [None]:
import warnings
import pandas as pd
import numpy as np
import time
import pickle
import torch
import torch.nn as nn
import shutil
import os
from xgboost import XGBClassifier

!pip install polars==0.18.2
import polars as pl

!pip install catboost
from catboost import CatBoostClassifier, Pool

warnings.filterwarnings("ignore")

In [None]:
%cd drive/My\ Drive/performance_prediction

In [None]:
dtypes = {
    "session_id": pl.Int64,
    "elapsed_time": pl.Int64,
    "level": pl.Int8,
    "page": pl.Float32,
    "room_coor_x": pl.Float32,
    "room_coor_y": pl.Float32,
    "screen_coor_x": pl.Float32,
    "screen_coor_y": pl.Float32,
    "hover_duration": pl.Float32,
    "fullscreen": pl.Int8,
    "hq": pl.Int8,
    "music": pl.Int8,
}
dtypes = [pl.col(key).cast(value) for key, value in dtypes.items()]
fills = [
    pl.col("page").fill_null(-1),
    pl.col("fqid").fill_null("fqid_None"),
    pl.col("text_fqid").fill_null("text_fqid_None")
]

In [None]:
CATS = [
    'event_name',
    'name',
    'fqid',
    'room_fqid',
    'text_fqid'
]
NUMS = [
    'room_coor_x',
    'room_coor_y',
    'screen_coor_x',
    'screen_coor_y',
    'et_diff',
    'page'
]

In [None]:
# To predict using a simplified model, set ONLY_TR_FEATURES=True.
ONLY_TR_FEATURES = False

most_common = pl.read_parquet("models/torch_models/most_common.parquet").to_series()

norms = pl.read_parquet("models/torch_models/norms.parquet")

f_read = open('models/torch_models/point_ixs.pkl', 'rb')
point_ixs = pickle.load(f_read)
f_read.close()

f_read = open('models/xgb_models/grp_cat_vals.pkl', 'rb')
grp_cat_vals = pickle.load(f_read)
f_read.close()

point_pos = pl.read_parquet("models/xgb_models/point_pos.parquet")

f_name = "feature_dict_tr.pkl" if ONLY_TR_FEATURES else "feature_dict.pkl"
f_read = open('models/xgb_models/' + f_name, 'rb')
feature_dict = pickle.load(f_read)
f_read.close()

In [None]:
def feature_engineer_base(df):
    """
    Create initial features, distance (cumulative distance moved) and et_diff,
    that are used to create other features
    in feature_engineer_xgb and feature_engineer_tr
    """
    cols = [
        (
            (((pl.col("screen_coor_y")-pl.col("screen_coor_y").shift(1)).fill_null(0) ** 2
            + (pl.col("screen_coor_x")-pl.col("screen_coor_x").shift(1)).fill_null(0) ** 2) ** (1/2))
            .cumsum()
            .fill_null(0)
            .over(["session_id"])
            .alias("distance")
        ),
        (
            (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1))
            .fill_null(0)
            .clip(0, 1e9)
            .over(["session_id"])
            .alias("et_diff")
        ),
    ]
    return df.with_columns(cols)


def feature_engineer_xgb(df, grp):
    """
    Create features that only xgboost and catboost uses
    """

    aggs = [

        # fqid
        # count of each fqid value
        *[((pl.col("fqid") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "fqid")]],
        # mean elapsed time difference over each fqid value
        *[pl.col("et_diff").filter(pl.col("fqid")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "fqid")]],
        # max elapsed time difference over each fqid value
        *[pl.col("et_diff").filter(pl.col("fqid")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "fqid")]],

        # room_fqid
        *[((pl.col("room_fqid") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "room_fqid")]],
        *[pl.col("et_diff").filter(pl.col("room_fqid")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "room_fqid")]],
        *[pl.col("et_diff").filter(pl.col("room_fqid")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "room_fqid")]],

        # text_fqid
        *[((pl.col("text_fqid") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "text_fqid")]],
        *[pl.col("et_diff").filter(pl.col("text_fqid")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "text_fqid")]],
        *[pl.col("et_diff").filter(pl.col("text_fqid")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "text_fqid")]],

        # event_name
        *[((pl.col("event_name") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "event_name")]],
        *[pl.col("et_diff").filter(pl.col("event_name")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "event_name")]],
        *[pl.col("et_diff").filter(pl.col("event_name")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "event_name")]],

        # name
        *[((pl.col("name") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "name")]],
        *[pl.col("et_diff").filter(pl.col("name")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "name")]],
        *[pl.col("et_diff").filter(pl.col("name")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "name")]],

        # stats of the numericals
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique") for c in CATS],
        *[pl.col(c).mean().alias(f"{c}_mean") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min") for c in NUMS],
        *[pl.col(c).std().alias(f"{c}_std") for c in NUMS],
        *[pl.col(c).sum().alias(f"{c}_sum") for c in NUMS],

        # whether or not hq, music, fullscreen are used
        pl.col("hq").apply(lambda x: x[0]),
        pl.col("music").apply(lambda x: x[0]),
        pl.col("fullscreen").apply(lambda x: x[0]),

    ]

    df = df.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")

    # weekday and hour
    df = df.with_columns([
        pl.col("session_id").apply(lambda x: int(str(x)[4:6])).alias("weekday"),
        pl.col("session_id").apply(lambda x: int(str(x)[6:8])).alias("hour"),
    ])

    return df


def point_filter_tr(df):
    """
    Filter out the points that exists in more than 0.999 of sessions
    (that are in most_common)
    """
    df = df.with_columns(pl.col("index").cumcount().over(["session_id", "event_name", "level", "name", "page", "fqid", "room_fqid", "text_fqid"]).cast(pl.Utf8).alias("dup_count"))
    df = df.with_columns((pl.col("event_name") + pl.col("level") + pl.col("name") + pl.col("page") + pl.col("fqid") + pl.col("room_fqid") + pl.col("text_fqid") + pl.col("dup_count")).alias("point"))
    df = df.filter(pl.col("point").is_in(most_common))
    return df


def pad_tr(df, grp):
    """
    Makes sure that the transformer features are of constant length
    and in the same order when used by the gbdt models.
    """
    # the level group predicted mapped to the level groups seen/used
    point_lgs = {"0-4": ["0-4"], "5-12": ["0-4", "5-12"], "13-22": ["0-4", "5-12", "13-22"]}[grp]
    # get order of occurance of the points used
    point_pos_lgs = point_pos.filter(pl.col("level_group").is_in(point_lgs))
    seq_len = min(256, point_pos_lgs.shape[0])
    order = point_pos_lgs[-seq_len:]

    def sort_points(df_sess):
        """
        sorts the points in a df containing one session,
        in order specified by order
        """
        sid = df_sess["session_id"][0]
        p = order.with_columns(pl.lit(sid).alias("session_id"))
        return p.join(df_sess, on="point", how="left")

    df = df.groupby("session_id", maintain_order=True).apply(sort_points)
    df = df.fill_null(0)
    return df


def feature_engineer_tr(df):
    """
    Generate the numerical transformer features:
    time difference,
    index difference,
    distance (cumulative distance moved, calculated from screen_coor's) difference,
    room_coor_x,
    room_coor_y
    """
    return df.with_columns([
        (pl.col("elapsed_time").clip(0, 1e9) - pl.col("elapsed_time").clip(0, 1e9).shift(1)).fill_null(0).over("session_id").alias("et_diff"),
        (pl.col("index") - pl.col("index").shift(1)).fill_null(0).over("session_id").alias("ix_diff"),
        (pl.col("distance") - pl.col("distance").shift(1)).fill_null(0).over("session_id").alias("dist_diff"),
        pl.col("room_coor_x").fill_null(0),
        pl.col("room_coor_y").fill_null(0),
    ])


def flatten_tr(df):
    """
    Flatten the transformer features so they can be used by the GBDT models
    """
    df = df.select(["session_id", "et_diff", "ix_diff", "dist_diff", "room_coor_x", "room_coor_y"])
    df = df.groupby("session_id").agg([pl.col(c) for c in df.columns[1:]])
    vs = {}
    for r in df.iter_rows():
        ls = []
        for l in r[1:]:
            ls.extend(l)
        vs[str(r[0])] = ls
    df = pl.DataFrame(vs).transpose(include_header=True)
    df = df.with_columns(pl.col("column").cast(pl.Int64)).rename({"column": "session_id"})
    return df


def xgb_data_pipe(df, grp):
    """
    Generate all features for training and infering with the GBDT models
    """
    df = feature_engineer_base(df)
    # convert to string for combining the categoricals when creating points
    df_tr = df.with_columns([
        pl.col("page").cast(pl.Utf8),
        pl.col("level").cast(pl.Utf8),
    ])
    df_tr = point_filter_tr(df_tr)
    df_tr = feature_engineer_tr(df_tr)
    df_tr = pad_tr(df_tr, grp)
    df_tr = flatten_tr(df_tr)
    if ONLY_TR_FEATURES:
        return df_tr
    else:
        df_xgb = feature_engineer_xgb(df, grp)
        df = df_xgb.join(df_tr, on="session_id", how="inner")
        return df


def tr_data_pipe(df):
    """
    Generate all features for training and infering with the transformer
    """
    df = feature_engineer_base(df)
    # convert to string for combining the categoricals when creating points
    df = df.with_columns([
        pl.col("page").cast(pl.Utf8),
        pl.col("level").cast(pl.Utf8),
    ])
    df = point_filter_tr(df)
    df = feature_engineer_tr(df)
    df = df.with_columns([
        np.sign(pl.col(f)) * np.log1p(np.absolute(pl.col(f))) for f in ["et_diff", "ix_diff", "dist_diff"]
    ])
    return df.select(["session_id", "level_group", "et_diff", "ix_diff",  "dist_diff", "room_coor_x", "room_coor_y", "point"])

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seq_lens = [76, 249, 452]
max_seq_len = 256


class NN(nn.Module):
    """
    Lightweight transformer model
    """

    def __init__(self, num_cont_cols, embed_dim, num_layers, num_heads, max_seq_len):
        super(NN, self).__init__()
        self.emb_cont = nn.Sequential(
            nn.Linear(num_cont_cols, embed_dim//2),
            nn.LayerNorm(embed_dim//2)
        )
        self.emb_cats = nn.Sequential(
            nn.Embedding(max_seq_len + 1, embed_dim//2),
            nn.LayerNorm(embed_dim//2)
        )
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim,
            dropout=0.1,
            batch_first=True,
            activation="relu",
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # one head for each level group
        self.clf_heads = nn.ModuleList([
            nn.Linear(embed_dim, out_dim) for out_dim in [3, 10, 5]
        ])

    def forward(self, x, grp):
        emb_conts = self.emb_cont(x[:, :, :-1])
        emb_cats = self.emb_cats(x[:, :, -1].type(torch.int32))
        x = torch.cat([emb_conts, emb_cats], dim=2)
        x = self.encoder(x)
        x = x.mean(dim=1)
        x = self.clf_heads[["0-4", "5-12", "13-22"].index(grp)](x)
        return x.unsqueeze(2)


In [None]:
# load the models

num_folds = 5
if ONLY_TR_FEATURES:
    models_xgb = {}
    for grp in ["0-4", "5-12", "13-22"]:
        for f in range(num_folds):
                clf = XGBClassifier()
                clf.load_model(f"models/xgb_models/model_tr_{f}_grp_{grp}_s0.xgb")
                models_xgb[(grp, f)] = clf
else:
    num_seeds = 3
    models_tr = {}
    for f in range(num_folds):
        for s in range(num_seeds):

            model = NN(
                num_cont_cols=5,
                embed_dim=64,
                num_layers=1,
                num_heads=8,
                max_seq_len=seq_lens[-1],
            )

            model_path = f"models/torch_models/model_{f}_s{s}"
            if os.path.exists(model_path):
                shutil.make_archive(model_path, 'zip', model_path)
            model_path += ".zip"

            model.load_state_dict(torch.load(model_path))
            model.eval()
            models_tr[(f, s)] = model

    models_xgb = {}
    models_cat = {}
    for grp in ["0-4", "5-12", "13-22"]:
        for f in range(num_folds):
            for s in range(num_seeds):
                    clf = XGBClassifier()
                    clf.load_model(f"models/xgb_models/model_{f}_grp_{grp}_s{s}.xgb")
                    models_xgb[(grp, f, s)] = clf

                    clf = CatBoostClassifier()
                    clf.load_model(f"models/cat_models/model_{f}_grp_{grp}_s{s}.cbm")
                    models_cat[(grp, f, s)] = clf

    models_lin_reg = {}
    for f in range(num_folds):
        for q in range(1, 19):
            lin_reg = pickle.load(open(f"models/meta_models/lin_reg_{f}_q{q}.sav", 'rb'))
            models_lin_reg[(f, q)] = lin_reg

In [None]:
test = pl.read_csv("data/test.csv")
sample_submission = pl.read_csv("data/sample_submission.csv")

In [None]:
# sort by level group
grp_map = {"0-4": 1, "5-12": 2, "13-22": 3}
test = test.with_columns(pl.col("level_group").apply(lambda x: grp_map[x]))
test = test.sort(by=["session_id", "level_group"])
grp_map = {1: "0-4", 2: "5-12", 3: "13-22"}
test = test.with_columns(pl.col("level_group").apply(lambda x: grp_map[x]))

iter_test = [b for _, a in test.groupby(["session_id"], maintain_order=True) for _, b in a.groupby(["level_group"], maintain_order=True)]

In [None]:
limits = {'0-4':(1, 4), '5-12':(4, 14), '13-22':(14, 19)}
pad_lens = {'0-4': 76, '5-12': 249, '13-22': 256}

predictions = []
sids_qs = []

dfs = {}
prev_probas = {}

for test in iter_test:
    test = test.sort(by='index')
    grp = test["level_group"][0]
    sess_id = test["session_id"][0]

    df = test.with_columns(dtypes).with_columns(fills)

    # use data from previous level groups
    if grp == "0-4":
        dfs[sess_id] = df
    elif grp == "5-12":
        df = pl.concat([dfs[sess_id], df])
        dfs[sess_id] = df
    else:
        df = pl.concat([dfs[sess_id], df])

    # drop hovers and create new index for each session, from 0 to len(session)
    df = df.filter(pl.col("hover_duration").is_null())
    df = df.with_columns(pl.arange(0, pl.col("index").count()).over(["session_id"]).alias("new_index"))
    df = df.drop("index").rename({"new_index": "index"})

    # gbdt feature engineering
    df_xgb = xgb_data_pipe(df, grp)

    df_xgb = df_xgb.to_pandas().set_index('session_id')
    df_xgb = df_xgb[feature_dict[grp]]

    # use question number as a input feature since one model is trained per level group
    df_qs = []
    a, b = limits[grp]
    for q in range(a, b):
        df_xgb["q"] = q
        df_qs.append(df_xgb.copy())
    df_xgb = pd.concat(df_qs)

    # transformer feature engineering
    df_tr = tr_data_pipe(df)

    # normalize the input
    df_tr = df_tr.join(norms, on="point", how="left")
    df_tr = df_tr.with_columns([
        (pl.col("et_diff") - pl.col("et_mean")) / pl.col("et_std"),
        (pl.col("ix_diff") - pl.col("ix_mean")) / pl.col("ix_std"),
        (pl.col("dist_diff") - pl.col("dist_mean")) / pl.col("dist_std"),
        (pl.col("room_coor_x") - pl.col("room_coor_x_mean")) / pl.col("room_coor_x_std"),
        (pl.col("room_coor_y") - pl.col("room_coor_y_mean")) / pl.col("room_coor_y_std"),
    ]).fill_nan(0)
    # replace some outlier values with 0
    df_tr = df_tr.with_columns([pl.when(pl.col("ix_diff").abs() > 7.5).then(0).otherwise(pl.col("ix_diff")).keep_name()])
    # map point categoricals to integer index, so transformers can handle it
    df_tr = df_tr.with_columns([pl.col("point").map_dict(point_ixs)])

    input = df_tr.select(["et_diff", "ix_diff", "dist_diff","room_coor_x", "room_coor_y", "point"]).to_numpy()

    pad_len = pad_lens[grp]
    seq_len = input.shape[0]
    if seq_len < pad_len:
        input = np.pad(input, ((pad_len - seq_len, 0), (0, 0)), 'constant')
    if seq_len > pad_len:
        input = input[-pad_len:]

    input = torch.tensor(input, dtype=torch.float).unsqueeze(0)

    # predict

    # folds average
    probas = np.zeros(b - a)
    for f in range(num_folds):

        if ONLY_TR_FEATURES:
            probas += models_xgb[(grp, f)].predict_proba(df_xgb)[:, 1] / num_folds
            continue


        # seeds average
        probas_seeds_avg_xgb = 0
        probas_seeds_avg_cat = 0
        probas_seeds_avg_tr = 0
        for s in range(num_seeds):
            probas_xgb = models_xgb[(grp, f, s)].predict_proba(df_xgb)[:, 1]
            probas_cat = models_cat[(grp, f, s)].predict_proba(df_xgb)[:, 1]
            out_tr = models_tr[(f, s)](input, grp)[0, :, 0]
            probas_tr = torch.sigmoid(out_tr).detach().numpy()
            probas_seeds_avg_xgb += probas_xgb / num_seeds
            probas_seeds_avg_cat += probas_cat / num_seeds
            probas_seeds_avg_tr += probas_tr / num_seeds

        # get previous predictions
        if grp == "0-4":
            prev_probas[(sess_id, f, "xgb")] = probas_seeds_avg_xgb
            prev_probas[(sess_id, f, "cat")] = probas_seeds_avg_cat
            prev_probas[(sess_id, f, "tr")] = probas_seeds_avg_tr
        elif grp == "5-12":
            probas_seeds_avg_xgb = np.concatenate([prev_probas[(sess_id, f, "xgb")], probas_seeds_avg_xgb])
            probas_seeds_avg_cat = np.concatenate([prev_probas[(sess_id, f, "cat")], probas_seeds_avg_cat])
            probas_seeds_avg_tr = np.concatenate([prev_probas[(sess_id, f, "tr")], probas_seeds_avg_tr])
            prev_probas[(sess_id, f, "xgb")] = probas_seeds_avg_xgb
            prev_probas[(sess_id, f, "cat")] = probas_seeds_avg_cat
            prev_probas[(sess_id, f, "tr")] = probas_seeds_avg_tr
        else:
            probas_seeds_avg_xgb = np.concatenate([prev_probas[(sess_id, f, "xgb")], probas_seeds_avg_xgb])
            probas_seeds_avg_cat = np.concatenate([prev_probas[(sess_id, f, "cat")], probas_seeds_avg_cat])
            probas_seeds_avg_tr = np.concatenate([prev_probas[(sess_id, f, "tr")], probas_seeds_avg_tr])

        probas_seeds_avg = np.concatenate([probas_seeds_avg_xgb, probas_seeds_avg_cat, probas_seeds_avg_tr])
        probas_seeds_avg = probas_seeds_avg.reshape(1, -1)

        # linear regression prediction
        for q in range(a, b):
            for f1 in range(num_folds):
                probas[q-a] += models_lin_reg[(f1, q)].predict(probas_seeds_avg) / num_folds / num_folds

    threshold = 0.625 if ONLY_TR_FEATURES else 0.62
    preds = (probas > threshold).astype(np.int32)
    for q in range(a, b):
        sids_qs.append(str(sess_id) + f'_q{q}')
        predictions.append(preds[q-a])

In [None]:
pred_df = pl.DataFrame({"session_id": sids_qs, "correct": predictions})

In [None]:
print(pred_df.shape)
pred_df.head(40)

In [None]:
print(pred_df["correct"].mean())