In [None]:
import warnings
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import pickle
import gc
import os.path
import copy
import shutil
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from scipy.optimize import minimize

!pip install catboost
from catboost import CatBoostClassifier, Pool

!pip install polars==0.18.2
import polars as pl

!pip install transformers
from transformers import get_linear_schedule_with_warmup

warnings.filterwarnings("ignore")

In [None]:
%cd drive/My\ Drive/performance_prediction

In [None]:
np.random.seed(0)

def load_data(n_rows=None):
    """
    Load n_rows rows of the kaggle dataset and the parsed raw data,
    cast to specified data types,
    and fill nulls

    Returns:
    df: DataFrame
    concatenation of the kaggle dataset and parsed raw data

    orig_ids: Series
    The session ids of the kaggle dataset
    """

    dtypes = {
        "session_id": pl.Int64,
        "elapsed_time": pl.Int64,
        "level": pl.Int8,
        "page": pl.Float32,
        "room_coor_x": pl.Float32,
        "room_coor_y": pl.Float32,
        "screen_coor_x": pl.Float32,
        "screen_coor_y": pl.Float32,
        "hover_duration": pl.Float32,
        "fullscreen": pl.Int8,
        "hq": pl.Int8,
        "music": pl.Int8,
    }
    dtypes = [pl.col(key).cast(value) for key, value in dtypes.items()]
    fills = [
        pl.col("page").fill_null(-1),
        pl.col("fqid").fill_null("fqid_None"),
        pl.col("text_fqid").fill_null("text_fqid_None")
    ]
    to_cat = [
        pl.col("fqid").cast(pl.Categorical),
        pl.col("text_fqid").cast(pl.Categorical),
        pl.col("room_fqid").cast(pl.Categorical),
        pl.col("text").cast(pl.Categorical),
        pl.col("event_name").cast(pl.Categorical),
        pl.col("name").cast(pl.Categorical),
    ]

    df = (pl.read_parquet("data/train.parquet", n_rows=n_rows)
            .with_columns(dtypes)
            .with_columns(fills))

    df_raw = (pl.read_parquet("data/raw_train.parquet", n_rows=n_rows)
                .filter(~pl.col("session_id").is_in(df["session_id"].unique()))
                .with_columns(dtypes)
                .with_columns(fills))

    orig_ids = df["session_id"].unique(maintain_order=True)

    df = pl.concat([df, df_raw])

    # reduce memory usage
    df = df.with_columns(to_cat)

    return df, orig_ids



def preprocess(df):
    """
    Preprocess the data
    """

    # sort by level group and index
    grp_map = {"0-4": 1, "5-12": 2, "13-22": 3}
    df = df.with_columns(pl.col("level_group").apply(lambda x: grp_map[x]))
    df = df.sort(by=["session_id", "level_group", "index"])
    grp_map = {1: "0-4", 2: "5-12", 3: "13-22"}
    df = df.with_columns(pl.col("level_group").apply(lambda x: grp_map[x]))

    # drop hovers and create new index for each session, from 0 to len(session)
    df = df.filter(pl.col("hover_duration").is_null())
    df = df.with_columns(pl.arange(0, pl.col("index").count()).over(["session_id"]).alias("new_index"))
    df = df.drop("index").rename({"new_index": "index"})

    # remove old sessions
    df = df.with_columns([
        pl.col("session_id").apply(lambda x: int(str(x)[:4])).alias("year_month"),
    ])
    df = df.filter(pl.col("year_month") > 1911)

    return df


def split_data(df):
    """
    Splits the data for training models per level group

    half_group_ids: Series
    session ids that contains all levels 0-12 but not all 13-22
    """

    df1 = df.filter((pl.col("level_group")=='0-4'))
    df1 = df1.filter((pl.col("level").n_unique() == 5).over("session_id"))
    df2 = df.filter((pl.col("level_group").is_in(["0-4", '5-12'])))
    df2 = df2.filter((pl.col("level").n_unique() == 13).over("session_id"))
    df3 = df.filter((pl.col("level").n_unique() == 23).over("session_id"))

    half_group_ids = df2.filter(~pl.col("session_id").is_in(df3["session_id"].unique()))["session_id"].unique()

    return df1, df2, df3, half_group_ids


In [None]:
df, orig_ids = load_data(n_rows=None)
df = preprocess(df)
df1, df2, df3, half_group_ids = split_data(df)

In [None]:
del df
_ = gc.collect()

In [None]:
df1["session_id"].n_unique(), df2["session_id"].n_unique(), df3["session_id"].n_unique(), half_group_ids.shape

In [None]:
CATS = [
    'event_name',
    'name',
    'fqid',
    'room_fqid',
    'text_fqid'
]

NUMS = [
    'room_coor_x',
    'room_coor_y',
    'screen_coor_x',
    'screen_coor_y',
    'et_diff',
    'page'
]

SAVE_FOR_SUB = True
# To train a simplified model, set ONLY_TR_FEATURES=True,
# and run only the XGBoost training cell out of the models.
ONLY_TR_FEATURES = False

# for only using categoricals present in respective level group df,
# when feature engineering
grp_cat_vals = {}
for cat in CATS:
    for grp, df in [
        ("0-4", df1),
        ("5-12", df2),
        ("13-22", df3),
    ]:
        grp_cat_vals[(grp, cat)] = df[cat].unique(maintain_order=True)

df_temp = df3.with_columns([
    pl.col("page").cast(pl.Utf8),
    pl.col("level").cast(pl.Utf8),
])
# identify points in the game with "point_columns"
point_columns = ["session_id", "event_name", "level", "name", "page", "fqid", "room_fqid", "text_fqid"]
# enumerate duplicate points
df_temp = df_temp.with_columns(pl.col("index").cumcount().over(point_columns).cast(pl.Utf8).alias("dup_count"))
df_temp = df_temp.with_columns((pl.col("event_name") + pl.col("level") + pl.col("name") + pl.col("page") + pl.col("fqid") + pl.col("room_fqid") + pl.col("text_fqid") + pl.col("dup_count")).alias("point"))
# identify points that are present in over 0.999 of sessions
most_common = df_temp["point"].value_counts(sort=True).filter(pl.col("counts") >= 0.999*df_temp["session_id"].n_unique())["point"]
df_temp = df_temp.filter(pl.col("point").is_in(most_common))

# the order of occurance of points in the game
point_pos = df_temp.groupby(["point", "level_group"]).agg(pl.col("index").mean()).sort(by="index")
# a map from points to integer
point_ixs = {cat: i for i, cat in enumerate(most_common, 1)}

del df_temp
_ = gc.collect()

# save for feature engineering during inference and/or training
if SAVE_FOR_SUB:
    f_save = open("models/xgb_models/grp_cat_vals.pkl", 'wb')
    pickle.dump(grp_cat_vals, f_save)
    f_save.close()

    f_save = open('models/torch_models/point_ixs.pkl', 'wb')
    pickle.dump(point_ixs, f_save)
    f_save.close()

    pl.DataFrame(most_common).write_parquet("models/xgb_models/most_common.parquet")

    pl.DataFrame(most_common).write_parquet("models/torch_models/most_common.parquet")

    point_pos.write_parquet("models/xgb_models/point_pos.parquet")


In [None]:
def feature_engineer_base(df):
    """
    Create initial features, distance (cumulative distance moved) and et_diff,
    that are used to create other features
    in feature_engineer_xgb and feature_engineer_tr
    """
    cols = [
        (
            (((pl.col("screen_coor_y")-pl.col("screen_coor_y").shift(1)).fill_null(0) ** 2
            + (pl.col("screen_coor_x")-pl.col("screen_coor_x").shift(1)).fill_null(0) ** 2) ** (1/2))
            .cumsum()
            .fill_null(0)
            .over(["session_id"])
            .alias("distance")
        ),
        (
            (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1))
            .fill_null(0)
            .clip(0, 1e9)
            .over(["session_id"])
            .alias("et_diff")
        ),
    ]
    return df.with_columns(cols)


def feature_engineer_xgb(df, grp):
    """
    Create features that only xgboost and catboost uses
    """

    aggs = [

        # fqid
        # count of each fqid value
        *[((pl.col("fqid") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "fqid")]],
        # mean elapsed time difference over each fqid value
        *[pl.col("et_diff").filter(pl.col("fqid")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "fqid")]],
        # max elapsed time difference over each fqid value
        *[pl.col("et_diff").filter(pl.col("fqid")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "fqid")]],

        # room_fqid
        *[((pl.col("room_fqid") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "room_fqid")]],
        *[pl.col("et_diff").filter(pl.col("room_fqid")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "room_fqid")]],
        *[pl.col("et_diff").filter(pl.col("room_fqid")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "room_fqid")]],

        # text_fqid
        *[((pl.col("text_fqid") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "text_fqid")]],
        *[pl.col("et_diff").filter(pl.col("text_fqid")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "text_fqid")]],
        *[pl.col("et_diff").filter(pl.col("text_fqid")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "text_fqid")]],

        # event_name
        *[((pl.col("event_name") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "event_name")]],
        *[pl.col("et_diff").filter(pl.col("event_name")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "event_name")]],
        *[pl.col("et_diff").filter(pl.col("event_name")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "event_name")]],

        # name
        *[((pl.col("name") == c).sum()).alias(f"{c}_num") for c in grp_cat_vals[(grp, "name")]],
        *[pl.col("et_diff").filter(pl.col("name")==c).mean().alias(f"{c}_et_mean") for c in grp_cat_vals[(grp, "name")]],
        *[pl.col("et_diff").filter(pl.col("name")==c).max().alias(f"{c}_et_max") for c in grp_cat_vals[(grp, "name")]],

        # stats of the numericals
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique") for c in CATS],
        *[pl.col(c).mean().alias(f"{c}_mean") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min") for c in NUMS],
        *[pl.col(c).std().alias(f"{c}_std") for c in NUMS],
        *[pl.col(c).sum().alias(f"{c}_sum") for c in NUMS],

        # whether or not hq, music, fullscreen are used
        pl.col("hq").apply(lambda x: x[0]),
        pl.col("music").apply(lambda x: x[0]),
        pl.col("fullscreen").apply(lambda x: x[0]),

    ]

    df = df.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")

    # weekday and hour
    df = df.with_columns([
        pl.col("session_id").apply(lambda x: int(str(x)[4:6])).alias("weekday"),
        pl.col("session_id").apply(lambda x: int(str(x)[6:8])).alias("hour"),
    ])

    return df


def point_filter_tr(df):
    """
    Filter out the points that exists in more than 0.999 of sessions
    (that are in most_common)
    """
    df = df.with_columns(pl.col("index").cumcount().over(["session_id", "event_name", "level", "name", "page", "fqid", "room_fqid", "text_fqid"]).cast(pl.Utf8).alias("dup_count"))
    df = df.with_columns((pl.col("event_name") + pl.col("level") + pl.col("name") + pl.col("page") + pl.col("fqid") + pl.col("room_fqid") + pl.col("text_fqid") + pl.col("dup_count")).alias("point"))
    df = df.filter(pl.col("point").is_in(most_common))
    return df


def pad_tr(df, grp):
    """
    Makes sure that the transformer features are of constant length
    and in the same order when used by the gbdt models.
    """
    # the level group predicted mapped to the level groups seen/used
    point_lgs = {"0-4": ["0-4"], "5-12": ["0-4", "5-12"], "13-22": ["0-4", "5-12", "13-22"]}[grp]
    # get order of occurance of the points used
    point_pos_lgs = point_pos.filter(pl.col("level_group").is_in(point_lgs))
    seq_len = min(256, point_pos_lgs.shape[0])
    order = point_pos_lgs[-seq_len:]

    def sort_points(df_sess):
        """
        sorts the points in a df containing one session,
        in order specified by order
        """
        sid = df_sess["session_id"][0]
        p = order.with_columns(pl.lit(sid).alias("session_id"))
        return p.join(df_sess, on="point", how="left")

    df = df.groupby("session_id", maintain_order=True).apply(sort_points)
    df = df.fill_null(0)
    return df


def feature_engineer_tr(df):
    """
    Generate the numerical transformer features:
    time difference,
    index difference,
    distance (cumulative distance moved, calculated from screen_coor's) difference,
    room_coor_x,
    room_coor_y
    """
    return df.with_columns([
        (pl.col("elapsed_time").clip(0, 1e9) - pl.col("elapsed_time").clip(0, 1e9).shift(1)).fill_null(0).over("session_id").alias("et_diff"),
        (pl.col("index") - pl.col("index").shift(1)).fill_null(0).over("session_id").alias("ix_diff"),
        (pl.col("distance") - pl.col("distance").shift(1)).fill_null(0).over("session_id").alias("dist_diff"),
        pl.col("room_coor_x").fill_null(0),
        pl.col("room_coor_y").fill_null(0),
    ])


def flatten_tr(df):
    """
    Flatten the transformer features so they can be used by the GBDT models
    """
    df = df.select(["session_id", "et_diff", "ix_diff", "dist_diff", "room_coor_x", "room_coor_y"])
    df = df.groupby("session_id").agg([pl.col(c) for c in df.columns[1:]])
    vs = {}
    for r in df.iter_rows():
        ls = []
        for l in r[1:]:
            ls.extend(l)
        vs[str(r[0])] = ls
    df = pl.DataFrame(vs).transpose(include_header=True)
    df = df.with_columns(pl.col("column").cast(pl.Int64)).rename({"column": "session_id"})
    return df


def xgb_data_pipe(df, grp):
    """
    Generate all features for training and infering with the GBDT models
    """
    df = feature_engineer_base(df)
    # convert to string for combining the categoricals when creating points
    df_tr = df.with_columns([
        pl.col("page").cast(pl.Utf8),
        pl.col("level").cast(pl.Utf8),
    ])
    df_tr = point_filter_tr(df_tr)
    df_tr = feature_engineer_tr(df_tr)
    df_tr = pad_tr(df_tr, grp)
    df_tr = flatten_tr(df_tr)
    if ONLY_TR_FEATURES:
        return df_tr
    else:
        df_xgb = feature_engineer_xgb(df, grp)
        df = df_xgb.join(df_tr, on="session_id", how="inner")
        return df


def xgb_fs(df):
    """
    Some feature selection for the GBDT models
    """
    corr = df.corr().with_columns(pl.col("*").abs())
    # put features correlated over 0.99 in the "drop" list to be dropped
    drop = [col for i, col in enumerate(corr.columns) if any(corr[col][:i] > 0.99)]
    df = df.to_pandas().set_index('session_id')
    # number of nulls
    null = df.isnull().sum().sort_values(ascending=False) / len(df)
    # drop features with more than 0.75 nulls
    drop += list(null[null>0.75].index)
    for col in df.columns:
        if df[col].nunique() == 1:
            # drop features with only one unique value
            drop.append(col)
    features = np.array([f for f in df.columns if f not in drop + ["level_group"]])
    print(len(drop), features.shape)
    return df, features


def tr_data_pipe(df):
    """
    Generate all features for training and infering with the transformer
    """
    df = feature_engineer_base(df)
    # convert to string for combining the categoricals when creating points
    df = df.with_columns([
        pl.col("page").cast(pl.Utf8),
        pl.col("level").cast(pl.Utf8),
    ])
    df = point_filter_tr(df)
    df = feature_engineer_tr(df)
    df = df.with_columns([
        np.sign(pl.col(f)) * np.log1p(np.absolute(pl.col(f))) for f in ["et_diff", "ix_diff", "dist_diff"]
    ])
    return df.select(["session_id", "level_group", "et_diff", "ix_diff",  "dist_diff", "room_coor_x", "room_coor_y", "point"])

In [None]:
%%time
df1_xgb = xgb_data_pipe(df1, "0-4")
df2_xgb = xgb_data_pipe(df2, "5-12")
df3_xgb = xgb_data_pipe(df3, "13-22")

df1_xgb, features1 = xgb_fs(df1_xgb)
df2_xgb, features2 = xgb_fs(df2_xgb)
df3_xgb, features3 = xgb_fs(df3_xgb)

if SAVE_FOR_SUB:
    feature_dict = {
        "0-4": features1,
        "5-12": features2,
        "13-22": features3
    }
    f_name = "feature_dict_tr.pkl" if ONLY_TR_FEATURES else "feature_dict.pkl"
    f_save = open('models/xgb_models/' + f_name, 'wb')
    pickle.dump(feature_dict, f_save)
    f_save.close()

df3_xgb.head()

In [None]:
%%time
# create the transformer input
df_tr = tr_data_pipe(df3)
# use the incomplete sessions for the transformer aswell
df_tr_h = tr_data_pipe(df2.filter(pl.col("session_id").is_in(half_group_ids)))

In [None]:
del df1, df2, df3
_ = gc.collect()

In [None]:
seq_lens = []
seq_len = 0
for g in ["0-4", "5-12", "13-22"]:
    seq_len += point_pos.filter(pl.col("level_group") == g).shape[0]
    print(f"seq_len {g}:", seq_len)
    seq_lens.append(seq_len)
max_seq_len = 256
trimmed_seq_lens = [min(p, max_seq_len) for p in seq_lens]

In [None]:
targets = pd.read_csv("data/train_labels.csv")
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [None]:
# load targets from the kaggle data and raw data, and put them in the same format

targets_raw = pl.read_parquet("data/raw_labels.parquet")
targets_raw = targets_raw.filter(~pl.col("session_id").is_in(targets["session"].unique()))

sids = []
correct = []
session = []
q = []
for i in range(18):
    for sid in targets_raw["session_id"]:
        sids.append(str(sid) + f"_q{i+1}")
        session.append(sid)
        q.append(i+1)

for i in range(18):
    correct.extend(targets_raw[f"q{i}"].to_list())

targets_raw = pd.DataFrame({
    "session_id": sids,
    "correct": correct,
    "session": session,
    "q": q
})

targets_xgb = pd.concat([targets, targets_raw])
targets_xgb.head()

In [None]:
sids = {"session_id": [k for k in targets_xgb.session.unique()]}
qs = {f"q{i}": targets_xgb[targets_xgb.q == i+1].correct.values  for i in range(18)}
targets_tr = pl.DataFrame(sids | qs)
targets_tr.head()

In [None]:
# use only the kaggle data for validation, specified in val_sids
val_sids = df_tr.filter(pl.col("session_id").is_in(orig_ids))["session_id"].unique(maintain_order=True)
qs = {f"q{i+1}": [0]*val_sids.shape[0] for i in range(18)}
xgb_oof = pd.DataFrame({"session_id": val_sids} | qs).set_index("session_id")
tr_oof = pd.DataFrame({"session_id": val_sids} | qs).set_index("session_id")

In [None]:
def threshold_score(preds, targs):
    """
    Calculate the best threshold by simply iterating and taking the maximum
    """
    scores = []
    thresholds = np.arange(0.4, 0.81, 0.005)
    for threshold in thresholds:
        preds_bin = (preds > threshold).astype('int')
        score = f1_score(targs, preds_bin, average="macro")
        scores.append(score)
    best_score = max(scores)
    best_threshold = thresholds[scores.index(best_score)]
    return best_score, best_threshold

In [None]:
%%time

# XGB training

n_splits = 5

kf = KFold(n_splits=n_splits)

seeds = [0] if ONLY_TR_FEATURES else [0, 1, 2]
for s in seeds:

    params = {
        'objective' : 'binary:logistic',
        'eval_metric':'logloss',
        'learning_rate': 0.015,
        'max_depth': 8,
        'n_estimators': 1000,
        'early_stopping_rounds': 50,
        'tree_method':'gpu_hist',
        'subsample': 0.8,
        'colsample_bytree': 0.4,
        'use_label_encoder' : False,
        "alpha": 8,
        "lambda": 0,
        "seed": s
    }

    limits = {'0-4': range(1, 4), '5-12': range(4, 14), '13-22': range(14, 19)}

    fold_preds = []
    fold_targs = []

    for i, (train_index, test_index) in enumerate(kf.split(val_sids)):

        grp_preds = []
        grp_targs = []

        for df, features, grp in [
            (df1_xgb, features1, "0-4"),
            (df2_xgb, features2, "5-12"),
            (df3_xgb, features3, "13-22")
        ]:

            test_users = val_sids[test_index]
            test_x = df[features].loc[test_users]

            # Doing this to validate on the kaggle data but also use the raw data for training
            train_x = df[features].loc[~df.index.isin(test_users)]
            train_users = train_x.index.values

            train_y = targets_xgb.loc[targets_xgb["q"].isin(limits[grp])].set_index('session').loc[train_users]
            train_y = train_y.sort_values(by=["session", "q"])
            # create copies of the data for each question
            train_x = pd.concat([train_x]*len(limits[grp])).sort_values(by=["session_id"])
            # use question number as a input feature since one model is trained per level group
            train_x["q"] = train_y["q"].values
            train_y = train_y['correct'].values

            test_y = targets_xgb.loc[targets_xgb["q"].isin(limits[grp])].set_index('session').loc[test_users]
            test_y = test_y.sort_values(by=["session", "q"])
            test_x = pd.concat([test_x]*len(limits[grp])).sort_values(by=["session_id"])
            test_x["q"] = test_y["q"].values
            test_y = test_y['correct'].values

            clf = XGBClassifier(**params)


            if ONLY_TR_FEATURES:
                model_path = f"models/xgb_models/model_tr_{i}_grp_{grp}_s{params['seed']}.xgb"
            else:
                model_path = f"models/xgb_models/model_{i}_grp_{grp}_s{params['seed']}.xgb"

            if os.path.exists(model_path):
                clf.load_model(model_path)
            else:
                clf.fit(train_x.astype(np.float32), train_y, eval_set=[(test_x.astype(np.float32), test_y)], verbose=0)

            val_probas = clf.predict_proba(test_x.astype(np.float32))[:, 1]

            grp_preds.append(val_probas)
            grp_targs.append(test_y)

            if SAVE_FOR_SUB:
                clf.save_model(model_path)

            for j, q in enumerate(limits[grp]):
                xgb_oof.loc[test_x.index.unique(), f"q{q}"] = val_probas[j::len(limits[grp])]

        preds = np.concatenate(grp_preds)
        targs = np.concatenate(grp_targs)

        best_score, best_threshold = threshold_score(preds, targs)

        fold_preds.append(preds)
        fold_targs.append(targs)

        print(f"fold_score_{i}:", best_score)

    preds = np.concatenate(fold_preds)
    targs = np.concatenate(fold_targs)

    best_score, best_threshold = threshold_score(preds, targs)

    if ONLY_TR_FEATURES:
        xgb_oof.to_csv(f"models/xgb_models/xgb_oof_tr_s{params['seed']}.csv")
    else:
        xgb_oof.to_csv(f"models/xgb_models/xgb_oof_s{params['seed']}.csv")

    print("-"*10)
    print("cv_score:", best_score)
    print("best_threshold:", best_threshold)

In [None]:
# evaluate xgb seeds average

targs = targets_xgb.set_index('session').loc[val_sids]["correct"].values

seeds_preds = 0
for s in seeds:
    preds = pd.read_csv(f"models/xgb_models/xgb_oof_s{s}.csv").set_index("session_id")
    seeds_preds += preds.values.flatten() / len(seeds)

threshold_score(seeds_preds, targs)

In [None]:
if SAVE_FOR_SUB:
    # Used for normalizing the transformer input during training and inference
    # The input is normalized over each point
    norms = df_tr.filter(pl.col("session_id").is_in(orig_ids)).groupby("point").agg([
        pl.col("et_diff").mean().alias("et_mean"),
        pl.col("ix_diff").mean().alias("ix_mean"),
        pl.col("dist_diff").mean().alias("dist_mean"),
        pl.col("et_diff").std().alias("et_std"),
        pl.col("ix_diff").std().alias("ix_std"),
        pl.col("dist_diff").std().alias("dist_std"),
        pl.col("room_coor_x").mean().alias("room_coor_x_mean"),
        pl.col("room_coor_y").mean().alias("room_coor_y_mean"),
        pl.col("room_coor_x").std().alias("room_coor_x_std"),
        pl.col("room_coor_y").std().alias("room_coor_y_std"),
    ])
    norms.write_parquet(f"models/torch_models/norms.parquet")

df_tr = pl.concat([df_tr, df_tr_h])


In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs

In [None]:
class NN(nn.Module):
    """
    Lightweight transformer model
    """

    def __init__(self, num_cont_cols, embed_dim, num_layers, num_heads, max_seq_len):
        super(NN, self).__init__()
        self.emb_cont = nn.Sequential(
            nn.Linear(num_cont_cols, embed_dim//2),
            nn.LayerNorm(embed_dim//2)
        )
        self.emb_cats = nn.Sequential(
            nn.Embedding(max_seq_len + 1, embed_dim//2),
            nn.LayerNorm(embed_dim//2)
        )
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim,
            dropout=0.1,
            batch_first=True,
            activation="relu",
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        # one head for each level group
        self.clf_heads = nn.ModuleList([
            nn.Linear(embed_dim, out_dim) for out_dim in [3, 10, 5]
        ])

    def forward(self, x, grp):
        emb_conts = self.emb_cont(x[:, :, :-1])
        emb_cats = self.emb_cats(x[:, :, -1].type(torch.int32))
        x = torch.cat([emb_conts, emb_cats], dim=2)
        x = self.encoder(x)
        x = x.mean(dim=1)
        x = self.clf_heads[["0-4", "5-12", "13-22"].index(grp)](x)
        return x.unsqueeze(2)


class DataGenerator(Dataset):
    """
    The transformers datagenerator
    """
    def __init__(self, xs, ys, seq_lens):
        self.xs = xs
        self.ys = ys
        self.pad_lens = seq_lens

    def __len__(self):
        return len(self.xs)

    def transform(self, x):
        padded_seqs = []

        seqs = [x[0], np.concatenate([x[0], x[1]])]
        # since some of the raw data used does not have levelgroup 13-22
        if len(x) == 3:
            seqs.append(np.concatenate([x[0], x[1], x[2]]))

        # padding and/or cropping
        for seq, pad_len in zip(seqs, self.pad_lens):
            seq_len = seq.shape[0]
            if seq_len < pad_len:
                seq = np.pad(seq, ((pad_len - seq_len, 0), (0, 0)), 'constant')
            if seq_len > pad_len:
                seq = seq[-pad_len:]
            padded_seqs.append(seq)

        if len(padded_seqs) == 2:
            padded_seqs.append(np.zeros((self.pad_lens[-1], x[0].shape[1])))

        return padded_seqs

    def __getitem__(self, idx):
        x = self.xs[idx]
        y = self.ys[idx]
        x1, x2, x3 = self.transform(x)
        return x1, x2, x3, y


def train(hyperparams, train_x, train_y, test_x, test_y, seq_lens):
    """
    Pytorch training loop
    """

    torch.manual_seed(hyperparams["seed"])
    writer = SummaryWriter()

    train_gen = DataGenerator(train_x, train_y, seq_lens)
    train_loader = DataLoader(
        train_gen,
        batch_size=hyperparams["batch_size"],
        shuffle=True,
        num_workers=4,
        drop_last=True
    )

    val_gen = DataGenerator(test_x, test_y, seq_lens)
    val_loader = DataLoader(
        val_gen,
        batch_size=hyperparams["batch_size"],
        shuffle=False,
        num_workers=4,
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = NN(
        num_cont_cols=hyperparams["num_cont_cols"],
        embed_dim=hyperparams["embed_dim"],
        num_layers=hyperparams["num_layers"],
        num_heads=hyperparams["num_heads"],
        max_seq_len=hyperparams["max_seq_len"],
    ).to(device)


    loss_function = torch.nn.BCEWithLogitsLoss()

    optimizer = torch.optim.AdamW(model.parameters(), hyperparams["learning_rate"])

    num_epochs = 80

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_epochs*len(train_loader)//10,
        num_training_steps= num_epochs*len(train_loader)
    )

    val_interval = 1
    best_val_loss = 1e6
    best_val_score = -1
    best_model = model

    for epoch in range(num_epochs):
        torch.manual_seed(hyperparams["seed"])
        model.train()
        epoch_loss = 0

        for x1s, x2s, x3s, ys in train_loader:
            x1s, x2s, x3s, ys = [t.to(device).type(torch.float) for t in [x1s, x2s, x3s, ys]]

            # takes one level group at a time, since the sequence length varies

            out = model(x1s, "0-4")
            loss = loss_function(out, ys[:, :3].unsqueeze(2)) *3/18
            epoch_loss += loss.item()
            loss.backward()

            out = model(x2s, "5-12")
            loss = loss_function(out, ys[:, 3:13].unsqueeze(2)) *10/18
            epoch_loss += loss.item()
            loss.backward()

            g3_mask = x3s.sum(dim=(1, 2)) != 0
            out = model(x3s[g3_mask], "13-22")
            loss = loss_function(out, ys[g3_mask, 13:].unsqueeze(2)) *5/18
            epoch_loss += loss.item()
            loss.backward()

            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        epoch_loss /= len(train_loader)
        writer.add_scalar('Loss/train', epoch_loss, epoch)


        if epoch % val_interval == 0:
            model.eval()
            with torch.no_grad():
                val_epoch_loss = 0
                targs = []
                val_probas = []
                for x1s, x2s, x3s, ys in val_loader:
                    x1s, x2s, x3s, ys = [t.to(device).type(torch.float) for t in [x1s, x2s, x3s, ys]]

                    out = model(x1s, "0-4")
                    loss = loss_function(out, ys[:, :3].unsqueeze(2)) *3/18
                    val_epoch_loss += loss.item()
                    probas1 = torch.sigmoid(out)

                    out = model(x2s, "5-12")
                    loss = loss_function(out, ys[:, 3:13].unsqueeze(2)) *10/18
                    val_epoch_loss += loss.item()
                    probas2 = torch.sigmoid(out)

                    out = model(x3s, "13-22")
                    loss = loss_function(out, ys[:, 13:].unsqueeze(2)) *5/18
                    val_epoch_loss += loss.item()
                    probas3 = torch.sigmoid(out)

                    probas = torch.cat([probas1, probas2, probas3], dim=1)
                    val_probas.append(probas.cpu().numpy())
                    targs.append(ys.cpu().numpy())

                targs = np.concatenate(targs)
                val_probas = np.concatenate(val_probas)

                best_score, best_threshold = threshold_score(val_probas.flatten(), targs.flatten())

                val_epoch_loss /= len(val_loader)
                writer.add_scalar('Loss/val', val_epoch_loss, epoch)
                writer.add_scalar('Score/val', best_score, epoch)

                if val_epoch_loss < best_val_loss:
                    best_val_loss = val_epoch_loss
                    best_val_score = best_score
                    best_model = copy.deepcopy(model)
                    best_val_probas = val_probas

        print("epoch", epoch, "best_score", best_val_score)



    writer.flush()
    return best_model, best_val_probas


def predict(hyperparams, test_x, test_y, seq_lens, model_path):
    """
    Load trained transformer from model_path,
    and perform prediction on the data test_x
    """

    val_gen = DataGenerator(test_x, test_y, seq_lens)
    val_loader = DataLoader(
        val_gen,
        batch_size=hyperparams["batch_size"],
        shuffle=False,
        num_workers=4,
    )

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = NN(
        num_cont_cols=hyperparams["num_cont_cols"],
        embed_dim=hyperparams["embed_dim"],
        num_layers=hyperparams["num_layers"],
        num_heads=hyperparams["num_heads"],
        max_seq_len=hyperparams["max_seq_len"],
    ).to(device)

    model.load_state_dict(torch.load(model_path))

    model.eval()
    with torch.no_grad():
        targs = []
        val_probas = []
        for x1s, x2s, x3s, ys in val_loader:
            x1s, x2s, x3s, ys = [t.to(device).type(torch.float) for t in [x1s, x2s, x3s, ys]]

            # takes one level group at a time, since the sequence length varies

            out = model(x1s, "0-4")
            probas1 = torch.sigmoid(out)

            out = model(x2s, "5-12")
            probas2 = torch.sigmoid(out)

            out = model(x3s, "13-22")
            probas3 = torch.sigmoid(out)

            probas = torch.cat([probas1, probas2, probas3], dim=1)
            val_probas.append(probas.cpu().numpy())
            targs.append(ys.cpu().numpy())

        targs = np.concatenate(targs)
        val_probas = np.concatenate(val_probas)

        best_score, best_threshold = threshold_score(val_probas.flatten(), targs.flatten())

    return model, val_probas


In [None]:
%%time

# Transfomer training

n_splits = 5

kf = KFold(n_splits=n_splits)

seeds = [0, 1, 2]
for s in seeds:

    params = {
        'learning_rate': 0.001,
        "batch_size": 512,
        "num_cont_cols": 5,
        "embed_dim": 64,
        "max_seq_len": seq_lens[-1],
        "num_layers": 1,
        "num_heads": 8,
        'early_stopping_rounds': 15,
        "seed": s
    }

    limits = {'0-4': range(1, 4), '5-12': range(4, 14), '13-22': range(14, 19)}

    preds = []
    targs = []

    fold_scores = []

    for i, (train_index, test_index) in enumerate(kf.split(val_sids)):

        test_users = val_sids[test_index]

        # Doing this to validate on the kaggle data while also using the raw data for training
        train_x = df_tr.filter(~pl.col("session_id").is_in(test_users))
        train_users = train_x["session_id"].unique(maintain_order=True)

        # normalize the input
        train_x = train_x.join(norms, on="point", how="left")
        train_x = train_x.with_columns([
            (pl.col("et_diff") - pl.col("et_mean")) / pl.col("et_std"),
            (pl.col("ix_diff") - pl.col("ix_mean")) / pl.col("ix_std"),
            (pl.col("dist_diff") - pl.col("dist_mean")) / pl.col("dist_std"),
            (pl.col("room_coor_x") - pl.col("room_coor_x_mean")) / pl.col("room_coor_x_std"),
            (pl.col("room_coor_y") - pl.col("room_coor_y_mean")) / pl.col("room_coor_y_std"),
        ]).fill_nan(0)
        # replace some outlier values with 0
        train_x = train_x.with_columns([pl.when(pl.col("ix_diff").abs() > 7.5).then(0).otherwise(pl.col("ix_diff")).keep_name()])
        # map point categoricals to integer index, so transformers can handle it
        train_x = train_x.with_columns([pl.col("point").map_dict(point_ixs)])

        test_x = df_tr.filter(pl.col("session_id").is_in(test_users))

        test_x = test_x.join(norms, on="point", how="left")
        test_x = test_x.with_columns([
            (pl.col("et_diff") - pl.col("et_mean")) / pl.col("et_std"),
            (pl.col("ix_diff") - pl.col("ix_mean")) / pl.col("ix_std"),
            (pl.col("dist_diff") - pl.col("dist_mean")) / pl.col("dist_std"),
            (pl.col("room_coor_x") - pl.col("room_coor_x_mean")) / pl.col("room_coor_x_std"),
            (pl.col("room_coor_y") - pl.col("room_coor_y_mean")) / pl.col("room_coor_y_std"),
        ]).fill_nan(0)
        test_x = test_x.with_columns([pl.when(pl.col("ix_diff").abs() > 7.5).then(0).otherwise(pl.col("ix_diff")).keep_name()])

        test_x = test_x.with_columns([pl.col("point").map_dict(point_ixs)])

        train_users = train_x["session_id"].unique(maintain_order=True)
        train_y = targets_tr.filter(pl.col("session_id").is_in(train_users))
        # to sort labels and data in the same order
        train_y = train_y.join(pl.DataFrame(train_users), on="session_id")
        train_y = train_y.drop("session_id").to_numpy()

        test_users = test_x["session_id"].unique(maintain_order=True)
        test_y = targets_tr.filter(pl.col("session_id").is_in(test_users))
        test_y = test_y.join(pl.DataFrame(test_users), on="session_id")
        test_y = test_y.drop("session_id").to_numpy()

        train_x = [[a.filter(pl.col("level_group")==grp).select(["et_diff", "ix_diff",  "dist_diff", "room_coor_x", "room_coor_y", "point"]).to_numpy() for grp in ["0-4", "5-12", "13-22"]] for _, a in train_x.groupby(["session_id"], maintain_order=True)]
        test_x = [[a.filter(pl.col("level_group")==grp).select(["et_diff", "ix_diff",  "dist_diff", "room_coor_x", "room_coor_y", "point"]).to_numpy() for grp in ["0-4", "5-12", "13-22"]] for _, a in test_x.groupby(["session_id"], maintain_order=True)]

        model_path = f"models/torch_models/model_{i}_s{s}"
        if os.path.exists(model_path):
            shutil.make_archive(model_path, 'zip', model_path)
            model_path += ".zip"
            model, val_probas = predict(params, test_x, test_y, trimmed_seq_lens, model_path)
        elif os.path.exists(model_path + ".zip"):
            model, val_probas = predict(params, test_x, test_y, trimmed_seq_lens, model_path + ".zip")
        else:
            model, val_probas = train(params, train_x, train_y, test_x, test_y, trimmed_seq_lens)

            if SAVE_FOR_SUB:
                torch.save(model.state_dict(), model_path)

        for j in range(18):
            tr_oof.loc[test_users, f"q{j+1}"] = val_probas[:, j, :]

        pred = val_probas.flatten()
        targ = test_y.flatten()
        best_score, best_threshold = threshold_score(pred, targ)

        preds.append(pred)
        targs.append(targ)
        fold_scores.append(best_score)

        print(f"fold_score_{i}:", best_score)


    preds = np.concatenate(preds)
    targs = np.concatenate(targs)

    best_score, best_threshold = threshold_score(preds, targs)

    tr_oof.to_csv(f"models/torch_models/tr_oof_s{s}.csv")

    print("-"*10)
    print("cv_score:", best_score)
    print("best_threshold:", best_threshold)

    for i, score in enumerate(fold_scores):
        print(f"fold_score_{i}:", score)



In [None]:
# evaluate transformer seeds average

targs = targets_xgb.set_index('session').loc[val_sids]["correct"].values

seeds_preds = 0
for s in seeds:
    preds = pd.read_csv(f"models/torch_models/tr_oof_s{s}.csv").set_index("session_id")
    seeds_preds += preds.values.flatten() / len(seeds)

threshold_score(seeds_preds, targs)

In [None]:
%%time

# Catboost training

seeds = [0, 1, 2]
for s in seeds:

    n_splits = 5

    kf = KFold(n_splits=n_splits)

    params = {
        'loss_function':'Logloss',
        'learning_rate': 0.05,
        'depth': 8,
        'iterations': 1000,
        'early_stopping_rounds': 50,
        'subsample': 0.8,
        'colsample_bylevel': 0.4,
        'metric_period': 1,
        "l2_leaf_reg": 8,
        "random_seed": s,
        'verbose': 0,
        'use_best_model': True,
    }

    limits = {'0-4': range(1, 4), '5-12': range(4, 14), '13-22': range(14, 19)}

    fold_preds = []
    fold_targs = []

    for i, (train_index, test_index) in enumerate(kf.split(val_sids)):

        grp_preds = []
        grp_targs = []

        for df, features, grp in [
            (df1_xgb, features1, "0-4"),
            (df2_xgb, features2, "5-12"),
            (df3_xgb, features3, "13-22")
        ]:

            test_users = val_sids[test_index]
            test_x = df[features].loc[test_users]

            # Doing this to validate on the kaggle data while using the raw data for training
            train_x = df[features].loc[~df.index.isin(test_users)]
            train_users = train_x.index.values

            train_y = targets_xgb.loc[targets_xgb["q"].isin(limits[grp])].set_index('session').loc[train_users]
            train_y = train_y.sort_values(by=["session", "q"])
            train_x = pd.concat([train_x]*len(limits[grp])).sort_values(by=["session_id"])
            train_x["q"] = train_y["q"].values
            train_y = train_y['correct'].values

            test_y = targets_xgb.loc[targets_xgb["q"].isin(limits[grp])].set_index('session').loc[test_users]
            test_y = test_y.sort_values(by=["session", "q"])
            test_x = pd.concat([test_x]*len(limits[grp])).sort_values(by=["session_id"])
            test_x["q"] = test_y["q"].values
            test_y = test_y['correct'].values

            clf = CatBoostClassifier(**params)

            train_pool = Pool(train_x, train_y)
            valid_pool = Pool(test_x, test_y)

            model_path = f"models/cat_models/model_{i}_grp_{grp}_s{s}.cbm"
            if os.path.exists(model_path):
                clf.load_model(model_path)
            else:
                clf = clf.fit(train_pool, eval_set=valid_pool)

            val_probas = clf.predict_proba(valid_pool)[:, 1]

            grp_preds.append(val_probas)
            grp_targs.append(test_y)

            if SAVE_FOR_SUB:
                clf.save_model(model_path)

            for j, q in enumerate(limits[grp]):
                xgb_oof.loc[test_x.index.unique(), f"q{q}"] = val_probas[j::len(limits[grp])]


        preds = np.concatenate(grp_preds)
        targs = np.concatenate(grp_targs)

        best_score, best_threshold = threshold_score(preds, targs)

        fold_preds.append(preds)
        fold_targs.append(targs)

        print(f"fold_score_{i}:", best_score)

    preds = np.concatenate(fold_preds)
    targs = np.concatenate(fold_targs)

    best_score, best_threshold = threshold_score(preds, targs)

    xgb_oof.to_csv(f"models/cat_models/cat_oof_s{s}.csv")

    print("-"*10)
    print("cv_score:", best_score)
    print("best_threshold:", best_threshold)



In [None]:
# evaluate catboost seeds average

targs = targets_xgb.set_index('session').loc[val_sids]["correct"].values

seeds_preds = 0
for s in seeds:
    preds = pd.read_csv(f"models/cat_models/cat_oof_s{s}.csv").set_index("session_id")
    seeds_preds += preds.values.flatten() / len(seeds)

threshold_score(seeds_preds, targs)

In [None]:
%%time

# train and evaluate ensemble with linear regression

n_splits = 5
kf = KFold(n_splits=n_splits)

limits = {'0-4': range(1, 4), '5-12': range(4, 14), '13-22': range(14, 19)}
fold_preds = []
fold_targs = []
seeds = [0, 1, 2]

for q in range(1, 19):

    if 1 <= q <= 3:
        # number of questions used as feature
        qs = 3
    elif 4 <= q <= 13:
        qs = 13
    else:
        qs = 18

    preds = []
    targs = []

    for i, (train_index, test_index) in enumerate(kf.split(val_sids)):

        train_users = val_sids[train_index]
        test_users = val_sids[test_index]

        train_y = targets_xgb.loc[targets_xgb["q"] == q].set_index('session').loc[train_users]

        # calculate seeds average
        train_x = 0
        for s in seeds:
            preds_xgb = pd.read_csv(f"models/xgb_models/xgb_oof_s{s}.csv").set_index("session_id")
            preds_xgb = preds_xgb.loc[train_users, [f"q{q+1}" for q in range(qs)]]
            preds_cat = pd.read_csv(f"models/cat_models/cat_oof_s{s}.csv").set_index("session_id")
            preds_cat = preds_cat.loc[train_users, [f"q{q+1}" for q in range(qs)]]
            preds_tr = pd.read_csv(f"models/torch_models/tr_oof_s{s}.csv").set_index("session_id")
            preds_tr = preds_tr.loc[train_users, [f"q{q+1}" for q in range(qs)]]
            preds_ens = pd.concat([preds_xgb, preds_cat, preds_tr], axis=1)
            train_x += preds_ens.values / len(seeds)

        test_y = targets_xgb.loc[targets_xgb["q"] == q].set_index('session').loc[test_users]

        test_x = 0
        for s in seeds:
            preds_xgb = pd.read_csv(f"models/xgb_models/xgb_oof_s{s}.csv").set_index("session_id")
            preds_xgb = preds_xgb.loc[test_users, [f"q{q+1}" for q in range(qs)]]
            preds_cat = pd.read_csv(f"models/cat_models/cat_oof_s{s}.csv").set_index("session_id")
            preds_cat = preds_cat.loc[test_users, [f"q{q+1}" for q in range(qs)]]
            preds_tr = pd.read_csv(f"models/torch_models/tr_oof_s{s}.csv").set_index("session_id")
            preds_tr = preds_tr.loc[test_users, [f"q{q+1}" for q in range(qs)]]
            preds_ens = pd.concat([preds_xgb, preds_cat, preds_tr], axis=1)
            test_x += preds_ens.values / len(seeds)

        clf = LinearRegression()
        clf.fit(train_x, train_y['correct'].values)

        pickle.dump(clf, open(f'models/meta_models/lin_reg_{i}_q{q}.sav', 'wb'))

        val_probas = clf.predict(test_x)

        preds.append(val_probas)
        targs.append(test_y['correct'].values)

    preds = np.concatenate(preds)
    targs = np.concatenate(targs)

    fold_preds.append(preds)
    fold_targs.append(targs)

preds = np.concatenate(fold_preds)
targs = np.concatenate(fold_targs)

best_score, best_threshold = threshold_score(preds, targs)

print("-"*10)
print("cv_score:", best_score)
print("best_threshold:", best_threshold)