In [1]:
import pandas as pd
import json
import lightgbm as lgb
from tqdm import tqdm
tqdm.pandas()


test = pd.read_csv('../../data/raw/test.csv')
model = lgb.Booster(model_file='../../models/regression_baseline_rmse.lgb')
coef=[1.1652705553319513, 1.7983061293534481, 2.2127169630913537]

In [2]:
games = ['Scrub-A-Dub', 'All Star Sorting', 'Mushroom Sorter (Assessment)',
       'Air Show', 'Crystals Rule', 'Bird Measurer (Assessment)',
       'Dino Drink', 'Bubble Bath', 'Dino Dive', 'Chow Time',
       'Cauldron Filler (Assessment)', 'Pan Balance', 'Happy Camel',
       'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)',
       'Leaf Leader']


def unwrap_event_data(df):
    unwrapped=pd.DataFrame(data=list(df.event_data.apply(json.loads).values))
    return pd.concat([unwrapped.reset_index(),df.reset_index()],axis=1)


def process_log(df):
    assessment_title=df.title.iloc[-1]    

    history = df.iloc[:-1]
    history = history[history.type.isin(["Game", "Assessment"])].copy()

    def calculate_ratios(df):
        n_correct=df.correct_move.sum()
        n_incorrect=df.wrong_move.sum()
        ratio=n_correct/(n_correct+n_incorrect)
        return n_correct, n_incorrect, ratio
    
    def make_move_stats(df, title,n_lags=2):
        df=df.copy()
        if len(df):
            df = unwrap_event_data(df)
        if "correct" in df.columns:
            df["correct_move"] = df.correct == True
            df["wrong_move"] = df.correct == False
        else:
            df["correct_move"]=False
            df["wrong_move"]=False
        result = []
        result.extend(zip([f"n_correct {title}", f"n_incorrect {title}", f"global_ratio {title}"], calculate_ratios(df)))
        if n_lags:
            last_sessions = df.game_session.unique()[-n_lags:]
            for i in range(n_lags):
                if i < len(last_sessions): 
                    result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], 
                                      calculate_ratios(df[df.game_session==last_sessions[i]])))
                else:
                    result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], [None, None, None]))
        return {k: v for k, v in result}
    result = {"title": games.index(assessment_title)}
    for game in games:
        stats=history[history.title==game]
        stats=make_move_stats(stats, game)
        result.update(stats)
    return result


def process_test_installations(test):
    test = test.sort_values("timestamp")
    test=test.groupby("installation_id").progress_apply(process_log).reset_index()
    test.columns = ["installation_id", "features"]
    result = []
    for i, installation_id, feature in test.itertuples():
        result.append(feature)
        feature["installation_id"]=installation_id
    return pd.DataFrame(result).fillna(-1)

test_features=process_test_installations(test)

100%|██████████| 1000/1000 [01:31<00:00, 10.92it/s]


__validation.py__

In [3]:
import numpy as np
from sklearn.model_selection import GroupKFold, KFold
from sklearn.utils import shuffle
from typing import NamedTuple
from functools import partial
from sklearn.metrics import cohen_kappa_score


class Predict(NamedTuple):
    true: np.array
    pred: np.array


class InstallationFold(GroupKFold):
    def __init__(self, n_splits=5, installation_ids=None):
        super().__init__(n_splits=n_splits)
        self.installation_ids = installation_ids

    def split(self, X, y, installation_ids=None):
        if installation_ids is None:
            installation_ids = self.installation_ids
        orig_indices = np.arange(len(X))
        shuffled_indices, installation_ids = shuffle(orig_indices, installation_ids, random_state=2019)
        for train, test in super().split(shuffled_indices, shuffled_indices, installation_ids):
            yield shuffled_indices[train], shuffled_indices[test]


def fit_fold(df, train_ix, test_ix, make_features, train_model, make_predictions):
    train = df.iloc[train_ix].reset_index().copy()
    test = df.iloc[test_ix].reset_index().copy()
    train_features, test_features = make_features(train, test)
    model = train_model(*train_features)
    test_pred, test_true = make_predictions(model, *test_features)
    return Predict(test_true, test_pred)


def cross_validate(train, labels, make_features, train_model, make_predictions, cv=None):
    predicts = []
    np.random.seed(2019)
    cv = InstallationFold() if cv is None else cv
    for ix_train, ix_test in cv.split(train, labels, train.installation_id.values):
        predicts.append(fit_fold(train, ix_train, ix_test, make_features, train_model, make_predictions))
    return predicts


quad_kappa = partial(cohen_kappa_score, weights="quadratic")

__coeff.py__

In [4]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
#from dsb2019.data.validation import quad_kappa


class ThresholdClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_iter=1000, random_state=2019):
        self.n_iter=n_iter
        self.random_state=random_state

    def _run_trial(self, X, y, params):
        threshold1 = params["threshold1"]
        threshold2 = threshold1 + abs(params["threshold2_delta"])
        threshold3 = threshold2 + abs(params["threshold3_delta"]) 
        pred = pd.cut(X, [-np.inf, threshold1, threshold2, threshold3, np.inf], labels = [0, 1, 2, 3])
        return {
           "loss": -quad_kappa(y, pred),
           "status": STATUS_OK,
           "coef": [threshold1, threshold2, threshold3]
        }

    def fit(self, X, y):
        class1_percentile = sum(y<1) / len(y) * 100
        class2_percentile = sum(y<2) / len(y) * 100
        class3_percentile = sum(y<3) / len(y) * 100
        threshold1_prior = np.percentile(X, class1_percentile)
        threshold2_prior = np.percentile(X, class2_percentile)
        threshold3_prior = np.percentile(X, class3_percentile)
        threshold2_delta_prior = threshold2_prior - threshold1_prior
        threshold3_delta_prior = threshold3_prior - threshold2_prior
        prior_std = (np.percentile(X, 99) - np.percentile(X, 1)) / 3
        space = {
            "threshold1": hp.normal("threshold1", threshold1_prior, prior_std),
            "threshold2_delta": hp.normal("threshold2_delta", threshold2_delta_prior, prior_std),
            "threshold3_delta": hp.normal("threshold3_delta", threshold3_delta_prior, prior_std)
        }

        partial_run = partial(self._run_trial, X, y)

        trials = Trials()
        fmin(partial_run, space=space,
             algo=tpe.suggest,
             max_evals=self.n_iter, rstate=np.random.RandomState(self.random_state), trials=trials)
        
        self.coef_ = trials.best_trial["result"]["coef"]
        return self

    def predict(self, X):
        return pd.cut(X, [-np.inf] + self.coef_ + [np.inf], labels = [0, 1, 2, 3])

In [5]:
def make_submission(test_features, model):
    installations = test_features.installation_id.values
    test = test_features.drop("installation_id", axis=1)
    predictions = model.predict(test)
    clf = ThresholdClassifier()
    clf.coef_=coef
    predictions = clf.predict(predictions)
    return pd.DataFrame(data={"installation_id": installations, "accuracy_group": predictions})

submission = make_submission(test_features, model)

In [6]:
submission.to_csv("../../data/submissions/regression_baseline_rmse.csv", index=False)

In [7]:
submission.accuracy_group.value_counts()

3    391
2    320
0    159
1    130
Name: accuracy_group, dtype: int64