## Libraries

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path

from tqdm import tqdm_notebook as tqdm

## Data Loading

In [2]:
DATA_DIR = Path("../features/")
train = pd.read_feather(DATA_DIR / "PastSummary_train.ftr")
valid = pd.read_feather(DATA_DIR / "PastSummary_valid.ftr")
test = pd.read_feather(DATA_DIR / "PastSummary_test.ftr")

In [3]:
train.head()

Unnamed: 0,Activity,Assessment,Clip,Game,3010,4035,2020,4100,3110,4070,...,Fireworks (Activity)_Dot_SoHigh_count,Flower Waterer (Activity)_duration,Sandcastle Builder (Activity)_duration,sand_filled_ratio,Sandcastle Builder (Activity)_Dot_DragShovel_count,Sandcastle Builder (Activity)_Dot_SoCool_count,Sandcastle Builder (Activity)_Dot_FillItUp_count,Watering Hole (Activity)_duration,water_filled_ratio,installation_id
0,3,0,6,2,79,1,20,0,77,94,...,12,164673,89767,0.5,5,14,6,0,0.0,0006a69f
1,4,1,8,4,226,6,26,5,223,156,...,12,164673,89767,0.5,5,14,6,0,0.0,0006a69f
2,4,2,8,4,228,6,27,5,225,160,...,12,164673,89767,0.5,5,14,6,0,0.0,0006a69f
3,9,3,15,7,341,14,52,6,336,348,...,30,254739,127411,0.52381,5,22,8,80243,0.666667,0006a69f
4,10,3,17,9,463,15,64,12,457,387,...,30,254739,127411,0.52381,5,22,8,80243,0.666667,0006a69f


In [4]:
oof_preds = np.load("../output/lgb_18_past_summary_all/oof_preds.npy")

train["preds"] = oof_preds

## First Assessment

In [5]:
all_ids = train["installation_id"].unique()
all_first_assess = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: []
}

after_second_assess = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: []
}

for idx in tqdm(all_ids):
    for i in range(5): 
        assessments = train.query(
            f"installation_id == '{idx}' & session_title == {i}")
        if len(assessments) > 0:
            all_first_assess[i].append(assessments.iloc[0:1])
        if len(assessments) > 1:
            after_second_assess[i].append(assessments.iloc[1:])
    
all_first_df0 = pd.concat(
    all_first_assess[0], 
    axis=0, 
    sort=False).reset_index(drop=True)

all_first_df1 = pd.concat(
    all_first_assess[1], 
    axis=0, 
    sort=False).reset_index(drop=True)

all_first_df2 = pd.concat(
    all_first_assess[2], 
    axis=0, 
    sort=False).reset_index(drop=True)

all_first_df3 = pd.concat(
    all_first_assess[3], 
    axis=0, 
    sort=False).reset_index(drop=True)

all_first_df4 = pd.concat(
    all_first_assess[4], 
    axis=0, 
    sort=False).reset_index(drop=True)

after_second_df0 = pd.concat(
    after_second_assess[0],
    axis=0,
    sort=False).reset_index(drop=True)

after_second_df1 = pd.concat(
    after_second_assess[1],
    axis=0,
    sort=False).reset_index(drop=True)

after_second_df2 = pd.concat(
    after_second_assess[2],
    axis=0,
    sort=False).reset_index(drop=True)

after_second_df3 = pd.concat(
    after_second_assess[3],
    axis=0,
    sort=False).reset_index(drop=True)

after_second_df4 = pd.concat(
    after_second_assess[4],
    axis=0,
    sort=False).reset_index(drop=True)

HBox(children=(IntProgress(value=0, max=3614), HTML(value='')))




In [6]:
all_first_df0.head()

Unnamed: 0,Activity,Assessment,Clip,Game,3010,4035,2020,4100,3110,4070,...,Flower Waterer (Activity)_duration,Sandcastle Builder (Activity)_duration,sand_filled_ratio,Sandcastle Builder (Activity)_Dot_DragShovel_count,Sandcastle Builder (Activity)_Dot_SoCool_count,Sandcastle Builder (Activity)_Dot_FillItUp_count,Watering Hole (Activity)_duration,water_filled_ratio,installation_id,preds
0,4,1,8,4,226,6,26,5,223,156,...,164673,89767,0.5,5,14,6,0,0.0,0006a69f,1
1,7,2,11,5,246,45,50,5,240,577,...,267890,339403,0.492308,4,64,24,21389,0.0,0006c192,0
2,0,1,0,0,1,0,0,0,0,0,...,0,0,0.0,0,0,0,0,0.0,00129856,2
3,0,1,2,0,8,1,2,3,8,30,...,0,0,0.0,0,0,0,0,0.0,001d0ed0,0
4,3,1,6,3,81,6,10,0,81,204,...,132080,0,0.0,0,0,0,0,0.0,00225f67,0


In [7]:
import numpy as np

from typing import Union


def qwk(y_true: Union[np.ndarray, list],
        y_pred: Union[np.ndarray, list],
        max_rat: int = 3) -> float:
    y_true_ = np.asarray(y_true)
    y_pred_ = np.asarray(y_pred)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    uniq_class = np.unique(y_true_)
    for i in uniq_class:
        hist1[int(i)] = len(np.argwhere(y_true_ == i))
        hist2[int(i)] = len(np.argwhere(y_pred_ == i))

    numerator = np.square(y_true_ - y_pred_).sum()

    denominator = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            denominator += hist1[i] * hist2[j] * (i - j) * (i - j)

    denominator /= y_true_.shape[0]
    return 1 - numerator / denominator

In [8]:
qwk_score = qwk(all_first_df0["accuracy_group"], all_first_df0["preds"])
print(f"class 0: all first qwk: {qwk_score:.4f}")

class 0: all first qwk: 0.4341


In [9]:
qwk_score = qwk(all_first_df1["accuracy_group"], all_first_df1["preds"])
print(f"class 1: all first qwk: {qwk_score:.4f}")

class 1: all first qwk: 0.3674


In [10]:
qwk_score = qwk(all_first_df2["accuracy_group"], all_first_df2["preds"])
print(f"class 2: all first qwk: {qwk_score:.4f}")

class 2: all first qwk: 0.3194


In [11]:
qwk_score = qwk(all_first_df3["accuracy_group"], all_first_df3["preds"])
print(f"class 3: all first qwk: {qwk_score:.4f}")

class 3: all first qwk: 0.2719


In [12]:
qwk_score = qwk(all_first_df4["accuracy_group"], all_first_df4["preds"])
print(f"class 4: all first qwk: {qwk_score:.4f}")

class 4: all first qwk: 0.3300


In [13]:
qwk_score = qwk(after_second_df0["accuracy_group"], after_second_df0["preds"])
print(f"class 0: after second qwk: {qwk_score:.4f}")

class 0: after second qwk: 0.5695


In [14]:
qwk_score = qwk(after_second_df1["accuracy_group"], after_second_df1["preds"])
print(f"class 1: after second qwk: {qwk_score:.4f}")

class 1: after second qwk: 0.4456


In [15]:
qwk_score = qwk(after_second_df2["accuracy_group"], after_second_df2["preds"])
print(f"class 2: after second qwk: {qwk_score:.4f}")

class 2: after second qwk: 0.4335


In [16]:
qwk_score = qwk(after_second_df3["accuracy_group"], after_second_df3["preds"])
print(f"class 3: after second qwk: {qwk_score:.4f}")

class 3: after second qwk: 0.5430


In [17]:
qwk_score = qwk(after_second_df4["accuracy_group"], after_second_df4["preds"])
print(f"class 4: after second qwk: {qwk_score:.4f}")

class 4: after second qwk: 0.5081


In [20]:
qwk_score = qwk(
    np.concatenate(
        [
            all_first_df0["accuracy_group"],
            all_first_df1["accuracy_group"],
            all_first_df2["accuracy_group"],
            all_first_df3["accuracy_group"],
            all_first_df4["accuracy_group"]
        ])
    , np.concatenate(
        [
            all_first_df0["preds"],
            all_first_df1["preds"],
            all_first_df2["preds"],
            all_first_df3["preds"],
            all_first_df4["preds"]
        ])
    )
print(f"overall qwk: {qwk_score:.4f}")

overall qwk: 0.5688


In [21]:
qwk_score = qwk(
    np.concatenate(
        [
            after_second_df0["accuracy_group"],
            after_second_df1["accuracy_group"],
            after_second_df2["accuracy_group"],
            after_second_df3["accuracy_group"],
            after_second_df4["accuracy_group"]
        ])
    , np.concatenate(
        [
            after_second_df0["preds"],
            after_second_df1["preds"],
            after_second_df2["preds"],
            after_second_df3["preds"],
            after_second_df4["preds"]
        ])
    )
print(f"overall qwk: {qwk_score:.4f}")

overall qwk: 0.6309
