In [1]:
%load_ext autoreload
%autoreload 2

In [76]:
import os
from target_encoding import TargetEncoderClassifier, TargetEncoder
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from functools import reduce
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score
import json
from functools import partial

from dsb2019.data.validation import InstallationFold, cross_validate
from dsb2019.visualization import session_browser
from dsb2019.data import DATA_DIR
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
tqdm.pandas()
pd.options.display.max_rows=999

In [3]:
train = pd.read_csv(DATA_DIR / 'interim/train.csv')
test = pd.read_csv(DATA_DIR / 'raw/test.csv')
train_labels = pd.read_csv(DATA_DIR / 'raw/train_labels.csv')
submission = pd.read_csv(DATA_DIR / 'raw/sample_submission.csv')

In [4]:
def preprocess_sessions(df):
    def cut_last_assessment(sub_df):
        sub_df = sub_df.sort_values("timestamp").drop("installation_id", axis=1)
        assessments = sub_df[sub_df.type=="Assessment"]
        if len(assessments):
            last_game_session = assessments.game_session.iloc[-1]
            result = sub_df[sub_df.game_session != last_game_session].copy().reset_index()
            last_row = sub_df[sub_df.game_session==last_game_session].head(1).reset_index()
            return pd.concat([result, last_row])
        return sub_df.head(0)
        
    return df.groupby("installation_id").apply(cut_last_assessment)

#preprocessed_train_sessions = preprocess_sessions(train)

In [88]:
games = ['Scrub-A-Dub', 'All Star Sorting', 'Mushroom Sorter (Assessment)',
       'Air Show', 'Crystals Rule', 'Bird Measurer (Assessment)',
       'Dino Drink', 'Bubble Bath', 'Dino Dive', 'Chow Time',
       'Cauldron Filler (Assessment)', 'Pan Balance', 'Happy Camel',
       'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)',
       'Leaf Leader']

def unwrap_event_data(df):
    unwrapped=pd.DataFrame(data=list(df.event_data.apply(json.loads).values))
    return pd.concat([unwrapped.reset_index(),df.reset_index()],axis=1)


def process_installation(df):
    df = df.sort_values("timestamp")
    assessment_title=df.title.iloc[-1]    
    history = df.iloc[:-1]
    history = history[history.type.isin(["Game", "Assessment"])].copy()

    def calculate_ratios(df):
        n_correct=df.correct_move.sum()
        n_incorrect=df.wrong_move.sum()
        ratio=n_correct/(n_correct+n_incorrect)
        return n_correct, n_incorrect, ratio
    
    def make_move_stats(df, title,n_lags=2):
        df=df.copy()
        if len(df):
            df = unwrap_event_data(df)
        if "correct" in df.columns:
            df["correct_move"] = df.correct == True
            df["wrong_move"] = df.correct == False
        else:
            df["correct_move"]=False
            df["wrong_move"]=False
        result = []
        result.extend(zip([f"n_correct {title}", f"n_incorrect {title}", f"global_ratio {title}"], calculate_ratios(df)))
        if n_lags:
            last_sessions = df.game_session.unique()[-n_lags:]
            for i in range(n_lags):
                if i < len(last_sessions): 
                    result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], calculate_ratios(df[df.game_session==last_sessions[i]])))
                else:
                    result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], [None, None, None]))
        return {k: [v] for k, v in result}
    result = {"title": assessment_title}
    for game in games:
        stats=history[history.title==game]
        stats=make_move_stats(stats, game)
        result.update(stats)
    return pd.DataFrame(result).fillna(-1)

def add_features(df):
    return df.groupby("installation_id").progress_apply(process_installation).reset_index().drop("level_1", axis=1)

game_features=add_features(train[train.installation_id=='545ee585'])










100%|██████████| 1/1 [00:00<00:00,  5.05it/s]


In [91]:
train_labels[train_labels["installation_id"]=="545ee585"]

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group


In [89]:
#train.groupby("installation_id")
pd.merge(game_features,train_labels,on=["installation_id","title"])

Unnamed: 0,installation_id,title,n_correct Scrub-A-Dub,n_incorrect Scrub-A-Dub,global_ratio Scrub-A-Dub,n_correct Scrub-A-Dub 0,n_incorrect Scrub-A-Dub 0,ratio Scrub-A-Dub 0,n_correct Scrub-A-Dub 1,n_incorrect Scrub-A-Dub 1,...,n_incorrect Leaf Leader 0,ratio Leaf Leader 0,n_correct Leaf Leader 1,n_incorrect Leaf Leader 1,ratio Leaf Leader 1,game_session,num_correct,num_incorrect,accuracy,accuracy_group


In [78]:
train_labels

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.000000,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.000000,0
2,901acc108f55a5a1,0006a69f,Mushroom Sorter (Assessment),1,0,1.000000,3
3,9501794defd84e4d,0006a69f,Mushroom Sorter (Assessment),1,1,0.500000,2
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.000000,3
...,...,...,...,...,...,...,...
17685,c996482b11d149dd,ffc90c32,Bird Measurer (Assessment),1,0,1.000000,3
17686,b05a02b52d5c1f4c,ffd2871d,Cauldron Filler (Assessment),1,0,1.000000,3
17687,5448d652309a6324,ffeb0b1b,Cauldron Filler (Assessment),1,2,0.333333,1
17688,a6885ab824fbc32c,ffeb0b1b,Mushroom Sorter (Assessment),0,1,0.000000,0


In [77]:
game_features.train

installation_id,545ee585
Unnamed: 0_level_1,0
assessment_title,3.0
n_correct Scrub-A-Dub,0.0
n_incorrect Scrub-A-Dub,0.0
global_ratio Scrub-A-Dub,-1.0
n_correct Scrub-A-Dub 0,-1.0
n_incorrect Scrub-A-Dub 0,-1.0
ratio Scrub-A-Dub 0,-1.0
n_correct Scrub-A-Dub 1,-1.0
n_incorrect Scrub-A-Dub 1,-1.0
ratio Scrub-A-Dub 1,-1.0


In [None]:
def train_baseline(x_train,y_train):
    x_train_all, x_val_all,y_train_all,y_val_all = train_test_split(
        x_train,y_train,
        test_size=0.15,
        random_state=2019,
    )
    train_set = lgb.Dataset(x_train_all, y_train)
    val_set = lgb.Dataset(x_val_all, y_val)

    params = {
        'learning_rate': 0.01,
        'bagging_fraction': 0.9,
        'feature_fraction': 0.9,
        'num_leaves': 14,
        'lambda_l1': 0.1,
        'lambda_l2': 1,
        'metric': 'multiclass',
        'objective': 'multiclass',
        'num_classes': 4,
        'random_state': 2019
    }

    return lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=300, valid_sets=[train_set, val_set], verbose_eval=100)


def make_features_wrapper(train, test):
    x_train_all, x_val_all, x_test_all, y_train, y_val = make_features(train, test)
    return (x_train_all,x_val_all,y_train,y_val), (x_test_all,test.accuracy_group.values)

def make_predictions(model,x_test_all,y_test):
    pred=model.predict(x_test_all).argmax(axis=1)
    return pred,y_test