In [1]:
cd ..

/code/dsb2019/notebooks


## init

In [2]:
import os
import pandas as pd
from pathlib import Path
from pandas.io.json import json_normalize
import json
import lightgbm as lgb
import numpy as np
from tqdm import tqdm
tqdm.pandas()


def read_data_model():
    if "LOCAL_LAB_ENV" in os.environ:
        data_dir = Path("../data/raw")
        model_dir = Path("/code/dsb2019/models")
    else:
        data_dir = Path("/kaggle/input/data-science-bowl-2019")
        model_dir = Path("/kaggle/input/time-baseline3")

    test = pd.read_csv(data_dir / "test.csv")
    test["accuracy_group"] = None
    model = lgb.Booster(model_file=str(model_dir / "time_baseline.lgb"))
    return test, model

test, model = read_data_model()

## lgb_classifier.py

In [70]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split

from functools import reduce
from functools import partial

from dsb2019.models.tracking import track_experiment, track_submission_info
from dsb2019.data.validation import InstallationFold, cross_validate, quad_kappa


def lgb_quad_kappa(preds, true):
    true = true.get_label()
    preds = preds.reshape((4, -1)).argmax(axis=0)
    return "quad_kappa", quad_kappa(true, preds), True
    
    
def train_baseline(x_train,y_train, params=None):
    x_train_all, x_val_all,y_train_all,y_val_all = train_test_split(
        x_train,y_train,
        test_size=0.15,
        random_state=2019,
    )
    train_set = lgb.Dataset(x_train_all, y_train_all)
    val_set = lgb.Dataset(x_val_all, y_val_all)

    return lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=2000, valid_sets=[train_set, val_set], verbose_eval=100,
                    feval=lgb_quad_kappa)


def make_features_wrapper(*dataframes):
    def make_features(df):
        return df.drop(["installation_id", "accuracy_group", "target_game_session"], axis=1), df.accuracy_group.values
    
    result = tuple([make_features(df) for df in dataframes]) 
    if len(result) == 1:
        return result[0]
    return result


def make_predictions(model,x_test_all,y_test):
    pred=model.predict(x_test_all).argmax(axis=1)
    return pred,y_test


def make_submission(test_features, model):
    installations = test_features.installation_id.values
    test, _ = make_features_wrapper(test_features)
    predictions, _ = make_predictions(model, test, None)
    return pd.DataFrame(data={"installation_id": installations, "accuracy_group": predictions})


## game.py

In [71]:
games = ['Scrub-A-Dub', 'All Star Sorting', 'Mushroom Sorter (Assessment)',
       'Air Show', 'Crystals Rule', 'Bird Measurer (Assessment)',
       'Dino Drink', 'Bubble Bath', 'Dino Dive', 'Chow Time',
       'Cauldron Filler (Assessment)', 'Pan Balance', 'Happy Camel',
       'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)',
       'Leaf Leader']


def calculate_ratios(df):
    n_correct=df.correct_move.sum()
    n_incorrect=df.wrong_move.sum()
    ratio=n_correct/(n_correct+n_incorrect)
    return n_correct, n_incorrect, ratio


def assessment_title(df, assessment):
    assessment_title=assessment.title    
    return {"title": games.index(assessment_title)}


def make_move_stats(df, assessment, title="", n_lags=2):
    if "correct" in df.columns:
        df["correct_move"] = df.correct == True
        df["wrong_move"] = df.correct == False
    else:
        df["correct_move"]=False
        df["wrong_move"]=False
    result = []
    result.extend(zip([f"n_correct {title}", f"n_incorrect {title}", f"global_ratio {title}"], calculate_ratios(df)))
    if n_lags:
        last_sessions = df.game_session.unique()[-n_lags:]
        for i in range(n_lags):
            if i < len(last_sessions): 
                result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], 
                                    calculate_ratios(df[df.game_session==last_sessions[i]])))
            else:
                result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], [None, None, None]))
    return {k: v for k, v in result}


## framework.py

In [72]:
import contextlib
import os
import joblib
from multiprocessing import Pool, cpu_count

import numpy as np
import pandas as pd

from functools import reduce, partial
import json

from tqdm import tqdm
from dsb2019.features.game import games


def unwrap_event_data(df):
    unwrapped=pd.DataFrame(data=list(df.event_data.apply(json.loads).values))
    return pd.concat([unwrapped.reset_index(),df.reset_index()],axis=1)


def process_test_installations(test, process_log):
    test_labels = prepare_test_labels(test)
    return process_installations(test_labels, test, process_log)


def process_test_installations_single(test, process_log):
    test_labels = prepare_test_labels(test)
    return process_installations_single(test_labels, test, process_log)


def prepare_test_labels(test):
    if "accuracy_group" not in test:
        test["accuracy_group"] = None
    index = {}
    df = test[["installation_id", "game_session", "timestamp"]].sort_values(["installation_id", "timestamp"])
    for i, installation_id, game_session, timestamp in df.itertuples():
        index[installation_id] = game_session
    installations, sessions = zip(*index.items())
    index = pd.DataFrame(data={"installation_id": installations, "game_session": sessions})
    return pd.merge(index, test[["game_session", "title", "installation_id", "accuracy_group"]].drop_duplicates(), 
                    on=["installation_id", "game_session"])


def process_installations(train_labels, train, process_log, n_installations_in_chunk=100, n_jobs=None):
    installation_ids = train.installation_id.unique()
    chunk_size = n_installations_in_chunk
    n_jobs = n_jobs if n_jobs is not None else cpu_count()
    tasks = []
    for p, i_low in enumerate(tqdm(range(0, len(installation_ids), chunk_size), desc="Generating tasks", position=0)):
        i_high = min(len(installation_ids), i_low + chunk_size)
        installation_ids_chunk = installation_ids[i_low:i_high]
        train_labels_chunk = train_labels[train_labels.installation_id.isin(installation_ids_chunk)].copy()
        train_chunk = train[train.installation_id.isin(installation_ids_chunk)].copy()
        task = joblib.delayed(process_installations_single)(train_labels_chunk, train_chunk, process_log, position=p+1)
        tasks.append(task)
    
    result = []
    with tqdm_joblib(tqdm(desc="Completing tasks", total=len(tasks), position=0)) as progress_bar:
        with joblib.Parallel(n_jobs=n_jobs) as workers:
            for result_df in workers(tasks):
                result.append(result_df)
    return pd.concat(result, ignore_index=True)


@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback:
        def __init__(self, time, index, parallel):
            self.index = index
            self.parallel = parallel

        def __call__(self, index):
            tqdm_object.update()
            if self.parallel._original_iterator is not None:
                self.parallel.dispatch_next()
    
    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()


def process_installations_single(train_labels, train, process_log, position=1):
    result = []
    train = train.sort_values("timestamp")
    train["timestamp"] = pd.to_datetime(train.timestamp)
    installations = train.groupby("installation_id")
    for i, game_session, title, installation_id, accuracy_group in tqdm(train_labels[["game_session", "title", "installation_id", "accuracy_group"]].itertuples(), 
                                                              total=len(train_labels), position=position, desc=f"Processing chunk {position}"):
        player_log = installations.get_group(installation_id).reset_index()
        log_length = player_log[(player_log.game_session==game_session) & (player_log.title==title)].index[0]
        player_log = player_log.iloc[:(log_length + 1)]
        player_log["accuracy_group"] = accuracy_group
        player_log["target_game_session"] = game_session
        features = process_log(player_log)
        features["installation_id"] = installation_id
        features["target_game_session"] = game_session
        features["accuracy_group"] = accuracy_group
        result.append(features)
    df = pd.DataFrame(data=result)
    return df[sorted(df.columns)].fillna(-1)


class LogProcessor:
    def __init__(self, global_features, game_features):
        self.global_features = global_features
        self.game_features = game_features

    def __call__(self, df):
        assessment = df.iloc[-1]
        history = df.iloc[:-1]
        history = history[history.type.isin(["Game", "Assessment"])].copy()
        
        result = {}
        for func in self.global_features:
            result.update(func(df, assessment))
        for game in games:
            game_feature_funcs = self.game_features.get(game, [])
            if game_feature_funcs:
                game_info=history[history.title==game].copy()
                if len(game_info):
                    game_info = unwrap_event_data(game_info)
                for func in game_feature_funcs:
                    result.update(func(game_info, assessment))
        return result


def with_prefix(func, prefix):
    def f(df, assessment):
        return {f"{prefix}_{k}": v for k, v in func(df, assessment).items()}
    return f


## time.py

In [73]:
import numpy as np


def make_calendar_features(df, assessment):
    ts = assessment.timestamp
    year = ts.year
    month = ts.month
    dayofweek = ts.dayofweek
    time = ts.time()
    return {
        "month": month,
        "dayofweek": dayofweek,
        "hour": time.hour,
    }


def make_base_time_features(df, assessment):
    start_end_times = df.groupby("game_session").agg({"timestamp": ["min", "max", "count"]}).reset_index()
    start_end_times.columns = ["game_session", "start_time", "end_time", "n_turns"]
    start_end_times["duration"] = start_end_times.end_time - start_end_times.start_time
    duration_minutes = start_end_times.duration / np.timedelta64(1, "m")
    result = {
        "mean_session_time_minutes": round(duration_minutes.mean(), 2), 
        "mean_session_turns":  round(start_end_times.n_turns.mean(), 2)
    }
    last_event_time = assessment.timestamp
    first_event_time = start_end_times.start_time.min()
    
    days_active = round((last_event_time - first_event_time) / np.timedelta64(1, "D"), 0) + 1
    result["games_per_day"] = round(df.game_session.nunique() / days_active, 2)
    result["games_played"] = df.game_session.nunique()
    minutes_between_games = ((start_end_times.start_time - start_end_times.start_time.shift(1)).dropna() / np.timedelta64(1, "m")).round(1)
    result["mean_minutes_between_games"] = round(minutes_between_games.mean(), 2)
    return result


## Solution

In [74]:
log_processor = LogProcessor([assessment_title, make_calendar_features, 
                                                make_base_time_features], 
                                               {name: [partial(make_move_stats, title=name),
                                                       with_prefix(make_base_time_features, name)] 
                                                for name in games})
test_features = process_test_installations(test, log_processor)

Generating tasks: 100%|██████████| 10/10 [00:00<00:00, 25.88it/s]
Completing tasks: 100%|██████████| 10/10 [00:49<00:00,  4.96s/it]


In [79]:
submission = make_submission(test_features, model)

In [81]:
submission.accuracy_group.value_counts()

3    776
0    195
1     29
Name: accuracy_group, dtype: int64

In [80]:
submission.to_csv("../data/submissions/time_baseline.csv", index=False)