In [4]:
import pandas as pd
import json
import lightgbm as lgb
from tqdm import tqdm
tqdm.pandas()


test = pd.read_csv('../data/raw/test.csv')
model = lgb.Booster(model_file='../models/game_baseline.lgb')

In [5]:
games = ['Scrub-A-Dub', 'All Star Sorting', 'Mushroom Sorter (Assessment)',
       'Air Show', 'Crystals Rule', 'Bird Measurer (Assessment)',
       'Dino Drink', 'Bubble Bath', 'Dino Dive', 'Chow Time',
       'Cauldron Filler (Assessment)', 'Pan Balance', 'Happy Camel',
       'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)',
       'Leaf Leader']


def unwrap_event_data(df):
    unwrapped=pd.DataFrame(data=list(df.event_data.apply(json.loads).values))
    return pd.concat([unwrapped.reset_index(),df.reset_index()],axis=1)


def process_log(df):
    assessment_title=df.title.iloc[-1]    

    history = df.iloc[:-1]
    history = history[history.type.isin(["Game", "Assessment"])].copy()

    def calculate_ratios(df):
        n_correct=df.correct_move.sum()
        n_incorrect=df.wrong_move.sum()
        ratio=n_correct/(n_correct+n_incorrect)
        return n_correct, n_incorrect, ratio
    
    def make_move_stats(df, title,n_lags=2):
        df=df.copy()
        if len(df):
            df = unwrap_event_data(df)
        if "correct" in df.columns:
            df["correct_move"] = df.correct == True
            df["wrong_move"] = df.correct == False
        else:
            df["correct_move"]=False
            df["wrong_move"]=False
        result = []
        result.extend(zip([f"n_correct {title}", f"n_incorrect {title}", f"global_ratio {title}"], calculate_ratios(df)))
        if n_lags:
            last_sessions = df.game_session.unique()[-n_lags:]
            for i in range(n_lags):
                if i < len(last_sessions): 
                    result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], 
                                      calculate_ratios(df[df.game_session==last_sessions[i]])))
                else:
                    result.extend(zip([f"n_correct {title} {i}", f"n_incorrect {title} {i}",f"ratio {title} {i}"], [None, None, None]))
        return {k: v for k, v in result}
    result = {"title": games.index(assessment_title)}
    for game in games:
        stats=history[history.title==game]
        stats=make_move_stats(stats, game)
        result.update(stats)
    return result


def process_test_installations(test):
    test = test.sort_values("timestamp")
    test=test.groupby("installation_id").progress_apply(process_log).reset_index()
    test.columns = ["installation_id", "features"]
    result = []
    for i, installation_id, feature in test.itertuples():
        result.append(feature)
        feature["installation_id"]=installation_id
    return pd.DataFrame(result).fillna(-1)

test_features=process_test_installations(test)

100%|██████████| 1000/1000 [01:28<00:00, 11.33it/s]


In [9]:
def make_submission(test_features, model):
    installations = test_features.installation_id.values
    test = test_features.drop("installation_id", axis=1)
    predictions = model.predict(test).argmax(axis=1).astype(int)
    return pd.DataFrame(data={"installation_id": installations, "accuracy_group": predictions})

submission = make_submission(test_features, model)

In [13]:
submission.to_csv("../data/submissions/game_baseline.csv", index=False)