In [1]:
cd ..

/code/dsb2019/notebooks


In [2]:
import pandas as pd
from pandas.io.json import json_normalize
import json
import lightgbm as lgb
import numpy as np
from tqdm import tqdm
tqdm.pandas()


test = pd.read_csv('../data/raw/test.csv')
model = lgb.Booster(model_file='../models/time_baseline.lgb')

In [3]:
games = ['Scrub-A-Dub', 'All Star Sorting', 'Mushroom Sorter (Assessment)',
       'Air Show', 'Crystals Rule', 'Bird Measurer (Assessment)',
       'Dino Drink', 'Bubble Bath', 'Dino Dive', 'Chow Time',
       'Cauldron Filler (Assessment)', 'Pan Balance', 'Happy Camel',
       'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)',
       'Leaf Leader']


def unwrap_event_data(df):
    unwrapped = json_normalize(df.event_data.apply(json.loads))
    return pd.concat([unwrapped.reset_index(),df.reset_index()],axis=1)


def process_installations_parallel(process_installations, process_log, df, *dataframes):
    installations = df.installation_id.unique()
    jobs = []
    n_jobs=cpu_count()
    with joblib.Parallel(n_jobs=n_jobs) as workers:
        chunk_size = len(installations) // n_jobs
        for pos, i in enumerate(range(0, len(installations), chunk_size)):
            inst_chunk = installations[i:min(i + chunk_size, len(installations))]
            df_chunk = df[df.installation_id.isin(inst_chunk)].copy()
            dataframes_chunk = [d[d.installation_id.isin(inst_chunk)].copy() for d in dataframes]
            jobs.append(joblib.delayed(process_installations)(process_log, df_chunk, *dataframes_chunk, position=pos))
        result = []
        for result_df in workers(jobs):
            result.append(result_df.reset_index())
    return pd.concat(result).drop("index", axis=1)


def process_installations(process_log, train_labels, train, position=0):
    result = []
    train["timestamp"] = pd.to_datetime(train.timestamp)
    train = train.drop(["event_count"], axis=1)
    train=train.sort_values("timestamp")
    installations = train.groupby("installation_id")
    for i, game_session, title, installation_id, accuracy_group in tqdm(train_labels[["game_session", "title", "installation_id", "accuracy_group"]].itertuples(), 
                                                              total=len(train_labels), position=position):
        player_log = installations.get_group(installation_id).reset_index()
        log_length = player_log[(player_log.game_session==game_session) & (player_log.title==title)].index[0]
        player_log = player_log.iloc[:(log_length + 1)]
        player_log["accuracy_group"] = accuracy_group
        player_log["target_game_session"] = game_session
        features = process_log(player_log)
        features["installation_id"] = installation_id
        features["accuracy_group"] = accuracy_group
        result.append(features)
    return pd.DataFrame(data=result).fillna(-1)


def calculate_ratios(df):
    if len(df)==0:
        return 0, 0, None

    n_correct=df.correct_move.sum()
    n_incorrect=df.wrong_move.sum()
    ratio=n_correct/(n_correct+n_incorrect)
    return n_correct, n_incorrect, ratio


def make_move_stats(assessment, df, title):
    result = []
    result.extend(zip(["n_correct " + title, "n_incorrect " + title, "global_ratio " + title], calculate_ratios(df)))
    return {k: v for k,v in result}


def shrink_session(group):
    group = populate_correct_moves(group)
    correct_moves = group[group.correct_move]
    correct_timestamps = correct_moves.timestamp
    correct_turns = correct_moves.event_count
    time_between_correct_moves = (correct_timestamps - correct_timestamps.shift(1)).dropna() / np.timedelta64(1, "m")
    turns_between_correct_moves = (correct_turns - correct_turns.shift(1)).dropna()
    result = {
    } 
    result["start_time"] = group.timestamp.min()
    result["end_time"] = group.timestamp.max()
    result["duration"] = result["end_time"] - result["start_time"]
    result["correct_move"] = group.correct_move.sum()
    result["wrong_move"] = group.wrong_move.sum()
    result["title"] = group.title.iloc[0]
    result["installation_id"] = group.installation_id.iloc[0]
    result["game_session"] = group.game_session.iloc[0]
    return result


def populate_correct_moves(history: pd.DataFrame) -> pd.DataFrame:
    if "correct" in history.columns:
        history["correct_move"] = history.correct == True
        history["wrong_move"] = history.correct == False
    else:
        history["correct_move"]=False
        history["wrong_move"]=False
    return history

    
def make_base_time_features(assessment, history):
    start_end_times = history
    duration_minutes = start_end_times.duration / np.timedelta64(1, "m")
    result = {
        "mean_session_time_minutes": round(duration_minutes.mean(), 0), 
        #"mean_turns_between_correct_moves": history.mean_turns_between_correct_moves.median(),
        #"mean_time_between_correct_moves": history.mean_time_between_correct_moves.mean(),
        #"mean_time_before_first_correct_move": history.time_before_first_correct_move.mean(),
        #"mean_turns_before_first_correct_move": history.turns_before_first_correct_move.median(),
    }
    last_event_time = assessment.timestamp
    first_event_time = start_end_times.start_time.min()
    
    days_active = round((last_event_time - first_event_time) / np.timedelta64(1, "D"), 0) + 1
    result["games_per_day"] = round(history.game_session.nunique() / days_active, 2)
    
    minutes_between_games = ((start_end_times.start_time - start_end_times.start_time.shift(1)).dropna() / np.timedelta64(1, "m")).round(0)
    result["mean_minutes_between_games"] = round(minutes_between_games.mean(), 2)
    return result


def make_game_time_features(assessment, history, title):
    _1day=history[history.start_time>=(assessment.timestamp-pd.Timedelta(1,'D'))]
    _7days=history[history.start_time>=(assessment.timestamp-pd.Timedelta(7,'D'))]
    result = {
        "hours_played": round(history.duration.sum() / np.timedelta64(1, "h"), 0),
        "games_played": history.game_session.nunique(),
    }
    for suffix, df in [("1d", _1day), ("7d", _7days)]:
        n_correct, n_incorrect, ratio=calculate_ratios(df)
        result["n_correct_"+suffix]=n_correct
        result["n_incorrect_"+suffix]=n_incorrect
        result["game_ratio_"+suffix]=ratio
    return result

    
def apply_on_sessions(assessment, history, n_lags, func_list, title):
    result = {}
    empty = history.head(0)
    if n_lags:
        last_sessions = history.game_session.unique()[-n_lags:]
        for i in range(n_lags):
            for get_features in func_list:
                if i < len(last_sessions):
                    lag_features = get_features(assessment, history[history.game_session==last_sessions[i]], title)
                else:
                    lag_features = get_features(assessment, empty, title)
                features = {}
                for k, v in lag_features.items():
                    features["%s %d" % (k, i)] = v
                result.update(features)
    return result


def make_calendar_features(assessment, history):
    ts = assessment.timestamp
    year = ts.year
    month = ts.month
    dayofweek = ts.dayofweek
    time = ts.time()
    return {
        "month": month,
        "dayofweek": dayofweek,
        "hour": time.hour,
    }


def make_base_features(assessment, history):
    return  {
        "title": games.index(assessment.title)
    }

base_stats = [populate_correct_moves]
base_features = [make_base_features, make_calendar_features, make_base_time_features]
game_features = [make_move_stats, make_game_time_features]
lag_features = [make_move_stats]



def get_base_features(assessment, history):
    result = {}
    for f in base_features:
        result.update(f(assessment, history))
    return result


def get_game_features(assessment, stats, game):
    result = {}
    for f in game_features:
        result.update({game + " " + k: v for k, v in f(assessment, stats, game).items()})
    return result


def get_lag_features(assessment, stats, game, n_lags):
    res = {}
    result = apply_on_sessions(assessment, stats, n_lags, lag_features, game)
    for k, v in result.items():
        res[game + " " + k] = v
    return res       


def process_log(df):
    assessment = df.iloc[-1]
    history = df.iloc[:-1]
    history = history[history.type.isin(["Game", "Assessment"])]
    
    if len(history):
        history = unwrap_event_data(history)
    else:
        return {}
    history.sort_values("timestamp", inplace=True)
    
    for f in base_stats:
        history = f(history)
    
    history = json_normalize(history.groupby("game_session").apply(shrink_session))
    history.sort_values("start_time", inplace=True)
    result = {}
    result.update(get_base_features(assessment, history))
    for game in games:
        stats=history[history.title==game]
        result.update(get_game_features(assessment, stats, game))
        result.update(get_lag_features(assessment, stats, game, 2))
    return result

In [4]:
def process_test_installations(test):
    test["timestamp"]=pd.to_datetime(test.timestamp)
    test = test.sort_values("timestamp")
    test=test.groupby("installation_id").progress_apply(process_log).reset_index()
    test.columns = ["installation_id", "features"]
    result = []
    for i, installation_id, feature in test.itertuples():
        result.append(feature)
        feature["installation_id"]=installation_id
    return pd.DataFrame(result).fillna(-1)

In [5]:
test_features = process_test_installations(test)

 23%|██▎       | 232/1000 [00:44<02:16,  5.61it/s]

KeyboardInterrupt: 

In [None]:
def make_submission(test_features, model):
    installations = test_features.installation_id.values
    test = test_features.drop("installation_id", axis=1)
    predictions = model.predict(test).argmax(axis=1).astype(int)
    return pd.DataFrame(data={"installation_id": installations, "accuracy_group": predictions})

submission = make_submission(test_features, model)

In [None]:
submission.to_csv("../data/submissions/time_baseline.csv", index=False)