In [1]:
import pandas as pd
import json
import lightgbm as lgb
from tqdm import tqdm
tqdm.pandas()


test = pd.read_csv('../../data/raw/test.csv')
model = lgb.Booster(model_file='../../models/regression_baseline__eventid_bag.lgb')
coef=[1.0539541002386488, 1.6792994887656274, 2.247341408420393]

In [2]:
games = ['Scrub-A-Dub', 'All Star Sorting', 'Mushroom Sorter (Assessment)',
       'Air Show', 'Crystals Rule', 'Bird Measurer (Assessment)',
       'Dino Drink', 'Bubble Bath', 'Dino Dive', 'Chow Time',
       'Cauldron Filler (Assessment)', 'Pan Balance', 'Happy Camel',
       'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)',
       'Leaf Leader']
worlds = ['NONE', 'MAGMAPEAK', 'TREETOPCITY', 'CRYSTALCAVES']


def unwrap_event_data(df):
    unwrapped=pd.DataFrame(data=list(df.event_data.apply(json.loads).values))
    return pd.concat([unwrapped.reset_index(),df.reset_index()],axis=1)


def make_counters(df, column):
    return df.groupby(column)[column].count().to_dict()

    
def process_log(df):
    assessment_title=df.title.iloc[-1]   
    world=df.world.iloc[-1]

    history = df.iloc[:-1]
    history = history[history.type.isin(["Game", "Assessment"])].copy()

    def calculate_ratios(df):
        n_correct=df.correct_move.sum()
        n_incorrect=df.wrong_move.sum()
        ratio=n_correct/(n_correct+n_incorrect)
        return n_correct, n_incorrect, ratio
    
    def make_move_stats(df, title,n_lags=2):
        df=df.copy()
        if len(df):
            df = unwrap_event_data(df)
        if "correct" in df.columns:
            df["correct_move"] = df.correct == True
            df["wrong_move"] = df.correct == False
        else:
            df["correct_move"]=False
            df["wrong_move"]=False
        result = []
        result.extend(zip([f"n_correct_{title}", f"n_incorrect_{title}", f"global_ratio_{title}"], calculate_ratios(df)))

        if n_lags:
            last_sessions = df.game_session.unique()[-n_lags:]
            for i in range(n_lags):
                if i < len(last_sessions): 
                    result.extend(zip([f"n_correct_{title}_{i}", f"n_incorrect_{title} {i}",f"ratio_{title}_{i}"], 
                                      calculate_ratios(df[df.game_session==last_sessions[i]])))
                else:
                    result.extend(zip([f"n_correct_{title}_{i}", f"n_incorrect_{title}_{i}",f"ratio_{title}_{i}"], [None, None, None]))
        return {k: v for k, v in result}
    
    
    result = {"title": games.index(assessment_title),
              "world": worlds.index(world),
              "n_activities": df[df.type=="Activity"].game_session.nunique(),
              "n_games": df[df.type=="Game"].game_session.nunique(),
              "event_code_count": df.event_code.nunique(),
              "event_id_count": df.event_id.nunique(),
              "title_count": df.title.nunique(),
              "session_id_count": df.game_session.nunique(),
              "n_actions": len(df),
              "world_title_count": df[df.world==world].title.nunique(),
             }
    for game in games:
        stats=history[history.title==game]
        stats_features=make_move_stats(stats, game)
        stats_features[f"{game}_event_code_count"] = stats.event_code.nunique()
        stats_features[f"{game}_event_id_count"] = stats.event_id.nunique()
        stats_features[f"{game}_session_id_count"] = stats.game_session.nunique()
        stats_features[f"{game}_n_actions"] = len(stats)
        result.update(stats_features)
        result.update({f"{game}_{k}": v for k, v in make_counters(stats, "event_id").items()})
        result.update({f"{game}_{k}": v for k, v in make_counters(stats, "event_code").items()})
    world_games = history[history.world==world]
    for game in games:
        stats=world_games[world_games.title==game]
        stats_features=make_move_stats(stats, game)
        stats_features = {f"world_{k}": v for k, v in stats_features.items()}
        stats_features[f"world_{game}_event_code_count"] = stats.event_code.nunique()
        stats_features[f"world_{game}_event_id_count"] = stats.event_id.nunique()
        stats_features[f"world_{game}_session_id_count"] = stats.game_session.nunique()
        stats_features[f"world_{game}_n_actions"] = len(stats)
        result.update(stats_features)
        result.update({f"world_{game}_{k}": v for k, v in make_counters(stats, "event_id").items()})
        result.update({f"world_{game}_{k}": v for k, v in make_counters(stats, "event_code").items()})
    result.update(make_counters(history, "event_id"))
    result.update(make_counters(history, "event_code"))
    return result


def process_test_installations(test):
    test = test.sort_values("timestamp")
    test=test.groupby("installation_id").progress_apply(process_log).reset_index()
    test.columns = ["installation_id", "features"]
    result = []
    for i, installation_id, feature in test.itertuples():
        result.append(feature)
        feature["installation_id"]=installation_id
    return pd.DataFrame(result).fillna(-1)

test_features=process_test_installations(test)

100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s]


In [3]:
useful_features=['title',
 'world',
 'n_activities',
 'n_games',
 'event_code_count',
 'event_id_count',
 'title_count',
 'session_id_count',
 'n_actions',
 'world_title_count',
 'n_correct_Scrub-A-Dub',
 'n_incorrect_Scrub-A-Dub',
 'global_ratio_Scrub-A-Dub',
 'n_correct_Scrub-A-Dub_0',
 'n_incorrect_Scrub-A-Dub 0',
 'ratio_Scrub-A-Dub_0',
 'n_correct_Scrub-A-Dub_1',
 'n_incorrect_Scrub-A-Dub_1',
 'ratio_Scrub-A-Dub_1',
 'Scrub-A-Dub_event_code_count',
 'Scrub-A-Dub_session_id_count',
 'Scrub-A-Dub_n_actions',
 'Scrub-A-Dub_4a09ace1',
 'Scrub-A-Dub_5a848010',
 'Scrub-A-Dub_5c3d2b2f',
 'Scrub-A-Dub_6d90d394',
 'Scrub-A-Dub_7040c096',
 'Scrub-A-Dub_c1cac9a2',
 'Scrub-A-Dub_cf82af56',
 'Scrub-A-Dub_f71c4741',
 'n_correct_All Star Sorting',
 'n_incorrect_All Star Sorting',
 'global_ratio_All Star Sorting',
 'n_correct_All Star Sorting_0',
 'n_incorrect_All Star Sorting 0',
 'ratio_All Star Sorting_0',
 'n_correct_All Star Sorting_1',
 'n_incorrect_All Star Sorting 1',
 'ratio_All Star Sorting_1',
 'All Star Sorting_event_code_count',
 'All Star Sorting_session_id_count',
 'All Star Sorting_n_actions',
 'All Star Sorting_1cc7cfca',
 'All Star Sorting_2c4e6db0',
 'All Star Sorting_2dc29e21',
 'All Star Sorting_363d3849',
 'All Star Sorting_4b5efe37',
 'All Star Sorting_587b5989',
 'All Star Sorting_6043a2b4',
 'All Star Sorting_b120f2ac',
 'All Star Sorting_d02b7a8e',
 'n_correct_Mushroom Sorter (Assessment)',
 'n_incorrect_Mushroom Sorter (Assessment)',
 'global_ratio_Mushroom Sorter (Assessment)',
 'n_correct_Mushroom Sorter (Assessment)_0',
 'n_incorrect_Mushroom Sorter (Assessment) 0',
 'ratio_Mushroom Sorter (Assessment)_0',
 'n_correct_Mushroom Sorter (Assessment)_1',
 'n_incorrect_Mushroom Sorter (Assessment)_1',
 'ratio_Mushroom Sorter (Assessment)_1',
 'Mushroom Sorter (Assessment)_event_code_count',
 'Mushroom Sorter (Assessment)_session_id_count',
 'Mushroom Sorter (Assessment)_n_actions',
 'Mushroom Sorter (Assessment)_25fa8af4',
 'Mushroom Sorter (Assessment)_28ed704e',
 'Mushroom Sorter (Assessment)_5f0eb72c',
 'Mushroom Sorter (Assessment)_6c930e6e',
 'Mushroom Sorter (Assessment)_7da34a02',
 'Mushroom Sorter (Assessment)_a1e4395d',
 'Mushroom Sorter (Assessment)_a5be6304',
 'Mushroom Sorter (Assessment)_c7128948',
 'Mushroom Sorter (Assessment)_fbaf3456',
 'n_correct_Air Show',
 'n_incorrect_Air Show',
 'global_ratio_Air Show',
 'n_correct_Air Show_0',
 'n_incorrect_Air Show 0',
 'ratio_Air Show_0',
 'n_correct_Air Show_1',
 'n_incorrect_Air Show_1',
 'ratio_Air Show_1',
 'Air Show_event_code_count',
 'Air Show_session_id_count',
 'Air Show_n_actions',
 'Air Show_06372577',
 'Air Show_14de4c5d',
 'Air Show_1575e76c',
 'Air Show_15ba1109',
 'Air Show_28f975ea',
 'Air Show_58a0de5c',
 'Air Show_65abac75',
 'Air Show_7423acbc',
 'Air Show_a1bbe385',
 'Air Show_bcceccc6',
 'Air Show_d88ca108',
 'Air Show_dcb55a27',
 'n_correct_Crystals Rule',
 'n_incorrect_Crystals Rule',
 'global_ratio_Crystals Rule',
 'n_correct_Crystals Rule_0',
 'n_incorrect_Crystals Rule 0',
 'ratio_Crystals Rule_0',
 'n_correct_Crystals Rule_1',
 'n_incorrect_Crystals Rule_1',
 'ratio_Crystals Rule_1',
 'Crystals Rule_event_code_count',
 'Crystals Rule_session_id_count',
 'Crystals Rule_n_actions',
 'Crystals Rule_44cb4907',
 'Crystals Rule_48349b14',
 'Crystals Rule_5e3ea25a',
 'Crystals Rule_86c924c4',
 'Crystals Rule_cc5087a3',
 'n_correct_Bird Measurer (Assessment)',
 'n_incorrect_Bird Measurer (Assessment)',
 'global_ratio_Bird Measurer (Assessment)',
 'n_correct_Bird Measurer (Assessment)_0',
 'n_incorrect_Bird Measurer (Assessment) 0',
 'ratio_Bird Measurer (Assessment)_0',
 'n_correct_Bird Measurer (Assessment)_1',
 'n_incorrect_Bird Measurer (Assessment)_1',
 'ratio_Bird Measurer (Assessment)_1',
 'Bird Measurer (Assessment)_event_code_count',
 'Bird Measurer (Assessment)_session_id_count',
 'Bird Measurer (Assessment)_n_actions',
 'Bird Measurer (Assessment)_1375ccb7',
 'Bird Measurer (Assessment)_17113b36',
 'Bird Measurer (Assessment)_4a4c3d21',
 'Bird Measurer (Assessment)_51102b85',
 'Bird Measurer (Assessment)_a16a373e',
 'Bird Measurer (Assessment)_ec138c1c',
 'Bird Measurer (Assessment)_f56e0afc',
 'n_correct_Dino Drink',
 'n_incorrect_Dino Drink',
 'global_ratio_Dino Drink',
 'n_correct_Dino Drink_0',
 'n_incorrect_Dino Drink_0',
 'ratio_Dino Drink_0',
 'n_correct_Dino Drink_1',
 'n_incorrect_Dino Drink_1',
 'ratio_Dino Drink_1',
 'Dino Drink_event_code_count',
 'Dino Drink_session_id_count',
 'Dino Drink_n_actions',
 'n_correct_Bubble Bath',
 'n_incorrect_Bubble Bath',
 'global_ratio_Bubble Bath',
 'n_correct_Bubble Bath_0',
 'n_incorrect_Bubble Bath_0',
 'ratio_Bubble Bath_0',
 'n_correct_Bubble Bath_1',
 'n_incorrect_Bubble Bath_1',
 'ratio_Bubble Bath_1',
 'Bubble Bath_event_code_count',
 'Bubble Bath_session_id_count',
 'Bubble Bath_n_actions',
 'n_correct_Dino Dive',
 'n_incorrect_Dino Dive',
 'global_ratio_Dino Dive',
 'n_correct_Dino Dive_0',
 'n_incorrect_Dino Dive_0',
 'ratio_Dino Dive_0',
 'n_correct_Dino Dive_1',
 'n_incorrect_Dino Dive_1',
 'ratio_Dino Dive_1',
 'Dino Dive_event_code_count',
 'Dino Dive_session_id_count',
 'Dino Dive_n_actions',
 'n_correct_Chow Time',
 'n_incorrect_Chow Time',
 'global_ratio_Chow Time',
 'n_correct_Chow Time_0',
 'n_incorrect_Chow Time_0',
 'ratio_Chow Time_0',
 'n_correct_Chow Time_1',
 'n_incorrect_Chow Time_1',
 'ratio_Chow Time_1',
 'Chow Time_event_code_count',
 'Chow Time_session_id_count',
 'Chow Time_n_actions',
 'n_correct_Cauldron Filler (Assessment)',
 'n_incorrect_Cauldron Filler (Assessment)',
 'global_ratio_Cauldron Filler (Assessment)',
 'n_correct_Cauldron Filler (Assessment)_0',
 'n_incorrect_Cauldron Filler (Assessment)_0',
 'ratio_Cauldron Filler (Assessment)_0',
 'n_correct_Cauldron Filler (Assessment)_1',
 'n_incorrect_Cauldron Filler (Assessment)_1',
 'ratio_Cauldron Filler (Assessment)_1',
 'Cauldron Filler (Assessment)_event_code_count',
 'Cauldron Filler (Assessment)_session_id_count',
 'Cauldron Filler (Assessment)_n_actions',
 'n_correct_Pan Balance',
 'n_incorrect_Pan Balance',
 'global_ratio_Pan Balance',
 'n_correct_Pan Balance_0',
 'n_incorrect_Pan Balance_0',
 'ratio_Pan Balance_0',
 'n_correct_Pan Balance_1',
 'n_incorrect_Pan Balance_1',
 'ratio_Pan Balance_1',
 'Pan Balance_event_code_count',
 'Pan Balance_session_id_count',
 'Pan Balance_n_actions',
 'n_correct_Happy Camel',
 'n_incorrect_Happy Camel',
 'global_ratio_Happy Camel',
 'n_correct_Happy Camel_0',
 'n_incorrect_Happy Camel_0',
 'ratio_Happy Camel_0',
 'n_correct_Happy Camel_1',
 'n_incorrect_Happy Camel_1',
 'ratio_Happy Camel_1',
 'Happy Camel_event_code_count',
 'Happy Camel_session_id_count',
 'Happy Camel_n_actions',
 'n_correct_Cart Balancer (Assessment)',
 'n_incorrect_Cart Balancer (Assessment)',
 'global_ratio_Cart Balancer (Assessment)',
 'n_correct_Cart Balancer (Assessment)_0',
 'n_incorrect_Cart Balancer (Assessment)_0',
 'ratio_Cart Balancer (Assessment)_0',
 'n_correct_Cart Balancer (Assessment)_1',
 'n_incorrect_Cart Balancer (Assessment)_1',
 'ratio_Cart Balancer (Assessment)_1',
 'Cart Balancer (Assessment)_event_code_count',
 'Cart Balancer (Assessment)_session_id_count',
 'Cart Balancer (Assessment)_n_actions',
 'n_correct_Chest Sorter (Assessment)',
 'n_incorrect_Chest Sorter (Assessment)',
 'global_ratio_Chest Sorter (Assessment)',
 'n_correct_Chest Sorter (Assessment)_0',
 'n_incorrect_Chest Sorter (Assessment)_0',
 'ratio_Chest Sorter (Assessment)_0',
 'n_correct_Chest Sorter (Assessment)_1',
 'n_incorrect_Chest Sorter (Assessment)_1',
 'ratio_Chest Sorter (Assessment)_1',
 'Chest Sorter (Assessment)_event_code_count',
 'Chest Sorter (Assessment)_session_id_count',
 'Chest Sorter (Assessment)_n_actions',
 'n_correct_Leaf Leader',
 'n_incorrect_Leaf Leader',
 'global_ratio_Leaf Leader',
 'n_correct_Leaf Leader_0',
 'n_incorrect_Leaf Leader_0',
 'ratio_Leaf Leader_0',
 'n_correct_Leaf Leader_1',
 'n_incorrect_Leaf Leader_1',
 'ratio_Leaf Leader_1',
 'Leaf Leader_event_code_count',
 'Leaf Leader_session_id_count',
 'Leaf Leader_n_actions',
 'world_n_correct_Scrub-A-Dub',
 'world_n_incorrect_Scrub-A-Dub',
 'world_global_ratio_Scrub-A-Dub',
 'world_n_correct_Scrub-A-Dub_0',
 'world_n_incorrect_Scrub-A-Dub_0',
 'world_ratio_Scrub-A-Dub_0',
 'world_n_correct_Scrub-A-Dub_1',
 'world_n_incorrect_Scrub-A-Dub_1',
 'world_ratio_Scrub-A-Dub_1',
 'world_Scrub-A-Dub_event_code_count',
 'world_Scrub-A-Dub_session_id_count',
 'world_Scrub-A-Dub_n_actions',
 'world_n_correct_All Star Sorting',
 'world_n_incorrect_All Star Sorting',
 'world_global_ratio_All Star Sorting',
 'world_n_correct_All Star Sorting_0',
 'world_n_incorrect_All Star Sorting 0',
 'world_ratio_All Star Sorting_0',
 'world_n_correct_All Star Sorting_1',
 'world_n_incorrect_All Star Sorting 1',
 'world_ratio_All Star Sorting_1',
 'world_All Star Sorting_event_code_count',
 'world_All Star Sorting_session_id_count',
 'world_All Star Sorting_n_actions',
 'world_All Star Sorting_1cc7cfca',
 'world_All Star Sorting_2c4e6db0',
 'world_All Star Sorting_2dc29e21',
 'world_All Star Sorting_363d3849',
 'world_All Star Sorting_4b5efe37',
 'world_All Star Sorting_587b5989',
 'world_All Star Sorting_6043a2b4',
 'world_All Star Sorting_b120f2ac',
 'world_All Star Sorting_d02b7a8e',
 'world_n_correct_Mushroom Sorter (Assessment)',
 'world_n_incorrect_Mushroom Sorter (Assessment)',
 'world_global_ratio_Mushroom Sorter (Assessment)',
 'world_n_correct_Mushroom Sorter (Assessment)_0',
 'world_n_incorrect_Mushroom Sorter (Assessment) 0',
 'world_ratio_Mushroom Sorter (Assessment)_0',
 'world_n_correct_Mushroom Sorter (Assessment)_1',
 'world_n_incorrect_Mushroom Sorter (Assessment)_1',
 'world_ratio_Mushroom Sorter (Assessment)_1',
 'world_Mushroom Sorter (Assessment)_event_code_count',
 'world_Mushroom Sorter (Assessment)_session_id_count',
 'world_Mushroom Sorter (Assessment)_n_actions',
 'world_Mushroom Sorter (Assessment)_25fa8af4',
 'world_Mushroom Sorter (Assessment)_28ed704e',
 'world_Mushroom Sorter (Assessment)_5f0eb72c',
 'world_Mushroom Sorter (Assessment)_6c930e6e',
 'world_Mushroom Sorter (Assessment)_7da34a02',
 'world_Mushroom Sorter (Assessment)_a1e4395d',
 'world_Mushroom Sorter (Assessment)_c7128948',
 'world_Mushroom Sorter (Assessment)_fbaf3456',
 'world_n_correct_Air Show',
 'world_n_incorrect_Air Show',
 'world_global_ratio_Air Show',
 'world_n_correct_Air Show_0',
 'world_n_incorrect_Air Show 0',
 'world_ratio_Air Show_0',
 'world_n_correct_Air Show_1',
 'world_n_incorrect_Air Show_1',
 'world_ratio_Air Show_1',
 'world_Air Show_event_code_count',
 'world_Air Show_session_id_count',
 'world_Air Show_n_actions',
 'world_Air Show_06372577',
 'world_Air Show_14de4c5d',
 'world_Air Show_1575e76c',
 'world_Air Show_15ba1109',
 'world_Air Show_28f975ea',
 'world_Air Show_58a0de5c',
 'world_Air Show_65abac75',
 'world_Air Show_7423acbc',
 'world_Air Show_a1bbe385',
 'world_Air Show_bcceccc6',
 'world_Air Show_d88ca108',
 'world_Air Show_dcb55a27',
 'world_n_correct_Crystals Rule',
 'world_n_incorrect_Crystals Rule',
 'world_global_ratio_Crystals Rule',
 'world_n_correct_Crystals Rule_0',
 'world_n_incorrect_Crystals Rule 0',
 'world_ratio_Crystals Rule_0',
 'world_n_correct_Crystals Rule_1',
 'world_n_incorrect_Crystals Rule_1',
 'world_ratio_Crystals Rule_1',
 'world_Crystals Rule_event_code_count',
 'world_Crystals Rule_session_id_count',
 'world_Crystals Rule_n_actions',
 'world_Crystals Rule_44cb4907',
 'world_Crystals Rule_48349b14',
 'world_Crystals Rule_5e3ea25a',
 'world_Crystals Rule_86c924c4',
 'world_Crystals Rule_cc5087a3',
 'world_n_correct_Bird Measurer (Assessment)',
 'world_n_incorrect_Bird Measurer (Assessment)',
 'world_global_ratio_Bird Measurer (Assessment)',
 'world_n_correct_Bird Measurer (Assessment)_0',
 'world_n_incorrect_Bird Measurer (Assessment) 0',
 'world_ratio_Bird Measurer (Assessment)_0',
 'world_n_correct_Bird Measurer (Assessment)_1',
 'world_n_incorrect_Bird Measurer (Assessment)_1',
 'world_ratio_Bird Measurer (Assessment)_1',
 'world_Bird Measurer (Assessment)_event_code_count',
 'world_Bird Measurer (Assessment)_session_id_count',
 'world_Bird Measurer (Assessment)_n_actions',
 'world_Bird Measurer (Assessment)_1375ccb7',
 'world_Bird Measurer (Assessment)_17113b36',
 'world_Bird Measurer (Assessment)_4a4c3d21',
 'world_Bird Measurer (Assessment)_51102b85',
 'world_Bird Measurer (Assessment)_a16a373e',
 'world_Bird Measurer (Assessment)_ec138c1c',
 'world_Bird Measurer (Assessment)_f56e0afc',
 'world_n_correct_Dino Drink',
 'world_n_incorrect_Dino Drink',
 'world_global_ratio_Dino Drink',
 'world_n_correct_Dino Drink_0',
 'world_n_incorrect_Dino Drink_0',
 'world_ratio_Dino Drink_0',
 'world_n_correct_Dino Drink_1',
 'world_n_incorrect_Dino Drink_1',
 'world_ratio_Dino Drink_1',
 'world_Dino Drink_event_code_count',
 'world_Dino Drink_session_id_count',
 'world_Dino Drink_n_actions',
 'world_n_correct_Bubble Bath',
 'world_n_incorrect_Bubble Bath',
 'world_global_ratio_Bubble Bath',
 'world_n_correct_Bubble Bath_0',
 'world_n_incorrect_Bubble Bath_0',
 'world_ratio_Bubble Bath_0',
 'world_n_correct_Bubble Bath_1',
 'world_n_incorrect_Bubble Bath_1',
 'world_ratio_Bubble Bath_1',
 'world_Bubble Bath_event_code_count',
 'world_Bubble Bath_session_id_count',
 'world_Bubble Bath_n_actions',
 'world_n_correct_Dino Dive',
 'world_n_incorrect_Dino Dive',
 'world_global_ratio_Dino Dive',
 'world_n_correct_Dino Dive_0',
 'world_n_incorrect_Dino Dive_0',
 'world_ratio_Dino Dive_0',
 'world_n_correct_Dino Dive_1',
 'world_n_incorrect_Dino Dive_1',
 'world_ratio_Dino Dive_1',
 'world_Dino Dive_event_code_count',
 'world_Dino Dive_session_id_count',
 'world_Dino Dive_n_actions',
 'world_n_correct_Chow Time',
 'world_n_incorrect_Chow Time',
 'world_global_ratio_Chow Time',
 'world_n_correct_Chow Time_0',
 'world_n_incorrect_Chow Time_0',
 'world_ratio_Chow Time_0',
 'world_n_correct_Chow Time_1',
 'world_n_incorrect_Chow Time_1',
 'world_ratio_Chow Time_1',
 'world_Chow Time_event_code_count',
 'world_Chow Time_session_id_count',
 'world_Chow Time_n_actions',
 'world_n_correct_Cauldron Filler (Assessment)',
 'world_n_incorrect_Cauldron Filler (Assessment)',
 'world_global_ratio_Cauldron Filler (Assessment)',
 'world_n_correct_Cauldron Filler (Assessment)_0',
 'world_n_incorrect_Cauldron Filler (Assessment)_0',
 'world_ratio_Cauldron Filler (Assessment)_0',
 'world_n_correct_Cauldron Filler (Assessment)_1',
 'world_n_incorrect_Cauldron Filler (Assessment)_1',
 'world_ratio_Cauldron Filler (Assessment)_1',
 'world_Cauldron Filler (Assessment)_event_code_count',
 'world_Cauldron Filler (Assessment)_session_id_count',
 'world_Cauldron Filler (Assessment)_n_actions',
 'world_n_correct_Pan Balance',
 'world_n_incorrect_Pan Balance',
 'world_global_ratio_Pan Balance',
 'world_n_correct_Pan Balance_0',
 'world_n_incorrect_Pan Balance_0',
 'world_ratio_Pan Balance_0',
 'world_n_correct_Pan Balance_1',
 'world_n_incorrect_Pan Balance_1',
 'world_ratio_Pan Balance_1',
 'world_Pan Balance_event_code_count',
 'world_Pan Balance_session_id_count',
 'world_Pan Balance_n_actions',
 'world_n_correct_Happy Camel',
 'world_n_incorrect_Happy Camel',
 'world_global_ratio_Happy Camel',
 'world_n_correct_Happy Camel_0',
 'world_n_incorrect_Happy Camel_0',
 'world_ratio_Happy Camel_0',
 'world_n_correct_Happy Camel_1',
 'world_n_incorrect_Happy Camel_1',
 'world_ratio_Happy Camel_1',
 'world_Happy Camel_event_code_count',
 'world_Happy Camel_session_id_count',
 'world_Happy Camel_n_actions',
 'world_n_correct_Cart Balancer (Assessment)',
 'world_n_incorrect_Cart Balancer (Assessment)',
 'world_global_ratio_Cart Balancer (Assessment)',
 'world_n_correct_Cart Balancer (Assessment)_0',
 'world_n_incorrect_Cart Balancer (Assessment)_0',
 'world_ratio_Cart Balancer (Assessment)_0',
 'world_n_correct_Cart Balancer (Assessment)_1',
 'world_n_incorrect_Cart Balancer (Assessment)_1',
 'world_ratio_Cart Balancer (Assessment)_1',
 'world_Cart Balancer (Assessment)_event_code_count',
 'world_Cart Balancer (Assessment)_session_id_count',
 'world_Cart Balancer (Assessment)_n_actions',
 'world_n_correct_Chest Sorter (Assessment)',
 'world_n_incorrect_Chest Sorter (Assessment)',
 'world_global_ratio_Chest Sorter (Assessment)',
 'world_n_correct_Chest Sorter (Assessment)_0',
 'world_n_incorrect_Chest Sorter (Assessment)_0',
 'world_ratio_Chest Sorter (Assessment)_0',
 'world_n_correct_Chest Sorter (Assessment)_1',
 'world_n_incorrect_Chest Sorter (Assessment)_1',
 'world_ratio_Chest Sorter (Assessment)_1',
 'world_Chest Sorter (Assessment)_event_code_count',
 'world_Chest Sorter (Assessment)_session_id_count',
 'world_Chest Sorter (Assessment)_n_actions',
 'world_n_correct_Leaf Leader',
 'world_n_incorrect_Leaf Leader',
 'world_global_ratio_Leaf Leader',
 'world_n_correct_Leaf Leader_0',
 'world_n_incorrect_Leaf Leader_0',
 'world_ratio_Leaf Leader_0',
 'world_n_correct_Leaf Leader_1',
 'world_n_incorrect_Leaf Leader_1',
 'world_ratio_Leaf Leader_1',
 'world_Leaf Leader_event_code_count',
 'world_Leaf Leader_session_id_count',
 'world_Leaf Leader_n_actions',
 2000,
 2010,
 2020,
 2025,
 2035,
 2060,
 2070,
 2080,
 2081,
 2083,
 3010,
 3020,
 3021,
 4010,
 4020,
 4025,
 4030,
 4035,
 4040,
 4070,
 4090,
 4100,
 4110,
 'installation_id',
 'accuracy_group',
 'n_incorrect_Bird Measurer (Assessment)_0',
 'world_n_incorrect_Bird Measurer (Assessment)_0',
 'n_incorrect_Mushroom Sorter (Assessment)_0',
 'n_incorrect_Air Show_0',
 'n_incorrect_Crystals Rule_0',
 'world_n_incorrect_Mushroom Sorter (Assessment)_0',
 'world_n_incorrect_Air Show_0',
 'world_n_incorrect_Crystals Rule_0',
 'n_incorrect_Scrub-A-Dub 1',
 'n_incorrect_Mushroom Sorter (Assessment) 1',
 'n_incorrect_Dino Drink 0',
 'Dino Drink_1996c610',
 'Dino Drink_4d6737eb',
 'Dino Drink_51311d7a',
 'Dino Drink_5be391b5',
 'Dino Drink_6c517a88',
 'Dino Drink_74e5f8a7',
 'Dino Drink_792530f8',
 'Dino Drink_7f0836bf',
 'Dino Drink_c6971acf',
 'Dino Drink_f806dc10',
 'n_incorrect_Bubble Bath 0',
 'Bubble Bath_0413e89d',
 'Bubble Bath_1340b8d7',
 'Bubble Bath_1beb320a',
 'Bubble Bath_1cf54632',
 'Bubble Bath_3bb91dda',
 'Bubble Bath_55115cbd',
 'Bubble Bath_5859dfb6',
 'Bubble Bath_857f21c0',
 'Bubble Bath_8d84fa81',
 'Bubble Bath_99abe2bb',
 'Bubble Bath_99ea62f3',
 'Bubble Bath_a0faea5d',
 'Bubble Bath_ecc36b7f',
 'world_n_incorrect_Mushroom Sorter (Assessment) 1',
 4045,
 4095,
 'Mushroom Sorter (Assessment)_160654fd',
 'n_incorrect_Air Show 1',
 'Air Show_d2659ab4',
 'n_incorrect_Crystals Rule 1',
 'world_Mushroom Sorter (Assessment)_160654fd',
 'world_n_incorrect_Air Show 1',
 'world_Air Show_d2659ab4',
 'world_n_incorrect_Crystals Rule 1',
 2075,
 'n_incorrect_All Star Sorting_0',
 'n_incorrect_All Star Sorting_1',
 'Bubble Bath_85de926c',
 'n_incorrect_Dino Dive 0',
 'Dino Dive_00c73085',
 'Dino Dive_28a4eb9a',
 'Dino Dive_29bdd9ba',
 'Dino Dive_6088b756',
 'Dino Dive_709b1251',
 'Dino Dive_76babcde',
 'Dino Dive_7d5c30a2',
 'Dino Dive_832735e1',
 'Dino Dive_87d743c1',
 'Dino Dive_c0415e5c',
 'n_incorrect_Chow Time 0',
 'Chow Time_0330ab6a',
 'Chow Time_0d1da71f',
 'Chow Time_47026d5f',
 'Chow Time_4ef8cdd3',
 'Chow Time_63f13dd7',
 'Chow Time_7372e1a5',
 'Chow Time_7ec0c298',
 'Chow Time_cfbd47c8',
 'Chow Time_d185d3ea',
 'Chow Time_f93fc684',
 'world_n_incorrect_Scrub-A-Dub 0',
 'world_Scrub-A-Dub_2b9272f4',
 'world_Scrub-A-Dub_4a09ace1',
 'world_Scrub-A-Dub_5a848010',
 'world_Scrub-A-Dub_5c3d2b2f',
 'world_Scrub-A-Dub_6d90d394',
 'world_Scrub-A-Dub_7040c096',
 'world_Scrub-A-Dub_ac92046e',
 'world_Scrub-A-Dub_c1cac9a2',
 'world_Scrub-A-Dub_cf82af56',
 'world_Scrub-A-Dub_f71c4741',
 'world_n_incorrect_All Star Sorting_0',
 'world_n_incorrect_All Star Sorting_1',
 'world_n_incorrect_Bubble Bath 0',
 'world_Bubble Bath_0413e89d',
 'world_Bubble Bath_1340b8d7',
 'world_Bubble Bath_1beb320a',
 'world_Bubble Bath_1cf54632',
 'world_Bubble Bath_3bb91dda',
 'world_Bubble Bath_55115cbd',
 'world_Bubble Bath_857f21c0',
 'world_Bubble Bath_85de926c',
 'world_Bubble Bath_8d84fa81',
 'world_Bubble Bath_99abe2bb',
 'world_Bubble Bath_99ea62f3',
 'world_Bubble Bath_a0faea5d',
 'world_Bubble Bath_ecc36b7f',
 'world_n_incorrect_Dino Dive 0',
 'world_Dino Dive_00c73085',
 'world_Dino Dive_28a4eb9a',
 'world_Dino Dive_29bdd9ba',
 'world_Dino Dive_6088b756',
 'world_Dino Dive_709b1251',
 'world_Dino Dive_76babcde',
 'world_Dino Dive_7d5c30a2',
 'world_Dino Dive_832735e1',
 'world_Dino Dive_87d743c1',
 'world_Dino Dive_c0415e5c',
 'n_incorrect_Cauldron Filler (Assessment) 0',
 'n_incorrect_Cauldron Filler (Assessment) 1',
 'Cauldron Filler (Assessment)_28520915',
 'Cauldron Filler (Assessment)_2dcad279',
 'Cauldron Filler (Assessment)_30614231',
 'Cauldron Filler (Assessment)_392e14df',
 'Cauldron Filler (Assessment)_3ee399c3',
 'Cauldron Filler (Assessment)_532a2afb',
 'Cauldron Filler (Assessment)_90d848e0',
 'n_incorrect_Pan Balance 0',
 'Pan Balance_0086365d',
 'Pan Balance_6cf7d25c',
 'Pan Balance_9c5ef70c',
 'Pan Balance_a592d54e',
 'n_incorrect_Scrub-A-Dub_0',
 'Mushroom Sorter (Assessment)_0d18d96c',
 'world_Mushroom Sorter (Assessment)_0d18d96c',
 'Pan Balance_2a444e03',
 'Pan Balance_804ee27f',
 'Pan Balance_907a054b',
 'Pan Balance_a5e9da97',
 'Pan Balance_bc8f2793',
 'Pan Balance_e7561dd2',
 'Pan Balance_f3cd5473',
 'n_incorrect_Happy Camel 0',
 'Happy Camel_1af8be29',
 'Happy Camel_3bb91ced',
 'Happy Camel_3d8c61b0',
 'Happy Camel_69fdac0a',
 'Happy Camel_6bf9e3e1',
 'Happy Camel_8af75982',
 'Happy Camel_a7640a16',
 'Happy Camel_a8a78786',
 'Happy Camel_abc5811c',
 'Happy Camel_c2baf0bd',
 'Happy Camel_d51b1749',
 'Happy Camel_d9c005dd',
 'n_incorrect_Cart Balancer (Assessment) 0',
 'n_incorrect_Cart Balancer (Assessment) 1',
 'Cart Balancer (Assessment)_5c2f29ca',
 'Cart Balancer (Assessment)_5e109ec3',
 'Cart Balancer (Assessment)_65a38bf7',
 'Cart Balancer (Assessment)_795e4a37',
 'Cart Balancer (Assessment)_828e68f9',
 'Cart Balancer (Assessment)_a8876db3',
 'Cart Balancer (Assessment)_b2e5b0f1',
 'Cart Balancer (Assessment)_d122731b',
 'world_n_incorrect_Chow Time 0',
 'world_Chow Time_0330ab6a',
 'world_Chow Time_0d1da71f',
 'world_Chow Time_4ef8cdd3',
 'world_Chow Time_63f13dd7',
 'world_Chow Time_7372e1a5',
 'world_Chow Time_7d093bf9',
 'world_Chow Time_7ec0c298',
 'world_Chow Time_cfbd47c8',
 'world_Chow Time_d185d3ea',
 'world_Chow Time_f93fc684',
 'world_n_incorrect_Pan Balance 0',
 'world_Pan Balance_0086365d',
 'world_Pan Balance_15f99afc',
 'world_Pan Balance_2a444e03',
 'world_Pan Balance_907a054b',
 'world_Pan Balance_9c5ef70c',
 'world_Pan Balance_a592d54e',
 'world_Pan Balance_a5e9da97',
 'world_Pan Balance_bc8f2793',
 'world_Pan Balance_e7561dd2',
 'world_Pan Balance_f3cd5473',
 'world_n_incorrect_Happy Camel 0',
 'world_Happy Camel_1af8be29',
 'world_Happy Camel_3bb91ced',
 'world_Happy Camel_3d8c61b0',
 'world_Happy Camel_69fdac0a',
 'world_Happy Camel_6bf9e3e1',
 'world_Happy Camel_8af75982',
 'world_Happy Camel_a7640a16',
 'world_Happy Camel_a8a78786',
 'world_Happy Camel_abc5811c',
 'world_Happy Camel_c2baf0bd',
 'world_Happy Camel_d51b1749',
 'world_Happy Camel_d9c005dd',
 'world_n_incorrect_Cart Balancer (Assessment) 0',
 'world_n_incorrect_Cart Balancer (Assessment) 1',
 'world_Cart Balancer (Assessment)_5c2f29ca',
 'world_Cart Balancer (Assessment)_5e109ec3',
 'world_Cart Balancer (Assessment)_65a38bf7',
 'world_Cart Balancer (Assessment)_795e4a37',
 'world_Cart Balancer (Assessment)_a8876db3',
 'world_Cart Balancer (Assessment)_b2e5b0f1',
 'world_Cart Balancer (Assessment)_d122731b',
 'All Star Sorting_b1d5101d',
 'Crystals Rule_a1192f43',
 'world_All Star Sorting_b1d5101d',
 'world_Crystals Rule_a1192f43',
 'Air Show_6f4bd64e',
 'n_incorrect_Chow Time 1',
 'Chow Time_9e6b7fb5',
 'Happy Camel_37db1c2f',
 'Happy Camel_c189aaf2',
 'n_incorrect_Leaf Leader 0',
 'Leaf Leader_262136f4',
 'Leaf Leader_29f54413',
 'Leaf Leader_2a512369',
 'Leaf Leader_3afde5dd',
 'Leaf Leader_67aa2ada',
 'Leaf Leader_763fc34e',
 'Leaf Leader_7dfe6d8a',
 'Leaf Leader_86ba578b',
 'Leaf Leader_8ac7cce4',
 'Leaf Leader_f32856e4',
 'Leaf Leader_fd20ea40',
 'world_n_incorrect_Chow Time 1',
 'world_Chow Time_47026d5f',
 'world_Chow Time_9e6b7fb5',
 'world_Happy Camel_37db1c2f',
 'world_Happy Camel_c189aaf2',
 'world_n_incorrect_Leaf Leader 0',
 'world_Leaf Leader_262136f4',
 'world_Leaf Leader_29f54413',
 'world_Leaf Leader_2a512369',
 'world_Leaf Leader_3afde5dd',
 'world_Leaf Leader_67aa2ada',
 'world_Leaf Leader_763fc34e',
 'world_Leaf Leader_7dfe6d8a',
 'world_Leaf Leader_86ba578b',
 'world_Leaf Leader_8ac7cce4',
 'world_Leaf Leader_f32856e4',
 'world_Leaf Leader_fd20ea40',
 'Pan Balance_e080a381',
 'n_incorrect_Chest Sorter (Assessment) 0',
 'Chest Sorter (Assessment)_0db6d71d',
 'Chest Sorter (Assessment)_155f62a4',
 'Chest Sorter (Assessment)_3ccd3f02',
 'Chest Sorter (Assessment)_3d0b9317',
 'Chest Sorter (Assessment)_562cec5f',
 'Chest Sorter (Assessment)_93b353f2',
 'Chest Sorter (Assessment)_a8efe47b',
 'Chest Sorter (Assessment)_bd612267',
 'Chest Sorter (Assessment)_df4fe8b6',
 'Bird Measurer (Assessment)_070a5291',
 'Bird Measurer (Assessment)_3393b68b',
 'Bird Measurer (Assessment)_45d01abe',
 'Bird Measurer (Assessment)_8fee50e2',
 'Bird Measurer (Assessment)_f6947f54',
 'world_Pan Balance_e080a381',
 'world_n_incorrect_Chest Sorter (Assessment) 0',
 'world_Chest Sorter (Assessment)_0db6d71d',
 'world_Chest Sorter (Assessment)_155f62a4',
 'world_Chest Sorter (Assessment)_3ccd3f02',
 'world_Chest Sorter (Assessment)_3d0b9317',
 'world_Chest Sorter (Assessment)_562cec5f',
 'world_Chest Sorter (Assessment)_93b353f2',
 'world_Chest Sorter (Assessment)_a8efe47b',
 'world_Chest Sorter (Assessment)_bd612267',
 'world_Chest Sorter (Assessment)_df4fe8b6',
 'Cauldron Filler (Assessment)_2b058fe3',
 'Cauldron Filler (Assessment)_91561152',
 'Cart Balancer (Assessment)_9d4e7b25',
 'Cart Balancer (Assessment)_acf5c23f',
 'n_incorrect_Chest Sorter (Assessment) 1',
 'Chest Sorter (Assessment)_9ce586dd',
 'Chest Sorter (Assessment)_cb1178ad',
 'world_Bird Measurer (Assessment)_070a5291',
 'world_Bird Measurer (Assessment)_3393b68b',
 'world_Bird Measurer (Assessment)_45d01abe',
 'world_Bird Measurer (Assessment)_8fee50e2',
 'world_Bird Measurer (Assessment)_f6947f54',
 'world_n_incorrect_Dino Drink 0',
 'world_Dino Drink_1996c610',
 'world_Dino Drink_4d6737eb',
 'world_Dino Drink_51311d7a',
 'world_Dino Drink_5be391b5',
 'world_Dino Drink_6c517a88',
 'world_Dino Drink_74e5f8a7',
 'world_Dino Drink_792530f8',
 'world_Dino Drink_7f0836bf',
 'world_Dino Drink_c6971acf',
 'world_Dino Drink_f806dc10',
 'world_Bubble Bath_5859dfb6',
 'n_incorrect_Bird Measurer (Assessment) 1',
 'n_incorrect_Dino Dive 1',
 'world_n_incorrect_Bird Measurer (Assessment) 1',
 'world_Cart Balancer (Assessment)_9d4e7b25',
 'world_Cart Balancer (Assessment)_acf5c23f',
 'Cart Balancer (Assessment)_3d63345e',
 'world_Cart Balancer (Assessment)_3d63345e',
 'Leaf Leader_53c6e11a',
 'world_n_incorrect_Cauldron Filler (Assessment) 0',
 'world_Cauldron Filler (Assessment)_2dcad279',
 'world_Cauldron Filler (Assessment)_30614231',
 'world_Cauldron Filler (Assessment)_3ee399c3',
 'world_Cauldron Filler (Assessment)_532a2afb',
 'world_Cauldron Filler (Assessment)_90d848e0',
 'world_n_incorrect_Cauldron Filler (Assessment) 1',
 'world_Cauldron Filler (Assessment)_28520915',
 'world_Cauldron Filler (Assessment)_392e14df',
 'Scrub-A-Dub_92687c59',
 'Bird Measurer (Assessment)_a76029ee',
 'n_incorrect_Dino Drink 1',
 'Dino Drink_6f8106d9',
 'Dino Drink_9ed8f6da',
 'Cauldron Filler (Assessment)_04df9b66',
 'Cauldron Filler (Assessment)_3edf6747',
 'n_incorrect_Happy Camel 1',
 'Happy Camel_a2df0760',
 'world_Bird Measurer (Assessment)_a76029ee',
 'world_n_incorrect_Dino Drink 1',
 'world_Dino Drink_6f8106d9',
 'world_Dino Drink_9ed8f6da',
 'world_n_incorrect_Happy Camel 1',
 'world_Happy Camel_a2df0760',
 'world_n_incorrect_Scrub-A-Dub 1',
 'world_Cauldron Filler (Assessment)_04df9b66',
 'world_Cauldron Filler (Assessment)_3edf6747',
 'Cart Balancer (Assessment)_31973d56',
 'Cart Balancer (Assessment)_4e5fc6f5',
 'world_Cart Balancer (Assessment)_31973d56',
 'world_Cart Balancer (Assessment)_4e5fc6f5',
 'Bird Measurer (Assessment)_d38c2fd7',
 'Cauldron Filler (Assessment)_5348fd84',
 'Chest Sorter (Assessment)_222660ff',
 'Chest Sorter (Assessment)_3afb49e6',
 'n_incorrect_Leaf Leader 1',
 'Leaf Leader_3b2048ee',
 'world_Chest Sorter (Assessment)_222660ff',
 'world_Chest Sorter (Assessment)_3afb49e6',
 'world_Chest Sorter (Assessment)_9ce586dd',
 'world_n_incorrect_Leaf Leader 1',
 'world_Leaf Leader_3b2048ee',
 'world_Leaf Leader_53c6e11a',
 'world_Cauldron Filler (Assessment)_2b058fe3',
 'world_Cauldron Filler (Assessment)_5348fd84',
 'world_Cauldron Filler (Assessment)_91561152',
 'Bird Measurer (Assessment)_731c0cbe',
 'world_n_incorrect_Chest Sorter (Assessment) 1',
 'n_incorrect_Pan Balance 1',
 'world_n_incorrect_Pan Balance 1',
 'n_incorrect_Bubble Bath 1',
 'world_n_incorrect_Bubble Bath 1',
 'world_n_incorrect_Dino Dive 1',
 'Happy Camel_46b50ba8',
 'Chow Time_19967db1',
 'world_Chow Time_19967db1',
 'Happy Camel_05ad839b',
 'world_Happy Camel_05ad839b',
 'world_Happy Camel_46b50ba8',
 'world_Bird Measurer (Assessment)_d38c2fd7',
 'world_Air Show_6f4bd64e',
 'Mushroom Sorter (Assessment)_eb2c19cd',
 'world_Mushroom Sorter (Assessment)_eb2c19cd',
 'Crystals Rule_93edfe2e',
 'world_Crystals Rule_93edfe2e',
 'world_Bird Measurer (Assessment)_731c0cbe',
 'Dino Dive_d3640339',
 'world_Dino Dive_d3640339',
 'Cauldron Filler (Assessment)_77c76bc5',
 'world_Cauldron Filler (Assessment)_77c76bc5',
 'Bubble Bath_29a42aea',
 4080,
 'Chow Time_6f445b57',
 'world_Chow Time_6f445b57',
 'All Star Sorting_26a5a3dd',
 'world_Chest Sorter (Assessment)_cb1178ad',
 'world_All Star Sorting_26a5a3dd',
 'Bubble Bath_6aeafed4',
 'world_Bubble Bath_6aeafed4',
 'Mushroom Sorter (Assessment)_13f56524',
 'world_Mushroom Sorter (Assessment)_13f56524',
 'world_Scrub-A-Dub_92687c59',
 'Chest Sorter (Assessment)_bfc77bd6',
 'world_Chest Sorter (Assessment)_bfc77bd6',
 'world_Bubble Bath_29a42aea',
 'Dino Drink_ab4ec3a4',
 'world_Dino Drink_ab4ec3a4',
 'Happy Camel_0ce40006',
 'Cauldron Filler (Assessment)_9554a50b',
 'world_Cauldron Filler (Assessment)_9554a50b',
 'Dino Dive_119b5b02',
 'world_Dino Dive_119b5b02',
 'Cart Balancer (Assessment)_ecc6157f',
 'world_Cart Balancer (Assessment)_ecc6157f',
 'Bird Measurer (Assessment)_6077cc36',
 'world_Bird Measurer (Assessment)_6077cc36',
 'Pan Balance_e4d32835',
 'world_Pan Balance_e4d32835',
 'Leaf Leader_01ca3a3c',
 'world_Leaf Leader_01ca3a3c',
 'world_Happy Camel_0ce40006']

In [4]:
for useful_feature in useful_features:
    if useful_feature not in test_features.columns:
        test_features[useful_feature]=-1
        print("Missing feature", useful_feature)

test_features=test_features[[c for c in useful_features if c != "accuracy_group"]].copy()

Missing feature accuracy_group
Missing feature world_Mushroom Sorter (Assessment)_eb2c19cd
Missing feature Bubble Bath_29a42aea
Missing feature Mushroom Sorter (Assessment)_13f56524
Missing feature world_Mushroom Sorter (Assessment)_13f56524
Missing feature Chest Sorter (Assessment)_bfc77bd6
Missing feature world_Chest Sorter (Assessment)_bfc77bd6
Missing feature world_Bubble Bath_29a42aea
Missing feature Dino Drink_ab4ec3a4
Missing feature world_Dino Drink_ab4ec3a4
Missing feature Happy Camel_0ce40006
Missing feature world_Cauldron Filler (Assessment)_9554a50b
Missing feature Dino Dive_119b5b02
Missing feature world_Dino Dive_119b5b02
Missing feature Cart Balancer (Assessment)_ecc6157f
Missing feature world_Cart Balancer (Assessment)_ecc6157f
Missing feature world_Bird Measurer (Assessment)_6077cc36
Missing feature Pan Balance_e4d32835
Missing feature world_Pan Balance_e4d32835
Missing feature Leaf Leader_01ca3a3c
Missing feature world_Leaf Leader_01ca3a3c
Missing feature world_Happy 

__validation.py__

In [5]:
import numpy as np
from sklearn.model_selection import GroupKFold, KFold
from sklearn.utils import shuffle
from typing import NamedTuple
from functools import partial
from sklearn.metrics import cohen_kappa_score


class Predict(NamedTuple):
    true: np.array
    pred: np.array


class InstallationFold(GroupKFold):
    def __init__(self, n_splits=5, installation_ids=None):
        super().__init__(n_splits=n_splits)
        self.installation_ids = installation_ids

    def split(self, X, y, installation_ids=None):
        if installation_ids is None:
            installation_ids = self.installation_ids
        orig_indices = np.arange(len(X))
        shuffled_indices, installation_ids = shuffle(orig_indices, installation_ids, random_state=2019)
        for train, test in super().split(shuffled_indices, shuffled_indices, installation_ids):
            yield shuffled_indices[train], shuffled_indices[test]


def fit_fold(df, train_ix, test_ix, make_features, train_model, make_predictions):
    train = df.iloc[train_ix].reset_index().copy()
    test = df.iloc[test_ix].reset_index().copy()
    train_features, test_features = make_features(train, test)
    model = train_model(*train_features)
    test_pred, test_true = make_predictions(model, *test_features)
    return Predict(test_true, test_pred)


def cross_validate(train, labels, make_features, train_model, make_predictions, cv=None):
    predicts = []
    np.random.seed(2019)
    cv = InstallationFold() if cv is None else cv
    for ix_train, ix_test in cv.split(train, labels, train.installation_id.values):
        predicts.append(fit_fold(train, ix_train, ix_test, make_features, train_model, make_predictions))
    return predicts


quad_kappa = partial(cohen_kappa_score, weights="quadratic")

__coeff.py__

In [6]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from functools import partial
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
#from dsb2019.data.validation import quad_kappa


class ThresholdClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_iter=1000, random_state=2019):
        self.n_iter=n_iter
        self.random_state=random_state

    def _run_trial(self, X, y, params):
        threshold1 = params["threshold1"]
        threshold2 = threshold1 + abs(params["threshold2_delta"])
        threshold3 = threshold2 + abs(params["threshold3_delta"]) 
        pred = pd.cut(X, [-np.inf, threshold1, threshold2, threshold3, np.inf], labels = [0, 1, 2, 3])
        return {
           "loss": -quad_kappa(y, pred),
           "status": STATUS_OK,
           "coef": [threshold1, threshold2, threshold3]
        }

    def fit(self, X, y):
        class1_percentile = sum(y<1) / len(y) * 100
        class2_percentile = sum(y<2) / len(y) * 100
        class3_percentile = sum(y<3) / len(y) * 100
        threshold1_prior = np.percentile(X, class1_percentile)
        threshold2_prior = np.percentile(X, class2_percentile)
        threshold3_prior = np.percentile(X, class3_percentile)
        threshold2_delta_prior = threshold2_prior - threshold1_prior
        threshold3_delta_prior = threshold3_prior - threshold2_prior
        prior_std = (np.percentile(X, 99) - np.percentile(X, 1)) / 3
        space = {
            "threshold1": hp.normal("threshold1", threshold1_prior, prior_std),
            "threshold2_delta": hp.normal("threshold2_delta", threshold2_delta_prior, prior_std),
            "threshold3_delta": hp.normal("threshold3_delta", threshold3_delta_prior, prior_std)
        }

        partial_run = partial(self._run_trial, X, y)

        trials = Trials()
        fmin(partial_run, space=space,
             algo=tpe.suggest,
             max_evals=self.n_iter, rstate=np.random.RandomState(self.random_state), trials=trials)
        
        self.coef_ = trials.best_trial["result"]["coef"]
        return self

    def predict(self, X):
        return pd.cut(X, [-np.inf] + self.coef_ + [np.inf], labels = [0, 1, 2, 3])

In [7]:
def make_submission(test_features, model):
    installations = test_features.installation_id.values
    test = test_features.drop("installation_id", axis=1)
    predictions = model.predict(test)
    clf = ThresholdClassifier()
    clf.coef_=coef
    predictions = clf.predict(predictions)
    return pd.DataFrame(data={"installation_id": installations, "accuracy_group": predictions})

submission = make_submission(test_features, model)

In [8]:
submission.to_csv("../../data/submissions/regression_baseline_eventid_bag.csv", index=False)

In [9]:
submission.accuracy_group.value_counts()

3    420
2    296
1    147
0    137
Name: accuracy_group, dtype: int64