In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from target_encoding import TargetEncoderClassifier, TargetEncoder
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from functools import reduce
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.model_selection import cross_val_score
import json
from functools import partial
from dsb2019.models.coeff import ThresholdClassifier

from dsb2019.models.tracking import track_experiment, track_submission_info
from dsb2019.data.validation import InstallationFold, cross_validate, quad_kappa
from dsb2019.visualization import session_browser
from dsb2019.data import DATA_DIR
from dsb2019.data import adv_validation
from dsb2019.models import MODELS_DIR
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import hyperopt
from hyperopt import hp, fmin, Trials, tpe, STATUS_OK
tqdm.pandas()
pd.options.display.max_rows=999

In [3]:
train = pd.read_csv(DATA_DIR / 'raw/train.csv')
test = pd.read_csv(DATA_DIR / 'raw/test.csv')
train_labels = pd.read_csv(DATA_DIR / 'raw/train_labels.csv')
submission = pd.read_csv(DATA_DIR / 'raw/sample_submission.csv')

In [4]:
train.event_id

0           27253bdc
1           27253bdc
2           77261ab5
3           b2dba42b
4           1bb5fbdb
              ...   
11341037    ab3136ba
11341038    27253bdc
11341039    27253bdc
11341040    27253bdc
11341041    27253bdc
Name: event_id, Length: 11341042, dtype: object

In [5]:
games = ['Scrub-A-Dub', 'All Star Sorting',
       'Air Show', 'Crystals Rule', 
       'Dino Drink', 'Bubble Bath', 'Dino Dive', 'Chow Time',
       'Pan Balance', 'Happy Camel',
       'Leaf Leader']
assessments = ['Mushroom Sorter (Assessment)',
       'Bird Measurer (Assessment)',
       'Cauldron Filler (Assessment)',
       'Cart Balancer (Assessment)', 'Chest Sorter (Assessment)']
worlds = ['NONE', 'MAGMAPEAK', 'TREETOPCITY', 'CRYSTALCAVES']

def unwrap_event_data(df):
    unwrapped=pd.DataFrame(data=list(df.event_data.apply(json.loads).values))
    return pd.concat([unwrapped.reset_index(),df.reset_index()],axis=1)


def process_installations(train_labels, train, process_log):
    result = []
    train=train.sort_values("timestamp")
    installations = train.groupby("installation_id")
    for i, game_session, title, installation_id, accuracy_group in tqdm(train_labels[["game_session", "title", "installation_id", "accuracy_group"]].itertuples(), 
                                                              total=len(train_labels), position=0):
        player_log = installations.get_group(installation_id).reset_index()
        log_length = player_log[(player_log.game_session==game_session) & (player_log.title==title)].index[0]
        player_log = player_log.iloc[:(log_length + 1)]
        player_log["accuracy_group"] = accuracy_group
        player_log["target_game_session"] = game_session
        features = process_log(player_log)
        features["installation_id"] = installation_id
        features["accuracy_group"] = accuracy_group
        result.append(features)
    return pd.DataFrame(data=result).fillna(-1)


def make_counters(df, column):
    return df.groupby(column)[column].count().to_dict()

    
def process_log(df):
    assessment_title=df.title.iloc[-1]   
    world=df.world.iloc[-1]

    history = df.iloc[:-1]
    history = history[history.type.isin(["Game", "Assessment"])].copy()

    def calculate_ratios(df):
        n_correct=df.correct_move.sum()
        n_incorrect=df.wrong_move.sum()
        ratio=n_correct/(n_correct+n_incorrect)
        return n_correct, n_incorrect, ratio
    
    def make_move_stats(df, title,n_lags=2):
        df=df.copy()
        if len(df):
            df = unwrap_event_data(df)
        if "correct" in df.columns:
            df["correct_move"] = df.correct == True
            df["wrong_move"] = df.correct == False
        else:
            df["correct_move"]=False
            df["wrong_move"]=False
        result = []
        result.extend(zip([f"n_correct_{title}", f"n_incorrect_{title}", f"global_ratio_{title}"], calculate_ratios(df)))

        if n_lags:
            last_sessions = df.game_session.unique()[-n_lags:]
            for i in range(n_lags):
                if i < len(last_sessions): 
                    result.extend(zip([f"n_correct_{title}_{i}", f"n_incorrect_{title} {i}",f"ratio_{title}_{i}"], 
                                      calculate_ratios(df[df.game_session==last_sessions[i]])))
                else:
                    result.extend(zip([f"n_correct_{title}_{i}", f"n_incorrect_{title}_{i}",f"ratio_{title}_{i}"], [None, None, None]))
        return {k: v for k, v in result}
    
    
    result = {"title": assessments.index(assessment_title),
              "world": worlds.index(world),
              "n_activities": df[df.type=="Activity"].game_session.nunique(),
              "n_games": df[df.type=="Game"].game_session.nunique(),
              "event_code_count": df.event_code.nunique(),
              "event_id_count": df.event_id.nunique(),
              "title_count": df.title.nunique(),
              "n_actions": len(df),
              "world_title_count": df[df.world==world].title.nunique(),
             }
    for game in games:
        stats=history[history.title==game]
        stats_features=make_move_stats(stats, game)
        stats_features[f"{game}_event_code_count"] = stats.event_code.nunique()
        #stats_features[f"{game}_event_id_count"] = stats.event_id.nunique()
        stats_features[f"{game}_session_id_count"] = stats.game_session.nunique()
        stats_features[f"{game}_n_actions"] = len(stats)
        result.update(stats_features)
        result.update({f"{game}_{k}": v for k, v in make_counters(stats, "event_id").items()})
        result.update({f"{game}_{k}": v for k, v in make_counters(stats, "event_code").items()})
    world_games = history[history.world==world]
    for game in games:
        stats=world_games[world_games.title==game]
        stats_features=make_move_stats(stats, game)
        stats_features = {f"world_{k}": v for k, v in stats_features.items()}
        stats_features[f"world_{game}_event_code_count"] = stats.event_code.nunique()
        stats_features[f"world_{game}_event_id_count"] = stats.event_id.nunique()
        stats_features[f"world_{game}_session_id_count"] = stats.game_session.nunique()
        stats_features[f"world_{game}_n_actions"] = len(stats)
        result.update(stats_features)
        result.update({f"world_{game}_{k}": v for k, v in make_counters(stats, "event_id").items()})
        result.update({f"world_{game}_{k}": v for k, v in make_counters(stats, "event_code").items()})
    result.update(make_counters(history, "event_id"))
    result.update(make_counters(history, "event_code"))
    return result

In [6]:
def create_lgb_splits_files(assessments):
    def binarize(arr):
        if len(arr)==1:
            return None
        else:
            split_at = len(arr)//2
            threshold = arr[split_at]
            left = binarize(arr[:split_at])
            right = binarize(arr[split_at:])
            result = {
                "feature": "title",
                "threshold": threshold,
            }
            if left is not None:
                result["left"]=left
            if right is not None:
                result["right"]=right
            return result
    bin_ups = [x - 0.1 for x in range(len(assessments))]
    forced_splits = binarize(bin_ups)
    
    def collect_bins(node, result):
        result.add(node["threshold"])
        if "left" in node:
            collect_bins(node["left"], result)
        if "right" in node:
            collect_bins(node["right"], result)
    
#     bin_upper_bound = set([])
#     collect_bins(forced_splits, bin_upper_bound)
#     forced_bins = [
#         {
#             "feature": "title",
#             "bin_upper_bound": sorted(bin_upper_bound)
#         }
#     ]
    return forced_splits
forced_splits = create_lgb_splits_files(assessments)
with open("forced_splits.json", "w") as f:
    json.dump(forced_splits, f)
# with open("forced_bins.json", "w") as f:
#     json.dump(forced_bins, f)

In [7]:
forced_splits

{'feature': 'title',
 'threshold': 1.9,
 'left': {'feature': 'title', 'threshold': 0.9},
 'right': {'feature': 'title',
  'threshold': 2.9,
  'right': {'feature': 'title', 'threshold': 3.9}}}

In [None]:
train_features = process_installations(train_labels, train, process_log)

 73%|███████▎  | 12993/17690 [43:33<11:14,  6.96it/s] 

In [None]:
bad_features = ["session_id_count", "event_id_count"]

In [None]:
def get_duplicate_features(features, bad_features):
    to_remove = set([])
    counter = 0
    feature_names=[f for f in features.columns if f not in ("installation_id", "game_session", "accuracy_group")]
    for feat_a in tqdm(feature_names):
        for feat_b in feature_names:
            if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
                c = np.corrcoef(features[feat_a], features[feat_b])[0][1]
                if c > 0.995:
                    counter += 1
                    to_remove.add(feat_b)
                    if feat_b in bad_features or feat_a in bad_features:
                        to_remove.add(feat_a)
                    #print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))
    for bf in bad_features:
        to_remove.add(bf)
    print(f"{len(to_remove)} features were removed ({round(len(to_remove)/len(feature_names)*100, 2)}% of all features)")
    return list(to_remove)
    
duplicate_features = get_duplicate_features(train_features, bad_features)

useful_features = [f for f in train_features.columns if f not in duplicate_features]
useful_features

In [None]:
train_features=train_features[useful_features].copy()

In [None]:
def lgb_quad_kappa(preds, true):
    true = true.get_label()
    #preds = preds.reshape((4, -1)).argmax(axis=0)
    preds = np.rint(preds)
    preds = np.maximum(0, preds)
    preds = np.minimum(3, preds)
    return "quad_kappa", quad_kappa(true, preds), True
    
    
def train_baseline(x_train,y_train, params=None):
    x_train_all, x_val_all,y_train_all,y_val_all = train_test_split(
        x_train,y_train,
        test_size=0.15,
        random_state=2019,
    )
    train_set = lgb.Dataset(x_train_all, y_train_all)
    val_set = lgb.Dataset(x_val_all, y_val_all)

#     params = {
#         'learning_rate': 0.01,
#         'bagging_fraction': 0.9,
#         'feature_fraction': 0.9,
#         'num_leaves': 14,
#         'lambda_l1': 0.1,
#         'lambda_l2': 1,
#         'metric': 'multiclass',
#         'objective': 'multiclass',
#         'num_classes': 4,
#         'random_state': 2019
#     }

    return lgb.train(params, train_set, num_boost_round=10000, early_stopping_rounds=2000, valid_sets=[val_set], verbose_eval=100)#,
                    #feval=lgb_quad_kappa)

def make_features(df):
    return df.drop(["installation_id", "accuracy_group"], axis=1), df.accuracy_group.values

def make_features_wrapper(train, test):
    def make_features(df):
        return df.drop(["installation_id", "accuracy_group"], axis=1), df.accuracy_group.values
    
    return make_features(train), make_features(test) 


def make_predictions(model,x_test_all,y_test):
    preds=model.predict(x_test_all)
    #preds = np.rint(preds)
    #preds = np.maximum(0, preds)
    #preds = np.minimum(3, preds)
    return preds,y_test

In [None]:
def process_test_installations(test):
    test = test.sort_values("timestamp")
    test=test.groupby("installation_id").progress_apply(process_log).reset_index()
    test.columns = ["installation_id", "features"]
    result = []
    for i, installation_id, feature in test.itertuples():
        result.append(feature)
        feature["installation_id"]=installation_id
    return pd.DataFrame(result).fillna(-1)
test_features=process_test_installations(test)

for useful_feature in useful_features:
    if useful_feature not in test_features.columns:
        test_features[useful_feature]=-1
        print("Missing feature", useful_feature)

test_features=test_features[[c for c in useful_features if c != "accuracy_group"]].copy()

In [None]:
!ls ../dsb2019/models

In [None]:
with open("../dsb2019/models/regression_baseline_adv_params.json", "r") as f:
    validator_params=json.load(f)
#validator_paramstor=best_params
selected_features = [f for f in train_features.columns if f not in ("installation_id", "game_session", "accuracy_group")]
validator = adv_validation.AdversarialValidator(validator_params, train_features, test_features, selected_features=selected_features,test_size=0.5)
validator.fit()
print(validator.roc_auc())
validator.shap_important_features()

In [None]:
subtrain_installations=pd.Series(train_features.installation_id.unique()).sample(frac=1., random_state=2019)
subtrain_features=train_features[train_features.installation_id.isin(subtrain_installations.values)].copy()
def check_hyperparams(params):
    print(params)
    if "max_depth" in params:
        params["max_depth"] = int(params["max_depth"])
    if "num_leaves" in params:
        params["num_leaves"] = int(params["num_leaves"])

    train_baseline_with_params = partial(train_baseline, params=params)
    cv=InstallationFold(n_splits=3)
    predictions = cross_validate(subtrain_features, subtrain_features.accuracy_group, make_features_wrapper, train_baseline_with_params, make_predictions,
                                cv=cv)
    return {
        "loss": np.mean([mean_squared_error(true, pred) for pred, true in predictions]),
        "status": STATUS_OK,
        "params": params
    }


def tune(check_params, n_tries=25, n_learning_rate_tries=15, learning_rate=None, n_estimators=10_000):        
    if learning_rate is None:
        learning_rate_space = {
            'learning_rate': hp.loguniform("learning_rate", np.log(0.005), np.log(0.3)),
            'metric': 'rmse',
            'objective': 'rmse',
            #'num_classes': 4,
            'random_state': 2019,
            "n_estimators": n_estimators,
            "forced_splits": "forced_splits.json",
            #"forcedbins_filename": "forced_bins.json",

        }
        trials = Trials()
        result = fmin(check_params,
                      learning_rate_space, tpe.suggest, n_learning_rate_tries, trials=trials)
        print(result)
        learning_rate = round(trials.best_trial["result"]["params"]["learning_rate"], 3)

    param_space = {
        'metric': 'rmse',
        'objective': 'rmse',
        #'num_classes': 4,
        'lambda_l1': hp.uniform("lamba_l1", 1e-10, 1),
        'lambda_l2': hp.uniform("lambda_l2", 1e-10, 1),
        'random_state': 2019,
        "n_estimators": n_estimators,
        "learning_rate": learning_rate,
        "max_depth": hp.quniform("max_depth", 3, 16, 1),
        "num_leaves": hp.choice("num_leaves", [5, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095]),
        "subsample": hp.quniform("subsample", 0.01, 1, 0.01),
        "feature_fraction": hp.quniform("feature_fraction", 0.01, 1, 0.01),
        "forced_splits": "forced_splits.json",
        #"forcedbins_filename": "forced_bins.json",
    }

    trials = Trials()
    fmin(check_params,
         param_space, tpe.suggest, n_tries, trials=trials)
    best_params = trials.best_trial["result"]["params"]
    return best_params

In [None]:
best_params=tune(check_hyperparams, n_tries=100, n_learning_rate_tries=10)

In [32]:
best_params["max_depth"]=10
best_params["num_leaves"]=63

What was selected on 100% of the data

```
{'feature_fraction': 0.53,
 'lambda_l1': 0.922950554822482,
 'lambda_l2': 0.835047934936944,
 'learning_rate': 0.006,
 'max_depth': 11,
 'metric': 'rmse',
 'n_estimators': 10000,
 'num_leaves': 31,
 'objective': 'rmse',
 'random_state': 2019,
 'subsample': 0.9500000000000001}

```

In [None]:
# best_params={'feature_fraction': 0.58,
#  'lambda_l1': 0.45619796864269707,
#  'lambda_l2': 0.033257384218246686,
#  'learning_rate': 0.007,
#  'max_depth': 14,
#  'metric': 'multiclass',
#  'n_estimators': 10000,
#  'num_classes': 4,
#  'num_leaves': 31,
#  'objective': 'multiclass',
#  'random_state': 2019,
#  'subsample': 0.9500000000000001}

In [None]:
with open("../dsb2019/models/regression_baseline_splits_params.json", "w") as f:
    json.dump(best_params, f)

In [33]:
baseline_model=train_baseline(train_features.drop(["installation_id", "accuracy_group"], axis=1), train_features.accuracy_group.values, 
               params=best_params)

Training until validation scores don't improve for 2000 rounds
[100]	valid_0's rmse: 5.50926e+38
[200]	valid_0's rmse: 5.50926e+38
[300]	valid_0's rmse: 5.50926e+38
[400]	valid_0's rmse: 5.50926e+38
[500]	valid_0's rmse: 5.50926e+38
[600]	valid_0's rmse: 5.50926e+38
[700]	valid_0's rmse: 5.50926e+38
[800]	valid_0's rmse: 5.50926e+38
[900]	valid_0's rmse: 5.50926e+38
[1000]	valid_0's rmse: 5.50926e+38
[1100]	valid_0's rmse: 5.50926e+38
[1200]	valid_0's rmse: 5.50926e+38
[1300]	valid_0's rmse: 5.50926e+38
[1400]	valid_0's rmse: 5.50926e+38
[1500]	valid_0's rmse: 5.50926e+38
[1600]	valid_0's rmse: 5.50926e+38
[1700]	valid_0's rmse: 5.50926e+38
[1800]	valid_0's rmse: 5.50926e+38
[1900]	valid_0's rmse: 5.50926e+38
[2000]	valid_0's rmse: 5.50926e+38
Early stopping, best iteration is:
[1]	valid_0's rmse: 33.8827


In [34]:
predictions = cross_validate(train_features, train_features.accuracy_group, make_features_wrapper, partial(train_baseline, params=best_params), 
                             make_predictions)
np.mean([mean_squared_error(true, pred) for pred, true in predictions]), [mean_squared_error(true, pred) for pred, true in predictions]



Training until validation scores don't improve for 2000 rounds
[100]	valid_0's rmse: 1.34766
[200]	valid_0's rmse: 2.03436
[300]	valid_0's rmse: 1.02186
[400]	valid_0's rmse: 0.991171
[500]	valid_0's rmse: 1.16942
[600]	valid_0's rmse: 1.0216
[700]	valid_0's rmse: 1.03774
[800]	valid_0's rmse: 1.21829
[900]	valid_0's rmse: 1.03588
[1000]	valid_0's rmse: 1.00013
[1100]	valid_0's rmse: 1.01633
[1200]	valid_0's rmse: 1.0313
[1300]	valid_0's rmse: 0.999637
[1400]	valid_0's rmse: 1.00326
[1500]	valid_0's rmse: 1.005
[1600]	valid_0's rmse: 1.00474
[1700]	valid_0's rmse: 1.02449
[1800]	valid_0's rmse: 1.00784
[1900]	valid_0's rmse: 1.00841
[2000]	valid_0's rmse: 1.0138
[2100]	valid_0's rmse: 1.01288
[2200]	valid_0's rmse: 1.01316
[2300]	valid_0's rmse: 1.017
Early stopping, best iteration is:
[368]	valid_0's rmse: 0.990352
Training until validation scores don't improve for 2000 rounds
[100]	valid_0's rmse: 1.47953
[200]	valid_0's rmse: 0.995861
[300]	valid_0's rmse: 1.02412
[400]	valid_0's rm

(1.0244837466871028,
 [1.0360890255297464,
  1.0147004267897548,
  0.980720702954557,
  1.018300951919899,
  1.0726076262415578])

In [None]:
baseline_model.save_model(str(MODELS_DIR / "regression_baseline_splits.lgb"))

In [None]:
features, target = make_features(train_features)
prediction=baseline_model.predict(features)
clf = ThresholdClassifier()
clf.fit(prediction, target)

In [26]:
clf.coef_