In [1]:
import os
import gc
import re
import abc
import json
import copy
import time
import eli5
import datetime
from numba import jit
from pathlib import Path

from IPython.display import HTML
import altair as alt
from category_encoders.ordinal import OrdinalEncoder
from typing import List, Any

import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

import random
random.seed(42)
np.random.seed(42)

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import lightgbm as lgb

from bayes_opt import BayesianOptimization
import shap

from itertools import product
from functools import partial

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn import linear_model
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score, mean_squared_error, mean_absolute_error


# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from collections import defaultdict
from joblib import Parallel, delayed

pd.set_option('display.max_columns', 1000)
pd.set_option('max_rows', 500)
pd.options.display.precision = 15
np.random.seed(42)

import warnings
warnings.filterwarnings("ignore")



# Objective

- LightGBMだけで、メダル圏内のスコアを目指す。
- まずは、Baselineとなるnotebookを作成する。

# Notes

- Piplineを整理する。
- 各ポイントでキャッシュを残す。
- 特徴量エンジニアリングは、可読性が高く、追記しやすいことを意識する。
    - 変数名が長くなりすぎるのを防ぐために略称をつけるが、なんの略語なのかを書き残しておく。
    - 特徴量ごとの塊を意識する。
    
### Abbreviations

- sess -> session, game_session


### Scores
- baseline
    - oof: 
    - eval: 

## Load Data

In [85]:
def read_data(files: list):
    if os.path.exists('/kaggle/input/data-science-bowl-2019/'):
        data_dir_path = '/kaggle/input/data-science-bowl-2019/'
    else:
        data_dir_path = '../../data/reduced/'
    
    dst_data = {}
    for file in files:
        print(f'Reading {file} ....')
        dst_data[file] = pd.read_csv(data_dir_path + file)
        print(f'{file} file have {dst_data[file].shape[0]} rows and {dst_data[file].shape[1]} columns.')

    return dst_data.values()


# 'sample_submission.csv', 'specs.csv', 'test.csv', 'train_labels.csv', 'train.csv'
raw_train, raw_test = read_data(['train.csv', 'test.csv'])

Reading train.csv ....
train.csv file have 11341042 rows and 11 columns.
Reading test.csv ....
test.csv file have 1156414 rows and 11 columns.


In [86]:
raw_train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


## Creat Feature

In [89]:
class Feature(metaclass=abc.ABCMeta):
    prefix = ""
    suffix = ""
    save_dir = "features"
    is_feature = True

    def __init__(self):
        self.name = self.__class__.__name__
        Path(self.save_dir).mkdir(exist_ok=True, parents=True)
        self.train = pd.DataFrame()
        self.test = pd.DataFrame()
        self.train_path = Path(self.save_dir) / f"{self.name}_train.pkl"
        self.test_path = Path(self.save_dir) / f"{self.name}_test.pkl"

    def run(self, train_df, test_df=None, log=False):
        self.create_features(train_df, test_df)
        prefix = self.prefix + "_" if self.prefix else ""
        suffix = self.suffix + "_" if self.suffix else ""
        self.train.columns = pd.Index([str(c) for c in self.train.columns])
        self.test.columns = pd.Index([str(c) for c in self.test.columns])
        self.train.columns = prefix + self.train.columns + suffix
        self.test.columns = prefix + self.test.columns + suffix
        return self

    @abc.abstractmethod
    def create_features(self, train_df, test_df):
        raise NotImplementedError

    def save(self):
        self.train.to_pickle(str(self.train_path))
        self.test.to_pickle(str(self.test_path))

In [90]:
class PastSessSummary(Feature):
    # session単位のデータから特徴量を作成する。
    def parse_session():
        pass
    
    # installation_idでgroupbyする役割を持ちデータをclass変数に割り当てる。
    def create_features(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df

In [94]:
'''
TODO:
- gruopby(['installation_id', 'game_session'])ごとの以下のカウント値を作る
    - event_code
    - title
    - type
    - world
- accuracy_groupの算出
- session_time
'''

print('', end='')

In [92]:
PastSessSummary().run(raw_train, raw_test).save()

## Feature Selection

In [69]:
# Load Feature

# Reduce Feature

# Manual Reduced Feature

## Training Model

In [None]:
class Base_Model(object):
    
    def __init__(self, train_df, test_df, features, categoricals=[], n_splits=5, cv_method='StratifiedKFold', verbose=True,ps={}):
        self.train_df = train_df
        self.test_df = test_df
        self.features = features
        self.n_splits = n_splits
        self.categoricals = categoricals
        self.target = 'accuracy_group'
        self.cv = self.get_cv(cv_method)
        self.verbose = verbose
#         self.params = self.get_params()
        self.params = self.set_params(ps)
        self.oof_pred, self.y_pred, self.score, self.models = self.fit()
        
    def train_model(self, train_set, val_set):
        raise NotImplementedError
        
    def get_cv(self, cv_method):
        method = cv_method
        if method=='StratifiedKFold':
            cv = StratifiedKFold(n_splits=self.n_splits, shuffle=True, random_state=42)
            return cv.split(self.train_df, self.train_df[self.target])
        elif method=='GroupKFold':
            cv = GroupKFold(n_splits=self.n_splits)
            return cv.split(self.train_df, self.train_df[self.target], self.train_df['installation_id'])
    
    def get_params(self):
        raise NotImplementedError
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        raise NotImplementedError
        
    def convert_x(self, x):
        return x
        
    def fit(self):
        oof_pred = np.zeros((len(reduce_train), ))
        y_pred = np.zeros((len(reduce_test), ))
        models = []
        
        for fold, (train_idx, val_idx) in enumerate(self.cv):
            x_train, x_val = self.train_df[self.features].iloc[train_idx], self.train_df[self.features].iloc[val_idx]
            y_train, y_val = self.train_df[self.target][train_idx], self.train_df[self.target][val_idx]
            train_set, val_set = self.convert_dataset(x_train, y_train, x_val, y_val)
            model = self.train_model(train_set, val_set)
            models.append(model)
            
            conv_x_val = self.convert_x(x_val)
            oof_pred[val_idx] = model.predict(conv_x_val).reshape(oof_pred[val_idx].shape)
            x_test = self.convert_x(self.test_df[self.features])
            y_pred += model.predict(x_test).reshape(y_pred.shape) / self.n_splits
            print('Partial score of fold {} is: {}'.format(fold, eval_qwk_lgb_regr(y_val, oof_pred[val_idx])[1]))
        _, loss_score, _ = eval_qwk_lgb_regr(self.train_df[self.target], oof_pred)
        if self.verbose:
            print(f'\nOur oof cohen kappa score is: {loss_score}\n')
        return oof_pred, y_pred, loss_score, models

In [None]:
class Lgb_Model(Base_Model):
    
    def train_model(self, train_set, val_set):
        verbosity = 100 if self.verbose else 0
        return lgb.train(self.params, train_set, valid_sets=[train_set, val_set], verbose_eval=verbosity)
        
    def convert_dataset(self, x_train, y_train, x_val, y_val):
        train_set = lgb.Dataset(x_train, y_train, categorical_feature=self.categoricals)
        val_set = lgb.Dataset(x_val, y_val, categorical_feature=self.categoricals)
        return train_set, val_set
        
    def get_params(self):
        params = {
            'n_estimators':5000,
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': 'rmse',
            'subsample': 0.75,
            'subsample_freq': 1,
            'learning_rate': 0.01,
            'feature_fraction': 0.9,
            'max_depth': 15,
            'lambda_l1': 1,  
            'lambda_l2': 1,
            'early_stopping_rounds': 100
        }
        return params
    def set_params(self,ps={}):
        params = self.get_params()
        if 'subsample_freq' in ps:
            params['subsample_freq']=int(ps['subsample_freq'])
            params['learning_rate']=ps['learning_rate']
            params['feature_fraction']=ps['feature_fraction']
            params['lambda_l1']=ps['lambda_l1']
            params['lambda_l2']=ps['lambda_l2']
            params['max_depth']=int(ps['max_depth'])
        
        return params    
    
    def get_feature_importance(self, models):
        feature_importance = pd.DataFrame(
            [model.feature_importance() for model in models],
            columns=models[0].feature_name()
        ).T

        feature_importance['Agerage_Importance'] = feature_importance.iloc[:, :len(models)].mean(axis=1)
        feature_importance['importance_std'] = feature_importance.iloc[:, :len(models)].std(axis=1)
        feature_importance.sort_values(by='Agerage_Importance', inplace=True)
        return feature_importance
    
    def plot_importance(self, models, max_num_features=50, figsize=(12, 15)):
        feature_importance = self.get_feature_importance(models)
        plt.figure(figsize=figsize)
        
        feature_importance[-max_num_features:].plot(
            kind='barh', title='Feature importance', figsize=figsize,
            y='Agerage_Importance', xerr='importance_std',
            grid=True, align="center"
        )
        plt.legend()
        plt.show()

In [None]:
def LGB_Beyes(subsample_freq,
                    learning_rate,
                    feature_fraction,
                    max_depth,
                    lambda_l1,
                    lambda_l2):
    params={}
    params['subsample_freq']=subsample_freq
    params['learning_rate']=learning_rate
    params['feature_fraction']=feature_fraction
    params['lambda_l1']=lambda_l1
    params['lambda_l2']=lambda_l2
    params['max_depth']=max_depth
    lgb_model = Lgb_Model(reduce_train, ajusted_test, features, categoricals=categoricals,ps=params)
    print('kappa: ',lgb_model.score)
    return lgb_model.score

bounds_LGB = {
    'subsample_freq': (1, 3),
    'learning_rate': (0.025, 0.4),
    'feature_fraction': (0.5, 1),
    'lambda_l1': (1, 5),
    'lambda_l2': (1, 5),
    'max_depth': (15, 17),
}

if False and os.path.exists('/kaggle/input/data-science-bowl-2019/'):
    LGB_BO = BayesianOptimization(LGB_Beyes, bounds_LGB, random_state=1029)
    import warnings
    init_points = 16
    n_iter = 16
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore')
        LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)
        
    params = LGB_BO.max['params']
    print('\n', LGB_BO.max['params'])
else:
    params = {
        'feature_fraction': 0.8,
        'lambda_l1': 2, 
        'lambda_l2': 3, 
        'learning_rate': 0.03, 
        'num_leaves': 2**8,
        'max_depth': 7, 
        'min_data_in_leaf': 50,
        'subsample_freq': 1
    }

In [None]:
lgb_model = Lgb_Model(reduce_train, ajusted_test, features, categoricals=categoricals, ps=params)

In [None]:
lgb_model.plot_importance(lgb_model.models, max_num_features=100, figsize=(12, 28))

## Model Evaluation

## Submission

In [None]:
final_pred = lgb_model.y_pred
trian_pred_reg = lgb_model.oof_pred

print(final_pred.shape)

In [None]:
# improved
import scipy as sp
from functools import partial

from sklearn.metrics import cohen_kappa_score

class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self, num_class=4):
        self.coef_ = 0
        self.initial_coef = [i for i in np.arange(1, (num_class+1)/2, 0.5)]
        self.labels = [i for i in range(num_class)]
    
    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels=self.labels)
        return -cohen_kappa_score(y, preds, weights = 'quadratic')
    
    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X = X, y = y)
        initial_coef = self.initial_coef
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')
    
    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels=self.labels)
        return preds
    
    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [None]:
optR = OptimizedRounder(num_class=4)
optR.fit(trian_pred_reg, reduce_train['accuracy_group'])
coefficients = optR.coefficients()

opt_preds = optR.predict(trian_pred_reg.reshape(-1, ), coefficients)

from sklearn.metrics import mean_squared_error
rmse_score = np.sqrt(mean_squared_error(reduce_train['accuracy_group'], lgb_model.oof_pred))
print(f'\nrmse_score is {rmse_score}\n')

off_score = cohen_kappa_score(reduce_train['accuracy_group'], opt_preds, weights='quadratic')
print(f'off_score is {off_score}\n')

print(f'coefficients is \n{coefficients}\n')

sample_submission['accuracy_group'] = optR.predict(final_pred.reshape(-1, ), coefficients).astype(int)
sample_submission.to_csv('submission.csv', index=False)
display(sample_submission['accuracy_group'].value_counts(normalize=True).sort_index())
sample_submission['accuracy_group'].hist()

In [None]:
%%time
result_scores = []
_smpl_train = []
_smpl_pred = []

def f():
    actual = reduce_train['accuracy_group'].copy()
    preds = pd.Series(opt_preds)
    for i in range(4):
        _smpl_train.extend(actual[actual==i].sample(100))
        _smpl_pred.extend(preds[actual==i].sample(100))
    return cohen_kappa_score(_smpl_train, _smpl_pred, weights='quadratic')

result_scores = Parallel(n_jobs=-1, verbose=0)( [delayed(f)() for i in range(1000)] )

print(f'off_score is {off_score}\n')
pd.Series(result_scores).describe().round(3).to_frame()

In [None]:
def plot_cm(y_true, y_pred, figsize=(8, 8)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, cmap=plt.cm.Blues, square=True, annot=annot, fmt='', ax=ax)


plt.title('Result')
reduce_train['accuracy_group'].hist(align='left', rwidth=0.4, color='tab:orange', label='actual')
pd.Series(opt_preds).hist(align='mid', rwidth=0.4, color='tab:blue', label='predict')
plt.legend()
plt.show()

plot_cm(reduce_train['accuracy_group'], opt_preds)