In [4]:
import os, sys
import datetime
from time import time
from tqdm import tqdm
from collections import Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import cohen_kappa_score
import category_encoders as ce

from catboost import CatBoostRegressor
import lightgbm as lgb
import xgboost as xgb

from functools import partial
import scipy as sp              # for optimize.minimize()

In [5]:
# Execution environment setting
Kaggle = False

if Kaggle:
    DIR = '../input/data-science-bowl-2019'
    task_type = 'CPU'
else:
    DIR = 'data'
    task_type = 'GPU'

## Observe the data

In [6]:
train = pd.read_csv(os.path.join(DIR,'train.csv'))
train_labels = pd.read_csv(os.path.join(DIR,'train_labels.csv'))
specs = pd.read_csv(os.path.join(DIR,'specs.csv'))
test = pd.read_csv(os.path.join(DIR,'test.csv'))

In [7]:
print('train:\t\t',train.shape)
print('train_labels:\t',train_labels.shape)
print('specs:\t\t',specs.shape)
print('test:\t\t',test.shape)

train:		 (11341042, 11)
train_labels:	 (17690, 7)
specs:		 (386, 3)
test:		 (1156414, 11)


In [8]:
# make 'title' and 'event_code' list
title_list = list(set(train['title'].value_counts().index) \
                   .union(set(test['title'].value_counts().index)))
event_code_list = list(set(train['event_code'].value_counts().index) \
                   .union(set(test['event_code'].value_counts().index)))
print("title_list:\t\t",len(title_list))
print("event_code_list:\t\t",len(event_code_list))

title_list:		 44
event_code_list:		 42


In [9]:
# makes dict 'title to number(integer)'
title2num = dict(zip(title_list, np.arange(len(title_list))))
# makes dict 'number to title'
num2title = dict(zip(np.arange(len(title_list)), title_list))
# makes dict 'title to win event_code' 
# (4100 except 'Bird Measurer' and 4110 for 'Bird Measurer'))
title2win_code = dict(zip(title2num.values() \
                    ,(np.ones(len(title2num))).astype('int') * 4100))
title2win_code[title2num['Bird Measurer (Assessment)']] = 4110

In [10]:
# Convert 'title' to the number
train['title'] = train['title'].map(title2num)
test['title'] = test['title'].map(title2num)
train_labels['title'] = train_labels['title'].map(title2num)

# Convert 'timestamp' to datetime
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [50]:
# Convert the raw data into processed features
def get_data(user_sample, test_set=False):
    '''
    user_sample : DataFrame from train/test group by 'installation_id'
    test_set    : related with the labels processing
    '''
    # Constants and parameters declaration
    user_assessments = []
    last_type = 0
    types_count = {'Clip':0, 'Activity':0, 'Assessment':0, 'Game':0}
    time_first_activity = float(user_sample['timestamp'].values[0])
    time_spent_each_title = {title:0 for title in title_list}
    event_code_count = {code:0 for code in event_code_list}
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    
    accumu_accuracy_group = 0
    accumu_accuracy=0
    accumu_win_n = 0 
    accumu_loss_n = 0 
    accumu_actions = 0
    counter = 0
    durations = []
    
    # group by 'game_session'
    for i, session in user_sample.groupby('game_session', sort=False):
        # i      : game_session_id
        # session: DataFrame from user_sample group by 'game_session'
        session_type = session['type'].iloc[0]  # Game/Assessment/Activity/Clip
        session_title = session['title'].iloc[0]
        
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)   # [sec]
            time_spent_each_title[num2title[session_title]] += time_spent
        
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100(4110)
            all_4100 = session.query(f'event_code == \
                                         {title2win_code[session_title]}')
            # numbers of wins and losses
            win_n = all_4100['event_data'].str.contains('true').sum()
            loss_n = all_4100['event_data'].str.contains('false').sum()

            # init features and then update
            features = types_count.copy()
            features.update(time_spent_each_title.copy())
            features.update(event_code_count.copy())
            features['session_title'] = session_title
            features['accumu_win_n'] = accumu_win_n
            features['accumu_loss_n'] = accumu_loss_n
            accumu_win_n += win_n
            accumu_loss_n += loss_n
            
            features['day_of_the_week'] = (session['timestamp'].iloc[-1]). \
                                            strftime('%A')    # Mod 2019-11-17

            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)

            # average of the all accuracy of this player
            features['accuracy_ave'] = accumu_accuracy / counter \
                                                if counter > 0 else 0
            accuracy = win_n / (win_n + loss_n) \
                                   if (win_n + loss_n) > 0 else 0
            accumu_accuracy += accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # average of accuracy_groups of this player
            features['accuracy_group_ave'] = \
                    accumu_accuracy_group / counter if counter > 0 else 0
            accumu_accuracy_group += features['accuracy_group']
            
            # how many actions the player has done in this game_session
            features['accumu_actions'] = accumu_actions
            
            # if test_set, all sessions belong to the final dataset
            # elif train, needs to be passed throught this clausule
            if test_set or (win_n + loss_n) > 0:
                user_assessments.append(features)

        

        # how many actions was made in each event_code
        event_codes = Counter(session['event_code'])
        for key in event_codes.keys():
            event_code_count[key] += event_codes[key]

        # how many actions the player has done
        accumu_actions += len(session)
        if last_type != session_type:
            types_count[session_type] += 1
            last_type = session_type
  
    # if test_set, only the last assessment must be predicted,
    # the previous are scraped
    if test_set:
        return user_assessments[-1]
    

    return user_assessments





In [43]:
# get_data function is applyed to each installation_id
compiled_data = []
installation_n = train['installation_id'].nunique()
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby( \
                                     'installation_id', sort=False)),
                                     total=installation_n):
    # user_sample : DataFrame group by 'installation_id'
    compiled_data += get_data(user_sample)

100%|██████████| 17000/17000 [03:20<00:00, 84.68it/s]


In [44]:
# the compiled_data is converted to DataFrame and deleted to save memmory
new_train = pd.DataFrame(compiled_data)
del compiled_data

In [47]:
new_train.head(1)

Unnamed: 0,Clip,Activity,Assessment,Game,Crystal Caves - Level 1,Welcome to Lost Lagoon!,Egg Dropper (Activity),Tree Top City - Level 1,Flower Waterer (Activity),Crystal Caves - Level 3,Tree Top City - Level 3,Pirate's Tale,Tree Top City - Level 2,Chow Time,Bubble Bath,Dino Dive,Rulers,Magma Peak - Level 1,Crystal Caves - Level 2,Bottle Filler (Activity),Bug Measurer (Activity),Watering Hole (Activity),Treasure Map,"Heavy, Heavier, Heaviest",Slop Problem,Happy Camel,All Star Sorting,Magma Peak - Level 2,Lifting Heavy Things,Pan Balance,Dino Drink,Leaf Leader,Honey Cake,Ordering Spheres,Mushroom Sorter (Assessment),Cauldron Filler (Assessment),12 Monkeys,Sandcastle Builder (Activity),Scrub-A-Dub,Cart Balancer (Assessment),Costume Box,Chicken Balancer (Activity),Crystals Rule,Balancing Act,Air Show,Bird Measurer (Assessment),Fireworks (Activity),Chest Sorter (Assessment),2050,4100,2060,4110,2070,2075,2080,2081,2083,3110,3120,3121,4220,4230,5000,4235,5010,4010,4020,4021,4022,4025,4030,4031,3010,4035,4040,3020,3021,4045,2000,4050,2010,2020,4070,2025,2030,4080,2035,2040,4090,4095,session_title,accumu_win_n,accumu_loss_n,day_of_the_week,duration_mean,accuracy_ave,accuracy_group,0,1,2,3,accuracy_group_ave,accumu_actions
0,6,3,0,2,0,0,0,0,164,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,163,0,0,0,0,0,0,0,0,0,0,89,115,0,0,0,0,0,0,0,91,0,6,0,0,0,0,0,4,1,2,77,7,9,0,0,0,0,0,4,92,14,31,19,121,0,79,1,0,7,9,0,18,0,0,20,94,4,18,0,0,6,4,0,30,0,0,Tuesday,0.0,0,3,0,0,0,0,0,647


In [51]:
# process test set, the same that was done with the train set
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id',sort=False),
                                total=1000):
    new_test.append(get_data(user_sample, test_set=True))
    
new_test = pd.DataFrame(new_test)

100%|██████████| 1000/1000 [00:21<00:00, 47.22it/s]


In [52]:
new_test.head(1)

Unnamed: 0,Clip,Activity,Assessment,Game,Crystal Caves - Level 1,Welcome to Lost Lagoon!,Egg Dropper (Activity),Tree Top City - Level 1,Flower Waterer (Activity),Crystal Caves - Level 3,Tree Top City - Level 3,Pirate's Tale,Tree Top City - Level 2,Chow Time,Bubble Bath,Dino Dive,Rulers,Magma Peak - Level 1,Crystal Caves - Level 2,Bottle Filler (Activity),Bug Measurer (Activity),Watering Hole (Activity),Treasure Map,"Heavy, Heavier, Heaviest",Slop Problem,Happy Camel,All Star Sorting,Magma Peak - Level 2,Lifting Heavy Things,Pan Balance,Dino Drink,Leaf Leader,Honey Cake,Ordering Spheres,Mushroom Sorter (Assessment),Cauldron Filler (Assessment),12 Monkeys,Sandcastle Builder (Activity),Scrub-A-Dub,Cart Balancer (Assessment),Costume Box,Chicken Balancer (Activity),Crystals Rule,Balancing Act,Air Show,Bird Measurer (Assessment),Fireworks (Activity),Chest Sorter (Assessment),2050,4100,2060,4110,2070,2075,2080,2081,2083,3110,3120,3121,4220,4230,5000,4235,5010,4010,4020,4021,4022,4025,4030,4031,3010,4035,4040,3020,3021,4045,2000,4050,2010,2020,4070,2025,2030,4080,2035,2040,4090,4095,session_title,accumu_win_n,accumu_loss_n,day_of_the_week,duration_mean,accuracy_ave,accuracy_group,0,1,2,3,accuracy_group_ave,accumu_actions
0,5,3,1,3,0,0,88,0,0,0,0,0,0,135,0,188,0,0,0,0,33,0,0,0,0,0,1960,0,0,0,0,0,0,0,0,0,0,90,0,0,0,0,0,0,0,0,194,0,0,1,1,0,1,0,0,0,0,108,33,9,0,0,0,0,0,3,120,12,0,10,148,0,111,33,0,33,9,0,25,0,1,11,190,1,7,0,0,0,0,0,31,1,0,Thursday,30.0,0,0,0,0,0,1,0,867


In [53]:
# all_features but 'accuracy_group', that is the label y
all_features = [x for x in new_train.columns if x not in ['accuracy_group']]
# categorical feature
categorical_features = ['session_title','day_of_the_week']

In [54]:
# Encode categorical_features to integer(for use with LightGB,XGBoost,etc)

# concatnate train and test data
temp_df = pd.concat([new_train[all_features], new_test[all_features]])
# encode
encoder = ce.ordinal.OrdinalEncoder(cols = categorical_features)
temp_df = encoder.fit_transform(temp_df)
# dataset
X, y = temp_df.iloc[:len(new_train),:], new_train['accuracy_group']
X_test = temp_df.iloc[len(new_train):,:]

In [55]:
del train,test

## Step 1 : Create Regressor Models
Used ** CatBoost **, ** XGBoost **, ** LightGBM **.

In [56]:
# Create multiple datasets to create multiple models (not for CV).
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

### - CatBoost

In [59]:
# makes the model and set the parameters
def make_CatBoost(task_type):
    model = CatBoostRegressor(
        iterations=5000,
        learning_rate=0.02,
        loss_function='RMSE',
        random_seed=42,
        depth=10,                            # add
        border_count=108,                    # add
        bagging_temperature=2.348502,        # add
        task_type=task_type,
        early_stopping_rounds=200
    )
    return model

In [60]:

def train_catBoost():
    
    # CatBoost
    start_time = time()
    cat_models = []
    scores = []

    # Train and make models
    for fold, (train_ids, test_ids) in enumerate(folds.split(X, y)):
        print('● Fold :', fold+1,'/',NFOLDS)
        model = make_CatBoost(task_type)
        model.fit(X.loc[train_ids, all_features], y.loc[train_ids], 
                eval_set=(X.loc[test_ids, all_features], y.loc[test_ids]),
                use_best_model=False,
                verbose=500,
                cat_features=categorical_features)    
        cat_models.append(model)
        
    print('Time:', time() - start_time)

### -XGBoost

In [61]:

def train_xgboost():
# XGBoost
    start_time = time()
    xgb_models = []
    scores = []

    params = {
        'max_depth': 9,                 # 6           # mod 10→9
        'learning_rate': 0.01,          # = eta 0.1: [0,1]
        'objective': 'reg:linear',                    # add
        'n_estimators' : 300,           # 100
        'subsample': 0.6,               # 1, (0,1]    # mod 0.8→0.6
        'colsample_bytree': 1.0,        # 1, (0, 1]   # mod 0.8→1.0
        'gamma': 0.0,                                 # add
        'min_child_weight': 5,                        # add
        'seed' : 42,
    }

    # Train and make models
    for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
        print('● Fold :', fold+1,'/',NFOLDS)
        dtrain = xgb.DMatrix(X.iloc[train_ids], y[train_ids])
        dval = xgb.DMatrix(X.iloc[val_ids], y[val_ids])
        model = xgb.train(params=params,
                        dtrain=dtrain,
                        num_boost_round=5000,
                        evals=[(dtrain, 'train'), (dval, 'val')],
                        early_stopping_rounds=100,
                        verbose_eval=100
                        )
        xgb_models.append(model)
        
    print('Time:', time() - start_time)

### - LightGBM

In [62]:
def train_lightGBM():
    # LightGBM
    start_time = time()
    lgb_models = []
    scores = []

    params = {
        'n_jobs': -1,
        'seed': 42,
        'boosting_type': 'gbdt',
        'objective': 'regression',
    #     'num_iteration': 100,           # add
        'metric': 'rmse',
        'eval_metric': 'cappa',
        'subsample': 0.75,
        'feature_fraction':0.998495,    # add
        'bagging_fraction': 0.872417,   # mod 0.8→
        'bagging_freq': 1,              # add
        'colsample_bytree': 0.8,        # add
        'subsample_freq': 1,
        'learning_rate': 0.02,
        'feature_fraction': 0.9,
        'max_depth': 13,                # mod 10→
        'num_leaves': 1028,             # mod      # 2^max_depth < num_leaves
        'min_gain_to_split':0.085502,   # add
        'min_child_weight':1.087712,    # add
        'lambda_l1': 1,  
        'lambda_l2': 1,
        'verbose': 100,
    }

    # Train and make models
    for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
        print('● Fold :', fold+1,'/',NFOLDS)
        train_set = lgb.Dataset(X.iloc[train_ids], y[train_ids],
                            categorical_feature=categorical_features)
        val_set = lgb.Dataset(X.iloc[val_ids], y[val_ids],
                            categorical_feature=categorical_features)
        model = lgb.train(params=params,
                        train_set=train_set,
                        valid_sets=[train_set, val_set],
                        num_boost_round=5000,
                        early_stopping_rounds=100,    # del
                        verbose_eval=200
                        )
        lgb_models.append(model)
        
    print('\nTime:', time() - start_time)

In [64]:
from knockknock import slack_sender

webhook_url = "<https://hooks.slack.com/services/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>"
@slack_sender(webhook_url=webhook_url, channel="<pbskids>")
def train():
    train_catBoost()
    train_xgboost()
    train_lightGBM()



train()


InvalidSchema: No connection adapters were found for '<https://hooks.slack.com/services/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>'

## Step 2 : Predict each Model

In [0]:
preds = []

# CatBoost models
for model in cat_models:
    pred = model.predict(X)
    preds.append(pred)
    
# XGBoost models
for model in xgb_models:
    pred = model.predict(xgb.DMatrix(X))
    pred = pred.flatten()
    preds.append(pred)
    
# LightGBM models
for model in lgb_models:
    pred = model.predict(X,num_iteration=model.best_iteration)
    pred = pred.reshape(len(X),1).flatten()
    preds.append(pred)

df = pd.DataFrame(preds).T
df.columns = ['C1','C2','C3','C4','C5',   # CatBoost
              'X1','X2','X3','X4','X5',   # XGBoost
              'L1','L2','L3','L4','L5']   # LightGBM

In [0]:
# Calculate the average value of each model pred
df['mean'] = df.mean(axis = 'columns')
df.head(10)

## Step 3 : Optimize Rounding Coefficients
The rounding coefficient is optimized using the average value of the prediction results of each model. Optimization uses `scipy.optimize.minimize()`.

In [0]:
class OptRounder(object):
    def __init__(self):
        self.res_ = []
        self.coef_ = []
        
    def get_res(self):
        return self.res_
    
    # objective function
    def func(self, coef, X, y):
        kappa = cohen_kappa_score(self.bincut(coef, X), y, weights='quadratic')
        return -kappa

    def bincut(self, coef, X):
        return pd.cut(X,
                      [-np.inf] + list(np.sort(coef)) + [np.inf],
                      labels = [0, 1, 2, 3])
        
    def fit(self, X, y):
        pfunc = partial(self.func, X=X, y=y)
        self.res_ = sp.optimize.minimize(fun = pfunc,           # objective function
                                         x0 = [0.6, 1.5, 2.4],  # initial coefficients
                                         method='nelder-mead')  # solver
        self.coef_ = self.res_.x
        
    def predict(self, X, coef):
        return self.bincut(coef, X)

In [0]:
optR = OptRounder()
optR.fit(df['mean'].values.reshape(-1,), y)
res = optR.get_res()        # Optimized result

print('●Iterations performed\t:',res.nit)
print('●Optimized coefficients\t:',res.x)
print('●Cohen Kappa score\t:',-res.fun)

coefficients = res.x        # Optimized coefficients

## Step 4 : Final Classification

In [0]:
# final classification
df['predict'] = optR.predict(df['mean'].values, coefficients).astype(int)

df['y'] = y
df[['mean','predict','y']].head(10)

In [0]:
df[['mean','predict','y']].plot(subplots=True,layout=(1, 3),figsize=(11, 3),kind='hist')

In [0]:
# binning plot of 'pred' versus 'y'
df.plot.hexbin(x='y', y='predict', gridsize=(3,3), sharex=False, title = "binning 'pred' vs 'y'")

## Make submission

In [0]:
preds = []
for model in cat_models:        # CatBoost
    pred = model.predict(X_test)
    preds.append(pred)
for model in xgb_models:        # XGBoost
    pred = model.predict(xgb.DMatrix(X_test))
    pred = pred.flatten()
    preds.append(pred)
for model in lgb_models:        # LightGBM
    pred = model.predict(X_test,num_iteration=model.best_iteration)
    pred = pred.reshape(len(X_test),1).flatten()
    preds.append(pred)
df_s = pd.DataFrame(preds).T

df_s['mean'] = df_s.mean(axis = 'columns')

# Classification
df_s['pred'] = optR.predict(df_s['mean'].values, coefficients).astype(int)

print(df_s.shape)
df_s[['mean','pred']].head(10)

In [0]:
df_s[['mean','pred']].plot(subplots=True, layout=(1, 2), figsize=(7, 3), kind='hist')

In [0]:
submission = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))
submission['accuracy_group'] = df_s['pred']
submission.head(10)

In [0]:
submission.to_csv('submission.csv', index=None)