In [1]:
import sys
sys.path.append('../')
from pathlib import Path
import requests
import os
if not os.path.isdir('./.data'):
    os.mkdir('./.data')
import json
import numpy as np
import pandas as pd
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200
import matplotlib.pyplot as plt
%matplotlib inline

import time
import datetime
import pytz
import warnings; warnings.simplefilter('ignore')

In [2]:
# Sklearn stuff
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, StratifiedKFold, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
#from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier

SEED = 17

In [36]:
# Import Training and Test Data  
train_features = pd.read_csv('data2/dpc_train_features.csv', index_col='match_id')
train_targets = pd.read_csv('data2/dpc_train_target.csv', index_col='match_id')
test_features = pd.read_csv('data2/dpc_test_features.csv', index_col='match_id')
test_targets = pd.read_csv('data2/dpc_test_target.csv', index_col='match_id')

In [37]:
train_features.shape, train_targets.shape, test_features.shape, test_targets.shape

((1045, 66), (1045, 2), (117, 66), (117, 2))

In [54]:
# Label targets
y_train = train_targets.radiant_win.values
y_test = test_targets.radiant_win.values

## 

In [41]:
df_full_features = pd.concat([train_features, test_features])

# Index to split the training and test data sets
idx_split = train_features.shape[0]

heroes_df = df_full_features[[f'{i}hero_id' for i in range(1, 23)]]

In [40]:
# Quick check that hero ids are unique (i.e., no double picking)
# https://www.kaggle.com/kuzand/dota-2-winner-prediction-multilayer-nn-pytorch
np.all(df_full_features[[f'{i}hero_id' for i in range(1, 23)]].nunique(axis=1) == 22)

True

Hot encoding by getting dummies for each hero_id feature. 

In [42]:
for i in range(1,23):
        df_full_features = pd.get_dummies(df_full_features, columns = [f'{i}hero_id'])

### Scaling 

Standard Scaler, skipping hot encoding since trying out cat vs adaboost

In [47]:
df_full_features_scaled1 = df_full_features.copy()
df_full_features_scaled[df_full_features.columns.tolist()] = StandardScaler().fit_transform(df_full_features_scaled[df_full_features.columns.tolist()])  # alternatively use StandardScaler

In [45]:
df_full_features_scaled2 = df_full_features.copy()
df_full_features_scaled[df_full_features.columns.tolist()] = MinMaxScaler().fit_transform(df_full_features_scaled[df_full_features.columns.tolist()])  # alternatively use StandardScaler

In [48]:
df_full_features_scaled1.head()

Unnamed: 0_level_0,1is_pick,1team,2is_pick,2team,3is_pick,3team,4is_pick,4team,5is_pick,5team,6is_pick,6team,7is_pick,7team,8is_pick,8team,9is_pick,9team,10is_pick,10team,11is_pick,11team,12is_pick,12team,13is_pick,13team,14is_pick,14team,15is_pick,15team,16is_pick,16team,17is_pick,17team,18is_pick,18team,19is_pick,19team,20is_pick,20team,21is_pick,21team,22is_pick,22team,1hero_id_2,1hero_id_3,1hero_id_4,1hero_id_6,1hero_id_7,1hero_id_9,1hero_id_10,1hero_id_16,1hero_id_18,1hero_id_19,1hero_id_23,1hero_id_25,1hero_id_27,1hero_id_29,1hero_id_31,1hero_id_33,1hero_id_37,1hero_id_38,1hero_id_40,1hero_id_41,1hero_id_42,1hero_id_43,1hero_id_46,1hero_id_52,1hero_id_53,1hero_id_55,1hero_id_57,1hero_id_58,1hero_id_59,1hero_id_60,1hero_id_61,1hero_id_62,1hero_id_65,1hero_id_66,1hero_id_68,1hero_id_69,1hero_id_71,1hero_id_72,1hero_id_73,1hero_id_74,1hero_id_75,1hero_id_76,1hero_id_79,1hero_id_80,1hero_id_81,1hero_id_82,1hero_id_83,1hero_id_84,1hero_id_85,1hero_id_86,1hero_id_87,1hero_id_88,1hero_id_89,1hero_id_90,1hero_id_91,1hero_id_93,...,22hero_id_1,22hero_id_2,22hero_id_4,22hero_id_5,22hero_id_6,22hero_id_7,22hero_id_8,22hero_id_10,22hero_id_11,22hero_id_12,22hero_id_13,22hero_id_14,22hero_id_15,22hero_id_16,22hero_id_17,22hero_id_18,22hero_id_19,22hero_id_21,22hero_id_22,22hero_id_23,22hero_id_25,22hero_id_26,22hero_id_27,22hero_id_28,22hero_id_29,22hero_id_32,22hero_id_33,22hero_id_34,22hero_id_35,22hero_id_36,22hero_id_37,22hero_id_38,22hero_id_39,22hero_id_40,22hero_id_41,22hero_id_42,22hero_id_43,22hero_id_44,22hero_id_45,22hero_id_46,22hero_id_47,22hero_id_48,22hero_id_49,22hero_id_50,22hero_id_51,22hero_id_52,22hero_id_53,22hero_id_54,22hero_id_55,22hero_id_56,22hero_id_57,22hero_id_58,22hero_id_59,22hero_id_60,22hero_id_61,22hero_id_62,22hero_id_63,22hero_id_65,22hero_id_67,22hero_id_68,22hero_id_70,22hero_id_71,22hero_id_72,22hero_id_73,22hero_id_74,22hero_id_75,22hero_id_76,22hero_id_77,22hero_id_78,22hero_id_79,22hero_id_80,22hero_id_81,22hero_id_82,22hero_id_84,22hero_id_85,22hero_id_86,22hero_id_88,22hero_id_89,22hero_id_92,22hero_id_93,22hero_id_94,22hero_id_95,22hero_id_96,22hero_id_97,22hero_id_98,22hero_id_99,22hero_id_101,22hero_id_102,22hero_id_104,22hero_id_106,22hero_id_107,22hero_id_108,22hero_id_109,22hero_id_110,22hero_id_112,22hero_id_113,22hero_id_114,22hero_id_120,22hero_id_121,22hero_id_129
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1
5115031896,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4865407245,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4889341660,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5050335216,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,0,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4860032461,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [49]:
df_full_features_scaled.shape

(1162, 2088)

## Set targets, features, and verification set 

Should the combined train sets still have hero variables? did I do this right?

In [23]:
# Encoded train/test features

X_train = train_features
#y_train = train_targets['radiant_win']
X_test = test_features
#y_test = test_targets['radiant_win']

In [22]:
#scaler = StandardScaler()
#scaler.fit(X_train)
#scaler.fit(X_test)
#X_train = scaler.transform(X_train)
#X_test = scaler.transform(X_test)

#dpc_log_scaled = dpc_log.fit(X_scaled_train, y_train)

### Train Validation Split

Train Validation Split

For now, let's just try this out with the hero_ohe feature set

In [26]:
heroes_df_train = X_train[[f'{i}hero_id' for i in range(1, 23)]]
heroes_df_test = X_test[[f'{i}hero_id' for i in range(1, 23)]]

In [30]:
y_train = train_targets['radiant_win'].values
y_test = test_targets['radiant_win'].values
X_train = heroes_df_train
X_test = heroes_df_test

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.1, random_state = SEED) ## see top, SEED = 17

In [32]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((940, 22), (105, 22), (940,), (105,))

In [18]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier
#We'll use full_df_mod without dummies and mark categorial vars
X_, X_valid_ctb, y_train_part_ctb, y_valid_ctb = train_test_split(full_df_mod[:idx_split], 
                                                                        y_train, test_size = 0.3, random_state=0) #fixing random_state
cat_vars = ['r_invar_herotype_sum', 'd_invar_herotype_sum'] + r_firstblood + d_firstblood #all the vars that we got dummies of

#Let it train for 200 iterations not to wait too long
ctb = CatBoostClassifier(iterations = 200, random_state=1, verbose=False, task_type='GPU', eval_metric='AUC', cat_features=cat_vars)

#We'll look at an online validation plot
ctb.fit(X_train_part_ctb, y_train_part_ctb.astype(float), eval_set=(X_valid_ctb, y_valid_ctb.astype(float)), plot=True)

ctb_ho_score =  roc_auc_score(y_valid_ctb.astype(float), ctb.predict_proba(X_valid_ctb)[:,1])
ctb_cv_score = cross_val_score(ctb, full_df_mod[:idx_split], y_train.astype(float), cv=cv, scoring = 'roc_auc') 