In [1]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from skopt import BayesSearchCV

import tensorflow as tf

pd.options.display.max_columns = 100
%matplotlib inline
plt.style.use('seaborn')

  from numpy.core.umath_tests import inner1d
  from ._conv import register_converters as _register_converters


In [2]:
radiant_cols = ['hero_' + str(i) for i in range(5)]
dire_cols = ['hero_' + str(i) for i in range(5, 10)]
no_heroes = 116

filter_cols = ['base_mr']

cont_variables = [
    'agi_gain',
    'attack_range',
    'attack_rate',
    'base_agi',
    'base_armor',
    'base_attack_max',
    'base_attack_min',
    'base_health_regen',
    'base_int',
    'base_mr',
    'base_str',
    'int_gain',
    'legs',
    'move_speed',
    'pro_ban',
    'pro_pick',
    'pro_win',
    'projectile_speed',
    'str_gain',
    'turn_rate',
]

unique_roles = [
    'Carry',
    'Disabler',
    'Durable',
    'Escape',
    'Initiator',
    'Jungler',
    'Nuker',
    'Pusher',
    'Support'
]

unique_primary_attrs = [
    'agi',
    'int',
    'str'
]

feature_names = cont_variables + \
    ['min_' + col for col in cont_variables if col not in filter_cols] + \
    ['max_' + col for col in cont_variables if col not in filter_cols] +  [
    'no_agi',
    'no_int',
    'no_str',
    'no_melees',
    'no_Carry',
    'no_Disabler',
    'no_Durable',
    'no_Escape',
    'no_Initiator',
    'no_Jungler',
    'no_Nuker',
    'no_Pusher',
    'no_Support',
]

def load_data():
    
    df = pd.read_csv('../data/matches_data.csv')
    radiants = pd.read_csv('../data/radiant_features.csv')
    dires = pd.read_csv('../data/dire_features.csv')
    
    assert np.all(radiants.columns == dires.columns), 'Radiants have different features than dires'
    assert df.shape[0] == radiants.shape[0], 'Number of matches in radiants are different to original matches data'
    assert df.shape[0] == dires.shape[0], 'Number of matches in dires are different to original matches data'
    assert len(feature_names) + 1 == radiants.shape[1]
    assert len(feature_names) + 1 == dires.shape[1]
    assert (radiants['match_id'] == dires['match_id']).all()
    assert (radiants['match_id'] == df['match_id']).all()
    
    radiants = radiants.\
        rename({col: 'radiant_' + col for col in feature_names}, axis=1).\
        drop('match_id', axis=1)
    dires = dires.\
        rename({col: 'dire_' + col for col in feature_names}, axis=1).\
        drop('match_id', axis=1)
        
    df = pd.concat([df[['match_id', 'radiant_win']], radiants, dires], axis=1)
    
    return radiants, dires, df

In [3]:
radiants, dires, df = load_data()

In [4]:
n_splits = 5
kf = KFold(n_splits, shuffle=True, random_state=10)

print('=================================================')

avg_score = {}
scores = {'model': [], 'model_1': []}
losses = {'model': [], 'model_1': []}

l_values = [0.001, 0.01, 0.5, 1, 1.5, 2, 3, 7, 10, 15, 20, 25]

for l in l_values:
    
    print(l, ' ', end='')
    
    C = 1 / l
    
    score = dict(model=0, model_1=0)
    loss = dict(model=0, model_1=0)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(df)):

        model = LogisticRegression(random_state=0, solver='saga', C=C, penalty='l2', n_jobs=4, max_iter=200)
        model_1 = LogisticRegression(random_state=0, solver='saga', C=C, penalty='l1', n_jobs=4, max_iter=200)
        
        X_train = np.concatenate([radiants.iloc[train_idx].copy(), dires.iloc[train_idx].copy()], axis=1)
        X_test = np.concatenate([radiants.iloc[test_idx].copy(), dires.iloc[test_idx].copy()], axis=1)

        y_train = df.radiant_win.iloc[train_idx].values
        y_test = df.radiant_win.iloc[test_idx].values
        
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        
        model.fit(X_train, y_train)
        model_1.fit(X_train, y_train)
        
        ypreds = model.predict_proba(X_test)[:, 1]
        ypreds_1 = model_1.predict_proba(X_test)[:, 1]
        
        score['model'] += accuracy_score(y_test, ypreds > 0.5)
        loss['model'] += log_loss(y_test, ypreds)
        
        score['model_1'] += accuracy_score(y_test, ypreds_1 > 0.5)
        loss['model_1'] += log_loss(y_test, ypreds_1)
        
        
    for k in ['model', 'model_1']:
        
        score[k] /= n_splits
        scores[k].append(score)

        loss[k] /= n_splits
        losses[k].append(loss)
    
print('\nDone.')
          
############################################################################

# print('Avg threshold', avg_threshold / (n_splits * len(models_dict)), '\n\n')
# print('Number of features:', len(feature_columns))
# print('Features:', feature_columns, '\n\n')

0.001  



0.01  0.5  1  1.5  2  3  7  10  15  20  25  
Done.
