<center>
<img src="https://habrastorage.org/files/fd4/502/43d/fd450243dd604b81b9713213a247aa20.jpg" />
</center> 
     

## <center> Kaggle inclass competition from [mlcourse.ai](https://mlcourse.ai/)
    
# <center> [**Dota 2 Winner Prediction**](https://www.kaggle.com/c/mlcourse-dota2-win-prediction/)

### <center> Session: Fall 2019

#### <div style="text-align: right"> Author: [Vladimir Kulyashov](https://www.kaggle.com/vovkaperm)


<div style="text-align: right"> creation date: 29 October 2019 </div>

In [1]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [2]:
PATH_TO_DATA = './data/'
PATH_TO_PREDS = './predictions/'
SEED = 17

skf = StratifiedKFold(n_splits=10, random_state=SEED)
logit = LogisticRegression(random_state=SEED, solver='lbfgs')

In [3]:
# Useful def's

def diff_of_quat(a1, a2):
    if a1 + a2 == 0: return 0
    else: return (a1-a2) / (a1 + a2)


def create_team_feature(feature):
    r_tmp = full_df[[f'r{i}_{feature}' for i in range(1,6)]]
    d_tmp = full_df[[f'd{i}_{feature}' for i in range(1,6)]]
    new_df[feature] = pd.concat([r_tmp.sum(axis=1), d_tmp.sum(axis=1)], axis=1).apply(lambda x: diff_of_quat(*x), axis=1)
    
def write_submission_file(predictions, filename):
    sub = pd.read_csv(path.join(PATH_TO_DATA,'sample_submission.csv'))
    sub['radiant_win_prob'] = predictions
    sub.to_csv(filename, index=False)
    print(f'Saved as {filename}')

### Loading DATA

In [4]:
%%time
cols = pd.read_csv(path.join(PATH_TO_DATA, 'train_features.csv'), sep=',', nrows=1).columns
cols = list(set(cols) - {'match_id_hash'})

train_df = pd.read_csv(path.join(PATH_TO_DATA, 'train_features.csv'), sep=',', header=0, usecols=cols)
train_targets = pd.read_csv(path.join(PATH_TO_DATA, 'train_targets.csv'), 
                            header=0, usecols=['radiant_win'], squeeze=True).astype('int')

test_df = pd.read_csv(path.join(PATH_TO_DATA, 'test_features.csv'), sep=',', header=0, usecols=cols)


full_df = pd.concat([train_df, test_df])
idx = train_df.shape[0]

Wall time: 1.16 s


### Adding features

In [25]:
%%time
team_features = ['kills',
                 'deaths',
                 'assists',
                 'gold',
                 'xp',
                 'denies',
                 'lh',
                 'level',
                 'max_health',
                 'firstblood_claimed',
                 'teamfight_participation',
                 'towers_killed',
                 'obs_placed']

new_df = pd.DataFrame(index=full_df.index)

for feature in team_features:
    create_team_feature(feature)

Wall time: 10.2 s


In [38]:
team_features_cv = cross_val_score(logit, new_df[:idx].values, train_targets.values,
                                   scoring='roc_auc', cv=skf)
print(team_features_cv)
print(f'CV mean: {team_features_cv.mean() : 4f}')

[0.79942633 0.80937258 0.79270945 0.80548994 0.81056284 0.80066065
 0.79470897 0.79552257 0.80239084 0.80850063]
CV mean:  0.801934


In [28]:
logit.fit(new_df[:idx].values, train_targets.values)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
preds = logit.predict_proba(new_df[idx:].values)

In [50]:
feature_importance = pd.DataFrame()
feature_importance['features'] = team_features
feature_importance['coefs'] = logit.coef_[0]
feature_importance.sort_values(by='coefs', ascending=False)

Unnamed: 0,features,coefs
7,level,7.617129
3,gold,5.480706
8,max_health,3.08774
2,assists,0.642799
1,deaths,0.637227
0,kills,0.600038
10,towers_killed,0.497316
4,xp,0.460872
6,lh,0.170086
11,obs_placed,0.165784


In [32]:
filename = f'sub_1.csv'
write_submission_file(preds, path.join(PATH_TO_PREDS, filename))

Saved as ./predictions/sub_1.csv


Initial submit gave **0.81678** on LB with the folowing features.

team_features = ['kills',
                 'deaths',
                 'assists',
                 'gold',
                 'xp',
                 'denies',
                 'lh',
                 'level',
                 'max_health',
                 'firstblood_claimed',
                 'teamfight_participation',
                 'towers_killed',
                 'obs_placed']

### Let's dive into JSON and find/make some awsome features :))

In [44]:
import json
matches = []
with open(path.join(PATH_TO_DATA, 'train_matches.jsonl')) as f:
    for i in range(500):
        line = f.readline()
        matches.append(json.loads(line))
    

In [None]:
matches[0]['players'][i]['hero_inventory']

In [None]:
afk_players = pd.DataFrame()
for match in matches:
    for i, game in enumerate(match):
        if len(match['players'][i]['actions']) < 3:
            afk_players = afk_players.append(pd.DataFrame(data={'match_id_hash': match['match_id_hash'], 'afk': 1}, index=[i]))
        # else:
        #     afk_players = afk_players.append(pd.DataFrame(data={'afk_true': 0}, index=[match['match_id_hash']]))
            # print(f"player_slot {match['players'][i]['player_slot']} in match {match['match_id_hash']} was AFK")
afk_players

In [None]:
afk_players = afk_players.groupby('match_id_hash').sum()
afk_players

In [None]:
%time
import os

try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ('Please install ujson to read JSON oblects faster')
    
try:
    from tqdm import tqdm_notebook
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ('Please install tqdm to track progress with Python loops')

def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)
            
afk_players = pd.DataFrame()   
# processing each game
for match in read_matches('../data/train_matches.jsonl'):
    
   # processing each player
    for i, player in enumerate(match['players']):
        if len(match['players'][i]['actions']) < 5:
            afk_players = afk_players.append(pd.DataFrame(data={'match_id_hash': match['match_id_hash'], 'afk': 1}, index=[i]))

In [None]:
afk_players = afk_players.groupby('match_id_hash').sum().reset_index()
print(afk_players.info())
afk_players.head()