<center>
<img src="https://habrastorage.org/files/fd4/502/43d/fd450243dd604b81b9713213a247aa20.jpg" />
</center> 
     

## <center> Kaggle inclass competition from [mlcourse.ai](https://mlcourse.ai/)
    
# <center> [**Dota 2 Winner Prediction**](https://www.kaggle.com/c/mlcourse-dota2-win-prediction/)

### <center> Session: Fall 2019

#### <div style="text-align: right"> Author: [Vladimir Kulyashov](https://www.kaggle.com/vovkaperm)


<div style="text-align: right"> creation date: 29 October 2019 </div>

In [1]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import path
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
PATH_TO_DATA = './data/'
PATH_TO_PREDS = './predictions/'
SEED = 17

skf = StratifiedKFold(n_splits=10, random_state=SEED)
logit = LogisticRegression(random_state=SEED, solver='lbfgs', max_iter=500)
pca_1 = PCA(n_components=1)
pca_2 = PCA(n_components=2)
scaler = StandardScaler()

In [3]:
# Useful def's

def diff_of_quat(a1, a2):
    if a1 + a2 == 0: return 0
    else: return (a1-a2) / (a1 + a2)


def create_team_feature(feature):
    r_tmp = full_df[[f'r{i}_{feature}' for i in range(1,6)]]
    d_tmp = full_df[[f'd{i}_{feature}' for i in range(1,6)]]
    new_df[feature] = pd.concat([r_tmp.sum(axis=1), d_tmp.sum(axis=1)], axis=1).apply(lambda x: diff_of_quat(*x), axis=1)
    
def write_submission_file(predictions, filename):
    sub = pd.read_csv(path.join(PATH_TO_DATA,'sample_submission.csv'))
    sub['radiant_win_prob'] = predictions
    sub.to_csv(filename, index=False)
    print(f'Saved as {filename}')

### Loading DATA

In [4]:
%%time
cols = pd.read_csv(path.join(PATH_TO_DATA, 'train_features.csv'), sep=',', nrows=1).columns
cols = list(set(cols) - {'match_id_hash'})

train_df = pd.read_csv(path.join(PATH_TO_DATA, 'train_features.csv'), sep=',', header=0, usecols=cols)
train_targets = pd.read_csv(path.join(PATH_TO_DATA, 'train_targets.csv'), 
                            header=0, usecols=['radiant_win'], squeeze=True).astype('int')

test_df = pd.read_csv(path.join(PATH_TO_DATA, 'test_features.csv'), sep=',', header=0, usecols=cols)


full_df = pd.concat([train_df, test_df])
idx = train_df.shape[0]
full_df.head(3)

Wall time: 1.82 s


Unnamed: 0,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,r1_denies,...,d5_stuns,d5_creeps_stacked,d5_camps_stacked,d5_rune_pickups,d5_firstblood_claimed,d5_teamfight_participation,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed
0,155,22,7,1,11,11,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0
1,658,4,0,3,10,15,7,2,0,7,...,0.0,0,0,0,0,0.0,0,0,0,0
2,21,23,0,0,0,101,0,0,0,0,...,0.0,0,0,0,0,0.0,0,0,0,0


### Adding features

In [10]:
%%time
team_features = ['kills',
                 'deaths',
                 'assists',
                 'gold',
                 'xp',
                 'denies',
                 'lh',
                 'level',
                 'max_health',
                 'max_mana',
                 'firstblood_claimed',
                 'teamfight_participation',
                 'towers_killed',
                 'obs_placed',
                 'rune_pickups',
                 'roshans_killed',
#                  'stuns',
#                  'camps_stacked',
#                  'sen_placed',
                ]

new_df = pd.DataFrame(index=full_df.index)

for feature in team_features:
    create_team_feature(feature)
    
new_df['chat_len'] = scaler.fit_transform(full_df['chat_len'].values.reshape(-1, 1))
team_features.append('chat_len')
# new_df['objectives_len'] = scaler.fit_transform(full_df['objectives_len'].values.reshape(-1, 1))
# team_features.append('objectives_len')
# new_df['game_time'] = scaler.fit_transform(full_df['game_time'].values.reshape(-1,1))
# team_features.append('game_time')

new_df.head(3)

Wall time: 21 s


Unnamed: 0,kills,deaths,assists,gold,xp,denies,lh,level,max_health,max_mana,firstblood_claimed,teamfight_participation,towers_killed,obs_placed,rune_pickups,roshans_killed,chat_len
0,-1.0,1.0,-1.0,-0.319298,-0.215369,-0.666667,-0.122807,-0.181818,0.0,-0.132659,-1.0,-1.0,0.0,-1.0,-0.555556,0.0,0.273281
1,0.684211,-0.6,0.647059,0.160557,0.152209,0.272727,0.139013,0.104478,0.070248,0.041403,-1.0,0.111111,1.0,-0.111111,-0.037037,0.0,0.198803
2,0.0,0.0,0.0,0.294118,0.0,0.0,0.0,0.0,-0.003155,0.025788,0.0,0.0,0.0,0.0,0.5,0.0,-0.545975


In [11]:
team_features_cv = cross_val_score(logit, new_df[:idx].values, train_targets.values,
                                   scoring='roc_auc', cv=skf)
print(team_features_cv)
print(f'CV mean: {team_features_cv.mean(): 4f}')
print(f'CV max: {team_features_cv.max(): 4f}')
logit.fit(new_df[:idx].values, train_targets.values)
preds = logit.predict_proba(new_df[idx:].values)
feature_importance = pd.DataFrame()
feature_importance['features'] = team_features
feature_importance['coefs'] = logit.coef_[0]
feature_importance.sort_values(by='coefs', ascending=False)

[0.80000637 0.81388606 0.79401699 0.80535317 0.81226119 0.8013409
 0.7971309  0.7969403  0.80448966 0.81042593]
CV mean:  0.803585
CV max:  0.813886


Unnamed: 0,features,coefs
7,level,7.126874
3,gold,4.079251
8,max_health,3.090431
9,max_mana,0.967883
0,kills,0.766942
6,lh,0.524323
2,assists,0.513879
1,deaths,0.451196
12,towers_killed,0.451042
14,rune_pickups,0.397391


In [7]:
team_features

['kills',
 'deaths',
 'assists',
 'gold',
 'xp',
 'denies',
 'lh',
 'level',
 'max_health',
 'max_mana',
 'firstblood_claimed',
 'teamfight_participation',
 'towers_killed',
 'obs_placed',
 'chat_len']

In [53]:
filename = f'sub_1.csv'
write_submission_file(preds, path.join(PATH_TO_PREDS, filename))

Saved as ./predictions/sub_1.csv


Initial submit gave **0.81678** on LB with the folowing features.

team_features = ['kills',
                 'deaths',
                 'assists',
                 'gold',
                 'xp',
                 'denies',
                 'lh',
                 'level',
                 'max_health',
                 'firstblood_claimed',
                 'teamfight_participation',
                 'towers_killed',
                 'obs_placed']

### Let's dive into JSON and find/make some awsome features :))

In [44]:
import json
matches = []
with open(path.join(PATH_TO_DATA, 'train_matches.jsonl')) as f:
    for i in range(500):
        line = f.readline()
        matches.append(json.loads(line))
    

In [None]:
matches[0]['players'][i]['hero_inventory']

In [None]:
afk_players = pd.DataFrame()
for match in matches:
    for i, game in enumerate(match):
        if len(match['players'][i]['actions']) < 3:
            afk_players = afk_players.append(pd.DataFrame(data={'match_id_hash': match['match_id_hash'], 'afk': 1}, index=[i]))
        # else:
        #     afk_players = afk_players.append(pd.DataFrame(data={'afk_true': 0}, index=[match['match_id_hash']]))
            # print(f"player_slot {match['players'][i]['player_slot']} in match {match['match_id_hash']} was AFK")
afk_players

In [None]:
afk_players = afk_players.groupby('match_id_hash').sum()
afk_players

In [None]:
%time
import os

try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ('Please install ujson to read JSON oblects faster')
    
try:
    from tqdm import tqdm_notebook
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ('Please install tqdm to track progress with Python loops')

def read_matches(matches_file):
    
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675,
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)
    
    with open(matches_file) as fin:
        for line in tqdm_notebook(fin, total=total_matches):
            yield json.loads(line)
            
afk_players = pd.DataFrame()   
# processing each game
for match in read_matches('../data/train_matches.jsonl'):
    
   # processing each player
    for i, player in enumerate(match['players']):
        if len(match['players'][i]['actions']) < 5:
            afk_players = afk_players.append(pd.DataFrame(data={'match_id_hash': match['match_id_hash'], 'afk': 1}, index=[i]))

In [None]:
afk_players = afk_players.groupby('match_id_hash').sum().reset_index()
print(afk_players.info())
afk_players.head()