# DOTA SCIENCE - SBERBANK CONTEST

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

import json
from annoy import AnnoyIndex

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, PolynomialFeatures
from tqdm import tqdm_notebook, tqdm, tqdm_pandas
from matplotlib import pyplot as plt

%matplotlib inline

ModuleNotFoundError: No module named 'annoy'

In [2]:
tqdm_pandas(tqdm())

0it [00:00, ?it/s]


# Load data

In [3]:
fights_train = pd.read_csv("data_final/train_features.csv")
fights_test = pd.read_csv("data_final/test_features.csv")

fights_train["is_train"] = True
fights_test["is_train"] = False

fights = pd.concat([fights_train, fights_test])

fights.index = np.arange(0, fights.shape[0])
fights["fight_id"] = np.arange(0, fights.shape[0])
# fights = fights.set_index("fight_id")


target = pd.read_csv("data_final/train_targets.csv")
heroes = pd.read_json("dictionaries/heroes.json").transpose().set_index("id")
items = pd.read_json("dictionaries/items.json").transpose().set_index("id")

hero2id = dict(zip(heroes.name, heroes.index))

CATEFORICAL_FEATURES = []

In [5]:
heroes.head()

Unnamed: 0_level_0,agi_gain,attack_range,attack_rate,attack_type,base_agi,base_armor,base_attack_max,base_attack_min,base_health,base_health_regen,...,int_gain,legs,localized_name,move_speed,name,primary_attr,projectile_speed,roles,str_gain,turn_rate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.8,150,1.45,Melee,22,-1,31,27,200,1.5,...,1.8,2,Anti-Mage,310,npc_dota_hero_antimage,agi,0,"[Carry, Escape, Nuker]",1.3,0.5
10,3.7,350,1.5,Ranged,24,-2,18,9,200,1.5,...,1.1,0,Morphling,280,npc_dota_hero_morphling,agi,1300,"[Carry, Escape, Durable, Nuker, Disabler]",2.3,0.6
100,2.1,150,1.7,Melee,23,0,31,27,200,1.5,...,1.7,2,Tusk,300,npc_dota_hero_tusk,str,900,"[Initiator, Disabler, Nuker]",2.6,0.7
101,0.8,600,1.7,Ranged,13,-2,22,12,200,1.5,...,3.6,2,Skywrath Mage,330,npc_dota_hero_skywrath_mage,int,1000,"[Support, Nuker, Disabler]",1.8,0.5
102,1.5,150,1.7,Melee,17,-1,42,32,200,1.5,...,2.0,2,Abaddon,310,npc_dota_hero_abaddon,str,900,"[Support, Carry, Durable]",2.6,0.5


In [4]:
fights_train.head()

Unnamed: 0,match_id_hash,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,...,d5_creeps_stacked,d5_camps_stacked,d5_rune_pickups,d5_firstblood_claimed,d5_teamfight_participation,d5_towers_killed,d5_roshans_killed,d5_obs_placed,d5_sen_placed,is_train
0,a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,...,0,0,0,0,0.0,0,0,0,0,True
1,b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,...,0,0,0,0,0.0,0,0,0,0,True
2,6db558535151ea18ca70a6892197db41,21,23,0,0,0,101,0,0,0,...,0,0,0,0,0.0,0,0,0,0,True
3,46a0ddce8f7ed2a8d9bd5edcbb925682,576,22,7,1,4,14,1,0,3,...,3,1,3,0,0.0,0,0,2,0,True
4,b1b35ff97723d9b7ade1c9c3cf48f770,453,22,7,1,3,42,0,1,1,...,2,1,2,0,0.25,0,0,0,0,True


In [4]:
fights.columns.tolist()

['match_id_hash',
 'game_time',
 'game_mode',
 'lobby_type',
 'objectives_len',
 'chat_len',
 'r1_hero_id',
 'r1_kills',
 'r1_deaths',
 'r1_assists',
 'r1_denies',
 'r1_gold',
 'r1_lh',
 'r1_xp',
 'r1_health',
 'r1_max_health',
 'r1_max_mana',
 'r1_level',
 'r1_x',
 'r1_y',
 'r1_stuns',
 'r1_creeps_stacked',
 'r1_camps_stacked',
 'r1_rune_pickups',
 'r1_firstblood_claimed',
 'r1_teamfight_participation',
 'r1_towers_killed',
 'r1_roshans_killed',
 'r1_obs_placed',
 'r1_sen_placed',
 'r2_hero_id',
 'r2_kills',
 'r2_deaths',
 'r2_assists',
 'r2_denies',
 'r2_gold',
 'r2_lh',
 'r2_xp',
 'r2_health',
 'r2_max_health',
 'r2_max_mana',
 'r2_level',
 'r2_x',
 'r2_y',
 'r2_stuns',
 'r2_creeps_stacked',
 'r2_camps_stacked',
 'r2_rune_pickups',
 'r2_firstblood_claimed',
 'r2_teamfight_participation',
 'r2_towers_killed',
 'r2_roshans_killed',
 'r2_obs_placed',
 'r2_sen_placed',
 'r3_hero_id',
 'r3_kills',
 'r3_deaths',
 'r3_assists',
 'r3_denies',
 'r3_gold',
 'r3_lh',
 'r3_xp',
 'r3_health',
 '

In [5]:
heroes.columns

Index(['agi_gain', 'attack_range', 'attack_rate', 'attack_type', 'base_agi',
       'base_armor', 'base_attack_max', 'base_attack_min', 'base_health',
       'base_health_regen', 'base_int', 'base_mana', 'base_mana_regen',
       'base_mr', 'base_str', 'cm_enabled', 'icon', 'img', 'int_gain', 'legs',
       'localized_name', 'move_speed', 'name', 'primary_attr',
       'projectile_speed', 'roles', 'str_gain', 'turn_rate'],
      dtype='object')

# Preprocess data

In [6]:
for team in ["r", "d"]:
    for i in range(1,6):
        fights["%s%s_hero_id" % (team, i)] = fights["%s%s_hero_id" % (team, i)].fillna(1).astype("int32")

---
# Process heroes

*Add hero winrate from herostat*

In [7]:
import requests

hero_id_2_winrate = {}
for hero in requests.get('https://api.opendota.com/api/heroStats').json():
    tmp = 0
    for i in range(1, 8):
        tmp += hero['{}_win'.format(i)] / hero['{}_pick'.format(i)]
    hero_id_2_winrate[hero['id']] = tmp / 7

    
winrate = [hero_id_2_winrate[hero_id] for hero_id in heroes.index]
heroes["winrate"] = winrate

*Delete usles features*

In [8]:
heroes = heroes.drop(["roles", "localized_name", "icon", "img", "name", "cm_enabled"], axis=1)

*Prepare categorical features*

In [9]:
heroes["attack_type"] = LabelEncoder().fit_transform(heroes["attack_type"])
heroes["primary_attr"] = LabelEncoder().fit_transform(heroes["primary_attr"])

hero_categorical_features = ["attack_type", "primary_attr"]

*Add features from hero to player*

In [10]:
for feature in heroes.columns:
    for i in range(1, 6):
        for team in ["r", "d"]:            
            if feature in hero_categorical_features:
                CATEFORICAL_FEATURES.append("%s%s_hero_%s" % (team, i, feature))
            
            players_heroes = fights["%s%s_hero_id" % (team, i)]
            fights["%s%s_hero_%s" % (team, i, feature)] = heroes.loc[players_heroes.get_values()][feature].get_values()

In [11]:
fights.head()

Unnamed: 0,match_id_hash,game_time,game_mode,lobby_type,objectives_len,chat_len,r1_hero_id,r1_kills,r1_deaths,r1_assists,...,r1_hero_winrate,d1_hero_winrate,r2_hero_winrate,d2_hero_winrate,r3_hero_winrate,d3_hero_winrate,r4_hero_winrate,d4_hero_winrate,r5_hero_winrate,d5_hero_winrate
0,a400b8f29dece5f4d266f49f1ae2e98a,155,22,7,1,11,11,0,0,0,...,0.475748,0.516962,0.513125,0.482539,0.51315,0.528186,0.507785,0.514097,0.516562,0.473145
1,b9c57c450ce74a2af79c9ce96fac144d,658,4,0,3,10,15,7,2,0,...,0.470987,0.484695,0.535258,0.51315,0.521155,0.4793,0.475609,0.515557,0.470536,0.549747
2,6db558535151ea18ca70a6892197db41,21,23,0,0,0,101,0,0,0,...,0.50583,0.495689,0.51976,0.566072,0.502387,0.50622,0.516962,0.49262,0.434292,0.493916
3,46a0ddce8f7ed2a8d9bd5edcbb925682,576,22,7,1,4,14,1,0,3,...,0.51315,0.495689,0.521984,0.450398,0.50583,0.513278,0.488747,0.500539,0.510045,0.472703
4,b1b35ff97723d9b7ade1c9c3cf48f770,453,22,7,1,3,42,0,1,1,...,0.543573,0.486286,0.500539,0.563639,0.521155,0.497022,0.481764,0.469922,0.436076,0.4793


---
# Process fights

### Process categorical features

*Replace NaN and Float to Int32*

In [12]:
for i in range(1,6):
    for team in ["r", "d"]:
        for feature in ["hero_id"]:
            fights["%s%s_%s" % (team, i, feature)] = fights["%s%s_%s" % (team, i, feature)].fillna(200)
            fights["%s%s_%s" % (team, i, feature)] = fights["%s%s_%s" % (team, i, feature)].astype("uint8")

---
*Dummy encode **heroes** *

In [13]:
heroes_unique_values = pd.concat([fights["r%s_hero_id"%i] for i in range(1,6)] \
                                 + [fights["d%s_hero_id"%i] for i in range(1,6)]).unique()

heroes_unique_values = sorted(heroes_unique_values)
dummy_heroes_column_names = ["hero_%s"%i for i in heroes_unique_values]

dummy_heroes_shape = (fights.shape[0], len(heroes_unique_values))
dummy_heroes = pd.DataFrame(np.zeros(dummy_heroes_shape), columns=dummy_heroes_column_names).astype("uint8")

for feature in ["r%s_hero_id" % i for i in range(1,6)]:
    encoded_feature = pd.get_dummies(fights[feature], prefix="hero")
    dummy_heroes[encoded_feature.columns] += encoded_feature
    
for feature in ["d%s_hero_id" % i for i in range(1,6)]:
    encoded_feature = pd.get_dummies(fights[feature], prefix="hero")
    dummy_heroes[encoded_feature.columns] -= encoded_feature

In [14]:
fights = pd.concat([fights, dummy_heroes], axis=1)

In [15]:
for i in range(1,6):
    for team in ["r", "d"]:
        for feature in ["hero", "items", "role"]:
            CATEFORICAL_FEATURES.append("%s%s_%s" % (team, i, feature))

CATEFORICAL_FEATURES += dummy_heroes.columns.tolist()

### Process numeric features

*Get player armor*

In [16]:
for team in ["r", "d"]:
    for i in range(1, 6):
        player = "%s%s" % (team, i)
        
        agiength = fights["%s_level" % player] * fights["%s_hero_agi_gain" % player] + fights["%s_hero_base_agi" % player]
        fights["%s_armor" % player] = fights["%s_hero_base_armor" % player] + agiength/6
        
        fights = fights.drop(["%s_hero_agi_gain" % player, "%s_hero_base_agi" % player, "%s_hero_base_armor" % player], axis=1)

*Get player damage*

In [17]:
for team in ["r", "d"]:
    for i in range(1, 6):
        player = "%s%s" % (team, i)
        
        fights["%s_attack" % player] = (fights["%s_hero_base_attack_min" % player] + fights["%s_hero_base_attack_max" % player])/2
        fights["%s_rated_attack" % player] = fights["%s_attack" % player] * fights["%s_hero_attack_rate" % player]
        fights["%s_ranged_rated_attack" % player] = fights["%s_rated_attack" % player] * fights["%s_hero_attack_range" % player]

*Calculate map features*

In [18]:
for team in ["r", "d"]:
    for i in range(1, 6):
        fights["%s%s_part_of_half_map" % (team, i)] =  - fights["%s%s_y" % (team, i)]
        
#         p1 = (fights["%s%s_x" % (team, i)]>120) & (fights["%s%s_y" % (team, i)]>120)
#         p2 = (fights["%s%s_x" % (team, i)]<=120) & (fights["%s%s_y" % (team, i)]>120)
#         p3 = (fights["%s%s_x" % (team, i)]>120) & (fights["%s%s_y" % (team, i)]<=120)
#         p4 = (fights["%s%s_x" % (team, i)]<=120) & (fights["%s%s_y" % (team, i)]<=120)
        
#         fights[p1]["%s%s_map_part" % (team, i)] = 1
#         fights[p2]["%s%s_map_part" % (team, i)] = 2
#         fights[p3]["%s%s_map_part" % (team, i)] = 3
#         fights[p4]["%s%s_map_part" % (team, i)] = 4

        fights = fights.drop(["%s%s_x" % (team, i), "%s%s_y" % (team, i)], axis=1)

*Convert players characteristics to team characteristics*

In [21]:
players_characteristics = set([i[3:] for i in (set(fights.columns) - set(CATEFORICAL_FEATURES)) \
                                       if (i[0]  == "d" and i[1].isnumeric())])

print(players_characteristics)

{'hero_int_gain', 'hero_base_mr', 'lh', 'deaths', 'health', 'hero_attack_rate', 'armor', 'hero_base_health_regen', 'stuns', 'obs_placed', 'hero_id', 'hero_projectile_speed', 'hero_turn_rate', 'hero_legs', 'hero_base_mana_regen', 'hero_move_speed', 'attack', 'denies', 'rune_pickups', 'hero_winrate', 'hero_str_gain', 'xp', 'hero_attack_range', 'part_of_half_map', 'hero_base_int', 'assists', 'kills', 'hero_base_health', 'firstblood_claimed', 'ranged_rated_attack', 'hero_base_attack_max', 'gold', 'creeps_stacked', 'hero_base_mana', 'towers_killed', 'hero_base_attack_min', 'camps_stacked', 'hero_base_str', 'roshans_killed', 'teamfight_participation', 'sen_placed', 'level', 'max_health', 'max_mana', 'rated_attack'}


In [22]:
for feature in tqdm_notebook(players_characteristics):
    for team in ["r", "d"]:
        team_name = {"r": "radiant","d": "dire"}
        
        team_values = fights[["%s%s_%s" % (team, i, feature) for i in range(1,6)]]
        
        fights["%s_%s_mean" % (team_name[team], feature)] = team_values.mean(axis=1).get_values()
        fights["%s_%s_median" % (team_name[team], feature)] = team_values.median(axis=1).get_values()
        fights["%s_%s_max" % (team_name[team], feature)] = team_values.max(axis=1).get_values()
        fights["%s_%s_min" % (team_name[team], feature)] = team_values.min(axis=1).get_values()
        
        fights = fights.drop(["%s%s_%s" % (team, i, feature) for i in range(1,6)], axis=1)




*Calculate polynom*

In [23]:
for team in ["radiant", "dire"]:
    player_features = ["%s_%s_mean" % (team, feature) for feature  in players_characteristics]
        
    poly = PolynomialFeatures(2)
    player_features_values = poly.fit_transform(fights[player_features].fillna(0))
    player_features_names = ["%s_feature_%s" % (team, j) for j in range(player_features_values.shape[1])]

    player_features_df = pd.DataFrame(player_features_values, columns=player_features_names)
    fights = pd.concat([fights, player_features_df], axis=1)


*Calculate delta*

In [24]:
team_characteristics = set([i.replace("radiant_", "") for i in (set(fights.columns) - set(CATEFORICAL_FEATURES)) \
                                       if (i.startswith("radiant"))])

In [25]:
for feature in tqdm_notebook(team_characteristics):
    dire = "dire_" + feature
    radiant = "radiant_" + feature
    
    
    fights[feature+"_delta"] = fights[radiant]-fights[dire]
    fights[feature+"_delta_2"] = (fights[radiant]-fights[dire])**2




In [26]:
fights.columns.tolist()

['match_id_hash',
 'game_time',
 'game_mode',
 'lobby_type',
 'objectives_len',
 'chat_len',
 'is_train',
 'fight_id',
 'r1_hero_attack_type',
 'd1_hero_attack_type',
 'r2_hero_attack_type',
 'd2_hero_attack_type',
 'r3_hero_attack_type',
 'd3_hero_attack_type',
 'r4_hero_attack_type',
 'd4_hero_attack_type',
 'r5_hero_attack_type',
 'd5_hero_attack_type',
 'r1_hero_primary_attr',
 'd1_hero_primary_attr',
 'r2_hero_primary_attr',
 'd2_hero_primary_attr',
 'r3_hero_primary_attr',
 'd3_hero_primary_attr',
 'r4_hero_primary_attr',
 'd4_hero_primary_attr',
 'r5_hero_primary_attr',
 'd5_hero_primary_attr',
 'hero_1',
 'hero_2',
 'hero_3',
 'hero_4',
 'hero_5',
 'hero_6',
 'hero_7',
 'hero_8',
 'hero_9',
 'hero_10',
 'hero_11',
 'hero_12',
 'hero_13',
 'hero_14',
 'hero_15',
 'hero_16',
 'hero_17',
 'hero_18',
 'hero_19',
 'hero_20',
 'hero_21',
 'hero_22',
 'hero_23',
 'hero_25',
 'hero_26',
 'hero_27',
 'hero_28',
 'hero_29',
 'hero_30',
 'hero_31',
 'hero_32',
 'hero_33',
 'hero_34',
 'he

*Distance metrci featues*

In [27]:
annoy_features = [i + "_delta" for i in team_characteristics]
annoy_data = fights[fights["is_train"] == True][annoy_features]
annoy_index = fights[fights["is_train"] == True].index.get_values()

In [28]:
annoy_vector_size = len(annoy_features)
annoy = AnnoyIndex(annoy_vector_size)

In [29]:
for vec, i in zip(annoy_data.get_values(), annoy_index):
    annoy.add_item(i, vec)

In [30]:
annoy.build(10)

True

In [34]:
k = 0

def get_nn_info(row):
    global k
    
    if k%1000==0: print(k)
    k+=1
    
    vec = row[annoy_features].get_values()
    indexes = annoy.get_nns_by_vector(vec, 21)
    
    if row.fight_id in indexes:
        indexes.remove(row.fight_id)
    else:
        indexes = indexes[:-1] 
    
    ys = target["radiant_win"].astype("int").get_values()[indexes]
    
    info = {
        "nn_mean_5": np.mean(ys[:5]),
        "nn_median_5": np.median(ys[:5]),
        "nn_mean_10": np.mean(ys[:10]),
        "nn_median_10": np.median(ys[:10]),
        "nn_mean_20": np.mean(ys),
        "nn_median_20": np.median(ys)
    }
    return pd.Series(info)

In [35]:
fights = pd.concat([fights, fights.apply(get_nn_info, axis="columns")], axis=1)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000


# Train/Validate/Test split

In [36]:
fights = fights.fillna(0)

In [37]:
fights.shape

(49675, 5193)

In [38]:
test_data = fights[fights["is_train"] == False]
test_data = test_data.drop(["fight_id", "is_train"], axis=1)

train_data = fights[fights["is_train"] == True]
train_data = train_data.drop(["fight_id", "is_train"], axis=1)

In [39]:
train_data.shape

(39675, 5191)

In [40]:
validation_size = 0.3
validation_split = int(train_data.shape[0]*validation_size)

In [41]:
validation_data = train_data.loc[:validation_split]
train_data = train_data.loc[validation_split:]

In [42]:
target_validate = target.loc[:validation_split]
target_train = target.loc[validation_split:]

In [43]:
train_data.to_csv("processed_dataset/train.csv")
target_train.to_csv("processed_dataset/target_train.csv")

In [44]:
test_data.to_csv("processed_dataset/test.csv")

In [None]:
n_batches = 3

for batch in range(1, n_batches+1):
    print("BATCH #%s" % batch)
    l = int(train_data.shape[0]*(batch-1)/n_batches)
    r = int(train_data.shape[0]*(batch)/n_batches)
    
    train_batch = train_data.loc[l:r]
    target_batch = target_train.loc[l:r]
    
    train_batch.to_csv("processed_dataset/train_%s.csv" % batch)
    target_batch.to_csv("processed_dataset/target_train_%s.csv" % batch)

In [45]:
validation_data.to_csv("processed_dataset/validation.csv")
target_validate.to_csv("processed_dataset/target_validate.csv")

In [46]:
with open("processed_dataset/categorical_features.json", "w") as f:
    f.write(json.dumps(CATEFORICAL_FEATURES))

In [47]:
print("hello")

hello


In [43]:
train_data.shape

(0, 5191)

In [41]:
fights.shape

(49675, 5193)