# <center> Предсказание победителя в Dota 2
<center>

### Импорты

In [1]:
import os
import json
import pandas as pd
import numpy as np
import datetime
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import random
import math
import xgboost
import lightgbm
import catboost
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score
from sklearn.ensemble import (RandomForestClassifier,
                              ExtraTreesClassifier)
from sklearn.metrics import roc_auc_score, accuracy_score
from matplotlib.colors import ListedColormap
from scipy.stats import pearsonr
from itertools import combinations
from sklearn.base import BaseEstimator
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import (DecisionTreeRegressor,
                          DecisionTreeClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

%matplotlib inline

In [2]:
SEED = 10801
sns.set_style(style="whitegrid")
plt.rcParams["figure.figsize"] = 12, 8
warnings.filterwarnings("ignore")

## <left>Базовые признаки

In [3]:
PATH_TO_DATA = "../data_folder"

df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                             "train_data.csv"), 
                                    index_col="match_id_hash")
df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 
                                            "train_targets.csv"), 
                                   index_col="match_id_hash")

## <left>Дополнительные признаки

Быстрое чтение

In [4]:
try:
    import ujson as json
except ModuleNotFoundError:
    import json
    print ("Подумайте об установке ujson, чтобы работать с JSON объектами быстрее")
    
try:
    from tqdm.notebook import tqdm
except ModuleNotFoundError:
    tqdm_notebook = lambda x: x
    print ("Подумайте об установке tqdm, чтобы следить за прогрессом")

    
def read_matches(matches_file, total_matches, n_matches_to_read=None):
    """
    Аргуент
    -------
    matches_file: JSON файл с сырыми данными
    
    Результат
    ---------
    Возвращает записи о каждом матче
    """
    
    if n_matches_to_read is None:
        n_matches_to_read = total_matches
        
    c = 0
    with open(matches_file) as fin:
        for line in tqdm(fin, total=total_matches):
            if c >= n_matches_to_read:
                break
            else:
                c += 1
                yield json.loads(line)

Добавление новых признаков

In [5]:
def add_new_features(df_features, matches_file, total_matches):
    """
    Аргуенты
    -------
    df_features: таблица с данными
    matches_file: JSON файл с сырыми данными
    
    Результат
    ---------
    Добавляет новые признаки в таблицу
    """
    
    for match in read_matches(matches_file, total_matches=total_matches, n_matches_to_read=None):
        match_id_hash = match['match_id_hash']

        # Посчитаем количество разрушенных вышек обеими командами
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match["objectives"]:
            if objective["type"] == "CHAT_MESSAGE_TOWER_KILL":
                if objective["team"] == 2:
                    radiant_tower_kills += 1
                if objective["team"] == 3:
                    dire_tower_kills += 1

        df_features.loc[match_id_hash, "radiant_tower_kills"] = radiant_tower_kills
        df_features.loc[match_id_hash, "dire_tower_kills"] = dire_tower_kills
        df_features.loc[match_id_hash, "diff_tower_kills"] = radiant_tower_kills - dire_tower_kills
        
        # ... (/¯◡ ‿ ◡)/¯☆*:・ﾟ добавляем новые признаки ...
        radiant_xp = 0
        dire_xp = 0
        
        radiant_gold = 0
        dire_gold = 0
        
        radiant_obs = 0
        dire_obs = 0
        
        radiant_runes = 0
        dire_runes = 0
        
        radiant_roshans = 0
        dire_roshans = 0
        
        radiant_abilities = 0
        dire_abilities = 0
        
        radiant_kda = 0
        dire_kda = 0
        
        kills = []
        
        radiant_pred = 0
        dire_pred = 0
        
        radiant_team = 0
        dire_team = 0
        
        radiant_xp_gold = 0
        dire_xp_gold = 0
        
        radiant_inv = 0
        radiant_purchase_len = 0
        radiant_purchase_sum = 0
        
        dire_inv = 0
        dire_purchase_len = 0
        dire_purchase_sum = 0
        
        
        
        for i in range(5):
            player_radiant = match["players"][i]
            radiant_xp += player_radiant["xp"]
            radiant_gold += player_radiant["gold"]
            radiant_xp_gold += player_radiant["xp"] * player_radiant["gold"]
            radiant_obs += player_radiant["observers_placed"]
            radiant_runes += player_radiant["rune_pickups"]
            radiant_roshans += player_radiant["roshans_killed"]
            radiant_abilities += len(player_radiant["ability_upgrades"])
            radiant_kda += (player_radiant["kills"] + player_radiant["assists"]) / max(1, player_radiant["deaths"])
            radiant_pred += player_radiant["pred_vict"]
            radiant_team += player_radiant["teamfight_participation"]
            radiant_inv += len(player_radiant["hero_inventory"]) + len(player_radiant["hero_stash"])
            radiant_purchase_len += len(player_radiant["purchase"])
            radiant_purchase_sum += sum(player_radiant["purchase"].values())
            for kill in player_radiant["kills_log"]:
                kills.append((kill["time"], 1))
            
            player_dire = match["players"][5 + i]
            dire_xp += player_dire["xp"]
            dire_gold += player_dire["gold"]
            dire_xp_gold += player_dire["xp"] * player_dire["gold"]
            dire_obs += player_dire["observers_placed"]
            dire_runes += player_dire["rune_pickups"]
            dire_roshans += player_dire["roshans_killed"]
            dire_abilities += len(player_dire["ability_upgrades"])
            dire_kda += (player_dire["kills"] + player_dire["assists"]) / max(1, player_dire["deaths"])
            dire_pred += player_dire["pred_vict"]
            dire_team += player_dire["teamfight_participation"]
            dire_inv += len(player_dire["hero_inventory"]) + len(player_dire["hero_stash"])
            dire_purchase_len += len(player_dire["purchase"])
            dire_purchase_sum += sum(player_dire["purchase"].values())
            for kill in player_dire["kills_log"]:
                kills.append((kill["time"], -1))
        
        game_time = max(1, match["game_time"] / 60)
        df_features.loc[match_id_hash, "radiant_xp"] = radiant_xp
        df_features.loc[match_id_hash, "radiant_xp_d"] = radiant_xp / game_time
        df_features.loc[match_id_hash, "dire_xp"] = dire_xp 
        df_features.loc[match_id_hash, "dire_xp_d"] = dire_xp / game_time
        df_features.loc[match_id_hash, "diff_xp"] = radiant_xp - dire_xp
        
        df_features.loc[match_id_hash, "radiant_gold_d"] = radiant_gold / game_time
        df_features.loc[match_id_hash, "dire_gold_d"] = dire_gold / game_time
        df_features.loc[match_id_hash, "radiant_gold"] = radiant_gold 
        df_features.loc[match_id_hash, "dire_gold"] = dire_gold 
        df_features.loc[match_id_hash, "diff_gold"] = radiant_gold - dire_gold

        df_features.loc[match_id_hash, "radiant_xp_gold"] = radiant_xp_gold 
        df_features.loc[match_id_hash, "dire_xp_gold"] = dire_xp_gold 
        df_features.loc[match_id_hash, "radiant_xp_gold_d"] = radiant_xp_gold / game_time
        df_features.loc[match_id_hash, "dire_xp_gold_d"] = dire_xp_gold / game_time
        df_features.loc[match_id_hash, "diff_xp_gold"] = radiant_obs - dire_obs
        
        df_features.loc[match_id_hash, "radiant_obs"] = radiant_obs
        df_features.loc[match_id_hash, "dire_obs"] = dire_obs
        df_features.loc[match_id_hash, "radiant_obs_d"] = radiant_obs / game_time
        df_features.loc[match_id_hash, "dire_obs_d"] = dire_obs / game_time
        df_features.loc[match_id_hash, "diff_obs"] = radiant_obs - dire_obs
        
        df_features.loc[match_id_hash, "radiant_runes"] = radiant_runes 
        df_features.loc[match_id_hash, "dire_runes"] = dire_runes 
        df_features.loc[match_id_hash, "radiant_runes_d"] = radiant_runes / game_time
        df_features.loc[match_id_hash, "dire_runes_d"] = dire_runes / game_time
        df_features.loc[match_id_hash, "diff_runes"] = radiant_runes - dire_runes

        df_features.loc[match_id_hash, "radiant_roshans"] = radiant_roshans
        df_features.loc[match_id_hash, "dire_roshans"] = dire_roshans
        df_features.loc[match_id_hash, "diff_roshans"] = radiant_roshans - dire_roshans

        df_features.loc[match_id_hash, "radiant_abilities_d"] = radiant_abilities / game_time
        df_features.loc[match_id_hash, "dire_abilities_d"] = dire_abilities / game_time
        df_features.loc[match_id_hash, "radiant_abilities"] = radiant_abilities
        df_features.loc[match_id_hash, "dire_abilities"] = dire_abilities
        df_features.loc[match_id_hash, "diff_abilities"] = radiant_abilities - dire_abilities
        
        df_features.loc[match_id_hash, "radiant_kda_d"] = radiant_kda / game_time
        df_features.loc[match_id_hash, "dire_kda_d"] = dire_kda / game_time
        df_features.loc[match_id_hash, "radiant_kda"] = radiant_kda 
        df_features.loc[match_id_hash, "dire_kda"] = dire_kda
        df_features.loc[match_id_hash, "diff_kda"] = radiant_kda - dire_kda
        
        df_features.loc[match_id_hash, "radiant_pred"] = radiant_pred
        df_features.loc[match_id_hash, "dire_pred"] = dire_pred
        df_features.loc[match_id_hash, "diff_pred"] = radiant_pred - dire_pred
        
        df_features.loc[match_id_hash, "radiant_team"] = radiant_team 
        df_features.loc[match_id_hash, "dire_team"] = dire_team
        df_features.loc[match_id_hash, "diff_team"] = radiant_team - dire_team

        df_features.loc[match_id_hash, "radiant_inv"] = radiant_inv
        df_features.loc[match_id_hash, "dire_inv"] = dire_inv
        df_features.loc[match_id_hash, "radiant_inv_d"] = radiant_inv / game_time
        df_features.loc[match_id_hash, "dire_inv_d"] = dire_inv / game_time
        df_features.loc[match_id_hash, "diff_inv"] = radiant_inv - dire_inv

        df_features.loc[match_id_hash, "radiant_purchase_len"] = radiant_purchase_len
        df_features.loc[match_id_hash, "dire_purchase_len"] = dire_purchase_len 
        df_features.loc[match_id_hash, "radiant_purchase_len_d"] = radiant_purchase_len / game_time
        df_features.loc[match_id_hash, "dire_purchase_len_d"] = dire_purchase_len / game_time
        df_features.loc[match_id_hash, "diff_purchase_len"] = radiant_purchase_len - dire_purchase_len

        df_features.loc[match_id_hash, "radiant_purchase_sum"] = radiant_purchase_sum
        df_features.loc[match_id_hash, "dire_purchase_sum"] = dire_purchase_sum 
        df_features.loc[match_id_hash, "radiant_purchase_sum_d"] = radiant_purchase_sum / game_time
        df_features.loc[match_id_hash, "dire_purchase_sum_d"] = dire_purchase_sum / game_time
        df_features.loc[match_id_hash, "diff_purchase_sum"] = radiant_purchase_sum - dire_purchase_sum
        
        
        
        kills.sort(key=lambda x:x[0], reverse=True)
        last_ten_kills = 0
        for i in range(min(10, len(kills))):
            last_ten_kills += kills[i][1]
            
        df_features.loc[match_id_hash, "last_ten_kills"] = last_ten_kills 

Самые простые, но эффективные фичи — gold/min и xp/min.
Самая интересная из фичей — последние десять убийств. За каждое убийство от radiant +1, за каждое убийство от dire -1.

In [6]:
# Скопируем таблицу с признаками
df_train_features_extended = df_train_features.copy()

# Добавим новые
add_new_features(df_train_features_extended, 
                 os.path.join(PATH_TO_DATA, 
                              "train_raw_data.jsonl"),
                total_matches=31698)

  0%|          | 0/31698 [00:00<?, ?it/s]

## <left>Сравнение моделей

In [7]:
X = df_train_features_extended.values
y = df_train_targets["radiant_win"].values

X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.3, 
                                                      random_state=SEED)

In [8]:
dt = DecisionTreeClassifier(random_state=SEED, max_depth=7, min_samples_leaf=100)
rf = RandomForestClassifier(n_estimators=300, max_depth=7, random_state=SEED)
etc = ExtraTreesClassifier(random_state=SEED)
knn = KNeighborsClassifier(n_neighbors=400, weights="distance")
cat = catboost.CatBoostClassifier(verbose=0, random_seed=SEED)
lgbm = lightgbm.LGBMClassifier(random_state=SEED)
lgbm_rf = lightgbm.LGBMClassifier(boosting_type="rf", bagging_freq=1, bagging_fraction=0.7, random_state=SEED)
xgb = xgboost.XGBClassifier(random_state=SEED)
xgb_rf = xgboost.XGBRFClassifier(random_state=SEED)
lr = LogisticRegression(solver='liblinear', max_iter=10000)
nb = GaussianNB()

models = [("DT", dt), ("RF", rf), 
          ("ETC", etc), ("KNN", knn), 
          ("CAT", cat), ("LGBM", lgbm), 
          ("LGBM_RF", lgbm_rf), ("XGB", xgb), 
          ("XGB_RF", xgb_rf), ("LR", lr), ("NB", nb)]

In [9]:
for model in models: 
    model[1].fit(X_train, y_train)
    y_pred = model[1].predict_proba(X_valid)[:, 1]
    valid_score = roc_auc_score(y_valid, y_pred)
    print(f"ROC-AUC for {model[0]}:", valid_score)

ROC-AUC for DT: 0.7942719947433088
ROC-AUC for RF: 0.8121770226960907
ROC-AUC for ETC: 0.8052292298429343
ROC-AUC for KNN: 0.8023718619423147
ROC-AUC for CAT: 0.8138181421130295
ROC-AUC for LGBM: 0.8145279604101894
ROC-AUC for LGBM_RF: 0.8085944692980637
ROC-AUC for XGB: 0.7991743311963833
ROC-AUC for XGB_RF: 0.8108861557003844
ROC-AUC for LR: 0.7992001427708255
ROC-AUC for NB: 0.7125847923525361


**Лучшие результаты у RF, CAT и LGBM.**

## <left>Алгоритм посылки

In [10]:
df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, "test_data.csv"), 
                                   index_col="match_id_hash")

#Добавим новые признаки
add_new_features(df_test_features, 
                 os.path.join(PATH_TO_DATA, 
                              "test_raw_data.jsonl"), 
                 total_matches=7977)

  0%|          | 0/7977 [00:00<?, ?it/s]

In [11]:
model = catboost.CatBoostClassifier(verbose=0, random_seed=SEED) # выберем модель

In [12]:
model.fit(X, y)
X_test = df_test_features.values
y_test_pred = model.predict_proba(X_test)[:, 1]

df_submission = pd.DataFrame({"radiant_win_prob": y_test_pred}, 
                                 index=df_test_features.index)


In [13]:
submission_filename = "../data_folder/submission_{}.csv".format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
df_submission.to_csv(submission_filename)
print("Файл посылки сохранен, как: {}".format(submission_filename))

Файл посылки сохранен, как: ../data_folder/submission_2023-04-14_11-11-28.csv
