# Dota 2: Winner Prediction
The task is to predict the winner (either **Dire** or **Radiant** team) based on data from the first 5 minutes of the match.

In [1]:
import numpy as np
import pandas as pd

from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [2]:
# importing data
train = pd.read_csv('features.csv',
                    index_col='match_id')
test = pd.read_csv('features_test.csv',
                   index_col='match_id')

In [3]:
# preparing cross-validation, target and scaler
cv = KFold(n_splits=5,
               shuffle=True,
               random_state=173)
scaler = StandardScaler()
y = train.iloc[:, 103]
print(f'target column: radiant_win')

target column: radiant_win


In [4]:
# finding missing data, explaining some, and filling with zeroes (better for logistic regression)
missing = train.isna().sum()
missing = missing[missing > 0]

print(f'''missing values in data by columns:
{missing}

explanation for 2 columns with missing values:
1. in 'radiant_bottle_time': probably, in 15691 matches the 'bottle' item was
   never purchased by the Radiant team.
2. in 'dire_first_ward_time': presumably, in 1826 matches the Dire team never
   planted the Ward.''')

train.fillna(value=0, inplace=True)

missing values in data by columns:
first_blood_time               19553
first_blood_team               19553
first_blood_player1            19553
first_blood_player2            43987
radiant_bottle_time            15691
radiant_courier_time             692
radiant_flying_courier_time    27479
radiant_first_ward_time         1836
dire_bottle_time               16143
dire_courier_time                676
dire_flying_courier_time       26098
dire_first_ward_time            1826
dtype: int64

explanation for 2 columns with missing values:
1. in 'radiant_bottle_time': probably, in 15691 matches the 'bottle' item was
   never purchased by the Radiant team.
2. in 'dire_first_ward_time': presumably, in 1826 matches the Dire team never
   planted the Ward.


#### Method 1: gradient boosting

In [5]:
# initializing hyperparameter grid and the model itself
gb = GradientBoostingClassifier(random_state=173)
grid = {'learning_rate': [1, 0.5, 0.3, 0.2, 0.1],
        'n_estimators': np.arange(10, 60, 10)}
gs = GridSearchCV(
    gb,
    grid,
    scoring='roc_auc',
    n_jobs=-1,
    cv=cv
)

gs.fit(train.iloc[:, :102], y)

# extracting info from search
gs_results = pd.DataFrame(gs.cv_results_)
optimal_rate = gs.best_params_['learning_rate']
_30_trees = gs_results[
    (gs_results.param_learning_rate == gs.best_params_['learning_rate']) &
    (gs_results.param_n_estimators == 30)
]

# completing the task: providing info on cv with 30 trees
print(f'''number of trees: 30
learning rate: {optimal_rate}
time: {_30_trees.mean_fit_time.values[0]}
score: {_30_trees.mean_test_score.values[0]}
''')

number of trees: 30
learning rate: 0.5
time: 66.22533183097839
score: 0.7034352666054644



#### Method 2: logistic regression

In [6]:
# initializing hyperparameter grid and the model itself
Cs = np.concatenate((np.arange(.01, .1, .01), np.arange(.1, 1.05, .05)))
logreg = LogisticRegressionCV(
    Cs=Cs,
    cv=cv,
    scoring='roc_auc',
    solver='saga',
    n_jobs=-1,
    random_state=173
)

# fitting scaled data
logreg.fit(scaler.fit_transform(train.iloc[:, :102]), y)

# completing the task: providing info on cv
print(f'''l2-regulator: {logreg.C_}
time: <= 10 seconds
score: {logreg.scores_[1].mean(axis=0).max()}''')

l2-regulator: [0.01]
time: <= 10 seconds
score: 0.7164074592058913


#### Method 3: logistic regression without categories

In [7]:
categories = [f'{team}{digit}_hero' for team in 'rd' for digit in range(1, 6)]
categories.append('lobby_type')

X_no_cat_scaled = scaler.fit_transform(
    train.iloc[:, :102].drop(categories, axis=1))

logreg.fit(X_no_cat_scaled, y)

# completing the task: providing info on cv
print(f'''l2-regulator: {logreg.C_}
time: <= 10 seconds
score: {logreg.scores_[1].mean(axis=0).max()}''')

l2-regulator: [0.01]
time: <= 10 seconds
score: 0.7164602235577593


#### Method 4: logistic regression with dummy coding

In [8]:
# amount of unique players
n_players = max(np.unique(train[categories]))

# creating a bag of words for players
bag = np.empty((train.shape[0], n_players), dtype=int)

# barbaric way of filling array provided by the course's authors
for i, match_id in enumerate(train.index):
    for p in range(1, 6):
        bag[i, train.loc[match_id, f'r{p}_hero']-1] = 1
        bag[i, train.loc[match_id, f'd{p}_hero']-1] = -1

logreg.fit(np.hstack((X_no_cat_scaled, bag)), y)

# completing the task: providing info on cv
print(f'''l2-regulator: {logreg.C_}
time: <= 20 seconds
score: {logreg.scores_[1].mean(axis=0).max()}''')

l2-regulator: [0.04]
time: <= 20 seconds
score: 0.7519508176437737


In [9]:
# preprocessing test data in the same way
test.fillna(value=0,
            inplace=True)
test_bag = np.empty((test.shape[0], n_players))
for i, match_id in enumerate(test.index):
    for p in range(1, 6):
        test_bag[i, test.loc[match_id, f'r{p}_hero']-1] = 1
        test_bag[i, test.loc[match_id, f'd{p}_hero']-1] = -1
X_test_bag = np.hstack((
    scaler.transform(test.drop(categories, axis=1)),
    test_bag
))

# final stage: prediction
answer = logreg.predict_proba(X_test_bag)[:, 1]
print(f'''most probable Radiant win: {max(answer)}
most probable Dire win: {min(answer)}''')

most probable Radiant win: 0.9962849889082319
most probable Dire win: 0.008395380987806138
