In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from sklearn.metrics import roc_auc_score
import seaborn as sns
import scipy as sp

%matplotlib inline

In [41]:
df_train = pd.read_csv('main_train.csv')
df_test = pd.read_csv('main_test.csv')

In [42]:
df_train.head()

Unnamed: 0,match_id,radiant,hero,gold_0,lh_0,xp_0,gold_60,lh_60,xp_60,gold_120,...,xp_600,level_180,level_240,level_300,level_360,level_420,level_480,level_540,level_600,radiant_win
0,0,1,Rubick,0.0,0.0,0.0,100.0,0.0,46.0,250.0,...,1741.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0,1,Wraith King,0.0,0.0,0.0,175.0,2.0,124.0,526.0,...,2319.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1
2,0,1,Riki,0.0,0.0,0.0,137.0,1.0,93.0,439.0,...,3859.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1
3,0,1,Tusk,0.0,0.0,0.0,100.0,0.0,62.0,200.0,...,1676.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,0,1,Templar Assassin,0.0,0.0,0.0,320.0,3.0,352.0,668.0,...,4453.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1


Для начала посмотрим, существуют ли в предполагаемо важных признаках пропуски

In [43]:
pd.isnull(df_train).sum()

match_id        0
radiant         0
hero           29
gold_0          0
lh_0            0
xp_0            0
gold_60         0
lh_60           0
xp_60           0
gold_120        0
lh_120          0
xp_120          0
gold_180        0
lh_180          0
xp_180          0
gold_240        0
lh_240          0
xp_240          0
gold_300        0
lh_300          0
xp_300          0
gold_360        0
lh_360          0
xp_360          0
gold_420        0
lh_420          0
xp_420          0
gold_480        0
lh_480          0
xp_480          0
gold_540        0
lh_540          0
xp_540          0
gold_600        0
lh_600          0
xp_600          0
level_180      50
level_240      50
level_300      50
level_360      50
level_420      50
level_480      50
level_540      50
level_600      50
radiant_win     0
dtype: int64

Как видим, существует 29 пропущенных значений в признаке 'hero'. Заменим NaN на 'Zero'

#### Первичная предобработка

In [44]:
df_train['hero'] = df_train['hero'].fillna('Zero')
df_test['hero'] = df_test['hero'].fillna('Zero')

Далее ввиду особенностей формирования объектов в датафрейме (идут игроки Radiant для 1-го матча, затем 5 игроков ) мы создадим командные векторы признаков

In [45]:
def convert_to_team(player_values):
    team_values = player_values.values.reshape(-1,5)
    team_values.sort(axis=1)
    return team_values

In [46]:
team_gold_600_train = convert_to_team(df_train['gold_600'])
team_lh_600_train = convert_to_team(df_train['lh_600'])
team_xp_600_train = convert_to_team(df_train['xp_600'])

team_gold_600_test = convert_to_team(df_test['gold_600'])
team_lh_600_test = convert_to_team(df_test['lh_600'])
team_xp_600_test = convert_to_team(df_test['xp_600'])

In [47]:
team_gold_600_train[1]

array([ 1332.,  1386.,  2503.,  2804.,  3581.])

Теперь сконкатенируем все полученные векторы в train и test выборки

In [48]:
X_train = np.array(np.vstack([team_gold_600_train.T, team_lh_600_train.T, team_xp_600_train.T])).T
X_test = np.array(np.vstack([team_gold_600_test.T, team_lh_600_test.T, team_xp_600_test.T])).T

#### Стандартизация

In [49]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [50]:
X_train.shape

(79918, 15)

Теперь X_train есть датафрейм, в котором 15 признаков и объекты построены для обеих команд в общем. Сделаем датафрейм для анализа обеих команд сепаратно

In [51]:
X_train = X_train.reshape(-1, 30)
X_test = X_test.reshape(-1, 30)

Добавим вектор, определяющий персонажей, которые были выбраны для игры в каждой команде. Сделаем дамми-переменные, поскольку данный признак является категориальным

In [52]:
heroes_train = np.matrix(pd.get_dummies(df_train['hero']), dtype='int')
heroes_test = np.matrix(pd.get_dummies(df_test['hero']), dtype='int')

In [53]:
heroes_train.shape

(399590, 111)

Чтобы обеспечить корректное соответствие героев в каждой команде в каждом матче, проведем следующие манипуляции

In [54]:
team_heroes_train = heroes_train[::5]+heroes_train[1::5]+heroes_train[2::5]+heroes_train[3::5]+heroes_train[4::5]
team_heroes_test = heroes_test[::5]+heroes_test[1::5]+heroes_test[2::5]+heroes_test[3::5]+heroes_test[4::5]

In [55]:
hero_team_train = np.array(np.subtract(np.matrix(team_heroes_train, dtype='int')[::2], np.matrix(team_heroes_train, dtype='int')[1::2]))
hero_team_test = np.array(np.subtract(np.matrix(team_heroes_test, dtype='int')[::2], np.matrix(team_heroes_test, dtype='int')[1::2]))

Наконец, соединим получившиеся векторы, чтобы сформировать конечный X_train и X_test

In [56]:
X_train = np.vstack([X_train.T, hero_team_train.T]).T
X_test = np.vstack([X_test.T, hero_team_test.T]).T

Выделим вектор целевой переменной

In [57]:
y_train = np.array(df_train.radiant_win[::10])

### Логистическая регрессия

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [59]:
from sklearn.model_selection import GridSearchCV

Подберем оптимальные параметры для используемой модели

In [73]:
param_test1 = {
 'penalty':['l1', 'l2'],
 'C':np.arange(0.1,3,0.5),}

gsearch1 = GridSearchCV(estimator = LogisticRegression(random_state = 40), 
 param_grid = param_test1, scoring='roc_auc',iid=False, cv=10)

gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.76089, std: 0.00720, params: {'penalty': 'l1', 'C': 0.10000000000000001},
  mean: 0.76121, std: 0.00719, params: {'penalty': 'l2', 'C': 0.10000000000000001},
  mean: 0.76121, std: 0.00718, params: {'penalty': 'l1', 'C': 0.59999999999999998},
  mean: 0.76124, std: 0.00717, params: {'penalty': 'l2', 'C': 0.59999999999999998},
  mean: 0.76122, std: 0.00719, params: {'penalty': 'l1', 'C': 1.1000000000000001},
  mean: 0.76124, std: 0.00718, params: {'penalty': 'l2', 'C': 1.1000000000000001},
  mean: 0.76123, std: 0.00719, params: {'penalty': 'l1', 'C': 1.6000000000000001},
  mean: 0.76123, std: 0.00718, params: {'penalty': 'l2', 'C': 1.6000000000000001},
  mean: 0.76123, std: 0.00719, params: {'penalty': 'l1', 'C': 2.1000000000000001},
  mean: 0.76123, std: 0.00719, params: {'penalty': 'l2', 'C': 2.1000000000000001},
  mean: 0.76123, std: 0.00719, params: {'penalty': 'l1', 'C': 2.6000000000000001},
  mean: 0.76123, std: 0.00719, params: {'penalty': 'l2', 'C': 2.6000000000000001}],

In [74]:
clf = LogisticRegression(random_state = 40, C = 0.6, penalty = 'l2')
clf.fit(X_train, y_train)

predictions = clf.predict_proba(X_test)

In [75]:
indeces = np.array(df_test.match_id.values[::10], dtype=int)

submission = pd.DataFrame()
submission['index'] = indeces
submission['proba'] = predictions[:, 1]

submission.to_csv('8th step.csv', index=False)