# One more thing

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pylab as pl
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import KFold
import time
from functools import partial
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from scipy.optimize import minimize_scalar
%matplotlib inline

In [None]:
features = pd.read_csv('./features.csv', index_col='match_id')
features_test = pd.read_csv('./features_test.csv', index_col='match_id')
features_copy = features.copy()
features_test_copy = features_test.copy()
features_test.head()

## Подход 1: градиентный бустинг "в лоб"

### Удаляем столбцы, связанные с итогами матча

In [None]:
MATCH_RESULT_COLUMNS = ["tower_status_radiant", 
                        "tower_status_dire", 
                        "barracks_status_radiant", 
                        "barracks_status_dire",
                        "duration"]
features.drop(MATCH_RESULT_COLUMNS, axis=1, inplace=True)

### Проверяем наличие пропусков

In [None]:
def get_columns_with_na(df):
    missed = df.shape[0] - df.count()
    missed_labels = missed.index.values[missed.values > 0]
    return missed_labels

print(get_columns_with_na(features))

>  Запишите названия признаков, имеющих пропуски, и попробуйте для любых двух из них дать обоснование, почему их значения могут быть пропущены

* `first_blood_player1` - возможны игроки, которые за первые пять минут не убили ни одного противника
* `dire_bottle_time` - dire не покупали bottle за первые пять минут

### Заполняем пропуски

In [None]:
def fill_na_calc_jumps(df, method, columns):
    jumps = pd.Series()
    for column in columns:
        jumps[column] = (method == "big")   * (df[column].max() + 5 * df[column].std()) +\
                        (method == "small") * (df[column].min() - 5 * df[column].std()) +\
                        (method == "zero")  * (0)
    return jumps

def fill_na(df, method, jumps=None):
    """
    method \in ["zero", "big", "small"]
    """
    columns = get_columns_with_na(df)
    if not isinstance(jumps, pd.Series):
        jumps = fill_na_calc_jumps(df, method, columns)
    for column in columns:
        df[column] = df[column].fillna(jumps[column])
    return df, jumps

In [None]:
print(type(features_test))
features, f_jumps = fill_na(features, "zero")
features_test, ft_jumps = fill_na(features_test, "zero", f_jumps)

### Целевая переменная

In [None]:
target = "radiant_win"

def get_train_test(df_train, df_test, target_column):
    train_columns = list(df_train.columns.values)
    train_columns.remove(target_column)
    assert train_columns == list(df_test.columns.values)
    X_train = df_train[train_columns].as_matrix()
    X_test = df_test[train_columns].as_matrix()
    y_train = df_train[target_column].as_matrix()
    return X_train, y_train, X_test
    
X_train, y_train, X_test = get_train_test(features, features_test, target)

### Обучаем градиентный бустинг

In [None]:
def test_boosting(X_train, y_train, X_test, ntrees, bins=5, learning_rate=0.1):
    split = KFold(X_train.shape[0], bins, shuffle=True)
    clf = GradientBoostingClassifier(n_estimators=ntrees, verbose=False, learning_rate=learning_rate)
    scores = []
    for itrain, itest in split:
        clf.fit(X_train[itrain], y_train[itrain])
        pred = clf.predict_proba(X_train[itest])[:, 1]
        score = roc_auc_score(y_train[itest], pred)
        scores.append(score)
    return sum(scores)/len(scores)

for trees in range(10, 31, 5):
    %time score = test_boosting(X_train, y_train, X_test, trees)
    print("trees = %d, score = %f" %(trees, score))

### Отчет

1. [Пропуски и что они могут означать](#Проверяем-наличие-пропусков)
2. [Целевая переменная](#Целевая-переменная)
3. [Время кросс-валидации для 30-ти деревьев (внизу) и соответствующее качество](#Обучаем-градиентный-бустинг)
4. Согласно скорам из предыдущего пункта наращивание количества деревьев мало влияет на результат

## Логистическая регрессия

### Преобразуем признаки

In [56]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [57]:
X_test = scaler.transform(X_test)

In [96]:
split = KFold(X_train.shape[0], 5, shuffle=True)
def test_logregr(X_train, y_train, X_test, C=1.0, bins=5):
    clf = LogisticRegression(penalty='l2', C=C, n_jobs=-1)   
#     split = KFold(X_train.shape[0], bins, shuffle=True)
    scores = []
    for itrain, itest in split:
        clf.fit(X_train[itrain], y_train[itrain])
        pred = clf.predict_proba(X_train[itest])[:, 1]
        score = roc_auc_score(y_train[itest], pred)
        scores.append(score)
    return sum(scores)/len(scores)

def _test_logregr(C):
    print(C)
    return -1.0 * test_logregr(X_train, y_train, X_test, C)

solution = minimize_scalar(_test_logregr, bounds=(0.1, 10), method='bounded')

3.88146351138
6.21853648862
2.43707297725
4.13554256921
3.88949892758
3.88634905452
3.98347923597
3.92539621111
3.90321046978
3.89473627066
3.89149941462
3.88829578313
3.89026304563
3.8898595715
3.88967924954
3.88961037268
3.88956780444
3.88954149582
3.8895252362
3.8895151872
3.88950897658
3.8895051382
