In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import time
import datetime
import warnings
warnings.filterwarnings('ignore')

## Подход 1: градиентный бустинг "в лоб"

In [2]:
features_train = pd.read_csv('C:\\Users\\kvanc\\coursera\\files\\features.csv', index_col = 'match_id')
features_test = pd.read_csv('C:\\Users\\kvanc\\coursera\\files\\features_test.csv', index_col = 'match_id')

In [3]:
target_train = features_train['radiant_win']

In [4]:
difference = features_train.columns.difference(features_test.columns.values.tolist()).tolist()
features_train.drop(difference, axis = 1, inplace = True)

In [5]:
skips = {}
for i in range(102):
    if features_train.iloc[:, i].count() != 97230:
        s = 97230 - features_train.iloc[:, i].count()
        skips[i] = s
all_skips = 0
for key, value in skips.items():
    all_skips += value
    print(features_train.columns.values[key])
print('all_skips = ', all_skips)

first_blood_time
first_blood_team
first_blood_player1
first_blood_player2
radiant_bottle_time
radiant_courier_time
radiant_flying_courier_time
radiant_first_ward_time
dire_bottle_time
dire_courier_time
dire_flying_courier_time
dire_first_ward_time
all_skips =  193087


In [7]:
features = features_train.fillna(0)

In [8]:
X = features.ix[:, :]

In [11]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 1)
scores = {}
for i in [10, 20, 30, 60, 70]:
    clf = GradientBoostingClassifier(n_estimators = i, random_state = 241)
    start_time = datetime.datetime.now()
    clf.fit(X, target_train)
    print('time elapsed:', datetime.datetime.now() - start_time)
    score = cross_val_score(clf, X, target_train, scoring = 'roc_auc', cv = kf)
    scores[i] = score.mean()
scores

time elapsed: 0:00:11.337704
time elapsed: 0:00:20.638656
time elapsed: 0:00:29.872240
time elapsed: 0:00:58.994814
time elapsed: 0:01:08.818893


{10: 0.66483292280491,
 20: 0.6821140369500348,
 30: 0.6896947542059906,
 60: 0.6998001572455899,
 70: 0.7018939073914252}

In [27]:
clf = GradientBoostingClassifier(n_estimators = 30, random_state = 241)
clf.fit(X, target_train)
start_time = datetime.datetime.now()
score = cross_val_score(clf, X, target_train, scoring = 'roc_auc', cv = kf).mean()
print('time elapsed:', datetime.datetime.now() - start_time)

time elapsed: 0:02:01.193987


## Подход 2: логистическая регрессия

In [12]:
scaler = StandardScaler().fit(features)

In [13]:
def log_scores(X, target):
    C_list = list(np.power(10.0, np.arange(-3, 3)))
    kf = KFold(n_splits = 5, shuffle = True, random_state = 1)
    scores = {}
    for C in C_list:
        clf = LogisticRegression(penalty = 'l2', C = C)
        start_time = datetime.datetime.now()
        score = cross_val_score(clf, scaler.transform(X), target, scoring = 'roc_auc', cv = kf)
        print('time elapsed:', datetime.datetime.now() - start_time)
        scores[C] = score.mean()
    best_score = 0
    for key, value in scores.items():
        if value > best_score:
            best_score = value
            best_C = key
    return best_C, best_score

In [24]:
def log_scores_2(X, target):
    C_list = list(np.power(10.0, np.arange(-3, 3)))
    kf = KFold(n_splits = 5, shuffle = True, random_state = 1)
    scores = {}
    for C in C_list:
        clf = LogisticRegression(penalty = 'l2', C = C)
        start_time = datetime.datetime.now()
        score = cross_val_score(clf, X, target, scoring = 'roc_auc', cv = kf)
        print('time elapsed:', datetime.datetime.now() - start_time)
        scores[C] = score.mean()
    best_score = 0
    for key, value in scores.items():
        if value > best_score:
            best_score = value
            best_C = key
    return best_C, best_score

In [14]:
log_scores(X, target_train)

time elapsed: 0:00:15.574311
time elapsed: 0:00:20.291152
time elapsed: 0:00:21.234645
time elapsed: 0:00:21.195530
time elapsed: 0:00:21.139779
time elapsed: 0:00:21.198743


(0.01, 0.7163757959125769)

In [15]:
new_features = pd.read_csv('C:\\Users\\kvanc\\coursera\\files\\features.csv', index_col = 'match_id')
new_features.drop(difference, axis = 1, inplace = True)
new_features = new_features.fillna(0)
new_features.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero',
                   'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis = 1, inplace = True)

In [16]:
new_X = new_features.ix[:, :]
scaler = StandardScaler().fit(new_features)

In [17]:
log_scores(new_X, target_train)

time elapsed: 0:00:13.743888
time elapsed: 0:00:18.488458
time elapsed: 0:00:19.482043
time elapsed: 0:00:19.631830
time elapsed: 0:00:19.481848
time elapsed: 0:00:19.451461


(0.01, 0.7164088702736156)

In [18]:
heroes = pd.Series()
for h in ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']:
    heroes = heroes.append(features[h])
N = heroes.max()

In [21]:
X_pick = np.zeros((features.shape[0], N))
for i, match_id in enumerate(features.index):
    for p in range(5):
        X_pick[i, features.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
        X_pick[i, features.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1
X_pick

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0., -1.]])

In [22]:
X_pick.shape

(97230, 112)

In [26]:
log_scores_2(np.hstack([scaler.transform(new_X), X_pick]), target_train)

time elapsed: 0:00:16.344432
time elapsed: 0:00:26.828072
time elapsed: 0:00:37.747380
time elapsed: 0:00:39.623838
time elapsed: 0:00:39.105853
time elapsed: 0:00:38.805176


(0.1, 0.7518731057809973)

In [29]:
features_test.fillna(0, inplace=True)
X_pick_test = np.zeros((features_test.shape[0], N))
for i, match_id in enumerate(features_test.index):
    for p in range(5):
        X_pick_test[i, features_test.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
        X_pick_test[i, features_test.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1
features_test.drop(['lobby_type', 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                    'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero'], axis=1, inplace=True)
scaler = StandardScaler().fit(new_features)
X_test = features_test.ix[:, :]

In [30]:
clf = LogisticRegression(C = 0.1)
clf.fit(np.hstack([scaler.transform(new_features), X_pick]), target_train)
scaler = StandardScaler().fit(features_test)

In [32]:
pred = clf.predict_proba(np.hstack((scaler.transform(features_test), X_pick_test)))[:, 1]

In [33]:
print(min(pred), max(pred))

0.008580592559245393 0.9964586771296121
