# Improvement: separate classifiers for lobby 1 and  lobbies 0, 7 (teaching lobbies 0, 7 on all data)
Kaggle score: 0.75784

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import scale

from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

from sklearn.metrics import roc_auc_score

# The function that cleans data

In [4]:
# Input: pandas dataframe, Output: numpy matrix
def clean_data(features) :
    
    features.drop('lobby_type', axis = 1, inplace = True)
    
    # Time
    min_time = min(features['start_time'])
    features['start_time'] = features['start_time'] - min(features['start_time'])
    
    # NaN's: categorical
    categorical_features = ['first_blood_player1', 'first_blood_player2',
                            'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                            'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
    for categorical_feature in categorical_features :
        most_popular = features[categorical_feature].dropna().value_counts().idxmax()
        features[categorical_feature].fillna(most_popular, inplace = True)
        
    # NaN's: numerical
    time_features = ['first_blood_time',
                     'radiant_bottle_time', 'radiant_courier_time', 'radiant_flying_courier_time', 'radiant_first_ward_time',
                     'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time', 'dire_first_ward_time']
    
    numerical_features = []
    for feature in features :
        if (feature not in categorical_features) :
            numerical_features.append(feature)
            
    for numerical_feature in numerical_features :
        if (numerical_feature in time_features) :
            features[numerical_feature].fillna(300, inplace = True)
        else :
            median = features[numerical_feature].dropna().median()
            features[numerical_feature].fillna(median, inplace = True)
    
    # Scaling numerical features
    X_numerical_scaled = scale(features[numerical_features].as_matrix())
    
    # How many heros are in the game?
    hero_features = ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                     'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
    n_heroes = [max(features_train[x].value_counts().index) for x in hero_features]
    N = max(n_heroes)
      
    # Using bag of words for categorical features
    X_pick = np.zeros((features.shape[0], N))
    for i, match_id in enumerate(features.index):
        for p in xrange(5):
            X_pick[i, features.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
            X_pick[i, features.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1
    
    return np.hstack((X_numerical_scaled, X_pick))

# Getting and preparing data

In [5]:
features_train = pd.read_csv('./data/features.csv', index_col='match_id')
features_test  = pd.read_csv('./data/features_test.csv', index_col='match_id')

In [6]:
target_train = features_train['radiant_win']

In [7]:
features_train.drop(['duration', 'radiant_win', 'tower_status_radiant','tower_status_dire',
                     'barracks_status_radiant', 'barracks_status_dire'], axis = 1, inplace = True)

In [8]:
features_train['lobby_type'].value_counts()

1    55962
7    28550
0    12718
Name: lobby_type, dtype: int64

In [9]:
features_train_lobby1  = features_train[features_train['lobby_type'] == 1].copy()
features_train_lobby07 = features_train.copy() # Train on all data
1
features_test_lobby1  = features_test[features_test['lobby_type'] == 1].copy()
features_test_lobby07 = features_test[features_test['lobby_type'] != 1].copy()

In [10]:
y_train_lobby1  = target_train[features_train['lobby_type'] == 1].as_matrix()
y_train_lobby07 = target_train.as_matrix()

In [11]:
X_train_lobby1  = clean_data(features_train_lobby1)
X_train_lobby07 = clean_data(features_train_lobby07)

X_test_lobby1  = clean_data(features_test_lobby1)
X_test_lobby07 = clean_data(features_test_lobby07)



# Creating and testing logistic regression model

In [12]:
clf_lobby1 = LogisticRegression(penalty = 'l2', C = 0.1, random_state = 42, n_jobs = -1)
clf_lobby1.fit(X_train_lobby1, y_train_lobby1)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
clf_lobby07 = LogisticRegression(penalty = 'l2', C = 0.1, random_state = 42, n_jobs = -1)
clf_lobby07.fit(X_train_lobby07, y_train_lobby07)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
kfold = KFold(len(y_train_lobby1), n_folds = 5, shuffle = True, random_state = 42) 
score = cross_val_score(estimator = clf_lobby1,
                        X = X_train_lobby1,
                        y = y_train_lobby1,
                        cv = kfold,
                        scoring = 'roc_auc',
                        n_jobs = -1,
                        verbose = True)

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.8s finished


In [15]:
score.mean()

0.77379316749930427

In [16]:
kfold = KFold(len(y_train_lobby07), n_folds = 5, shuffle = True, random_state = 42) 
score = cross_val_score(estimator = clf_lobby07,
                        X = X_train_lobby07,
                        y = y_train_lobby07,
                        cv = kfold,
                        scoring = 'roc_auc',
                        n_jobs = -1,
                        verbose = True)

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   17.8s finished


In [17]:
score.mean()

0.75240492206330423

In [18]:
predictions_lobby1 = clf_lobby1.predict_proba(X_test_lobby1)[:, 1]
predictions_lobby1_df = pd.DataFrame({'match_id'    : features_test_lobby1.index,
                                      'radiant_win' : predictions_lobby1
                                     })

In [19]:
predictions_lobby07 = clf_lobby07.predict_proba(X_test_lobby07)[:, 1]
predictions_lobby07_df = pd.DataFrame({'match_id'   : features_test_lobby07.index,
                                      'radiant_win' : predictions_lobby07
                                      })

In [20]:
kaggle_answer = pd.concat([predictions_lobby1_df, predictions_lobby07_df])

In [21]:
kaggle_answer.sort_values(by = ['match_id'], inplace = True)

In [22]:
#kaggle_answer.to_csv('kaggle_answers/4.csv', index = False)

In [50]:
features_coefs = zip(clf_lobby1.coef_[0], features_train.columns)

In [51]:
features_coefs.sort(key = lambda x: abs(x[0]), reverse = True)

In [52]:
features_coefs

[(-0.55642323515149605, 'radiant_boots_count'),
 (-0.53867233483257715, 'dire_first_ward_time'),
 (-0.32990093485362437, 'd4_level'),
 (-0.29008170787401483, 'd3_xp'),
 (0.27804161121485454, 'dire_courier_time'),
 (-0.27545160235999105, 'd2_gold'),
 (0.27005482601856962, 'r3_hero'),
 (-0.25786378407602412, 'd1_lh'),
 (0.25750783505973068, 'r2_level'),
 (0.2438931229771602, 'r4_deaths'),
 (-0.24005024921229273, 'r5_lh'),
 (0.23312243909613062, 'r3_items'),
 (0.22132828149415951, 'r1_level'),
 (-0.21436871631458013, 'r5_kills'),
 (0.21397793537673052, 'r1_xp'),
 (0.20860004031564841, 'dire_boots_count'),
 (0.20407811786996036, 'r3_deaths'),
 (-0.19818184897124899, 'radiant_ward_observer_count'),
 (0.19745633826126413, 'r2_hero'),
 (0.19365543390918322, 'r4_kills'),
 (0.18231110276091611, 'dire_flying_courier_time'),
 (-0.17561249287202182, 'd3_level'),
 (-0.17204719479389502, 'radiant_ward_sentry_count'),
 (-0.16972302697322167, 'd1_gold'),
 (0.16895169652604761, 'dire_bottle_time'),
 (0