# Improvement: own classifier for each lobby

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import scale

from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

from sklearn.metrics import roc_auc_score

# The function that cleans data

In [4]:
# Input: pandas dataframe, Output: numpy matrix
def clean_data(features) :
    
    features.drop('lobby_type', axis = 1, inplace = True)
    
    # Time
    min_time = min(features['start_time'])
    features['start_time'] = features['start_time'] - min(features['start_time'])
    
    # NaN's: categorical
    categorical_features = ['first_blood_player1', 'first_blood_player2',
                            'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                            'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
    for categorical_feature in categorical_features :
        most_popular = features[categorical_feature].dropna().value_counts().idxmax()
        features[categorical_feature].fillna(most_popular, inplace = True)
        
    # NaN's: numerical
    time_features = ['first_blood_time',
                     'radiant_bottle_time', 'radiant_courier_time', 'radiant_flying_courier_time', 'radiant_first_ward_time',
                     'dire_bottle_time', 'dire_courier_time', 'dire_flying_courier_time', 'dire_first_ward_time']
    
    numerical_features = []
    for feature in features :
        if (feature not in categorical_features) :
            numerical_features.append(feature)
            
    for numerical_feature in numerical_features :
        if (numerical_feature in time_features) :
            features[numerical_feature].fillna(300, inplace = True)
        else :
            median = features[numerical_feature].dropna().median()
            features[numerical_feature].fillna(median, inplace = True)
    
    # Scaling numerical features
    X_numerical_scaled = scale(features[numerical_features].as_matrix())
    
    # How many heros are in the game?
    hero_features = ['r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero',
                     'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero']
    n_heroes = [max(features_train[x].value_counts().index) for x in hero_features]
    N = max(n_heroes)
      
    # Using bag of words for categorical features
    X_pick = np.zeros((features.shape[0], N))
    for i, match_id in enumerate(features.index):
        for p in xrange(5):
            X_pick[i, features.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1
            X_pick[i, features.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1
    
    return np.hstack((X_numerical_scaled, X_pick))

# Getting and preparing data

In [5]:
features_train = pd.read_csv('./data/features.csv', index_col='match_id')
features_test  = pd.read_csv('./data/features_test.csv', index_col='match_id')

In [6]:
target_train = features_train['radiant_win']

In [7]:
features_train.drop(['duration', 'radiant_win', 'tower_status_radiant','tower_status_dire',
                     'barracks_status_radiant', 'barracks_status_dire'], axis = 1, inplace = True)

In [8]:
features_train['lobby_type'].value_counts()

1    55962
7    28550
0    12718
Name: lobby_type, dtype: int64

In [9]:
features_train_lobby0 = features_train[features_train['lobby_type'] == 0]
features_train_lobby1 = features_train[features_train['lobby_type'] == 1]
features_train_lobby7 = features_train[features_train['lobby_type'] == 7]

features_test_lobby0 = features_test[features_test['lobby_type'] == 0]
features_test_lobby1 = features_test[features_test['lobby_type'] == 1]
features_test_lobby7 = features_test[features_test['lobby_type'] == 7]

In [10]:
X_train_lobby0 = clean_data(features_train_lobby0)
X_train_lobby1 = clean_data(features_train_lobby1)
X_train_lobby7 = clean_data(features_train_lobby7)

X_test_lobby0 = clean_data(features_test_lobby0)
X_test_lobby1 = clean_data(features_test_lobby1)
X_test_lobby7 = clean_data(features_test_lobby7)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [11]:
y_train_lobby0 = target_train[features_train['lobby_type'] == 0].as_matrix()
y_train_lobby1 = target_train[features_train['lobby_type'] == 1].as_matrix()
y_train_lobby7 = target_train[features_train['lobby_type'] == 7].as_matrix()

# Creating and testing logistic regression model

In [12]:
clf_lobby0 = LogisticRegression(penalty = 'l2', C = 0.1, random_state = 42, n_jobs = -1)
clf_lobby0.fit(X_train_lobby0, y_train_lobby0)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
clf_lobby1 = LogisticRegression(penalty = 'l2', C = 0.1, random_state = 42, n_jobs = -1)
clf_lobby1.fit(X_train_lobby1, y_train_lobby1)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
clf_lobby7 = LogisticRegression(penalty = 'l2', C = 0.1, random_state = 42, n_jobs = -1)
clf_lobby7.fit(X_train_lobby7, y_train_lobby7)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
predictions_lobby0 = clf_lobby0.predict_proba(X_test_lobby0)[:, 1]
predictions_lobby0_df = pd.DataFrame({'match_id'    : features_test_lobby0.index,
                                      'radiant_win' : predictions_lobby0
                                     })

In [16]:
predictions_lobby1 = clf_lobby1.predict_proba(X_test_lobby1)[:, 1]
predictions_lobby1_df = pd.DataFrame({'match_id'    : features_test_lobby1.index,
                                      'radiant_win' : predictions_lobby1
                                     })

In [17]:
predictions_lobby7 = clf_lobby7.predict_proba(X_test_lobby7)[:, 1]
predictions_lobby7_df = pd.DataFrame({'match_id'    : features_test_lobby7.index,
                                      'radiant_win' : predictions_lobby7
                                     })

In [18]:
kaggle_answer = pd.concat([predictions_lobby0_df, predictions_lobby1_df, predictions_lobby7_df])

In [19]:
kaggle_answer.sort_values(by = ['match_id'], inplace = True)

In [20]:
kaggle_answer.to_csv('kaggle_answers/2.csv', index = False)