In [1]:
import time
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.ensemble import GradientBoostingClassifier as GB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [2]:
dota_train = pd.read_csv('../input/mlcourse-dota2-win-prediction/train_features.csv',index_col='match_id_hash')
dota_test = pd.read_csv('../input/mlcourse-dota2-win-prediction/test_features.csv',index_col='match_id_hash')
dota_target = pd.read_csv('../input/mlcourse-dota2-win-prediction/train_targets.csv',index_col='match_id_hash')

In [3]:
dota_train.head()

In [4]:
dota_train['r1_hero_id'].dtype

In [5]:
dota_train_columns= []
for i in dota_train.columns:
    dota_train_columns.append(i)
print(dota_train_columns)

### Finding cols with missing values

In [6]:
for i in dota_train.columns:
    print
    print(f'{i}: {dota_train[i].isnull().sum()/dota_train.shape[0] :.2f}% data is lost')
    if dota_train[i].isnull().sum()>0:
        print(f"\'{i}\'",end=',')

As we see, we don't have any missing values for the training data.

### Target column

We predict who will win, so we need to label 'radiant_win'

# Prediction with Gradient Boosting

In [7]:
y = dota_target['radiant_win']
kfold = KFold(n_splits=5, shuffle = True)

In [8]:
for i in [10,20,30]:

    clf = GB(n_estimators=i)
    start_time = datetime.datetime.now()
    mean_score = cross_val_score(clf, dota_train, y, scoring="roc_auc", cv = kfold).mean()
    #results = cross_val_score(clf, X, y, cv=kfold)
    #print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    print(mean_score)

Now we need to use AUC ROC, and probably decrease processing time.
For a normal cross-validation:<br>
10 algos Time elapsed: 0:00:51.295467 Acc:0.7337608087323305<br>
20 algos Time elapsed: 0:01:42.239909 Acc:0.7482447112816857<br>
30 algos Time elapsed: 0:02:32.794667 Acc:0.759866337841651

In [12]:
print(dota_train.shape)
print(y.shape)
print(dota_test.shape)

# Logistic Regression prediction

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [11]:
#X = dota_train.copy()
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(dota_train))

In [13]:
X_test_scaled = pd.DataFrame(scaler.transform(dota_test))

# Finding the accuracy by ROC

In [14]:
from sklearn.linear_model import RidgeClassifier

In [15]:
for i in [1e-5, 1e-4, 5e-5,6e-5]:
    start_time = datetime.datetime.now()
    model = RidgeClassifier(alpha=i)
    mean_score = cross_val_score(model, X_scaled, y, scoring="roc_auc", cv = kfold).mean()
    print(i)
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    print(mean_score)

### Dropping nominal features

In [25]:
drop_columns = ['lobby_type','r1_hero_id', 'r2_hero_id', 'r3_hero_id','r4_hero_id',\
                'r5_hero_id', 'd1_hero_id', 'd2_hero_id',\
                'd3_hero_id','d4_hero_id',\
                    'd5_hero_id']

In [26]:
X_drop = pd.DataFrame(dota_train.drop(drop_columns,axis=1))
X_test_drop = pd.DataFrame(dota_test.drop(drop_columns,axis=1))

In [27]:
print(X_drop.shape)
print(X_test_drop.shape)

In [28]:
scaler2 = StandardScaler()
X_scaled_drop = pd.DataFrame(scaler2.fit_transform(X_drop),columns = X_drop.columns)  #saving the names of columns
X_test_scaled_drop = pd.DataFrame(scaler2.transform(X_test_drop),columns = X_test_drop.columns)

In [29]:
for i in [1e-5, 1e-4, 5e-5,6e-5]:
    start_time = datetime.datetime.now()
    model = RidgeClassifier(alpha=i)
    mean_score = cross_val_score(model, X_scaled_drop, y, scoring="roc_auc", cv = kfold).mean()
    print(i)
    print ('Time elapsed:', datetime.datetime.now() - start_time)
    print(mean_score)

### How many different heroes do we have?

In [30]:
heroes = ['r1_hero_id', 'r2_hero_id', 'r3_hero_id','r4_hero_id',\
                'r5_hero_id', 'd1_hero_id', 'd2_hero_id',\
                'd3_hero_id','d4_hero_id',\
                    'd5_hero_id']
#for i in heroes:
np.unique(dota_test[heroes].values)

We have 120 different heroes in our dataset

### Create bag of words

In [31]:
dota_test.shape[0]   ### ?

In [32]:
dota_train.head()

In [33]:
X_pick = np.zeros((dota_train.shape[0], 120))

for i, match_id in enumerate(dota_train.index):
    for p in range(5):
        X_pick[i, dota_train.loc[match_id, 'r%d_hero_id' % (p+1)]-1] = 1
        X_pick[i, dota_train.loc[match_id, 'd%d_hero_id' % (p+1)]-1] = -1

In [35]:
X_test_pick = np.zeros((dota_test.shape[0], 120))   ### Compare this line with the original

for i, match_id in enumerate(dota_test.index):
    for p in range(5):
        X_test_pick[i, dota_test.loc[match_id, 'r%d_hero_id' % (p+1)]-1] = 1
        X_test_pick[i, dota_test.loc[match_id, 'd%d_hero_id' % (p+1)]-1] = -1

In [36]:
X_pick=pd.DataFrame(X_pick)
X_pick.rename(columns=lambda x: x+ 1, inplace=True)

X_test_pick=pd.DataFrame(X_test_pick)
X_test_pick.rename(columns=lambda x: x+ 1, inplace=True)
#X_pick.head()
#X_pick[109].value_counts()

In [38]:
print(X_pick.shape)
print(X_test_pick.shape)

In [39]:
X_for_bag = X_scaled_drop.copy()
X_for_bag.reset_index(drop=True, inplace=True)
X_pick.reset_index(drop=True, inplace=True)

In [40]:
X_test_for_bag = X_test_scaled_drop.copy()
X_test_for_bag.reset_index(drop=True, inplace=True)
X_test_pick.reset_index(drop=True, inplace=True)

In [55]:
X_for_bag  = pd.concat([X_for_bag,X_pick],axis=1)

In [41]:
X_test_for_bag = pd.concat([X_test_for_bag,X_test_pick],axis=1)

In [42]:
X_test_for_bag.shape

In [43]:
pd.set_option("display.max_columns", None)
X_for_bag.head()

### Cross-validation

In [58]:
z = [] #list of accuracies
k = [] #list of regularization alphas
for i in np.linspace(1e-4, 1, num=10):
    start_time = datetime.datetime.now()
    model = RidgeClassifier(alpha=i)
    mean_score = cross_val_score(model, X_for_bag, y, scoring="roc_auc", cv = kfold).mean()
    #print(i)
    #print ('Time elapsed:', datetime.datetime.now() - start_time)
    z.append(mean_score)
    k.append(i)
    #print(mean_score)
for h,g in enumerate(z):
    if g == max(z):
        print(g,k[h])

### Test prediction of the winner


In [47]:
dota_test.fillna(0,inplace=True)

In [48]:
dota_test.shape

In [56]:
X_for_bag.shape

In [57]:
X_test_for_bag.shape

In [59]:
model = LogisticRegression(C = 1/(2*0.5556),solver='liblinear') ###?
model.fit(X_for_bag,y)
pred = model.predict_proba(X_test_for_bag)[:, 1]

In [61]:
pred.min()

Possibility of Radiant win: <br>
5.3231e-05 - minimal value<br>
0.99994 - maximal value

## Uploading to Kaggle

In [64]:
pred_pandas = pd.DataFrame(pred,dota_test.index)
pred_pandas['match_id'] = pred_pandas.index.tolist()
pred_pandas['radiant_win_prob'] = pred_pandas.loc[:,0]
pred_pandas = pred_pandas.reset_index(drop=True)

pred_pandas.drop(pred_pandas.columns[0],axis=1,inplace = True)
#pred_pandas.index = pred_pandas.index.map(str)
pred_pandas['match_id_hash'] = pred_pandas['match_id'].apply(lambda x:str(x))
pred_pandas.drop(['match_id'],axis=1,inplace =True)
pred_pandas.set_index('match_id_hash')
pred_pandas.index

In [70]:
columns_titles = ["match_id_hash","radiant_win_prob"]
pred_pandas=pred_pandas.reindex(columns=columns_titles)

In [65]:
import csv

In [71]:
pred_pandas.head()

In [73]:
#pred_pandas.dtypes.to_frame('types').to_csv('submission.csv')
#pred_pandas.to_csv('submission.csv',index=False)
pred_pandas.to_csv('submission.csv',index=False, quoting=csv.QUOTE_NONNUMERIC)

In [74]:
sample_sub = pd.read_csv('../input/submission-1/submission (1).csv')
sample_sub.head()