In [100]:
import pandas as pd
import numpy as np

In [101]:
#import the data

df = pd.read_csv('data.csv')

In [102]:
#Select features to train the model with
df2 = df[['blueWins','blueWardsPlaced','blueWardsDestroyed','blueFirstBlood','blueKills', 'blueAssists', 'blueEliteMonsters', 'blueDragons', 'blueHeralds', 'blueTowersDestroyed', 'blueTotalGold', 'blueTotalExperience', 'blueTotalMinionsKilled', 'blueTotalJungleMinionsKilled', 'redWardsPlaced','redWardsDestroyed','redFirstBlood','redKills', 'redAssists', 'redEliteMonsters', 'redDragons', 'redHeralds', 'redTowersDestroyed', 'redTotalGold', 'redTotalExperience', 'redTotalMinionsKilled', 'redTotalJungleMinionsKilled']]

In [103]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [104]:
# y will be the labels
y = df2['blueWins'].values

In [105]:
#Convert it to a numpy array
y = np.array(y)

In [106]:
#drop the labels from the original dataframe
df2 = df2.drop(columns=['blueWins'])

In [107]:
#let's take a look at the dataframe
df2.values

array([[   28,     2,     1, ..., 17047,   197,    55],
       [   12,     1,     0, ..., 17438,   240,    52],
       [   15,     0,     0, ..., 17254,   203,    28],
       ...,
       [   23,     1,     0, ..., 19909,   261,    60],
       [   14,     4,     1, ..., 18314,   247,    40],
       [   18,     0,     1, ..., 17379,   201,    46]], dtype=int64)

In [108]:
# Let's convert our features to numpy array
X = np.array(df2.values)

In [109]:
# Take a look at their length. They should be equal
print(len(X))
print(len(y))

9879
9879


In [110]:
# Let's split them into two groups so that we can test our model later.
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=.15, shuffle=True)
print(len(X_train), len(X_test), len(y_train), len(y_test))

8397 1482 8397 1482


In [111]:
from sklearn.preprocessing import scale

In [112]:
# Normalize the data to have more intuitive weights
X_train = scale(X_train)
X_test = scale(X_test)


In [113]:
# Create an instance of LogisticRegression
# Logistic Regression allows us to classify based on categories such as 'Win' and 'Loss'.
# We don't want to use Linear Regression because it does not make a certain classification. 
# It gives a float number such as 0.685

regr = linear_model.LogisticRegression()

In [114]:
#Let's fit the data in our model
regr.fit(X_train, y_train)

LogisticRegression()

In [115]:
# Take a look at the coefficients.
# Numbers indicate how much each feature affects the actual results.
regr.coef_

array([[-0.01773514,  0.01669825,  0.01604078, -0.0562645 , -0.05276828,
         0.07513423,  0.13238986, -0.04235357, -0.03420086,  0.75255253,
         0.26936191, -0.07175862,  0.03019499, -0.04342891,  0.01236179,
        -0.01604078,  0.08101692,  0.0737001 , -0.06295813, -0.08070203,
         0.00118814,  0.0755252 , -0.76691127, -0.33410919,  0.131353  ,
         0.07694679]])

In [116]:
# Intercept is the bias. It gets added to the coefficients multiplied by their weights
regr.intercept_

array([0.00347128])

In [117]:
#Let's make a prediction using the testing data
pred = regr.predict(X_test)

In [118]:
# Check the accuracy of our model
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred,y_test)

In [124]:
# Our model is 73 percent accurate! Which is not bad but not great either.
# Data for accuracy might be a little different when you run it due to randomized data.
acc

0.733468286099865

In [120]:
# Let's take a look at how much each feature affects the winning team.
ratios = list(zip(regr.coef_[0], df2.columns))

In [121]:
# The numbers indicate the weights for the corresponding features.
ratios

[(-0.01773513767145281, 'blueWardsPlaced'),
 (0.01669825486237613, 'blueWardsDestroyed'),
 (0.016040782829288915, 'blueFirstBlood'),
 (-0.05626449783631365, 'blueKills'),
 (-0.05276827997556363, 'blueAssists'),
 (0.0751342344682104, 'blueEliteMonsters'),
 (0.13238985653135135, 'blueDragons'),
 (-0.042353574515031404, 'blueHeralds'),
 (-0.034200857895783594, 'blueTowersDestroyed'),
 (0.7525525330262591, 'blueTotalGold'),
 (0.269361913262368, 'blueTotalExperience'),
 (-0.07175861916199625, 'blueTotalMinionsKilled'),
 (0.03019499218982294, 'blueTotalJungleMinionsKilled'),
 (-0.04342891432059616, 'redWardsPlaced'),
 (0.012361791825472564, 'redWardsDestroyed'),
 (-0.016040782829288433, 'redFirstBlood'),
 (0.08101692036922928, 'redKills'),
 (0.07370010261328272, 'redAssists'),
 (-0.06295813157359534, 'redEliteMonsters'),
 (-0.08070203429859568, 'redDragons'),
 (0.0011881361901777544, 'redHeralds'),
 (0.07552519790459661, 'redTowersDestroyed'),
 (-0.766911267187291, 'redTotalGold'),
 (-0.3341

In [122]:
# Let's sort the data and grab the top 5
sorted(ratios)[-5:]

[(0.08101692036922928, 'redKills'),
 (0.13135300197952282, 'redTotalMinionsKilled'),
 (0.13238985653135135, 'blueDragons'),
 (0.269361913262368, 'blueTotalExperience'),
 (0.7525525330262591, 'blueTotalGold')]

In [123]:
# The highest number is next to 'blueTotalGold' which indicates that earning gold is the most effective factor.
# Second is 'blueTotalExperience' which indicates that earning experience is the second most effective factor.

# In conclusion, we can assume that the team with the higher amount of golds and experience in the first 
# 10 minutes has considerably higher chance of winning.