# MACHINE LEARNING

In [98]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
%matplotlib inline
pd.set_option("display.max_columns", None)
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))

## Preprocessing the data

In [36]:
df = pd.read_csv('lol_ml_data.csv')

In [37]:
df.head()

Unnamed: 0,p1_champ,p2_champ,p3_champ,p4_champ,p5_champ,p6_champ,p7_champ,p8_champ,p9_champ,p10_champ,game_length_mins,blue_team_win,red_team_win,p1_spell1,p2_spell1,p3_spell1,p4_spell1,p5_spell1,p6_spell1,p7_spell1,p8_spell1,p9_spell1,p10_spell1,p1_spell2,p2_spell2,p3_spell2,p4_spell2,p5_spell2,p6_spell2,p7_spell2,p8_spell2,p9_spell2,p10_spell2,p1perkPrimaryStyle,p2perkPrimaryStyle,p3perkPrimaryStyle,p4perkPrimaryStyle,p5perkPrimaryStyle,p6perkPrimaryStyle,p7perkPrimaryStyle,p8perkPrimaryStyle,p9perkPrimaryStyle,p10perkPrimaryStyle,p1perkSubStyle,p2perkSubStyle,p3perkSubStyle,p4perkSubStyle,p5perkSubStyle,p6perkSubStyle,p7perkSubStyle,p8perkSubStyle,p9perkSubStyle,p10perkSubStyle,p1_champ_prim_role,p2_champ_prim_role,p3_champ_prim_role,p4_champ_prim_role,p5_champ_prim_role,p6_champ_prim_role,p7_champ_prim_role,p8_champ_prim_role,p9_champ_prim_role,p10_champ_prim_role,p1_champ_sec_role,p2_champ_sec_role,p3_champ_sec_role,p4_champ_sec_role,p5_champ_sec_role,p6_champ_sec_role,p7_champ_sec_role,p8_champ_sec_role,p9_champ_sec_role,p10_champ_sec_role,tank_count,mage_count,marksman_count,support_count,fighter_count,assassin_count
0,Karma,Teemo,JarvanIV,Ezreal,Neeko,Ryze,Kaisa,Cassiopeia,Braum,Rengar,38.466667,Win,Fail,SummonerFlash,SummonerIgnite,SummonerFlash,SummonerFlash,SummonerFlash,SummonerTeleport,SummonerFlash,SummonerBoost,SummonerIgnite,SummonerFlash,SummonerIgnite,SummonerFlash,SummonerSmite,SummonerHeal,SummonerIgnite,SummonerFlash,SummonerHeal,SummonerFlash,SummonerFlash,SummonerSmite,Sorcery,Sorcery,Domination,Inspiration,Sorcery,Sorcery,Inspiration,Sorcery,Resolve,Domination,Inspiration,Domination,Inspiration,Sorcery,Domination,Domination,Sorcery,Domination,Inspiration,Sorcery,Mage,Marksman,Tank,Marksman,Mage,Mage,Marksman,Mage,Support,Assassin,Support,Assassin,Fighter,Mage,Support,Fighter,,,Tank,Fighter,2,5,3,3,3,2
1,Teemo,Tristana,LeeSin,Bard,Leblanc,Lucian,Irelia,Blitzcrank,Orianna,Kindred,23.4,Win,Fail,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,SummonerIgnite,SummonerHeal,SummonerTeleport,SummonerIgnite,SummonerBarrier,SummonerFlash,SummonerIgnite,SummonerHeal,SummonerSmite,SummonerIgnite,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,SummonerSmite,Sorcery,Inspiration,Domination,Domination,Domination,Inspiration,Inspiration,Resolve,Sorcery,Inspiration,Resolve,Sorcery,Inspiration,Resolve,Sorcery,Sorcery,Sorcery,Inspiration,Inspiration,Domination,Marksman,Marksman,Fighter,Support,Assassin,Marksman,Fighter,Tank,Mage,Marksman,Assassin,Assassin,Assassin,Mage,Mage,,Assassin,Fighter,Support,,1,3,4,2,3,5
2,JarvanIV,Lux,Zyra,Jax,Kaisa,Tryndamere,Ezreal,Thresh,Xerath,Karthus,31.733333,Fail,Win,SummonerFlash,SummonerFlash,SummonerTeleport,SummonerTeleport,SummonerFlash,SummonerIgnite,SummonerHeal,SummonerIgnite,SummonerHeal,SummonerSmite,SummonerSmite,SummonerIgnite,SummonerFlash,SummonerFlash,SummonerHeal,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,Domination,Sorcery,Sorcery,Inspiration,Inspiration,Inspiration,Inspiration,Resolve,Sorcery,Domination,Inspiration,Inspiration,Domination,Inspiration,Sorcery,Resolve,Sorcery,Inspiration,Inspiration,Inspiration,Tank,Mage,Mage,Fighter,Marksman,Fighter,Marksman,Support,Mage,Mage,Fighter,Support,Support,Assassin,,Assassin,Mage,Fighter,Assassin,,1,5,2,3,4,3
3,Blitzcrank,Graves,Caitlyn,Kled,Diana,LeeSin,Lucian,Thresh,Lissandra,Urgot,35.366667,Win,Fail,SummonerIgnite,SummonerFlash,SummonerHeal,SummonerTeleport,SummonerFlash,SummonerFlash,SummonerHeal,SummonerFlash,SummonerTeleport,SummonerFlash,SummonerFlash,SummonerSmite,SummonerFlash,SummonerFlash,SummonerTeleport,SummonerSmite,SummonerFlash,SummonerIgnite,SummonerFlash,SummonerTeleport,Resolve,Inspiration,Inspiration,Inspiration,Sorcery,Domination,Inspiration,Resolve,Sorcery,Inspiration,Sorcery,Sorcery,Sorcery,Resolve,Domination,Inspiration,Sorcery,Inspiration,Inspiration,Resolve,Tank,Marksman,Marksman,Fighter,Fighter,Fighter,Marksman,Support,Mage,Fighter,Fighter,,,Tank,Mage,Assassin,,Fighter,,Marksman,2,2,4,1,6,1
4,Lux,Jhin,Gragas,Fiora,Kindred,Tristana,Fiddlesticks,Graves,Xerath,Kennen,23.933333,Fail,Win,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,SummonerFlash,SummonerIgnite,SummonerFlash,SummonerFlash,SummonerTeleport,SummonerIgnite,SummonerHeal,SummonerIgnite,SummonerIgnite,SummonerSmite,SummonerHeal,SummonerFlash,SummonerSmite,SummonerBarrier,SummonerFlash,Domination,Inspiration,Resolve,Domination,Inspiration,Inspiration,Inspiration,Inspiration,Domination,Sorcery,Sorcery,Sorcery,Inspiration,Sorcery,Domination,Sorcery,Sorcery,Sorcery,Sorcery,Domination,Mage,Marksman,Fighter,Fighter,Marksman,Marksman,Mage,Marksman,Mage,Mage,Support,Assassin,Mage,Assassin,,Assassin,Support,,Assassin,Marksman,0,5,5,2,2,4


In [40]:
#the data we want to predict
red_win = df['red_team_win'].values
blue_win = df['blue_team_win'].values
game_length = df['game_length_mins'].values

In [71]:
red_win[:5]

array(['Fail', 'Fail', 'Win', 'Fail', 'Win'], dtype=object)

In [52]:
#Create features
X = df.drop(['red_team_win', 'blue_team_win', 'game_length_mins'], axis=1)

In [53]:
#Create a list of the categorical features
col_list = X.columns.tolist()
cat_feat = col_list[:-6]

In [54]:
#Create dummy variables for categorical features
X = pd.get_dummies(X, columns=cat_feat, drop_first=True)

## Building a Random Forest model

### Let's test a Random Forest Classifier out of the box with out hyper parameter tuning.

In [61]:
rf = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(X, red_win, test_size=0.2, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Get the accuracy score for our Random Forest model.

In [63]:
rf.score(X_test, y_test)

0.8074592890912274

### Cross Validation for the Random Forest model.

In [84]:
#Does cross_val_score fit the model and save the fitted model to rf2?
rf2 = RandomForestClassifier()
cv_rf_results = cross_val_score(rf2, X, red_win, cv=5)
print(cv_rf_results)

[0.92365878 0.95885134 0.86366183 0.65612524 0.67160468]


In [85]:
# Get the mean score of the cross validation
np.mean(cv_rf_results)

0.8147803728451987

### Try out of the box Random Forest Model again and scaling the features

In [86]:
# Set up pipeline
steps = [('scaler', StandardScaler()),
        ('rforest', RandomForestClassifier())]

pl = Pipeline(steps)

In [87]:
#Cross validation
scaled_cv_rf_results = cross_val_score(pl, X, red_win, cv=5)

In [88]:
print(scaled_cv_rf_results)

[0.92505953 0.95766065 0.86124536 0.6562303  0.67412622]


In [110]:
cv_score = np.mean(scaled_cv_rf_results)
print(cv_score)

0.8148644128337945


By scaling the features, we get slightly better performance from the model.

### Hyperparamter Tuning the Random Forest model

In [91]:
rf3 = RandomForestClassifier()

In [99]:
#default hyperparameters
rf3.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)>

In [97]:
n_estimators = [int(x) for x in np.linspace(start=10, stop=100, num=10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 3, 5, 10]
min_samples_leaf = [1, 2, 3, 4]
bootstrap = [True, False]

param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap}

In [104]:
#Use RandomizedSearchCV to see if we can get estimate of better hyperparameters
#n_jobs set to 2?
#cv set to 3?
# Take a smaller subset of the data to split into training and testing
# for finding hyper parameters, maybe 20k games

#Takes about an hour to run on the full training set
#rf_rand_cv = RandomizedSearchCV(estimator=rf3, param_distributions=param_grid,
#                                cv=5, n_iter=10, random_state=42)

In [None]:
rf_rand_cv = RandomizedSearchCV(estimator=rf3, param_distributions=param_grid,
                                cv=3, n_iter=10, n_jobs=2, random_state=42)

In [103]:
rf_rand_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 3, 5, 10], 'min_samples_leaf': [1, 2, 3, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [105]:
#Find best hyperparameters
rf_rand_cv.best_params_

{'bootstrap': False,
 'max_depth': 80,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 60}

In [130]:
#Use best hyperparamters and cross validate score
rf4 = RandomForestClassifier(bootstrap=False,
                                max_depth=80, max_features='sqrt',
                                min_samples_leaf=1, min_samples_split=5,
                                n_estimators=60)

cv_rf_results_params = cross_val_score(rf4, X, red_win, cv=5)

print(cv_rf_results_params)

[0.92975207 0.96154789 0.86972053 0.66785739 0.68358199]


In [115]:
cv_score2 = np.mean(cv_rf_results_params)
print(cv_score2)

0.8224919727392151


In [120]:
#Find improvement with tuned hyperparameters
diff = cv_score2 - cv_score
print(f'Using the current hyperparameters above increased the accuracy of the model by {diff:.5f}%')

Using the current hyperparameters above increased the accuracy of the model by 0.00763%


#### Further Tuning of hyperparameters using a smaller training subset due to time.

In [123]:
#First 20,000 rows of X
X_subset = X[:20000] 
red_win_subset = red_win[:20000]

In [134]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_subset, red_win_subset,
                                                       test_size=0.2, random_state=42)

In [131]:
#tighter parameter grid
param_grid2 = {'n_estimators': [58, 59, 60, 61, 62],
              'max_features': ['sqrt'],
              'max_depth': [78, 79, 80, 81, 82],
              'min_samples_split': [3, 4, 5, 6, 9, 10],
              'min_samples_leaf': [1, 2, 3],
              'bootstrap': [True, False]}

In [135]:
rf5 = RandomForestClassifier()
rf_grid_cv = GridSearchCV(estimator=rf5, param_grid=param_grid2,
                                cv=3, n_jobs=2)

## Logistic Regression Model

## K Nearest Neighbors Model