# 01_DECISION_TREE_RANDOM_FOREST

Goal: Run a decision tree + random forest model and perform feature selection using RFE.

# 0) Import packages

In [1]:
# Generic packages
import warnings
import re

# Import ML Packages
import numpy as np
import scipy as sp
import sklearn as sk
import pandas as pd

# Import visualization packages
import matplotlib.pyplot as plt
import seaborn as sb

sklearn functions for feature selection

In [2]:
from sklearn.feature_selection import SelectKBest, SelectPercentile, chi2, RFE, RFECV

sklearn functions for decision tree, random forest

In [3]:
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, cross_val_score, StratifiedKFold, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

Our own code

In [4]:
import importlib

In [5]:
import info5604_utils as util

This lets us reload the util functions if we change it, without restarting kernel

In [6]:
importlib.reload(util)

<module 'info5604_utils' from '/Users/abyb5152/Documents/GitHub/Applied-Machine-Learning/code/info5604_utils.py'>

# 1) Read the data

Increase the number of columns that can be displayed

In [7]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', None)

Read main dataset

In [8]:
lol_df = pd.read_csv("../clean_data/data.csv", header=[0,1],index_col=[0])
lol_df = lol_df[lol_df['golddiff','min_20'].notna()] #get rid of NA
lol_df = lol_df.fillna('No Ban') # Indicate where there was no ban
print(lol_df.shape)
lol_df.head()

(7578, 99)


Unnamed: 0_level_0,Info,Info,Info,Info,Info,Info,Info,Blue,Blue,Blue,Blue,Blue,Blue,Blue,Blue,Blue,Blue,Blue,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red,Red,Blue,Blue,Blue,Blue,Blue,Red,Red,Red,Red,Red,golddiff,golddiff,golddiff,golddiff,goldblue,goldblue,goldblue,goldblue,goldred,goldred,goldred,goldred,goldblueTop,goldblueTop,goldblueTop,goldblueTop,goldblueJungle,goldblueJungle,goldblueJungle,goldblueJungle,goldblueMiddle,goldblueMiddle,goldblueMiddle,goldblueMiddle,goldblueADC,goldblueADC,goldblueADC,goldblueADC,goldblueSupport,goldblueSupport,goldblueSupport,goldblueSupport,goldredTop,goldredTop,goldredTop,goldredTop,goldredJungle,goldredJungle,goldredJungle,goldredJungle,goldredMiddle,goldredMiddle,goldredMiddle,goldredMiddle,goldredADC,goldredADC,goldredADC,goldredADC,goldredSupport,goldredSupport,goldredSupport,goldredSupport,Blue,Red,Blue,Red,Blue,Red,Info,Info
Unnamed: 0_level_1,Address,League,Year,Season,GameType,gamelength,Winner,TeamTag,Top,TopChamp,Jungle,JungleChamp,Middle,MiddleChamp,ADC,ADCChamp,Support,SupportChamp,TeamTag,Top,TopChamp,Jungle,JungleChamp,Middle,MiddleChamp,ADC,ADCChamp,Support,SupportChamp,ban_1,ban_2,ban_3,ban_4,ban_5,ban_1,ban_2,ban_3,ban_4,ban_5,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,min_5,min_10,min_15,min_20,Towers,Towers,Inhib,Inhib,Kills,Kills,FirstBlood,FirstTower
0,http://matchhistory.na.leagueoflegends.com/en/...,NALCS,2015,Spring,Season,40,Blue,TSM,Dyrus,Irelia,Santorin,RekSai,Bjergsen,Ahri,WildTurtle,Jinx,Lustboy,Janna,C9,Balls,Gnar,Meteos,Elise,Hai,Fizz,Sneaky,Sivir,LemonNation,Thresh,Rumble,Kassadin,Lissandra,No Ban,No Ban,Tristana,Leblanc,Nidalee,No Ban,No Ban,-268,-625,-790,1422.0,5068,11361,18324,29519.0,5336,11986,19114,28097.0,893,2051,3630,6008.0,1049,2486,3632,5799.0,1102,2663,4608,7308.0,1127,2495,4051,6359.0,897,1666,2403,4045.0,958,2587,4377,6539.0,1192,2555,4099,6194.0,1178,2561,4043,5550.0,1097,2621,4149,6374.0,911,1662,2446,3440.0,4.0,2.0,0.0,0.0,4.0,3.0,Blue,Blue
1,http://matchhistory.na.leagueoflegends.com/en/...,NALCS,2015,Spring,Season,38,Red,CST,Cris,Gnar,Impaler,Rengar,Jesiz,Ahri,Mash,Caitlyn,Sheep,Leona,DIG,Gamsu,Irelia,Crumbzz,JarvanIV,Shiphtur,Azir,CoreJJ,Corki,KiWiKiD,Annie,Kassadin,Sivir,Lissandra,No Ban,No Ban,RekSai,Janna,Leblanc,No Ban,No Ban,147,-242,1394,1105.0,5511,11419,19698,27282.0,5364,11661,18304,26177.0,1127,2406,4594,6217.0,1176,2350,3929,5205.0,1102,2547,4549,6341.0,1182,2415,3869,5695.0,924,1701,2757,3824.0,1042,2346,3770,5424.0,1105,2285,3415,4429.0,1097,2601,4418,6217.0,1220,2670,4061,6364.0,900,1759,2640,3743.0,1.0,2.0,0.0,0.0,5.0,3.0,Blue,Red
2,http://matchhistory.na.leagueoflegends.com/en/...,NALCS,2015,Spring,Season,40,Blue,WFX,Flaresz,Renekton,ShorterACE,Rengar,Pobelter,Fizz,Altec,Sivir,Gleeb,Annie,GV,Hauntzer,Sion,Saintvicious,LeeSin,Keane,Azir,Cop,Corki,BunnyFuFuu,Janna,JarvanIV,Lissandra,Kassadin,No Ban,No Ban,Leblanc,Zed,RekSai,No Ban,No Ban,34,913,2922,5293.0,4933,12374,21237,31317.0,4899,11461,18315,26024.0,828,2419,4187,6478.0,1041,2552,4026,5321.0,1065,2865,4861,6777.0,1130,2723,4899,7580.0,869,1815,3264,5161.0,861,2113,3532,5356.0,1089,2454,3971,5338.0,1006,2254,3578,5491.0,1085,2854,4582,6268.0,858,1786,2652,3571.0,3.0,0.0,0.0,0.0,11.0,6.0,Blue,Blue
3,http://matchhistory.na.leagueoflegends.com/en/...,NALCS,2015,Spring,Season,41,Red,TIP,Rhux,Irelia,Rush,JarvanIV,XiaoWeiXiao,Leblanc,Apollo,Sivir,Adrian,Thresh,TL,Quas,Gnar,IWDominate,Nunu,Fenix,Lulu,KEITH,KogMaw,Xpecial,Janna,Annie,Lissandra,Kassadin,No Ban,No Ban,RekSai,Rumble,LeeSin,No Ban,No Ban,228,16,-335,768.0,5398,12033,19332,25772.0,5170,12017,19667,25004.0,992,2662,4296,5544.0,1272,2390,3627,4867.0,1066,2533,4378,5722.0,1202,2756,4555,6305.0,866,1692,2476,3334.0,922,2390,3785,4800.0,1171,2937,4646,5620.0,1046,2468,4420,5649.0,1161,2568,4243,5431.0,870,1654,2573,3504.0,2.0,1.0,0.0,0.0,5.0,5.0,Red,Blue
4,http://matchhistory.na.leagueoflegends.com/en/...,NALCS,2015,Spring,Season,35,Blue,CLG,Benny,Gnar,Xmithie,JarvanIV,Link,Lissandra,Doublelift,Tristana,aphromoo,Janna,T8,CaliTrlolz8,Sion,Porpoise8,RekSai,Slooshi8,Lulu,Maplestreet8,Corki,Dodo8,Annie,Irelia,Pantheon,Kassadin,No Ban,No Ban,Rumble,Sivir,Rengar,No Ban,No Ban,113,205,-574,478.0,5404,11943,19426,27472.0,5291,11738,20000,26994.0,1038,2446,4020,5920.0,1174,2272,3696,5154.0,1112,2643,4157,6003.0,1203,2907,4933,6675.0,877,1675,2620,3720.0,1025,2523,4284,6153.0,1220,2576,4353,5570.0,1042,2443,4321,6014.0,1187,2622,4459,5859.0,817,1574,2583,3398.0,2.0,3.0,0.0,0.0,1.0,1.0,Blue,Red


# 3) Decision tree classification

Set up the model. Note that the min-max scaling is unnecessary for decision tree/random forest, just good practice.

In [9]:
model = DecisionTreeClassifier(random_state=20220905)

Split training and test data with bResult as the target variable.

In [10]:
X_lol = lol_df.drop([('Info','Winner')], axis=1)
y_lol = lol_df[('Info','Winner')]

X_train, X_test, y_train, y_test = train_test_split(
    X_lol, y_lol, test_size=0.2, random_state=20220905, stratify=y_lol)

Set up column transformer: one-hot encoding for categorical vars, min-max scaling for numeric vars. Here we have to set handle unknowns to 'ignore' because there may be different unique champions in the test and training datasets.

In [11]:
CATEGORICAL_VARS=[('Info','League'), ('Info','Year'), ('Info','Season'), ('Info','GameType'), 
                  ('Blue', 'Top'), ('Blue','TopChamp'), ('Blue','Jungle'), ('Blue','JungleChamp'),
                  ('Blue','Middle'), ('Blue','MiddleChamp'), ('Blue', 'ADC'), ('Blue','ADCChamp'),
                  ('Blue', 'Support'), ('Blue', 'SupportChamp'), ('Blue', 'TeamTag'),
                  ( 'Red', 'Top'), ( 'Red','TopChamp'), ( 'Red','Jungle'), ( 'Red','JungleChamp'), 
                  ( 'Red','Middle'),( 'Red','MiddleChamp'), ( 'Red', 'ADC'), ( 'Red','ADCChamp'), 
                  ( 'Red', 'Support'),( 'Red', 'SupportChamp'), ( 'Red', 'TeamTag'), 
                  ('Blue', 'ban_1'), ('Blue', 'ban_2'), ('Blue', 'ban_3'), ('Blue', 'ban_4'), ('Blue', 'ban_5'),
                  ( 'Red', 'ban_1'), ( 'Red', 'ban_2'), ( 'Red', 'ban_3'), ( 'Red', 'ban_4'), ( 'Red', 'ban_5')]

NUMERIC_VARS=[('Info','gamelength'), 
              ('golddiff','min_5'),('golddiff','min_10'),('golddiff','min_15'),('golddiff','min_20'),
              ('goldblue','min_5'),('goldblue','min_10'),('goldblue','min_15'),('goldblue','min_20'),
              ('goldred','min_5'),('goldred','min_10'),('goldred','min_15'), ('goldred','min_20'),
              ('goldblueTop','min_5'),('goldblueTop','min_10'),('goldblueTop','min_15'),('goldblueTop','min_20'),
              ('goldblueJungle','min_5'),('goldblueJungle','min_10'),('goldblueJungle','min_15'),('goldblueJungle','min_20'),
              ('goldblueMiddle','min_5'),('goldblueMiddle','min_10'),('goldblueMiddle','min_15'),('goldblueMiddle','min_20'),
              ('goldblueADC','min_5'),('goldblueADC','min_10'),('goldblueADC','min_15'), ('goldblueADC','min_20'),
              ('goldblueSupport','min_5'),('goldblueSupport','min_10'), ('goldblueSupport','min_15'),('goldblueSupport','min_20'),
              ('goldredTop','min_5'),('goldredTop','min_10'),('goldredTop','min_15'),('goldredTop','min_20'),
              ('goldredJungle','min_5'),('goldredJungle','min_10'),('goldredJungle','min_15'),('goldredJungle','min_20'),
              ('goldredMiddle','min_5'),('goldredMiddle','min_10'),('goldredMiddle','min_15'),('goldredMiddle','min_20'),
              ('goldredADC','min_5'),('goldredADC','min_10'),('goldredADC','min_15'),('goldredADC','min_20'),
              ('goldredSupport','min_5'),('goldredSupport','min_10'),('goldredSupport','min_15'),('goldredSupport','min_20'),
              ('Blue','Towers'),('Red','Towers'),('Blue','Inhib'),('Red','Inhib'),('Blue','Kills'),('Red','Kills')]

ohe = OneHotEncoder(drop='first', handle_unknown='ignore')

columns = ColumnTransformer([('category', ohe, CATEGORICAL_VARS), 
                             ('numeric', MinMaxScaler(), NUMERIC_VARS)],
                             remainder='drop')

Set up the pipeline

In [12]:
selector = SelectKBest(score_func=chi2, k=20)

In [13]:
data_pipe = Pipeline(steps=[('preprocess', columns), ('filter', selector)])

In [14]:
pipe = Pipeline(steps=[('data',data_pipe), ('model', model)])

In [15]:
param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [3,4,5,6,7,8],
    'model__min_samples_leaf': [2,3,4]
}

skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=20220914)

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=skf, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

Fitting 8 folds for each of 36 candidates, totalling 288 fits




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=3; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=3, model__min_samples_leaf=4; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=4, model__min_samples_leaf=2; total time=   0.2s




[CV] END model__criterion=gini, model__max_depth=4, model__min_samples_leaf=2; total time=   0.2s
[CV] END model__criterion=gini, model__max_depth=4, model__min_samples_leaf=2; total time=   0.2s


In [None]:
params = grid_search.cv_results_['params']
scores = grid_search.cv_results_['mean_test_score']
rows = [dict(param_dict, score=score) for param_dict, score in zip(params, scores)]

In [None]:
rows

In [None]:
results_df = pd.DataFrame(rows)
results_df.columns = ['Criterion', 'Max Depth', 'Min Sample Leaf', 'Accuracy']
results_df.head()

In [None]:
sb.set(rc={"figure.figsize":(6, 4)})
DT_Grid_plot = sb.lineplot(data=results_df, x='Max Depth', y='Accuracy',
                           hue='Min Sample Leaf', style = 'Criterion', palette = 'bright')
DT_Grid_plot.set(title = 'Decision Tree Model Parameters')
DT_Grid_plot.get_figure().savefig('../plots/DecisionTree_GridSearch_Combined.png')

In [None]:
results_df[results_df['Accuracy'] == max(results_df['Accuracy'])]

In [None]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf = 4, random_state=20220905)

In [None]:
pipe = Pipeline(steps=[('data',data_pipe), ('model', model)])

Compute cross-validation results for the decision tree. There will be warnings here and throughout because there are champions present in the test dataset that were not in the training dataset.

In [None]:
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=20220905)

scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring='accuracy', verbose=2)

Create a data frame from the scores

In [None]:
score_tree = pd.DataFrame({'score_tree': scores})
score_tree

Plot accuracy for each cross-validation run.

In [None]:
sb.boxplot(data=score_tree['score_tree'], width = 0.3)

Fit on all the training data

In [None]:
pipe.fit(X_train,y_train)

Print accuracy on the training and test data

In [None]:
DT_acc_cmb = util.print_accuracies(pipe, X_train, y_train, X_test, y_test)

_Interpretation: for this very simple model, the accuracy is not great._

Show the relative importance of different features

In [None]:
feature_names = pipe.named_steps['data'].get_feature_names_out()
feature_coefs = pipe.named_steps['model'].feature_importances_
feature_df = pd.DataFrame({'Feature Name': list(feature_names), 'Importance': list(feature_coefs.flatten())})
feature_df.sort_values(by='Importance', axis=0, ascending=False, inplace=True, ignore_index=True)
#feature_df['feature'].replace(to_replace=['numeric__','category__','ABBREVIATION_'], value='', regex=True, inplace=True)
feature_df.head(20)

In [None]:
X_cols = pd.DataFrame(X_lol.columns, columns=['column'])
X_cols['column'] = X_cols['column'].astype(str)
X_cols['col'] = np.linspace(0, len(X_lol.columns)-1, num=len(X_lol.columns)).astype(int).astype(str)
X_cols = dict(zip(X_cols.col, X_cols.column))
X_cols

In [None]:
list_of_chars=['\(', '\)', '\'']
pattern = '[' +  ''.join(list_of_chars) +  ']'
feature_df['Feature Name 2'] = [ re.sub(', ', '_', re.sub(pattern, '', re.sub('(.*)__x(\d+)',X_cols.get(re.search('x(\d+)', s).group(1)),s))) for s in feature_df['Feature Name']]
feature_df

In [None]:
ax = sb.barplot(data=feature_df.iloc[0:20], x='Feature Name 2', y='Coef', orient='v')
ax.tick_params(axis='x', rotation=270)
ax.set_ylabel('Importance', rotation=90, labelpad=20)
ax.set_xlabel('Feature', rotation=0, labelpad=20)
ax.set_title('Most important features: Decision Tree (Combined)', pad=20)
plt.savefig('../plots/DecisionTree_importances_combined.pdf',dpi=300, bbox_inches = "tight")

Plot the decision tree. Go left if the condition is true, else go right.

In [None]:
plot_tree(model, filled=True, feature_names = [ re.sub(', ', '_', re.sub(pattern, '', re.sub('(.*)__x(\d+)',X_cols.get(re.search('x(\d+)', s).group(1)),s))) for s in list(pipe.named_steps['data'].get_feature_names_out())])
plt.savefig('../plots/tree_combined.pdf')

_Interpretation: the first few levels of the tree overwhelmingly split on gold difference at 10 minutes and game length._

# 4) Random forest classification

Set up the model

In [None]:
RFC = RandomForestClassifier(random_state=20220929)

Set up the pipeline

In [None]:
pipe_RFC = Pipeline(steps=[('data',data_pipe), ('model', RFC)])

In [None]:
param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [3,4,5,6,7,8],
    'model__min_samples_leaf': [2,3,4],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=skf, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
params = grid_search.cv_results_['params']
scores = grid_search.cv_results_['mean_test_score']
rows = [dict(param_dict, score=score) for param_dict, score in zip(params, scores)]

In [None]:
rows

In [None]:
results_df = pd.DataFrame(rows)
results_df.columns = ['Criterion', 'Max Depth', 'Min Sample Leaf','Accuracy']
results_df.head()

In [None]:
RFC_Grid_plot = sb.lineplot(data=results_df, x='Max Depth', y='Accuracy',
                           hue = 'Min Sample Leaf', style = 'Criterion', palette = 'bright')
RFC_Grid_plot.set(title = 'Random Forest Model Parameters')
RFC_Grid_plot.get_figure().savefig('../plots/RandomForest_GridSearch_Combined.png')

In [None]:
results_df[results_df['Accuracy'] == max(results_df['Accuracy'])]

In [None]:
RFC = RandomForestClassifier(max_depth=3, criterion='gini', min_samples_leaf = 4,random_state=20220929)

In [None]:
pipe_RFC = Pipeline(steps=[('data',data_pipe), ('model', RFC)])

Compute cross-validation results for the random forest

In [None]:
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=20220905)

scores = cross_val_score(pipe_RFC, X_train, y_train, cv=skf, scoring='accuracy', verbose=2)

Create a data frame from the scores

In [None]:
score_RFC = pd.DataFrame({'score_RFC': scores})
score_RFC

Plot accuracy for each cross-validation run.

In [None]:
sb.boxplot(data=score_RFC['score_RFC'], width = 0.3)

Fit on all the training data

In [None]:
pipe_RFC.fit(X_train,y_train)

Print accuracy on the training and test data

In [None]:
RFC_acc_cmb = util.print_accuracies(pipe_RFC, X_train, y_train, X_test, y_test)

In [None]:
feature_names = pipe_RFC.named_steps['data'].get_feature_names_out()
feature_coefs = pipe_RFC.named_steps['model'].feature_importances_
feature_df = pd.DataFrame({'Feature Name': list(feature_names), 'Importance': list(feature_coefs.flatten())})
feature_df.sort_values(by='Importance', axis=0, ascending=False, inplace=True, ignore_index=True)
#feature_df['feature'].replace(to_replace=['numeric__','category__','ABBREVIATION_'], value='', regex=True, inplace=True)
feature_df.head(20)

In [None]:
X_cols = pd.DataFrame(X_lol.columns, columns=['column'])
X_cols['column'] = X_cols['column'].astype(str)
X_cols['col'] = np.linspace(0, len(X_lol.columns)-1, num=len(X_lol.columns)).astype(int).astype(str)
X_cols = dict(zip(X_cols.col, X_cols.column))
X_cols

In [None]:
list_of_chars=['\(', '\)', '\'']
pattern = '[' +  ''.join(list_of_chars) +  ']'
feature_df['Feature Name 2'] = [ re.sub(', ', '_', re.sub(pattern, '', re.sub('(.*)__x(\d+)',X_cols.get(re.search('x(\d+)', s).group(1)),s))) for s in feature_df['Feature Name']]
feature_df

In [None]:
ax = sb.barplot(data=feature_df.iloc[0:20], x='Feature Name 2', y='Coef', orient='v')
ax.tick_params(axis='x', rotation=270)
ax.set_ylabel('Importance', rotation=90, labelpad=20)
ax.set_xlabel('Feature', rotation=0, labelpad=20)
ax.set_title('Most important features: Random forest (combined)', pad=20)
plt.savefig('../plots/RandomForest_importances_combined.pdf',dpi=300, bbox_inches = "tight")

# Generalizing the model to predict outcome with just a singel team's stats, instead using team vs team data.

The general idea is here is to see how accurate the model is looking at the stats of one team instead of team vs team like above. The trade off here is that there is more data for the models to learn from, but some context is lost to the model. We now have each team on a seperate row and a column to indicate which team we are looking at.

In [None]:
blue = lol_df['Blue'].copy()
blue['Team'] = 'Blue'

red = lol_df['Red'].copy()
red['Team'] = 'Red'

info = lol_df['Info'].copy()

In [None]:
diff = lol_df['golddiff'].copy()
diff.columns = ['Diff 5','Diff 10','Diff 15','Diff 20']
info = pd.concat([info,diff], axis=1)

gold = lol_df['goldred'].copy()
gold.columns = ['Team 5','Team 10','Team 15','Team 20']
red = pd.concat([red,gold], axis=1)

gold = lol_df['goldblue'].copy()
gold.columns = ['Team 5','Team 10','Team 15','Team 20']
blue = pd.concat([blue,gold], axis=1)

In [None]:
goldcol = ['goldblueTop','goldblueJungle','goldblueADC','goldblueSupport','goldblueMiddle',
          'goldredTop','goldredJungle','goldredADC','goldredSupport','goldredMiddle']

for i in goldcol:
    gold = lol_df[i]
    if i[4:7] == 'red':
        gold.columns = [i[7:]+' 5',i[7:]+' 10',i[7:]+' 15',i[7:]+' 20']
        red = pd.concat([red,gold], axis=1)
    elif i[4:8] == 'blue':
        gold.columns = [i[8:]+' 5',i[8:]+' 10',i[8:]+' 15',i[8:]+' 20']
        blue = pd.concat([blue,gold], axis=1)

In [None]:
blue = pd.concat([info,blue], axis=1)
red = pd.concat([info,red], axis=1)

lol_df2 = pd.concat([blue,red],axis=0)

In [None]:
y = lol_df2['Winner']
X = lol_df2.drop(['Winner','Address'],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20220905, stratify=y)

In [None]:
X.columns

# 3) Decision tree classification, Separate red/blue

Set up the model. Note that the min-max scaling is unnecessary for decision tree/random forest, just good practice.

In [None]:
model = DecisionTreeClassifier(random_state=20220905)

Set up column transformer: one-hot encoding for categorical vars, min-max scaling for numeric vars. Here we have to set handle unknowns to 'ignore' because there may be different unique champions in the test and training datasets.

In [None]:
CATEGORICAL_VARS=['League', 'Season', 'GameType', 'FirstBlood',
       'FirstTower','TeamTag', 'Top', 'TopChamp', 'Jungle', 'JungleChamp', 'Middle', 'MiddleChamp',
       'ADC', 'ADCChamp', 'Support', 'SupportChamp', 'ban_1', 'ban_2', 'ban_3',
       'ban_4', 'ban_5', 'Team']

NUMERIC_VARS=['gamelength','Year', 'Diff 5', 'Diff 10', 'Diff 15', 'Diff 20',
              'Towers', 'Inhib', 'Kills', 'Team 5',
       'Team 10', 'Team 15', 'Team 20', 'Top 5', 'Top 10', 'Top 15', 'Top 20',
       'Jungle 5', 'Jungle 10', 'Jungle 15', 'Jungle 20', 'ADC 5', 'ADC 10',
       'ADC 15', 'ADC 20', 'Support 5', 'Support 10', 'Support 15',
       'Support 20', 'Middle 5', 'Middle 10', 'Middle 15', 'Middle 20']

ohe = OneHotEncoder(drop='first', handle_unknown='ignore')

columns = ColumnTransformer([('category', ohe, CATEGORICAL_VARS), 
                             ('numeric', MinMaxScaler(), NUMERIC_VARS)],
                             remainder='drop')

Set up the pipeline

In [None]:
selector = SelectKBest(score_func=chi2, k=20)

In [None]:
data_pipe = Pipeline(steps=[('preprocess', columns), ('filter', selector)])

In [None]:
pipe = Pipeline(steps=[('data',data_pipe), ('model', model)])

In [None]:
param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [3,4,5,6,7,8],
    'model__min_samples_leaf': [2,3,4]
}

skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=20220914)

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=skf, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
params = grid_search.cv_results_['params']
scores = grid_search.cv_results_['mean_test_score']
rows = [dict(param_dict, score=score) for param_dict, score in zip(params, scores)]

In [None]:
rows

In [None]:
results_df = pd.DataFrame(rows)
results_df.columns = ['Criterion', 'Max Depth', 'Min Sample Leaf', 'Accuracy']
results_df.head()

In [None]:
sb.set(rc={"figure.figsize":(6, 4)})
DT_Grid_plot = sb.lineplot(data=results_df, x='Max Depth', y='Accuracy',
                           hue='Min Sample Leaf', style = 'Criterion', palette = 'bright')
DT_Grid_plot.set(title = 'Decision Tree Model Parameters')
DT_Grid_plot.get_figure().savefig('../plots/DecisionTree_GridSearch_Separated.png')

In [None]:
results_df[results_df['Accuracy'] == max(results_df['Accuracy'])]

In [None]:
model = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf = 4, random_state=20220905)

In [None]:
pipe = Pipeline(steps=[('data',data_pipe), ('model', model)])

Compute cross-validation results for the decision tree. There will be warnings here and throughout because there are champions present in the test dataset that were not in the training dataset.

In [None]:
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=20220905)

scores = cross_val_score(pipe, X_train, y_train, cv=skf, scoring='accuracy', verbose=2)

Create a data frame from the scores

In [None]:
score_tree = pd.DataFrame({'score_tree': scores})
score_tree

Plot accuracy for each cross-validation run.

In [None]:
sb.boxplot(data=score_tree['score_tree'], width = 0.3)

Fit on all the training data

In [None]:
pipe.fit(X_train,y_train)

Print accuracy on the training and test data

In [None]:
DT_acc_sep = util.print_accuracies(pipe, X_train, y_train, X_test, y_test)

_Interpretation: for this very simple model, the accuracy is not great._

Show the relative importance of different features

In [None]:
feature_names = pipe.named_steps['data'].get_feature_names_out()
feature_coefs = pipe.named_steps['model'].feature_importances_
feature_df = pd.DataFrame({'Feature Name': list(feature_names), 'Importance': list(feature_coefs.flatten())})
feature_df.sort_values(by='Importance', axis=0, ascending=False, inplace=True, ignore_index=True)
#feature_df['feature'].replace(to_replace=['numeric__','category__','ABBREVIATION_'], value='', regex=True, inplace=True)
feature_df.head(20)

In [None]:
ax = sb.barplot(data=feature_df.iloc[0:20], x='Feature Name 2', y='Coef', orient='v')
ax.tick_params(axis='x', rotation=270)
ax.set_ylabel('Importance', rotation=90, labelpad=20)
ax.set_xlabel('Feature', rotation=0, labelpad=20)
ax.set_title('Most important features: Decision Tree (Separated)', pad=20)
plt.savefig('../plots/DecisionTree_Importances_Separated.pdf',dpi=300, bbox_inches = "tight")

Plot the decision tree. Go left if the condition is true, else go right.

In [None]:
plot_tree(model, filled=True, feature_names = [ re.sub(', ', '_', re.sub(pattern, '', re.sub('(.*)__x(\d+)',X_cols.get(re.search('x(\d+)', s).group(1)),s))) for s in list(pipe.named_steps['data'].get_feature_names_out())])
plt.savefig('../plots/tree_separated.pdf')

_Interpretation: the first few levels of the tree overwhelmingly split on gold difference at 10 minutes and game length._

# 4) Random forest classification, separate red/blue

Set up the model

In [None]:
RFC = RandomForestClassifier(max_depth=4, criterion='gini', random_state=20220929)

Set up the pipeline

In [None]:
pipe_RFC = Pipeline(steps=[('data',data_pipe), ('model', RFC)])

In [None]:
param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [3,4,5,6,7,8],
    'model__min_samples_leaf': [2,3,4],
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=skf, verbose=2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
params = grid_search.cv_results_['params']
scores = grid_search.cv_results_['mean_test_score']
rows = [dict(param_dict, score=score) for param_dict, score in zip(params, scores)]

In [None]:
rows

In [None]:
results_df = pd.DataFrame(rows)
results_df.columns = ['Criterion', 'Max Depth', 'Min Sample Leaf','Accuracy']
results_df.head()

In [None]:
RFC_Grid_plot = sb.lineplot(data=results_df, x='Max Depth', y='Accuracy',
                           hue = 'Min Sample Leaf', style = 'Criterion', palette = 'bright')
RFC_Grid_plot.set(title = 'Random Tree Model Parameters')
RFC_Grid_plot.get_figure().savefig('../plots/RandomForest_GridSearch_Separated.png')

In [None]:
results_df[results_df['Accuracy'] == max(results_df['Accuracy'])]

In [None]:
RFC = RandomForestClassifier(criterion='gini', max_depth=3, min_samples_leaf = 4,random_state=20220929)

In [None]:
pipe_RFC = Pipeline(steps=[('data',data_pipe), ('model', RFC)])

Compute cross-validation results for the random forest

In [None]:
skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=20220905)

scores = cross_val_score(pipe_RFC, X_train, y_train, cv=skf, scoring='accuracy', verbose=2)

Create a data frame from the scores

In [None]:
score_RFC = pd.DataFrame({'score_RFC': scores})
score_RFC

Plot accuracy for each cross-validation run.

In [None]:
sb.boxplot(data=score_RFC['score_RFC'], width = 0.3)

Fit on all the training data

In [None]:
pipe_RFC.fit(X_train,y_train)

Print accuracy on the training and test data

In [None]:
RFC_acc_sep = util.print_accuracies(pipe_RFC, X_train, y_train, X_test, y_test)

In [None]:
feature_names = pipe_RFC.named_steps['data'].get_feature_names_out()
feature_coefs = pipe_RFC.named_steps['model'].feature_importances_
feature_df = pd.DataFrame({'Feature Name': list(feature_names), 'Importance': list(feature_coefs.flatten())})
feature_df.sort_values(by='Importance', axis=0, ascending=False, inplace=True, ignore_index=True)
#feature_df['feature'].replace(to_replace=['numeric__','category__','ABBREVIATION_'], value='', regex=True, inplace=True)
feature_df.head(20)

In [None]:
ax = sb.barplot(data=feature_df.iloc[0:20], x='Feature Name 2', y='Coef', orient='v')
ax.tick_params(axis='x', rotation=270)
ax.set_ylabel('Importance', rotation=90, labelpad=20)
ax.set_xlabel('Feature NAme', rotation=0, labelpad=20)
ax.set_title('Most important features: Random Forest (Separated)', pad=20)
plt.savefig('../plots/RandomForest_importances_separated.pdf',dpi=300, bbox_inches = "tight")

# Accuracies

In [None]:
acc_df = pd.DataFrame([DT_acc_cmb[0],RFC_acc_cmb[0],
                       DT_acc_sep[0],RFC_acc_sep[0],
                       DT_acc_cmb[1],RFC_acc_cmb[1],
                       DT_acc_sep[1],RFC_acc_sep[1]],
                       columns = ['Accuracy'])
acc_df['Type'] = ['Train','Train','Train','Train',
                  'Test','Test','Test','Test',]
acc_df['Model'] = ['DT_cmb','RFC_cmb','DT_sep','RFC_sep',
                  'DT_cmb','RFC_cmb','DT_sep','RFC_sep']
acc_df

In [None]:
barplot = sb.barplot(data = acc_df, y = 'Model', x = 'Accuracy', hue = 'Type', palette = 'bright')
barplot.set(title = 'Model Accuracies')

barplot.get_figure().savefig('../plots/ACC_DT_RFC_sep_combined_train_test.png')