In [4]:
# Pandas, numpy, matplotlib and seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# RiotWatcher
from riotwatcher import LolWatcher, ApiError

# OS tools
import os
import shutil
import json
import sys
import dill
import ipython_genutils

# Custom scripts
from extract_players_performance import extract_players_performance
from remove_perks import remove_perks
from cleaner import replace_champ_names_with_tags
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline


In [7]:
champ_data = {}
with open('../champion_data/champion.json','r',encoding='utf-8') as f :
    champions = json.load(f)
    for champion in champions['data'] :
        champ_data[str.lower(champion)] = champions['data'][champion]
    



def add(old,new_df) :
    appended_df = pd.concat([old,new_df],ignore_index=True)
    return appended_df

def champ_name_replacer(champ_name) :
    return champ_data[str.lower(champ_name)]['tags'][0]


In [8]:

df = pd.read_csv('../ml_data/full_ml_data.csv')
columns = list(df.columns)
champ_name_columns = [c for c in columns if c.startswith('team_comp') or c.startswith('dmg_carry') or c.startswith('obj_carry')]


for c in champ_name_columns :
    df[c] = df[c].apply(lambda x : champ_name_replacer(x))

print(f"dataframe shape:\n{df.shape}\n")
print(f"dataframe sample:\n{df.head(1)}\n")

dataframe shape:
(74239, 51)

dataframe sample:
   total_gold_earned_0  total_gold_spent_0  total_baron_kills_0  \
0               102871              112435                    3   

   total_dragon_kills_0  total_inhibitor_kils_0  total_kills_0  \
0                     3                       4             50   

   total_deaths_0  total_damage_dealt_to_champions_0  \
0              67                             251493   

   total_damage_dealt_to_objectives_0  total_damage_taken_0  ...  \
0                              114384                277343  ...   

   team_comp_1_champ_5  gameLengthMin  dmg_to_champs_winner  \
0                 Mage              0                     0   

   dmg_to_obj_winner vision_winner cs_winner champ_experience_winner  \
0                  0             0         1                       1   

  wards_placed_winner gold_spender_winner final_match_winner  
0                   1                   0                  1  

[1 rows x 51 columns]



In [9]:

### MUST RETRIEVE ALL POSSIBLE CHAMP TAGS BEFORE BEING ABLE TO ONE HOT ENCODE. 
# ONE POSSIBLE SOLUTION IS TO ADD DUMMY DATA FOR WHICH IT WOULD HELP GENERATE THE ONE HOT ENCODE VALUE FOR EACH CHAMPION TAG FOR EACH CATEGORICAL FEATURE IN THE DATASET (6 CHAMP TAGS -> 6 EXTRA DUMMY ROWS, 1 FOR EACH CHAMP TAG)
def fill_missing_champ_tags_with_dummy(df): 
    unique_champ_tags = df[champ_name_columns].stack().unique()
    dummy_dict = {}
    for k in champ_name_columns :
        dummy_dict[k] = unique_champ_tags.copy()
    dummy_df = pd.DataFrame(dummy_dict)
    dummy_df = dummy_df.replace(np.nan,0)
    extra_rows = dummy_df.shape[0]
    print('Number of extra dummy rows = ',extra_rows)
    df = add(df,dummy_df)
    return (df,extra_rows)

(74239, 51)


In [10]:

data = df.drop("final_match_winner",axis=1)
labels = df["final_match_winner"].copy()


X_train,X_test,y_train,y_test= train_test_split(data,labels,test_size=0.33,random_state=42)

print('Shape of training data and labels :',X_train.shape,y_train.shape)
print('Shape of test data and labels :',X_test.shape,y_test.shape)

Shape of training data and labels : (49740, 50) (49740,)
Shape of test data and labels : (24499, 50) (24499,)


In [11]:
missing_value_row =  data[data.isnull().any(axis=1)].head()
print(f'Rows with missing values:\n {missing_value_row}')
print(f'Missing value row shape: {missing_value_row.shape}')


Rows with missing values:
 Empty DataFrame
Columns: [total_gold_earned_0, total_gold_spent_0, total_baron_kills_0, total_dragon_kills_0, total_inhibitor_kils_0, total_kills_0, total_deaths_0, total_damage_dealt_to_champions_0, total_damage_dealt_to_objectives_0, total_damage_taken_0, average_vision_score_0, total_wards_placed_0, average_creep_score_0, average_champion_experience_0, dmg_carry_0, obj_carry_0, team_comp_0_champ_1, team_comp_0_champ_2, team_comp_0_champ_3, team_comp_0_champ_4, team_comp_0_champ_5, total_gold_earned_1, total_gold_spent_1, total_baron_kills_1, total_dragon_kills_1, total_inhibitor_kils_1, total_kills_1, total_deaths_1, total_damage_dealt_to_champions_1, total_damage_dealt_to_objectives_1, total_damage_taken_1, average_vision_score_1, total_wards_placed_1, average_creep_score_1, average_champion_experience_1, dmg_carry_1, obj_carry_1, team_comp_1_champ_1, team_comp_1_champ_2, team_comp_1_champ_3, team_comp_1_champ_4, team_comp_1_champ_5, gameLengthMin, dmg_to

In [12]:
X_train,dummy_rows_len = fill_missing_champ_tags_with_dummy(X_train)
print('Training data shape (with dummy) :',X_train.shape)
print('Columns of training data :',X_train.columns)

num_pipeline = Pipeline([
    ('std_scaler',StandardScaler())
])

full_column_set,cat_column_set = set(list(X_train.columns)),set(champ_name_columns)
num_columns = list(full_column_set - cat_column_set)

full_pipeline = ColumnTransformer([
        ('num',num_pipeline,num_columns),
        ("cat", OneHotEncoder(), champ_name_columns),
    ],remainder='passthrough')

X_train_prepared = full_pipeline.fit_transform(X_train)
size = len(X_train_prepared)
X_train_prepared = X_train_prepared[:size-dummy_rows_len]
print('One hot encoded Training data shape (without dummy) :',X_train_prepared.shape)


Number of extra dummy rows =  6
Training data shape (with dummy) : (49746, 50)
Columns of training data : Index(['total_gold_earned_0', 'total_gold_spent_0', 'total_baron_kills_0',
       'total_dragon_kills_0', 'total_inhibitor_kils_0', 'total_kills_0',
       'total_deaths_0', 'total_damage_dealt_to_champions_0',
       'total_damage_dealt_to_objectives_0', 'total_damage_taken_0',
       'average_vision_score_0', 'total_wards_placed_0',
       'average_creep_score_0', 'average_champion_experience_0', 'dmg_carry_0',
       'obj_carry_0', 'team_comp_0_champ_1', 'team_comp_0_champ_2',
       'team_comp_0_champ_3', 'team_comp_0_champ_4', 'team_comp_0_champ_5',
       'total_gold_earned_1', 'total_gold_spent_1', 'total_baron_kills_1',
       'total_dragon_kills_1', 'total_inhibitor_kils_1', 'total_kills_1',
       'total_deaths_1', 'total_damage_dealt_to_champions_1',
       'total_damage_dealt_to_objectives_1', 'total_damage_taken_1',
       'average_vision_score_1', 'total_wards_placed_

In [13]:
# create the following four different models with their default hyperparameter values to be trained using the preprocessed data
# Support Vector Machine
svm_model = SVC()

# Decision Trees
dt_model = DecisionTreeClassifier()

# Random Forests
rf_model = RandomForestClassifier()

# Naive Bayes
nb_model = GaussianNB()

In [14]:
# use sklearn GridSearchCV to train selected model with hyperparameter tuning
# parameters for SVC:
    # C -> e.g., 10, 100
    # gamma ->  e.g., 0.001, 0.0001
    # kernel -> 'rbf' or 'linear' 

svm_params = [
    {'C':[10,100],'gamma':[0.001,0.0001],'kernel':['rbf','linear']}
]

# parameters for DecisionTreeClassifier: 
    # max_depth ->  e.g., 3, 4
    # min_samples_split -> 5, 10
    # min_samples_leaf -> 10, 20
dt_params = [
    {'max_depth':[3,4],'min_samples_split':[5,10],'min_samples_leaf':[10,20]}
]

# parameters for RandomForestClassifier: 
    # n_estimators -> 100, 200
    # max_depth -> 3, 5
    # bootstrap -> True, False
rf_params = [
    {'n_estimators':[100,200],'max_depth':[3,5],'bootstrap':[True,False]}
]

nb_params = [{}]
# initialize gridsearch with the required parameters, including the following scoring methods and refit='bal_accuracy' (2)
scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}
grid_search_svc = GridSearchCV(svm_model,svm_params,scoring=scoring,cv=5,refit='bal_accuracy' ,return_train_score=True)
grid_search_dt = GridSearchCV(dt_model,dt_params,scoring=scoring,cv=5,refit='bal_accuracy' ,return_train_score=True)
grid_search_rf = GridSearchCV(rf_model,rf_params,scoring=scoring,cv=5,refit='bal_accuracy' ,return_train_score=True)
grid_search_nb = GridSearchCV(nb_model,nb_params,scoring=scoring,cv=5,refit='bal_accuracy' ,return_train_score=True)

# fit the training data (0.5)
grid_search_svc.fit(X_train_prepared,y_train)
grid_search_dt.fit(X_train_prepared,y_train)
grid_search_rf.fit(X_train_prepared,y_train)
grid_search_nb.fit(X_train_prepared,y_train)

# print the best parameters (0.5)
print(f'SVC best params:\n{grid_search_svc.best_params_}')
print(f'Decision Tree best params:\n{grid_search_dt.best_params_}')
print(f'Random Forest best params:\n{grid_search_rf.best_params_}')
print(f'Naive Bayes best params:\n{grid_search_nb.best_params_}')

# print the best estimator (0.5)
print(f'SVC best estimator:\n{grid_search_svc.best_estimator_}')
print(f'Decision Tree best estimator:\n{grid_search_dt.best_estimator_}')
print(f'Random Forest best estimator:\n{grid_search_rf.best_estimator_}')
print(f'Naive Bayes best estimator:\n{grid_search_nb.best_estimator_}')

# print the best score from trained GridSearchCV model (0.5)
print(f'SVC best score:\n{grid_search_svc.best_score_}')
print(f'Decision Tree best score:\n{grid_search_dt.best_score_}')
print(f'Random Forest best score:\n{grid_search_rf.best_score_}')
print(f'Naive Bayes best score:\n{grid_search_nb.best_score_}')

# Save session to "notebook_env.db"
dill.dump_session("notebook_env.db")

SVC best params:
{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Decision Tree best params:
{'max_depth': 4, 'min_samples_leaf': 20, 'min_samples_split': 5}
Random Forest best params:
{'bootstrap': False, 'max_depth': 5, 'n_estimators': 100}
Naive Bayes best params:
{}
SVC best estimator:
SVC(C=100, gamma=0.001)
Decision Tree best estimator:
DecisionTreeClassifier(max_depth=4, min_samples_leaf=20, min_samples_split=5)
Random Forest best estimator:
RandomForestClassifier(bootstrap=False, max_depth=5)
Naive Bayes best estimator:
GaussianNB()
SVC best score:
0.9895644071744701
Decision Tree best score:
0.9828303342650827
Random Forest best score:
0.9772318794161038
Naive Bayes best score:
0.9541117808059631


### Classification Reports

In [3]:
# Load jupyter notebook state
dill.load_session("notebook_env.db")

In [7]:
# Prepare X_test dataset based on previous method for X_train
X_test,dummy_rows_len = fill_missing_champ_tags_with_dummy(X_test)
print('Training data shape (with dummy) :',X_test.shape)
print('Columns of training data :',X_test.columns)

num_pipeline = Pipeline([
    ('std_scaler',StandardScaler())
])

full_column_set,cat_column_set = set(list(X_test.columns)),set(champ_name_columns)
num_columns = list(full_column_set - cat_column_set)

full_pipeline = ColumnTransformer([
        ('num',num_pipeline,num_columns),
        ("cat", OneHotEncoder(), champ_name_columns),
    ],remainder='passthrough')

X_test_prepared = full_pipeline.fit_transform(X_test)
size = len(X_test_prepared)
X_test_prepared = X_test_prepared[:size-dummy_rows_len]
print('One hot encoded Training data shape (without dummy) :',X_test_prepared.shape)

Number of extra dummy rows =  6
Training data shape (with dummy) : (24505, 50)
Columns of training data : Index(['total_gold_earned_0', 'total_gold_spent_0', 'total_baron_kills_0',
       'total_dragon_kills_0', 'total_inhibitor_kils_0', 'total_kills_0',
       'total_deaths_0', 'total_damage_dealt_to_champions_0',
       'total_damage_dealt_to_objectives_0', 'total_damage_taken_0',
       'average_vision_score_0', 'total_wards_placed_0',
       'average_creep_score_0', 'average_champion_experience_0', 'dmg_carry_0',
       'obj_carry_0', 'team_comp_0_champ_1', 'team_comp_0_champ_2',
       'team_comp_0_champ_3', 'team_comp_0_champ_4', 'team_comp_0_champ_5',
       'total_gold_earned_1', 'total_gold_spent_1', 'total_baron_kills_1',
       'total_dragon_kills_1', 'total_inhibitor_kils_1', 'total_kills_1',
       'total_deaths_1', 'total_damage_dealt_to_champions_1',
       'total_damage_dealt_to_objectives_1', 'total_damage_taken_1',
       'average_vision_score_1', 'total_wards_placed_

In [9]:
# Testing models with test data

# Using the following existing variables:
# X_test_prepared: test data
# y_test: test labels
# Models:
# grid_search_svc
# grid_search_dt
# grid_search_rf
# grid_search_nb

# Predict using models' best estimators
prediction_svc = grid_search_svc.best_estimator_.predict(X_test_prepared)
prediction_dt = grid_search_dt.best_estimator_.predict(X_test_prepared)
prediction_rf = grid_search_rf.best_estimator_.predict(X_test_prepared)
prediction_nb = grid_search_nb.best_estimator_.predict(X_test_prepared)

In [10]:
# Create and print classification reports for all models

# SVC
conf_matrix_svc = confusion_matrix(y_test, prediction_svc)
class_report_svc = classification_report(y_test, prediction_svc, output_dict=True)
print(f'Confusion matrix: SVC:\n{conf_matrix_svc}\n')

# Decision Tree
conf_matrix_dt = confusion_matrix(y_test, prediction_dt)
class_report_dt = classification_report(y_test, prediction_dt, output_dict=True)
print(f'Confusion matrix: DT:\n{conf_matrix_dt}\n')

# Random Forest
conf_matrix_rf = confusion_matrix(y_test, prediction_rf)
class_report_rf = classification_report(y_test, prediction_rf, output_dict=True)
print(f'Confusion matrix: RF:\n{conf_matrix_rf}\n')

# Naive Bayes
conf_matrix_nb = confusion_matrix(y_test, prediction_nb)
class_report_nb = classification_report(y_test, prediction_nb, output_dict=True)
print(f'Confusion matrix: NB:\n{conf_matrix_nb}\n')

Confusion matrix: SVC:
[[6953 5619]
 [6786 5141]]

Confusion matrix: DT:
[[5559 7013]
 [6512 5415]]

Confusion matrix: RF:
[[5534 7038]
 [7285 4642]]

Confusion matrix: NB:
[[6637 5935]
 [6517 5410]]



In [15]:
# Print our classification reports

# SVC
df_svc = pd.DataFrame(class_report_svc).transpose().drop(['0', '1'])
print(f"{df_svc}\n")

# DT
df_dt = pd.DataFrame(class_report_dt).transpose().drop(['0', '1'])
print(f"{df_dt}\n")

# RF
df_rf = pd.DataFrame(class_report_rf).transpose().drop(['0', '1'])
print(f"{df_rf}\n")

# NB
df_nb = pd.DataFrame(class_report_nb).transpose().drop(['0', '1'])
print(f"{df_nb}\n")

              precision    recall  f1-score       support
accuracy       0.493653  0.493653  0.493653      0.493653
macro avg      0.491933  0.492047  0.490868  24499.000000
weighted avg   0.492305  0.493653  0.491859  24499.000000

              precision    recall  f1-score       support
accuracy       0.447937  0.447937  0.447937      0.447937
macro avg      0.448117  0.448092  0.447918  24499.000000
weighted avg   0.448444  0.447937  0.448003  24499.000000

              precision    recall  f1-score       support
accuracy       0.415364  0.415364  0.415364      0.415364
macro avg      0.414567  0.414693  0.414588  24499.000000
weighted avg   0.415018  0.415364  0.415149  24499.000000

              precision    recall  f1-score       support
accuracy       0.491734  0.491734  0.491734      0.491734
macro avg      0.490712  0.490756  0.490456  24499.000000
weighted avg   0.491076  0.491734  0.491128  24499.000000



In [None]:
# Save session to "notebook_env.db"
dill.dump_session("notebook_env.db")