In [12]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split


In [2]:
# Author: John Lee 
# This notebook is to perform EDA on LoL dataset available from Kaggle
# URL for the dataset: https://www.kaggle.com/datasets/bobbyscience/league-of-legends-diamond-ranked-games-10-min

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
# Load Dataset
wDir = os.getcwd()
df = pd.read_csv(os.path.join(wDir, 'cleaned_df.csv'))

# Training Various Models

In [3]:
def data_prep(df):
    y = df['blueWins']
    X = df.drop('blueWins', axis = 1)
    scaler = MinMaxScaler()
    scaler.fit(X)
    X_sc = scaler.transform(X)
    
    # Split the dataset into train and test. By default the ratio is 70/30
    # Ratio can be hard-coded into different values
    
    X_train, X_test, y_train, y_test = train_test_split(X_sc, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = data_prep(df)

## Logistic Regression

In [17]:
# Logistic Regression Modeling. By default, this does not save the model but an user can set save = True
# If save is True, the model result will be saved in pickle format.

def logReg(X, y, save = False):
    lm = LogisticRegression()
    
    # Setup a matrix of parameters for grid search
    param_grid = [    
        {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
        'C' : np.logspace(-4, 4, 20),
        'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
        'max_iter' : [100, 1000, 2000]
        }
    ]
    
    # Start grid searching with 3-fold cross validation. CV can be altered into different number
    # Due to the limitation of local machine with computation cost, cv is set to 3 by default.
    cv_lm = GridSearchCV(lm, param_grid = param_grid, cv = 3, verbose = True, n_jobs = -1)
    best_cvlm = cv_lm.fit(X, y)
    print (f'Accuracy - : {best_cvlm.score(X, y):.3f}')
    
    if save:
        # Save the model by dumping it into pickle
        filename = os.path.join(wDir, 'models/logReg_model.sav')
        pickle.dump(best_cvlm, open(filename, 'wb'))

    return best_cvlm

## Random Forest Classifier

In [21]:
def rfc(X, y, save = False):
    rfc=RandomForestClassifier(random_state=42)
    param_grid = { 
        'n_estimators': [200, 500],
        'max_features': ['auto', 'sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['gini', 'entropy']
    }

    CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 5)
    CV_rfc.fit(X, y)
    print (f'Accuracy - : {CV_rfc.score(X, y):.3f}')
    
    if save:
        # Save the model by dumping it into pickle
        filename = os.path.join(wDir, 'models/rfc_model.sav')
        pickle.dump(CV_rfc, open(filename, 'wb'))

    return CV_rfc

## XGBoost Classifier

In [26]:
def xgboost(X, y, save = False):
    xgb = XGBClassifier(use_label_encoder = False, random_state = 42)
    
    param_grid = { 
        "learning_rate": [0.0001,0.001, 0.01, 0.1, 1] ,
        "max_depth": [3,8,15],
        "gamma": [i/10.0 for i in range(0,5)],
        "colsample_bytree": [i/10.0 for i in range(3,10)],
        "reg_alpha": np.logspace(-4,2,5),
        "reg_lambda": np.logspace(-4,2,5)}
    scoring = ['recall']
    
    CV_xgb = RandomizedSearchCV(estimator = xgb, param_distributions = param_grid, n_iter = 48, 
                                scoring = scoring, refit = 'recall', n_jobs = -1, cv = 3, verbose=0)
    
    CV_xgb.fit(X, y)
    print (f'Accuracy - : {CV_xgb.score(X, y):.3f}')
    
    if save:
        # Save the model by dumping it into pickle
        filename = os.path.join(wDir, 'models/XGB_model.sav')
        pickle.dump(CV_xgb, open(filename, 'wb'))

    return CV_xgb

In [23]:
rfc(X_train, y_train, True)

Accuracy - : 0.770


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500]})

In [27]:
xgboost(X_train, y_train, True)

Accuracy - : 0.754


RandomizedSearchCV(cv=3,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                   param_distributions={'colsample_bytree': [0.3, 0.4, 0.5, 0.6,
                                                             0.7, 0.8, 0.9],
                       







