In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Import packages
import pandas as pd
import numpy as np
np.random.seed(42)
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from xgboost import XGBRFClassifier, XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
df = pd.read_csv('high_diamond_ranked_10min.csv')

In [3]:
# Display basic information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9879 entries, 0 to 9878
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   gameId                        9879 non-null   int64  
 1   blueWins                      9879 non-null   int64  
 2   blueWardsPlaced               9879 non-null   int64  
 3   blueWardsDestroyed            9879 non-null   int64  
 4   blueFirstBlood                9879 non-null   int64  
 5   blueKills                     9879 non-null   int64  
 6   blueDeaths                    9879 non-null   int64  
 7   blueAssists                   9879 non-null   int64  
 8   blueEliteMonsters             9879 non-null   int64  
 9   blueDragons                   9879 non-null   int64  
 10  blueHeralds                   9879 non-null   int64  
 11  blueTowersDestroyed           9879 non-null   int64  
 12  blueTotalGold                 9879 non-null   int64  
 13  blu

According to the chart above, we can see that we have all numerical data and no null values to address.

# Data Preparation

In [4]:
# Drop gameId column (this is an identifier for each match -- not useful for our model)
df.drop('gameId', axis=1, inplace=True)
df.head()

Unnamed: 0,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueHeralds,...,redTowersDestroyed,redTotalGold,redAvgLevel,redTotalExperience,redTotalMinionsKilled,redTotalJungleMinionsKilled,redGoldDiff,redExperienceDiff,redCSPerMin,redGoldPerMin
0,0,28,2,1,9,6,11,0,0,0,...,0,16567,6.8,17047,197,55,-643,8,19.7,1656.7
1,0,12,1,0,5,5,5,0,0,0,...,1,17620,6.8,17438,240,52,2908,1173,24.0,1762.0
2,0,15,0,0,7,11,4,1,1,0,...,0,17285,6.8,17254,203,28,1172,1033,20.3,1728.5
3,0,43,1,0,4,5,5,1,0,1,...,0,16478,7.0,17961,235,47,1321,7,23.5,1647.8
4,0,75,4,0,6,6,6,0,0,0,...,0,17404,7.0,18313,225,67,1004,-230,22.5,1740.4


In [5]:
# Create df with our selected features
df_select = df[['blueWins','blueWardsPlaced', 'blueWardsDestroyed', 'blueFirstBlood', 'blueKills', 'blueDeaths', 'blueAssists', 
                'blueDragons', 'blueHeralds', 'blueTowersDestroyed', 'blueTotalMinionsKilled', 'blueTotalJungleMinionsKilled',
                'redWardsPlaced', 'redWardsDestroyed', 'redFirstBlood', 'redKills', 'redDeaths', 'redAssists', 'redDragons', 
                'redHeralds', 'redTowersDestroyed', 'redTotalMinionsKilled', 'redTotalJungleMinionsKilled']]
df_select.head()

Unnamed: 0,blueWins,blueWardsPlaced,blueWardsDestroyed,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueDragons,blueHeralds,blueTowersDestroyed,...,redWardsDestroyed,redFirstBlood,redKills,redDeaths,redAssists,redDragons,redHeralds,redTowersDestroyed,redTotalMinionsKilled,redTotalJungleMinionsKilled
0,0,28,2,1,9,6,11,0,0,0,...,6,0,6,9,8,0,0,0,197,55
1,0,12,1,0,5,5,5,0,0,0,...,1,1,5,5,2,1,1,1,240,52
2,0,15,0,0,7,11,4,1,0,0,...,3,1,11,7,14,0,0,0,203,28
3,0,43,1,0,4,5,5,0,1,0,...,2,1,5,4,10,0,0,0,235,47
4,0,75,4,0,6,6,6,0,0,0,...,2,1,6,6,7,1,0,0,225,67


In [6]:
# Drop the features that are duplicates of each other
df_select.drop(columns=['redKills', 'redDeaths', 'redFirstBlood'], 
               inplace=True)
df_select.columns

Index(['blueWins', 'blueWardsPlaced', 'blueWardsDestroyed', 'blueFirstBlood',
       'blueKills', 'blueDeaths', 'blueAssists', 'blueDragons', 'blueHeralds',
       'blueTowersDestroyed', 'blueTotalMinionsKilled',
       'blueTotalJungleMinionsKilled', 'redWardsPlaced', 'redWardsDestroyed',
       'redAssists', 'redDragons', 'redHeralds', 'redTowersDestroyed',
       'redTotalMinionsKilled', 'redTotalJungleMinionsKilled'],
      dtype='object')

# Logistic Regression

In [7]:
# Test train split

X = df_select.drop('blueWins', axis=1)
y = df_select['blueWins']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
# Train and test a simple logistical regression model to get a baseline 
lr = LogisticRegression(max_iter=5000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.73      0.72      1624
           1       0.73      0.72      0.73      1637

    accuracy                           0.72      3261
   macro avg       0.72      0.72      0.72      3261
weighted avg       0.72      0.72      0.72      3261



# GridSearch CV - Logistic Regression

In [9]:
# Define the parameter grid to search over
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'fit_intercept': [True, False],'penalty': ['l1', 'l2'], 
              'max_iter': [500], 'solver': ['liblinear']}

log_reg = LogisticRegression()

grid_search = GridSearchCV(log_reg, param_grid, cv=5)

grid_search.fit(X_train, y_train)


In [10]:
# Print the best hyperparameters and score
print("Best hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best hyperparameters:  {'C': 0.01, 'fit_intercept': True, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}
Best score:  0.7221228874228443


In [11]:
print("Test score: ", grid_search.score(X_test, y_test))

Test score:  0.7160380251456608


In [12]:
log_reg = LogisticRegression(C=0.01,fit_intercept=True,max_iter=500,penalty='l1',solver='liblinear')
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.72      0.72      1624
           1       0.72      0.71      0.72      1637

    accuracy                           0.72      3261
   macro avg       0.72      0.72      0.72      3261
weighted avg       0.72      0.72      0.72      3261



# XGBoost: Random Forest

In [13]:
# Create functions to facilitate scaling, fiting and evaluating multiple 
# dataframes.

def evaluate_model(model, X_train, y_train, X_test, y_test, digits=4, 
                   figsize=(10,5), params=False): 
        
    # Get Predictions
    y_hat_test = model.predict(X_test)
    y_hat_train = model.predict(X_train)
    
    # Classification Report 
    
    print("Classification Report")
    
    print(metrics.classification_report(y_test,y_hat_test, digits=digits))


def fit_eval(model, X_train, y_train, X_test, y_test, digits=4, 
             figsize=(10,5), params=False):

    model.fit(X_train, y_train)

    evaluate_model(model, X_train, y_train, X_test, y_test, digits=digits, 
                   figsize=figsize, params=params)
    
    return model

In [14]:
# Fit and evaluate XGBoost 
xgb_select = fit_eval(XGBRFClassifier(random_state=42), \
                      X_train, y_train, \
                      X_test, y_test)

Classification Report
              precision    recall  f1-score   support

           0     0.7128    0.7014    0.7070      1624
           1     0.7084    0.7196    0.7139      1637

    accuracy                         0.7105      3261
   macro avg     0.7106    0.7105    0.7105      3261
weighted avg     0.7106    0.7105    0.7105      3261



# GridSearch CV - XGBoost: Random Forest

In [15]:
# Create parameter grid for XGBoost Random Forest gridsearch and fit to data.
xgb_rf = XGBRFClassifier(random_state=42)

params = {'learning_rate': [0.03, 0.05, 0.06],
          'max_depth': [4, 5, 6],
          'min_child_weight': [2, 3, 4],
          'subsample': [0.3, 0.4, 0.5],
          'n_estimators': [100]}
xgb_grid = GridSearchCV(xgb_rf, params, scoring='recall_macro')
xgb_grid.fit(X_train, y_train)

In [16]:
# Print best params for xgb_grid
xgb_grid.best_params_

{'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 4,
 'n_estimators': 100,
 'subsample': 0.3}

In [17]:
# Print best score for xgb_grid
print("Best score: ", xgb_grid.best_score_)

Best score:  0.715168598213967


In [18]:
# Evaluate best estimating model
evaluate_model(xgb_grid.best_estimator_, X_train, y_train, \
               X_test, y_test, params=True)

Classification Report
              precision    recall  f1-score   support

           0     0.7166    0.7131    0.7148      1624
           1     0.7167    0.7202    0.7185      1637

    accuracy                         0.7167      3261
   macro avg     0.7167    0.7166    0.7166      3261
weighted avg     0.7167    0.7167    0.7166      3261



# Naive Bayes

In [19]:
# Create functions to facilitate scaling, fiting and evaluating multiple 
def load_data(filename):
    data = pd.read_csv(filename)
    return data

def train_model(X_train, y_train):
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [20]:
# Fit and evaluate
filename = "high_diamond_ranked_10min.csv"
data = load_data(filename)

X = data.drop("blueWins", axis=1)  # Features (excluding the target variable)
y = data["blueWins"]  # Target variable (game_result)

# Split the dataset into training (67%) and testing (33%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = train_model(X_train, y_train)
evaluate_model(model, X_test, y_test)

Accuracy: 72.28%

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.73      0.72      1624
           1       0.73      0.72      0.72      1637

    accuracy                           0.72      3261
   macro avg       0.72      0.72      0.72      3261
weighted avg       0.72      0.72      0.72      3261


Confusion Matrix:
[[1181  443]
 [ 461 1176]]
