# Setting Up Environment

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from keras.models import Sequential
# from keras.layers import Dense
# from keras import regularizers
# from keras.utils.vis_utils import model_to_dot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.utils import resample

import gym

from stable_baselines3 import PPO
from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import DummyVecEnv

In [2]:
def grid_search_classifier(function, param_grid, cv = 10):
    # Running Grid Search
    grid_search = GridSearchCV(function, param_grid, cv = cv)
    grid_search.fit(x_train, y_train)
    
    # Outputting results
    print(f'Best parameters are: {grid_search.best_params_}\n')
    print(f'Accuracy is: {grid_search.score(x_test, y_test)}\n')
    try:
        print(f'AUC score is: {roc_auc_score(y_test, grid_search.predict_proba(x_test)[:, 1])}\n')
    except:
        pass
    best_model = grid_search.best_estimator_
    pred_rf = best_model.predict(x_test)
    print(f'Classification report:\n {classification_report(y_test, pred_rf, target_names = ["Lower Ranked", "Higher Ranked"])}')
    print(f'MCC is: {matthews_corrcoef(y_test, pred_rf)}')
    
    return best_model

In [85]:
data = pd.read_csv('mma_data.csv', index_col=0)

# Filtering out unwanted rows
data = data[data.result >= 0]
data = data[data.slpm_2 + data.sapm_2 != 0]
data = data[data.slpm_1 + data.sapm_1 != 0]

In [86]:
# Engineering some columns
data['reach_diff'] = data.reach_1 - data.reach_2
data['age_diff'] = data.age_1 - data.age_2
data['slpm_diff'] = data.slpm_1 - data.slpm_2
data['sapm_diff'] = data.sapm_1 - data.sapm_2
data['td_acc_diff'] = data.td_acc_1 - data.td_acc_2
data['td_def_diff'] = data.td_def_1 - data.td_def_2
data['td_avg_diff'] = data.td_avg_1 - data.td_avg_2
data['sub_avg_diff'] = data.sub_avg_1 - data.sub_avg_2
data['strk_acc_diff'] = data.strk_acc_1 - data.strk_acc_2
data['strk_def_diff'] = data.strk_def_1 - data.strk_def_2
data['wins_diff'] = data.wins_1 - data.wins_2
data['losses_diff'] = data.losses_1 - data.losses_2
data['win_pct_1'] = data.wins_1/(data.losses_1 + data.wins_1)
data['win_pct_2'] = data.wins_2/(data.losses_2 + data.wins_2)
data['win_pct_diff'] = data.win_pct_1 - data.win_pct_2

# Droping unecessary columnns and scaling data
data.drop(['fighter_1', 'fighter_2'], axis = 1, inplace = True)
x_cols = ['reach_diff', 'age_diff', 'slpm_diff', 'sapm_diff', 'td_acc_diff', 'td_def_diff',
              'td_avg_diff', 'sub_avg_diff', 'strk_acc_diff', 'strk_def_diff', 'wins_diff',
              'losses_diff', 'win_pct_diff', 'weight_1', 'age_1']
y_col = ['result']
x, y = data[x_cols], data[y_col]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

# Formatting data
x_train = x_train.values
y_train = y_train.values.ravel()
x_test = x_test.values
y_test = y_test.values.ravel()

# Predicting KO

In [60]:
y = data['KO_OVR']

# Upsampling KO
x_up, y_up = resample(x[y==1], y[y==1], replace = True, random_state = 0, n_samples = x[y==0].shape[0])

x = np.vstack((x[y==0], x_up))
y = np.hstack((y[y==0], y_up))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

# Formatting data
# x_train = x_train.values
# y_train = y_train.values.ravel()
# x_test = x_test.values
# y_test = y_test.values.ravel()

### Setting Baseline

In [61]:
print(f'Always predicting no knockout would yield {100 - data.KO_OVR.mean()*100}% accuracy')

df = data.copy()
df['Higher_Pct'] = df.slpm_1.apply(lambda x: 1 if x > 0 else 0)
df['Result_Tracker'] = df.apply(lambda x: 1 if (x.Higher_Pct == 1) & (x.result == 1)
                               else 1 if (x.Higher_Pct == 0) & (x.result == 0)
                               else 0, axis = 1)
print(f'Predicting the fighter with the higher SLPM would yield {df.Result_Tracker.mean()*100}% accuracy')

Always predicting no knockout would yield 61.688311688311686% accuracy
Predicting the fighter with the higher SLPM would yield 57.57575757575758% accuracy


### Random Forest

In [62]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
rf = grid_search_classifier(RandomForestClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 9, 'max_features': 8, 'n_estimators': 15}

Accuracy is: 0.7982456140350878

AUC score is: 0.912280701754386

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.85      0.72      0.78        57
Higher Ranked       0.76      0.88      0.81        57

     accuracy                           0.80       114
    macro avg       0.81      0.80      0.80       114
 weighted avg       0.81      0.80      0.80       114

MCC is: 0.6040686963408962


### Gradient Boost

In [63]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
gb = grid_search_classifier(GradientBoostingClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 10, 'max_features': 3, 'n_estimators': 13}

Accuracy is: 0.8947368421052632

AUC score is: 0.956294244382887

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.92      0.86      0.89        57
Higher Ranked       0.87      0.93      0.90        57

     accuracy                           0.89       114
    macro avg       0.90      0.89      0.89       114
 weighted avg       0.90      0.89      0.89       114

MCC is: 0.7914248115215929


### Logistic Regression

In [72]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Creating parameter grid

c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c
}

# Running ML function
lr = grid_search_classifier(LogisticRegression(random_state = 0, max_iter = 500), param_grid)

### SVM

In [47]:
# Creating parameter grid

kernel = ['rbf', 'poly', 'sigmoid']
degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 5)]
c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c,
    'kernel' : kernel,
    'degree' : degree
}

# Running ML function
svm = grid_search_classifier(SVC(random_state = 0), param_grid)

Best parameters are: {'C': 10, 'degree': 5, 'kernel': 'poly'}

Accuracy is: 0.5609756097560976

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.59      0.84      0.69        49
Higher Ranked       0.38      0.15      0.22        33

     accuracy                           0.56        82
    macro avg       0.49      0.49      0.46        82
 weighted avg       0.51      0.56      0.50        82



# Predicting SUB

### Setting Up Data

In [90]:
x, y = data[x_cols], data['SUB_OVR']

# Upsampling KO
x_up, y_up = resample(x[y==1], y[y==1], replace = True, random_state = 0, n_samples = x[y==0].shape[0])

x = np.vstack((x[y==0], x_up))
y = np.hstack((y[y==0], y_up))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

### Setting Baseline

In [91]:
print(f'Always predicting no sub would yield {100 - data.SUB_OVR.mean()*100}% accuracy')

Always predicting no sub would yield 80.73593073593074% accuracy


### Random Forest

In [92]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
rf = grid_search_classifier(RandomForestClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 9, 'max_features': 3, 'n_estimators': 14}

Accuracy is: 0.96

AUC score is: 0.9939555555555556

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.97      0.95      0.96        75
Higher Ranked       0.95      0.97      0.96        75

     accuracy                           0.96       150
    macro avg       0.96      0.96      0.96       150
 weighted avg       0.96      0.96      0.96       150

MCC is: 0.9203272856738179


### Gradient Boost

In [93]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
gb = grid_search_classifier(GradientBoostingClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 10, 'max_features': 4, 'n_estimators': 12}

Accuracy is: 0.92

AUC score is: 0.9989333333333333

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       1.00      0.84      0.91        75
Higher Ranked       0.86      1.00      0.93        75

     accuracy                           0.92       150
    macro avg       0.93      0.92      0.92       150
 weighted avg       0.93      0.92      0.92       150

MCC is: 0.8509629433967631


### Logistic Regression

In [94]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Creating parameter grid

c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c
}

# Running ML function
lr = grid_search_classifier(LogisticRegression(random_state = 0, max_iter = 500, class_weight = 'balanced'), param_grid)

Best parameters are: {'C': 0.001}

Accuracy is: 0.6133333333333333

AUC score is: 0.648888888888889

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.62      0.59      0.60        75
Higher Ranked       0.61      0.64      0.62        75

     accuracy                           0.61       150
    macro avg       0.61      0.61      0.61       150
 weighted avg       0.61      0.61      0.61       150

MCC is: 0.22698972639472081


### SVM

In [95]:
# Creating parameter grid

kernel = ['rbf', 'poly', 'sigmoid']
degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 5)]
c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c,
    'kernel' : kernel,
    'degree' : degree
}

# Running ML function
svm = grid_search_classifier(SVC(random_state = 0), param_grid)

Best parameters are: {'C': 100, 'degree': 2, 'kernel': 'rbf'}

Accuracy is: 0.9

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       1.00      0.80      0.89        75
Higher Ranked       0.83      1.00      0.91        75

     accuracy                           0.90       150
    macro avg       0.92      0.90      0.90       150
 weighted avg       0.92      0.90      0.90       150

MCC is: 0.816496580927726
