# Setting Up Environment

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from keras.models import Sequential
# from keras.layers import Dense
# from keras import regularizers
# from keras.utils.vis_utils import model_to_dot

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

import gym

from stable_baselines3 import PPO
from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import DummyVecEnv



# Populating LGBM Predictions

### Training On Data Before Predictions DF starts

In [8]:
data = pd.read_csv('mma_data.csv', index_col=0)

# Filtering out unwanted rows
data = data[data.result >= 0]
data = data[data.slpm_2 + data.sapm_2 != 0]
data = data[data.slpm_1 + data.sapm_1 != 0]

In [26]:
# Engineering some columns
data['strike_diff_1'] = data.slpm_1 - data.sapm_1
data['strike_diff_2'] = data.slpm_2 - data.sapm_2
data['strike_diff'] = data.strike_diff_1 - data.strike_diff_2
data['td_diff_1'] = data.td_acc_1 - data.td_def_1
data['td_diff_2'] = data.td_acc_2 - data.td_def_2
data['td_diff'] = data.td_diff_1 - data.td_diff_2
data['reach_diff'] = data.reach_1 - data.reach_2
data['age_diff'] = data.age_1 - data.age_2
data['slpm_diff'] = data.slpm_1 - data.slpm_2
data['sapm_diff'] = data.sapm_1 - data.sapm_2
data['td_acc_diff'] = data.td_acc_1 - data.td_acc_2
data['td_def_diff'] = data.td_def_1 - data.td_def_2
data['td_avg_diff'] = data.td_avg_1 - data.td_avg_2
data['sub_avg_diff'] = data.sub_avg_1 - data.sub_avg_2
data['strk_acc_diff'] = data.strk_acc_1 - data.strk_acc_2
data['strk_def_diff'] = data.strk_def_1 - data.strk_def_2
data['wins_diff'] = data.wins_1 - data.wins_2
data['losses_diff'] = data.losses_1 - data.losses_2
data['win_pct_1'] = data.wins_1/(data.losses_1 + data.wins_1)
data['win_pct_2'] = data.wins_2/(data.losses_2 + data.wins_2)
data['win_pct_diff'] = data.win_pct_1 - data.win_pct_2

# Getting a dataset for only veteran fights
data_v2 = data.copy()

# Droping unecessary columnns and scaling data
# data.drop(['fighter_1', 'fighter_2'], axis = 1, inplace = True)
x_cols = ['reach_diff', 'age_diff', 'slpm_diff', 'sapm_diff', 'td_acc_diff', 'td_def_diff',
              'td_avg_diff', 'sub_avg_diff', 'strk_acc_diff', 'strk_def_diff', 'wins_diff',
              'losses_diff', 'win_pct_diff', 'weight_1', 'age_1', 'strike_diff', 'td_diff']
y_col = ['result']
x, y = data[x_cols], data[y_col]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0, stratify = y)

# Formatting data
x_train = x_train.values
y_train = y_train.values.ravel()
x_test = x_test.values
y_test = y_test.values.ravel()

In [28]:
def grid_search_classifier(function, param_grid, cv = 10):
    # Running Grid Search
    grid_search = GridSearchCV(function, param_grid, cv = cv)
    grid_search.fit(x_train, y_train)
    
    # Outputting results
    print(f'Best parameters are: {grid_search.best_params_}\n')
    print(f'Accuracy is: {grid_search.score(x_test, y_test)}\n')
    try:
        print(f'AUC score is: {roc_auc_score(y_test, grid_search.predict_proba(x_test)[:, 1])}\n')
    except:
        pass
    best_model = grid_search.best_estimator_
    pred_rf = best_model.predict(x_test)
    print(f'Classification report:\n {classification_report(y_test, pred_rf, target_names = ["Lower Ranked", "Higher Ranked"])}')
    print(f'MCC is: {matthews_corrcoef(y_test, pred_rf)}')
    
    return best_model

In [30]:
max_iter = [int(x) for x in np.linspace(start = 5, stop = 15, num = 11)]
max_leaf_nodes = [int(x) for x in np.linspace(start = 4, stop = 10, num = 7)]
max_depth = [int(x) for x in np.linspace(start = 4, stop = 10, num = 7)]
learning_rate = [0.001, 0.01, 0.1, 1]
param_grid = {
    'max_iter' : max_iter,
    'max_leaf_nodes' : max_leaf_nodes,
    'max_depth' : max_depth,
    'learning_rate' : learning_rate
}
lgbm = grid_search_classifier(HistGradientBoostingClassifier(random_state = 0), param_grid)

Best parameters are: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 5, 'max_leaf_nodes': 8}

Accuracy is: 0.6774193548387096

AUC score is: 0.6801994301994303

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.80      0.31      0.44        39
Higher Ranked       0.65      0.94      0.77        54

     accuracy                           0.68        93
    macro avg       0.73      0.63      0.61        93
 weighted avg       0.72      0.68      0.64        93

MCC is: 0.3382769504422759


### Applying to Predictions DF

In [37]:
data = pd.read_csv('mma_data_predictions.csv', index_col=0)

# Engineering some columns
data['strike_diff_1'] = data.slpm_1 - data.sapm_1
data['strike_diff_2'] = data.slpm_2 - data.sapm_2
data['strike_diff'] = data.strike_diff_1 - data.strike_diff_2
data['td_diff_1'] = data.td_acc_1 - data.td_def_1
data['td_diff_2'] = data.td_acc_2 - data.td_def_2
data['td_diff'] = data.td_diff_1 - data.td_diff_2
data['reach_diff'] = data.reach_1 - data.reach_2
data['age_diff'] = data.age_1 - data.age_2
data['slpm_diff'] = data.slpm_1 - data.slpm_2
data['sapm_diff'] = data.sapm_1 - data.sapm_2
data['td_acc_diff'] = data.td_acc_1 - data.td_acc_2
data['td_def_diff'] = data.td_def_1 - data.td_def_2
data['td_avg_diff'] = data.td_avg_1 - data.td_avg_2
data['sub_avg_diff'] = data.sub_avg_1 - data.sub_avg_2
data['strk_acc_diff'] = data.strk_acc_1 - data.strk_acc_2
data['strk_def_diff'] = data.strk_def_1 - data.strk_def_2
data['wins_diff'] = data.wins_1 - data.wins_2
data['losses_diff'] = data.losses_1 - data.losses_2
data['win_pct_1'] = data.wins_1/(data.losses_1 + data.wins_1)
data['win_pct_2'] = data.wins_2/(data.losses_2 + data.wins_2)
data['win_pct_diff'] = data.win_pct_1 - data.win_pct_2

# Droping unecessary columnns and scaling data
# data.drop(['fighter_1', 'fighter_2'], axis = 1, inplace = True)
x_cols = ['reach_diff', 'age_diff', 'slpm_diff', 'sapm_diff', 'td_acc_diff', 'td_def_diff',
              'td_avg_diff', 'sub_avg_diff', 'strk_acc_diff', 'strk_def_diff', 'wins_diff',
              'losses_diff', 'win_pct_diff', 'weight_1', 'age_1', 'strike_diff', 'td_diff']
y_col = ['result']
x, y = data[x_cols], data[y_col]

# Formatting data
x = x.values

In [41]:
data['Prediction_LGBM_Winner'] = lgbm.predict_proba(x)[:, 1]

In [46]:
data.to_csv('mma_data_predictions.csv')