# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers
import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

# Functions

In [2]:
def grid_search_classifier(function, param_grid, cv = 4):
    # Running Grid Search
    grid_search = GridSearchCV(function, param_grid, cv = cv)
    grid_search.fit(x_train, y_train)
    
    # Outputting results
    print(f'Best parameters are: {grid_search.best_params_}\n')
    print(f'Accuracy is: {grid_search.score(x_test, y_test)}\n')
    try:
        print(f'AUC score is: {roc_auc_score(y_test, grid_search.predict_proba(x_test)[:, 1])}\n')
    except:
        pass
    best_model = grid_search.best_estimator_
    pred_rf = best_model.predict(x_test)
    print(f'Classification report:\n {classification_report(y_test, pred_rf, target_names = ["Lower Ranked", "Higher Ranked"])}')
    
    return best_model

# Data work

In [3]:
data = pd.read_csv('mma_data.csv', index_col=0)

# Filtering out unwanted rows
data = data[data.result!=-2]
data = data[data.slpm_2 + data.sapm_2 != 0]
data = data[data.slpm_1 + data.sapm_1 != 0]

In [4]:
# Engineering some columns
data['reach_diff'] = data.reach_1 - data.reach_2
data['age_diff'] = data.age_1 - data.age_2
data['slpm_diff'] = data.slpm_1 - data.slpm_2
data['sapm_diff'] = data.sapm_1 - data.sapm_2
data['td_acc_diff'] = data.td_acc_1 - data.td_acc_2
data['td_def_diff'] = data.td_def_1 - data.td_def_2
data['td_avg_diff'] = data.td_avg_1 - data.td_avg_2
data['sub_avg_diff'] = data.sub_avg_1 - data.sub_avg_2
data['strk_acc_diff'] = data.strk_acc_1 - data.strk_acc_2
data['strk_def_diff'] = data.strk_def_1 - data.strk_def_2
data['wins_diff'] = data.wins_1 - data.wins_2
data['losses_diff'] = data.losses_1 - data.losses_2
data['win_pct_1'] = data.wins_1/(data.losses_1 + data.wins_1)
data['win_pct_2'] = data.wins_2/(data.losses_2 + data.wins_2)
data['win_pct_diff'] = data.win_pct_1 - data.win_pct_2

# Droping unecessary columnns and scaling data
data.drop(['fighter_1', 'fighter_2'], axis = 1, inplace = True)
x_cols = ['reach_diff', 'age_diff', 'slpm_diff', 'sapm_diff', 'td_acc_diff', 'td_def_diff',
              'td_avg_diff', 'sub_avg_diff', 'strk_acc_diff', 'strk_def_diff', 'wins_diff',
              'losses_diff', 'win_pct_diff', 'weight_1']
y_col = ['result']
x, y = data[x_cols], data[y_col]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Formatting data
x_train = x_train.values
y_train = y_train.values.ravel()
x_test = x_test.values
y_test = y_test.values.ravel()

# Predicting Winner

### Setting Baseline

In [5]:
print(f'Predicting fighter 1 would yield {data.result.mean()*100}% accuracy')

df = data.copy()
df['Higher_Pct'] = df.win_pct_diff.apply(lambda x: 1 if x > 0 else 0)
df['Result_Tracker'] = df.apply(lambda x: 1 if (x.Higher_Pct == 1) & (x.result == 1)
                               else 1 if (x.Higher_Pct == 0) & (x.result == 0)
                               else 0, axis = 1)
print(f'Predicting the fighter with the higher win percentage would yield {df.Result_Tracker.mean()*100}% accuracy')

Predicting fighter 1 would yield 57.21271393643031% accuracy
Predicting the fighter with the higher win percentage would yield 55.74572127139364% accuracy


### Neural Network

In [6]:
# Scaling data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Instantiating NN model
model = Sequential()
model.add(Dense(16, input_dim=x_train_scaled.shape[1],
                activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [7]:
model.fit(x=x_train_scaled, y=y_train, epochs=200, batch_size=64, verbose=0)
test_results = model.evaluate(x = x_test_scaled, y = y_test, verbose=0)
print("Test Accuracy = {}".format(test_results[1]))

Test Accuracy = 0.5487805008888245


### Random Forest

In [8]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
rf = grid_search_classifier(RandomForestClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 4, 'max_features': 5, 'n_estimators': 15}

Accuracy is: 0.5853658536585366

AUC score is: 0.5603864734299516

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.55      0.31      0.39        36
Higher Ranked       0.60      0.80      0.69        46

     accuracy                           0.59        82
    macro avg       0.57      0.55      0.54        82
 weighted avg       0.58      0.59      0.56        82



### Gradient Boost

In [None]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
gb = grid_search_classifier(GradientBoostingClassifier(random_state = 0), param_grid)

### Logistic Regression

In [None]:
# Creating parameter grid

c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c
}

# Running ML function
lr = grid_search_classifier(LogisticRegression(random_state = 0, max_iter = 500), param_grid)

### SVM

In [None]:
# Creating parameter grid

kernel = ['rbf', 'poly', 'sigmoid']
degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 5)]
c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c,
    'kernel' : kernel,
    'degree' : degree
}

# Running ML function
svm = grid_search_classifier(SVC(random_state = 0), param_grid)

### Naive Bayes

In [None]:
# Creating parameter grid

var_smoothing = [int(1**x) for x in np.linspace(start = -9, stop = 0, num = 10)]
param_grid = {
    'var_smoothing' : var_smoothing
}

# Running ML function
grid_search_classifier(GaussianNB(random_state = 0), param_grid)

### Ensemble

In [31]:
x = x.values

# Iterating over models to create prediction columns
models = [rf, gb, lr, svm]
names = ['rf', 'gb', 'lr', 'svm']
for model, name in zip(models, names):
    try:
        data[f'{name}_pred'] = model.predict_proba(x)
    except:
        data[f'{name}_pred'] = model.predict(x)

# Re-formatting data
x_ensemble, y_ensemble = data[x_cols], data[y_col]
x_train_ensemble, x_test_ensemble, y_train_ensemble, y_test_ensemble = train_test_split(x_ensemble, y_ensemble, test_size = 0.2, random_state = 0)

x_train_ensemble = x_train_ensemble.values
y_train_ensemble = y_train_ensemble.values.ravel()
x_test_ensemble = x_test_ensemble.values
y_test_ensemble = y_test_ensemble.values.ravel()

# Creating parameter grid
c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c
}

# Running Logistic

# Running Grid Search
grid_search = GridSearchCV(LogisticRegression(random_state = 0, max_iter = 500), param_grid, cv = 4)
grid_search.fit(x_train_ensemble, y_train_ensemble)

# Outputting results
print(f'Best parameters are: {grid_search.best_params_}\n')
print(f'Accuracy is: {grid_search.score(x_test_ensemble, y_test_ensemble)}\n')
try:
    print(f'AUC score is: {roc_auc_score(y_test_ensemble, grid_search.predict_proba(x_test)[:, 1])}\n')
except:
    pass
ensemble = grid_search.best_estimator_
pred_rf = ensemble.predict(x_test_ensemble)
print(f'Classification report:\n {classification_report(y_test_ensemble, pred_rf, target_names = ["Lower Ranked", "Higher Ranked"])}')
    

Best parameters are: {'C': 0.01}

Accuracy is: 0.6103896103896104

AUC score is: 0.6336088154269972

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.57      0.36      0.44        33
Higher Ranked       0.62      0.80      0.70        44

     accuracy                           0.61        77
    macro avg       0.60      0.58      0.57        77
 weighted avg       0.60      0.61      0.59        77



# Predicting KO

In [43]:
y = data['KO_OVR']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Formatting data
x_train = x_train.values
y_train = y_train.values.ravel()
x_test = x_test.values
y_test = y_test.values.ravel()

### Setting Baseline

In [41]:
print(f'Always predicting no knockout would yield {100 - data.KO_OVR.mean()*100}% accuracy')

df = data.copy()
df['Higher_Pct'] = df.slpm_1.apply(lambda x: 1 if x > 0 else 0)
df['Result_Tracker'] = df.apply(lambda x: 1 if (x.Higher_Pct == 1) & (x.result == 1)
                               else 1 if (x.Higher_Pct == 0) & (x.result == 0)
                               else 0, axis = 1)
print(f'Predicting the fighter with the higher SLPM would yield {df.Result_Tracker.mean()*100}% accuracy')

Always predicting no knockout would yield 62.836185819070906% accuracy
Predicting the fighter with the higher SLPM would yield 57.21271393643031% accuracy


### Random Forest

In [44]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
rf = grid_search_classifier(RandomForestClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 6, 'max_features': 8, 'n_estimators': 15}

Accuracy is: 0.524390243902439

AUC score is: 0.4155844155844156

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.58      0.76      0.65        49
Higher Ranked       0.33      0.18      0.24        33

     accuracy                           0.52        82
    macro avg       0.46      0.47      0.45        82
 weighted avg       0.48      0.52      0.49        82



### Gradient Boost

In [45]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
gb = grid_search_classifier(GradientBoostingClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 10, 'max_features': 10, 'n_estimators': 6}

Accuracy is: 0.5121951219512195

AUC score is: 0.4001236858379716

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.57      0.76      0.65        49
Higher Ranked       0.29      0.15      0.20        33

     accuracy                           0.51        82
    macro avg       0.43      0.45      0.42        82
 weighted avg       0.46      0.51      0.47        82



### Logistic Regression

In [46]:
# Creating parameter grid

c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c
}

# Running ML function
lr = grid_search_classifier(LogisticRegression(random_state = 0, max_iter = 500), param_grid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Best parameters are: {'C': 0.1}

Accuracy is: 0.5487804878048781

AUC score is: 0.45701917130488556

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.59      0.80      0.68        49
Higher Ranked       0.38      0.18      0.24        33

     accuracy                           0.55        82
    macro avg       0.48      0.49      0.46        82
 weighted avg       0.50      0.55      0.50        82



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### SVM

In [47]:
# Creating parameter grid

kernel = ['rbf', 'poly', 'sigmoid']
degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 5)]
c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c,
    'kernel' : kernel,
    'degree' : degree
}

# Running ML function
svm = grid_search_classifier(SVC(random_state = 0), param_grid)

Best parameters are: {'C': 10, 'degree': 5, 'kernel': 'poly'}

Accuracy is: 0.5609756097560976

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.59      0.84      0.69        49
Higher Ranked       0.38      0.15      0.22        33

     accuracy                           0.56        82
    macro avg       0.49      0.49      0.46        82
 weighted avg       0.51      0.56      0.50        82



# Predicting SUB

In [48]:
y = data['SUB_OVR']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

# Formatting data
x_train = x_train.values
y_train = y_train.values.ravel()
x_test = x_test.values
y_test = y_test.values.ravel()

### Setting Baseline

In [50]:
print(f'Always predicting no sub would yield {100 - data.SUB_OVR.mean()*100}% accuracy')

Always predicting no sub would yield 80.440097799511% accuracy


### Random Forest

In [51]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
rf = grid_search_classifier(RandomForestClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 2, 'max_features': 9, 'n_estimators': 6}

Accuracy is: 0.7926829268292683

AUC score is: 0.625339366515837

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.79      1.00      0.88        65
Higher Ranked       0.00      0.00      0.00        17

     accuracy                           0.79        82
    macro avg       0.40      0.50      0.44        82
 weighted avg       0.63      0.79      0.70        82



  _warn_prf(average, modifier, msg_start, len(result))


### Gradient Boost

In [52]:
# Creating parameter grid
n_estimators = [int(x) for x in np.linspace(start = 3, stop = 15, num = 13)]
max_features = [int(x) for x in np.linspace(start = 3, stop = 10, num = 8)]
max_depth = [int(x) for x in np.linspace(start = 1, stop = 10, num = 10)]
param_grid = {
    'n_estimators' : n_estimators,
    'max_features' : max_features,
    'max_depth' : max_depth
}

# Running ML function
gb = grid_search_classifier(GradientBoostingClassifier(random_state = 0), param_grid)

Best parameters are: {'max_depth': 5, 'max_features': 5, 'n_estimators': 5}

Accuracy is: 0.7926829268292683

AUC score is: 0.5728506787330316

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.79      1.00      0.88        65
Higher Ranked       0.00      0.00      0.00        17

     accuracy                           0.79        82
    macro avg       0.40      0.50      0.44        82
 weighted avg       0.63      0.79      0.70        82



  _warn_prf(average, modifier, msg_start, len(result))


### Logistic Regression

In [53]:
# Creating parameter grid

c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c
}

# Running ML function
lr = grid_search_classifier(LogisticRegression(random_state = 0, max_iter = 500), param_grid)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Best parameters are: {'C': 0.001}

Accuracy is: 0.7926829268292683

AUC score is: 0.5285067873303168

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.79      1.00      0.88        65
Higher Ranked       0.00      0.00      0.00        17

     accuracy                           0.79        82
    macro avg       0.40      0.50      0.44        82
 weighted avg       0.63      0.79      0.70        82



  _warn_prf(average, modifier, msg_start, len(result))


### SVM

In [54]:
# Creating parameter grid

kernel = ['rbf', 'poly', 'sigmoid']
degree = [int(x) for x in np.linspace(start = 2, stop = 7, num = 5)]
c = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {
    'C' : c,
    'kernel' : kernel,
    'degree' : degree
}

# Running ML function
svm = grid_search_classifier(SVC(random_state = 0), param_grid)

Best parameters are: {'C': 0.001, 'degree': 2, 'kernel': 'rbf'}

Accuracy is: 0.7926829268292683

Classification report:
                precision    recall  f1-score   support

 Lower Ranked       0.79      1.00      0.88        65
Higher Ranked       0.00      0.00      0.00        17

     accuracy                           0.79        82
    macro avg       0.40      0.50      0.44        82
 weighted avg       0.63      0.79      0.70        82



  _warn_prf(average, modifier, msg_start, len(result))


# Let's Lose Bradley Some Money