# An Evaluation on the Effectiveness of Using Machine Learning to Predict Professional Fight Outcomes

This notebook tests various machine learning models ability to predict outcomes on fights in the UFC. The three models used are Logistic Regression, Random Forest, and MLP Neural Network. A baseline was also established in order to evaluate the performance. The data used to train the models was accessed on Kaggle and webscraped from the UFC stats website. The baseline models accuracy was 59%. The models that produced the highest accuracy were the Logistic Regression and MLP Neural Network models at 66%. The Random Forest Model achieved 64%. 

The three models were used to predict the outcomes of the UFC 300 event and all of them were able to accuratley predict 5 out of 6 fights. The 6 fights the models predictions were based off of are included below each of the models evaluation of the dataset. 

In [2]:
import pandas as pd
import numpy as np 

In [3]:
#importing the csv file 
ufc_data = pd.read_csv('ufc-master.csv')
ufc_data.fillna(0)

ufc_data.shape

(4896, 119)

In [4]:
#Columns that were not relevant for our analysis were dropped 
ufc_data.columns.get_loc('empty_arena')
ufc_data.drop(ufc_data.iloc[:, 77:120], inplace = True, axis = 1)
ufc_data.drop(['date', 'location', 'country', 'title_bout', 'R_ev', 'B_ev', 'weight_class', 'no_of_rounds'], axis = 1, inplace = True)
ufc_data.set_index(['R_fighter'] + ['B_fighter'] , inplace = True)

ufc_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,R_odds,B_odds,Winner,gender,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_SIG_STR_landed,B_avg_SIG_STR_pct,B_avg_SUB_ATT,...,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif
R_fighter,B_fighter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Thiago Santos,Johnny Walker,-150.0,130,Red,MALE,0,1,0,3.42,0.59,0.7,...,-32,-1,-7,0,10.16,15.24,-8,-0.53,0.6,-0.37
Alex Oliveira,Niko Price,170.0,-200,Blue,MALE,2,0,0,5.16,0.42,0.8,...,-20,0,0,-1,2.54,0.0,-1,2.19,0.3,-1.48
Misha Cirkunov,Krzysztof Jotko,110.0,-130,Blue,MALE,1,0,0,2.92,0.41,0.1,...,25,0,0,-5,-5.08,0.0,-2,-0.85,-1.6,-3.33
Alexander Hernandez,Mike Breeden,-675.0,475,Red,MALE,1,0,0,4.04,0.34,0.0,...,-12,0,-2,0,2.54,-5.08,3,0.25,0.0,-1.57
Joe Solecki,Jared Gordon,-135.0,115,Blue,MALE,0,2,0,5.22,0.56,0.0,...,11,0,1,-2,0.0,-5.08,5,2.58,-0.6,-0.31


In [5]:
#Changing categorical variables into integers
GenderMap = {'MALE': 1, 'FEMALE': 2}
ufc_data['gender'] = ufc_data['gender'].map(GenderMap)

StanceMap = {'Orthodox': 1, 'Southpaw': 2}
ufc_data['B_Stance'] = ufc_data['B_Stance'].map(StanceMap)
ufc_data['R_Stance'] = ufc_data['R_Stance'].map(StanceMap)
ufc_data = ufc_data.fillna(0)

#Setting the winner column into a binary value column 
WinnerMap = {'Red': 1, 'Blue': 0}

# Applying the mapping to create a new 'Red Winner' column in the DataFrame
ufc_data['Red_winner'] = ufc_data['Winner'].map(WinnerMap)

In [6]:
# To avoid overfitting, the most relevant columns were selected 
columns = ['R_odds','gender','lose_streak_dif',
       'win_streak_dif', 'longest_win_streak_dif', 'win_dif', 'loss_dif',
       'total_round_dif', 'ko_dif', 'sub_dif',
       'height_dif', 'reach_dif', 'age_dif','avg_sub_att_dif',
       'avg_td_dif']

X = ufc_data[columns]
y = ufc_data['Red_winner']

# Transform X
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 50)

# Baseline 

In [7]:
# Dummy Classifier with baseline accuracy
from sklearn.dummy import DummyClassifier
dummy_classifier = DummyClassifier(strategy='most_frequent')
dummy_classifier.fit(X_train,y_train)
baseline_acc = dummy_classifier.score(X_train,y_train)

print("Baseline Accuracy = ", round((baseline_acc) * 100, 2), '%')

Baseline Accuracy =  59.32 %


# Logistic Regression 

In [15]:
# Create and fit the logistic regression model here:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression()
lg_model.fit(X_train,y_train)

# Save and print the predicted outcomes
y_pred = lg_model.predict(X_test)
print('predicted classes: ', y_pred)

# Print out the true outcomes for the test data
print('true classes: ', y_test)

# Print out the confusion matrix
from sklearn.metrics import confusion_matrix
print('confusion matrix: ')
print(confusion_matrix(y_test, y_pred))

# Print accuracy here:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', round((accuracy), 2) * 100, '%')



predicted classes:  [0 0 0 ... 0 0 0]
true classes:  R_fighter            B_fighter         
Jason Gonzalez       JC Cottrell           1
Christian Morecraft  Matt Mitrione         0
Takanori Gomi        Dong Hyun Ma          0
Volkan Oezdemir      Anthony Smith         0
Maciej Jewtuszko     Curt Warburton        0
                                          ..
Hannah Cifers        Jodie Esquibel        1
Albert Morales       Andre Soukhamthath    1
Aleksei Oleinik      Curtis Blaydes        0
Tito Ortiz           Matt Hamill           0
Garreth McLellan     Alessio Di Chirico    0
Name: Red_winner, Length: 1469, dtype: int64
confusion matrix: 
[[338 305]
 [195 631]]
Accuracy: 66.0 %


In [18]:
import joblib

joblib.dump(lg_model, 'lg_model.pk1')

['lg_model.pk1']

## UFC Model predictions
1. Davidson Figueiredo vs Cody Garbrandt
2. Bo Nickal vs Cody Brundage
3. Justin Gaethje vs Max Holloway
4. Charles Oliveira vs Arman Tsarukyan
5. Zhang Weili vs Yan Xiaonan
6. Alex Pereira vs Jamahal Hill

In [17]:
data = np.array([
    [-200, 1, 0, -1, -1, 23, -3, 12, -2, 3, -7.63, 7.62, 4, 1.7, 0.62],
    [-1450, 1, 0, 4, 4, 4, -6, -10, -2, 2, 2.54, 10.16, -1, 6.6, 5.13],
    [-162, 1, 0, 0, -9, -2, -3, -65, -4, -2, 0, 2.54, 3, -0.3, -0.14],
    [120, 1, 0, -2, 6, 13, 6, 35, 0, 17, 7.62, 5.08, 7, 2.8, -1.08],
    [-250, 2, 0, 1, -1, 6, 0, -1, 1, 2, -2.53, 0, 0, 0.5, 1.51],
    [-125, 1, 0, -2, 0, -3, 2, -3, -1, 0, 0, 0, 4, 0.4, 0.18]
])

predictions = lg_model.predict(data)
print("Predictions:", predictions)

Predictions: [1 1 1 0 1 1]


# Random forest 

In [10]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators = 200, random_state = 0)

classifier.fit(X_train, y_train)

rfc_score = classifier.score(X_test, y_test)
print('The Random Forest Model accuracy is:', round(rfc_score, 2) * 100, '%')



The Random Forest Model accuracy is: 64.0 %


## UFC Model predictions
1. Davidson Figueiredo vs Cody Garbrandt
2. Bo Nickal vs Cody Brundage
3. Justin Gaethje vs Max Holloway
4. Charles Oliveira vs Arman Tsarukyan
5. Zhang Weili vs Yan Xiaonan
6. Alex Pereira vs Jamahal Hill

In [11]:
data = np.array([
    [-200, 1, 0, -1, -1, 23, -3, 12, -2, 3, -7.63, 7.62, 4, 1.7, 0.62],
    [-1450, 1, 0, 4, 4, 4, -6, -10, -2, 2, 2.54, 10.16, -1, 6.6, 5.13],
    [-162, 1, 0, 0, -9, -2, -3, -65, -4, -2, 0, 2.54, 3, -0.3, -0.14],
    [120, 1, 0, -2, 6, 13, 6, 35, 0, 17, 7.62, 5.08, 7, 2.8, -1.08],
    [-250, 2, 0, 1, -1, 6, 0, -1, 1, 2, -2.53, 0, 0, 0.5, 1.51],
    [-125, 1, 0, -2, 0, -3, 2, -3, -1, 0, 0, 0, 4, 0.4, 0.18]
])

predictions = classifier.predict(data)
print("Predictions:", predictions)

Predictions: [1 1 1 1 1 1]


# Deep Learning (Multi-Layer Perceptron)

In [16]:
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, InputLayer
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical


mlp_model = Sequential()
mlp_model.add(InputLayer(shape = (X_train.shape[1],)))
mlp_model.add(Dense(12, activation = 'relu'))
mlp_model.add(Dense(1, activation = 'sigmoid'))

mlp_model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

#Train and evaluate 

mlp_model.fit(X_train, y_train, epochs = 100, batch_size = 16, verbose = 1)

loss, acc = model.evaluate(X_test, y_test, verbose = 0)

y_estimate = model.predict(X_test, verbose = 0)
y_estimate = (y_estimate > 0.5).astype(int)
y_true = y_test


print(y_true)
print(classification_report(y_true, y_estimate))

Epoch 1/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308us/step - accuracy: 0.5318 - loss: 0.7011
Epoch 2/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300us/step - accuracy: 0.6037 - loss: 0.6456
Epoch 3/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297us/step - accuracy: 0.6140 - loss: 0.6385
Epoch 4/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294us/step - accuracy: 0.6410 - loss: 0.6258
Epoch 5/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 295us/step - accuracy: 0.6441 - loss: 0.6285
Epoch 6/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 297us/step - accuracy: 0.6604 - loss: 0.6143
Epoch 7/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 311us/step - accuracy: 0.6502 - loss: 0.6185
Epoch 8/100
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 316us/step - accuracy: 0.6655 - loss: 0.6135
Epoch 9/100
[1m

## UFC Model predictions
1. Davidson Figueiredo vs Cody Garbrandt
2. Bo Nickal vs Cody Brundage
3. Justin Gaethje vs Max Holloway
4. Charles Oliveira vs Arman Tsarukyan
5. Zhang Weili vs Yan Xiaonan
6. Alex Pereira vs Jamahal Hill

In [13]:
data = np.array([
    [-200, 1, 0, -1, -1, 23, -3, 12, -2, 3, -7.63, 7.62, 4, 1.7, 0.62],
    [-1450, 1, 0, 4, 4, 4, -6, -10, -2, 2, 2.54, 10.16, -1, 6.6, 5.13],
    [-162, 1, 0, 0, -9, -2, -3, -65, -4, -2, 0, 2.54, 3, -0.3, -0.14],
    [120, 1, 0, -2, 6, 13, 6, 35, 0, 17, 7.62, 5.08, 7, 2.8, -1.08],
    [-250, 2, 0, 1, -1, 6, 0, -1, 1, 2, -2.53, 0, 0, 0.5, 1.51],
    [-125, 1, 0, -2, 0, -3, 2, -3, -1, 0, 0, 0, 4, 0.4, 0.18]
])

predictions = model.predict(data)
print("Predictions:", predictions)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
Predictions: [[1.0000000e+00]
 [1.0000000e+00]
 [1.0000000e+00]
 [1.6863964e-31]
 [1.0000000e+00]
 [1.0000000e+00]]
