In [47]:
# Import our libraries
import pandas as pd
from datetime import datetime
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import warnings
warnings.filterwarnings("ignore")

In [32]:
# Load in our data
fights_data = pd.read_csv('scraped_fights.csv')

In [33]:
""" Make 50% of our rows switch fighter1 and fighter2 to make winner point to 
fighter2 to avoid bias. """

winner1 = fights_data[:2603]
winner2 = fights_data[2603:]

winner2['fighter1'], winner2['fighter2'] = winner2['fighter2'], winner2['fighter1']
winner2['pass_stat_f1'], winner2['pass_stat_f2'] = winner2['pass_stat_f2'], winner2['pass_stat_f1']
winner2['str_stat_f1'], winner2['str_stat_f2'] = winner2['str_stat_f2'], winner2['str_stat_f1']
winner2['sub_stat_f1'], winner2['sub_stat_f2'] = winner2['sub_stat_f2'], winner2['sub_stat_f1']
winner2['td_stat_f1'], winner2['td_stat_f2'] = winner2['td_stat_f2'], winner2['td_stat_f1']
winner2['f1_dob'], winner2['f2_dob'] = winner2['f2_dob'], winner2['f1_dob']
winner2['f1_record'], winner2['f2_record'] = winner2['f2_record'], winner2['f1_record']
winner2['f1_height'], winner2['f2_height'] = winner2['f2_height'], winner2['f1_height']
winner2['f1_reach'], winner2['f2_reach'] = winner2['f2_reach'], winner2['f1_reach']
winner2['f1_sapm'], winner2['f2_sapm'] = winner2['f2_sapm'], winner2['f1_sapm']
winner2['f1_slpm'], winner2['f2_slpm'] = winner2['f2_slpm'], winner2['f1_slpm']
winner2['f1_stance'], winner2['f2_stance'] = winner2['f2_stance'], winner2['f1_stance']
winner2['f1_stk_acc'], winner2['f2_stk_acc'] = winner2['f2_stk_acc'], winner2['f1_stk_acc']
winner2['f1_stk_def'], winner2['f2_stk_def'] = winner2['f2_stk_def'], winner2['f1_stk_def']
winner2['f1_sub_avg'], winner2['f2_sub_avg'] = winner2['f2_sub_avg'], winner2['f1_sub_avg']
winner2['f1_td_acc'], winner2['f2_td_acc'] = winner2['f2_td_acc'], winner2['f1_td_acc']
winner2['f1_td_avg'], winner2['f2_td_avg'] = winner2['f2_td_avg'], winner2['f1_td_avg']
winner2['f1_td_def'], winner2['f2_td_def'] = winner2['f2_td_def'], winner2['f1_td_def']
winner2['f1_weight'], winner2['f2_weight'] = winner2['f2_weight'], winner2['f1_weight']

fights_data = pd.concat([winner1, winner2])

In [34]:
""" Remove the specific variables (pass_stat, str_stat, sub_stat, td_stat) 
that deal with post-fight measurements. """

post_fight_data = fights_data.loc[:, ['pass_stat_f1', 'pass_stat_f2', 'str_stat_f1', 'str_stat_f2', 'sub_stat_f1', 'sub_stat_f2', 'td_stat_f1', 'td_stat_f2']]
fights_data = fights_data.drop(['pass_stat_f1', 'pass_stat_f2', 'str_stat_f1', 'str_stat_f2', 'sub_stat_f1', 'sub_stat_f2', 'td_stat_f1', 'td_stat_f2'], axis=1)

In [35]:
""" Convert our f1 and f2 records into 3 columns (W, L, D) """

wins = []
losses = []
draws = []

for recordString in fights_data['f1_record']:
    recordString = recordString.split(" (")[0]
    w, l, d = [int(x) for x in recordString.split(": ")[1].split("-")]
    wins.append(w)
    losses.append(l)
    draws.append(d)

fights_data['f1_wins'] = wins
fights_data['f1_losses'] = losses
fights_data['f1_draws'] = draws

wins = []
losses = []
draws = []

for recordString in fights_data['f2_record']:
    recordString = recordString.split(" (")[0]
    w, l, d = [int(x) for x in recordString.split(": ")[1].split("-")]
    wins.append(w)
    losses.append(l)
    draws.append(d)

fights_data['f2_wins'] = wins
fights_data['f2_losses'] = losses
fights_data['f2_draws'] = draws


fights_data = fights_data.drop('f1_record', axis=1)
fights_data = fights_data.drop('f2_record', axis=1)

In [36]:
""" Convert our f1_dob into f1_age (likewise for f2) """

f1_ages = []
f2_ages = []

for index, row in fights_data.iterrows():
    if (row['f1_dob'] == '--'):
        f1_age = 0
    else:
        dob_datetime = datetime.strptime(row['f1_dob'], '%b %d %Y')
        event_datetime = datetime.strptime(row['event_date'], '%B %d %Y')
        f1_age = event_datetime.year - dob_datetime.year - ((event_datetime.month, dob_datetime.day) < (event_datetime.month, dob_datetime.day))
    f1_ages.append(f1_age)

    if (row['f2_dob'] == '--'):
        f2_age = 0
    else:
        dob_datetime = datetime.strptime(row['f2_dob'], '%b %d %Y')
        event_datetime = datetime.strptime(row['event_date'], '%B %d %Y')
        f2_age = event_datetime.year - dob_datetime.year - ((event_datetime.month, dob_datetime.day) < (event_datetime.month, dob_datetime.day))
    f2_ages.append(f2_age)


fights_data['f1_age'] = f1_ages
fights_data['f2_age'] = f2_ages

fights_data = fights_data.drop('f1_dob', axis=1)
fights_data = fights_data.drop('f2_dob', axis=1)



In [37]:
""" Convert our f1_height into a number in terms of inches (likewise for f2) """

f1_heights = []
f2_heights = []

for index, row in fights_data.iterrows():

    f1_height_string = row['f1_height']
    if f1_height_string == "--":
        f1_height = 0
    else:
        parts = f1_height_string.split("' ")
        f1_height = (int(parts[0]) * 12) + (int(parts[1].replace('"', '')))
    f1_heights.append(f1_height)


    f2_height_string = row['f2_height']
    if f2_height_string == "--":
        f2_height = 0
    else:
        parts = f2_height_string.split("' ")
        f2_height = (int(parts[0]) * 12) + (int(parts[1].replace('"', '')))
    f2_heights.append(f2_height)


fights_data = fights_data.drop('f1_height', axis=1)
fights_data = fights_data.drop('f2_height', axis=1)

fights_data['f1_height'] = f1_heights
fights_data['f2_height'] = f2_heights



In [38]:
""" Now we do some more data preprocessing on categorical columns """

# Map the 14 weight classes to a number
weight_class_dict = {'Lightweight': 6, 'Welterweight': 10, 'Middleweight': 7, "Women's Strawweight": 14, 'Flyweight': 3, 
                     "Women's Flyweight": 13, 'Heavyweight': 4, 'Bantamweight': 0, 'Featherweight': 2, 'Light Heavyweight': 5, 
                     "Women's Bantamweight": 11, "Women's Featherweight": 12, 'Open Weight': 8, 'Super Heavyweight': 9, 'Catch Weight': 1}

fights_data['weight_class'] = fights_data['weight_class'].map(weight_class_dict)

# Clear nans and map the 37 differnt win_method_finishes to numbers
fights_data['win_method_finish'] = fights_data['win_method_finish'].fillna('N/A')
fights_data['win_method_finish'] = fights_data['win_method_finish'].astype('category').cat.codes.astype(int)

# Map our 9 win_method_types to numbers
fights_data['win_method_type'] = fights_data['win_method_type'].astype('category').cat.codes.astype(int)

# Clear nans and map the 6 f1_stance and f2_stance to categories
fights_data['f1_stance'] = fights_data['f1_stance'].fillna('None')
fights_data['f2_stance'] = fights_data['f2_stance'].fillna('None')

stance_dict = {'Southpaw': 4, 'Orthodox': 2, 'Switch': 5, 'None': 0, 'Sideways': 3, 'Open Stance': 1}
fights_data['f1_stance'] = fights_data['f1_stance'].map(stance_dict)
fights_data['f2_stance'] = fights_data['f2_stance'].map(stance_dict)

# Set nans in f1_reach and f2_reach to f1_height and f2_height as average is 1:1 ratio
fights_data['f1_reach'] = fights_data['f1_reach'].fillna(fights_data['f1_height'])
fights_data['f2_reach'] = fights_data['f2_reach'].fillna(fights_data['f2_height'])

# Set nans in f1_weight and f2_weight to f2_weight and f1_weight as average is that fighters are similar weights
fights_data['f1_weight'] = fights_data['f1_weight'].fillna(fights_data['f2_weight'])
fights_data['f2_weight'] = fights_data['f2_weight'].fillna(fights_data['f1_weight'])

# 1 edge case where neither are specified so I set the weight to max limit of division they belong in (lightweight = 145 to 155 lbs)
fights_data['f1_weight'] = fights_data['f1_weight'].fillna(155.0)
fights_data['f2_weight'] = fights_data['f2_weight'].fillna(155.0)

In [None]:
# Determine which rows are incomplete in my dataframe
incomplete_rows = fights_data.isna().any(axis=1)
print(fights_data[incomplete_rows])

In [39]:
""" Let us create our labels now (0 indicates fighter1 won & 1 indicates fighter2 won) """

labels = []

for index, row in fights_data.iterrows():
    
    if row['winner'] == row['fighter1']:
        labels.append(0)
    else:
        labels.append(1)

with open('processed_data_labels.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['label'])
    for item in labels:
        writer.writerow([item])


In [40]:
# Determine all the columns specified as strings
print(fights_data.select_dtypes(include=['object']))

# Clear all non-essential columns like event_date, event_name, fighter1, fighter2, winner
processed_fights_info = fights_data[['event_date', 'event_name', 'fighter1', 'fighter2', 'winner', 'round_', 'time', 'win_method_finish',
       'win_method_type']]
processed_fights_info.to_csv('processed_fights_info.csv', index=False)
fights_data = fights_data.drop(['event_date', 'event_name', 'fighter1', 'fighter2', 'winner', 'round_', 'time', 'win_method_finish',
       'win_method_type'], axis=1)

            event_date                                event_name  \
0        April 14 2018      UFC Fight Night: Poirier vs. Gaethje   
1        April 14 2018      UFC Fight Night: Poirier vs. Gaethje   
2        April 14 2018      UFC Fight Night: Poirier vs. Gaethje   
3        April 14 2018      UFC Fight Night: Poirier vs. Gaethje   
4        April 14 2018      UFC Fight Night: Poirier vs. Gaethje   
...                ...                                       ...   
5201  December 07 2019  UFC Fight Night: Overeem vs. Rozenstruik   
5202  December 07 2019  UFC Fight Night: Overeem vs. Rozenstruik   
5203  December 07 2019  UFC Fight Night: Overeem vs. Rozenstruik   
5204  December 07 2019  UFC Fight Night: Overeem vs. Rozenstruik   
5205  December 07 2019  UFC Fight Night: Overeem vs. Rozenstruik   

                   fighter1           fighter2                 winner  
0            Dustin Poirier     Justin Gaethje         Dustin Poirier  
1             Alex Oliveira      Carlos

In [41]:
""" Determine the columns that contain missing values. """

for column_name, column_data in fights_data.iteritems():
    nans = fights_data[column_name].isna().sum()
    if nans > 0:
        print('The column', column_name, 'has', nans, 'nan values')

print('\n', fights_data.columns)


 Index(['weight_class', 'f1_reach', 'f1_sapm', 'f1_slpm', 'f1_stance',
       'f1_stk_acc', 'f1_stk_def', 'f1_sub_avg', 'f1_td_acc', 'f1_td_avg',
       'f1_td_def', 'f1_weight', 'f2_reach', 'f2_sapm', 'f2_slpm', 'f2_stance',
       'f2_stk_acc', 'f2_stk_def', 'f2_sub_avg', 'f2_td_acc', 'f2_td_avg',
       'f2_td_def', 'f2_weight', 'f1_wins', 'f1_losses', 'f1_draws', 'f2_wins',
       'f2_losses', 'f2_draws', 'f1_age', 'f2_age', 'f1_height', 'f2_height'],
      dtype='object')


In [52]:
# Can we determine the correlation between specific features to labels
fights_data_feature_correlation = fights_data.copy()
fights_data_feature_correlation['Winner'] = labels

corr_matrix = fights_data_feature_correlation.corr()
label_corr = corr_matrix['Winner'].abs().sort_values(ascending=False)
print(label_corr)
# plt.figure(figsize=(10, 8))
# sns.heatmap(corr_matrix, cmap='coolwarm')
# plt.title("Correlation Matrix Heatmap")
# plt.tight_layout()
# plt.show()

Winner          1.000000
f2_slpm         0.425120
f2_td_def       0.339230
f1_sapm         0.308661
f2_losses       0.277776
f1_wins         0.256633
f2_stk_def      0.247659
f2_stk_acc      0.227424
f1_weight       0.225936
f1_losses       0.223889
f2_weight       0.216684
f1_age          0.213349
f1_sub_avg      0.171452
f1_height       0.155545
f1_reach        0.151260
f2_age          0.149158
f1_slpm         0.145324
f2_draws        0.137613
f1_td_acc       0.128906
f1_td_avg       0.113769
f2_sub_avg      0.107815
f2_height       0.095706
f1_draws        0.095066
f2_sapm         0.084535
f2_td_acc       0.082624
f2_td_avg       0.066237
f2_reach        0.051029
f2_stance       0.046902
f1_td_def       0.029065
f1_stance       0.023041
f1_stk_acc      0.021641
f1_stk_def      0.013643
f2_wins         0.009425
weight_class    0.008720
Name: Winner, dtype: float64


In [42]:
# Now we convert this newly created dataset into a csv
print(fights_data.info())
fights_data.to_csv('processed_fights.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5206 entries, 0 to 5205
Data columns (total 33 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   weight_class  5206 non-null   int64  
 1   f1_reach      5206 non-null   float64
 2   f1_sapm       5206 non-null   float64
 3   f1_slpm       5206 non-null   float64
 4   f1_stance     5206 non-null   int64  
 5   f1_stk_acc    5206 non-null   int64  
 6   f1_stk_def    5206 non-null   int64  
 7   f1_sub_avg    5206 non-null   float64
 8   f1_td_acc     5206 non-null   int64  
 9   f1_td_avg     5206 non-null   float64
 10  f1_td_def     5206 non-null   int64  
 11  f1_weight     5206 non-null   float64
 12  f2_reach      5206 non-null   float64
 13  f2_sapm       5206 non-null   float64
 14  f2_slpm       5206 non-null   float64
 15  f2_stance     5206 non-null   int64  
 16  f2_stk_acc    5206 non-null   int64  
 17  f2_stk_def    5206 non-null   int64  
 18  f2_sub_avg    5206 non-null 