In [1]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv("ufc_data_merged.csv")

In [3]:
# Changes suffix to prefix 
def fix_suffix_to_prefix(column_name):
    column_name = column_name.lower().replace('.', '_').replace(':', '').replace(' ', '_')
    if column_name.endswith('_f1') or column_name.endswith('_f2'):
        suffix = column_name[-3:]
        prefix = 'f1_' if suffix == '_f1' else 'f2_'
        return prefix + column_name[:-3]
    elif '_f1_' in column_name:
        return column_name.replace('_f1_', '_')
    elif '_f2_' in column_name:
        return column_name.replace('_f2_', '_')
    else:
        return column_name

# Sorts columns
def custom_sort(column_name):
    if not column_name.startswith('f1_') and not column_name.startswith('f2_'):
        return (0, column_name)
    elif column_name.startswith('f1_'):
        return (1, column_name)
    else:
        return (2, column_name)

In [4]:
# Fix formatting colummn names
data.columns = [fix_suffix_to_prefix(col) for col in data.columns]
# Sort columns
sorted_columns = sorted(data.columns, key=custom_sort)
data = data[sorted_columns]

In [5]:
# Drop list for redundant, and unneccesary columns
drop_list = []
data.head(2)

Unnamed: 0,card_name,end_time,f1,f2,fight_date,fights_location,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_fighter,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_unnamed_11,f1_url,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_fighter,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_unnamed_11,f2_url,f2_weight
0,UFC Fight Night: Barboza vs. Murphy,5:00,Lerone Murphy,Edson Barboza,"May 18, 2024","Las Vegas, Nevada, USA",Decision - Unanimous,Herb Dean,5,5,Featherweight Bout,Lerone Murphy,27 of 34,13 of 17,5:19,205 of 344,"Jul 22, 1991",Lerone Murphy,2 of 3,172 of 308,"5' 9""",0,21 of 22,"73""",Record: 14-0-1,0,2.6,220 of 364,60%,5.01,Orthodox,54%,60%,0.6,0,4 of 6,66%,52%,1.59,46%,259 of 410,,http://www.ufcstats.com/fighter-details/396fe8...,145 lbs.,15 of 21,2 of 3,0:17,77 of 239,"Jan 21, 1986",Edson Barboza,0 of 0,38 of 189,"5' 11""",0,26 of 32,"75""",Record: 24-12-0,0,4.65,79 of 242,32%,4.11,Orthodox,44%,56%,0.1,0,0 of 0,---,50%,0.45,72%,79 of 242,,http://www.ufcstats.com/fighter-details/64a50d...,145 lbs.
1,UFC Fight Night: Barboza vs. Murphy,1:30,Khaos Williams,Carlston Harris,"May 18, 2024","Las Vegas, Nevada, USA",KO/TKO,Dan Miragliotta,1,3,Welterweight Bout,Khaos Williams,2 of 4,0 of 0,0:01,11 of 19,"Mar 30, 1994",Khaos Williams,1 of 1,5 of 9,"6' 0""",1,5 of 7,"77""",Record: 15-3-0,0,5.42,12 of 20,60%,5.81,Orthodox,39%,42%,0.0,0,0 of 0,---,0%,0.0,80%,12 of 20,,http://www.ufcstats.com/fighter-details/2558ae...,170 lbs.,1 of 2,0 of 0,0:00,14 of 30,"Jul 09, 1987",Carlston Harris,0 of 0,10 of 25,"6' 0""",0,3 of 3,"76""",Record: 19-6-0,0,2.66,14 of 30,46%,3.18,Orthodox,49%,52%,0.8,0,0 of 0,---,29%,1.97,55%,14 of 30,,http://www.ufcstats.com/fighter-details/a53d30...,170 lbs.


In [6]:
data['winner'] = np.where(data['winner'] != '0', 'f1', data['winner'])

In [7]:
data['winner'].value_counts()

winner
f1    7506
0      143
Name: count, dtype: int64

In [8]:
drop_list += ['f1', 'f2', 'fights_location', 'f1_fighter', 'f1_unnamed_11', 'f1_url', 'f2_fighter', 'f2_unnamed_11', 'f2_url', 'card_name']

In [9]:
data.drop(columns=drop_list, inplace=True)

#### filtering data before looking at the acutal value since there are some issues with record with old fights, since they are less relevant I will remove them

In [10]:
weightclass_pattern = re.compile(r"(light\s)?(\w+weight)")

def parse_weightclass(weightclass):
    matches = re.findall(weightclass_pattern, weightclass)
    if not matches:
        return "other"
    else:
        return  ''.join(matches[0])

def parse_gender(weightclass):    
    return 'F' if 'women' in weightclass.lower() else 'M'

In [11]:
data['weightclass'] = data['weightclass'].str.lower()
data['weightclass'] = data['weightclass'].apply(parse_weightclass)
data['gender'] = data['weightclass'].apply(parse_gender)
data['weightclass'] = data[data['weightclass'] != 'other'][['weightclass']]

In [12]:
data['weightclass'].value_counts(dropna=False)

weightclass
lightweight          1319
welterweight         1276
middleweight         1009
bantamweight          867
featherweight         768
heavyweight           701
light heavyweight     685
flyweight             557
strawweight           301
NaN                   166
Name: count, dtype: int64

In [13]:
# Convert "fight_date" column to datetime format
data['fight_date'] = pd.to_datetime(data['fight_date'])

# Extract year from the "fight_date" column
data['year'] = data['fight_date'].dt.year

weightclass_counts = data.pivot_table(index='year', columns='weightclass', aggfunc='size', fill_value=0)

# Display the DataFrame
display(weightclass_counts)
display(weightclass_counts.shape)
weightclass_counts.columns

weightclass,bantamweight,featherweight,flyweight,heavyweight,light heavyweight,lightweight,middleweight,strawweight,welterweight
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1997,0,0,0,26,1,11,1,0,0
1998,0,0,0,6,3,3,8,0,1
1999,0,0,0,20,1,6,13,0,2
2000,0,0,0,13,2,7,12,0,9
2001,0,0,0,10,8,7,5,0,10
2002,0,0,0,11,9,8,12,0,13
2003,0,0,0,13,5,9,7,0,7
2004,0,0,0,7,6,6,8,0,12
2005,0,0,0,14,15,0,26,0,25
2006,0,0,0,26,33,19,36,0,42


(28, 9)

Index(['bantamweight', 'featherweight', 'flyweight', 'heavyweight',
       'light heavyweight', 'lightweight', 'middleweight', 'strawweight',
       'welterweight'],
      dtype='object', name='weightclass')

In [14]:
# Concatenate 'f1_stance' and 'f2_stance' columns
combined_stance = pd.concat([data['f1_stance'], data['f2_stance']])
combined_stance_mode = combined_stance.mode()[0]

# Fill NaN values in both 'f1_stance' and 'f2_stance' with combined_stance_mode
data['f1_stance'].fillna(combined_stance_mode, inplace=True)
data['f2_stance'].fillna(combined_stance_mode, inplace=True)

In [15]:
# data[data['referee'].isna()][:10]
# old fight, can be dropped

In [16]:
data['method'].value_counts()

method
Decision - Unanimous       2698
KO/TKO                     2425
Submission                 1505
Decision - Split            730
Decision - Majority          92
TKO - Doctor's Stoppage      89
Overturned                   57
Could Not Continue           28
DQ                           23
Other                         2
Name: count, dtype: int64

In [17]:
# Replace values according to the specified mapping
data['method'] = data['method'].replace({
    "TKO - Doctor's Stoppage": 'KO/TKO',
    "Decision - Majority": 'Decision - Split',
    "Overturned": 'Decision - Split',
    "Could Not Continue": 'Stopped',
    "DQ": 'Stopped'
})

In [18]:
data['method'].value_counts()
# filter out stopped 

method
Decision - Unanimous    2698
KO/TKO                  2514
Submission              1505
Decision - Split         879
Stopped                   51
Other                      2
Name: count, dtype: int64

In [19]:
data = data[~data['method'].isin(['Stopped'])]
data

Unnamed: 0,end_time,fight_date,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_weight,gender,year
0,5:00,2024-05-18,Decision - Unanimous,Herb Dean,5,5,featherweight,f1,27 of 34,13 of 17,5:19,205 of 344,"Jul 22, 1991",2 of 3,172 of 308,"5' 9""",0,21 of 22,"73""",Record: 14-0-1,0,2.60,220 of 364,60%,5.01,Orthodox,54%,60%,0.6,0,4 of 6,66%,52%,1.59,46%,259 of 410,145 lbs.,15 of 21,2 of 3,0:17,77 of 239,"Jan 21, 1986",0 of 0,38 of 189,"5' 11""",0,26 of 32,"75""",Record: 24-12-0,0,4.65,79 of 242,32%,4.11,Orthodox,44%,56%,0.1,0,0 of 0,---,50%,0.45,72%,79 of 242,145 lbs.,M,2024
1,1:30,2024-05-18,KO/TKO,Dan Miragliotta,1,3,welterweight,f1,2 of 4,0 of 0,0:01,11 of 19,"Mar 30, 1994",1 of 1,5 of 9,"6' 0""",1,5 of 7,"77""",Record: 15-3-0,0,5.42,12 of 20,60%,5.81,Orthodox,39%,42%,0.0,0,0 of 0,---,0%,0.00,80%,12 of 20,170 lbs.,1 of 2,0 of 0,0:00,14 of 30,"Jul 09, 1987",0 of 0,10 of 25,"6' 0""",0,3 of 3,"76""",Record: 19-6-0,0,2.66,14 of 30,46%,3.18,Orthodox,49%,52%,0.8,0,0 of 0,---,29%,1.97,55%,14 of 30,170 lbs.,M,2024
2,5:00,2024-05-18,Decision - Unanimous,Mark Smith,3,3,welterweight,f1,31 of 35,35 of 38,10:13,6 of 10,"Jan 23, 1991",2 of 4,9 of 13,"6' 1""",0,3 of 4,"77""",Record: 13-4-0,1,1.08,43 of 52,82%,2.53,Orthodox,69%,46%,0.0,0,5 of 6,83%,58%,4.04,78%,110 of 134,170 lbs.,3 of 4,3 of 4,3:44,0 of 2,"Nov 17, 1992",0 of 0,0 of 2,"5' 10""",0,0 of 0,"72""",Record: 10-5-0,0,3.69,3 of 6,50%,1.74,Orthodox,41%,45%,1.6,0,1 of 8,12%,35%,1.61,44%,15 of 20,170 lbs.,M,2024
3,2:47,2024-05-18,KO/TKO,Chris Tognoni,1,3,bantamweight,f1,1 of 2,0 of 0,0:17,15 of 28,"Nov 29, 1993",21 of 28,27 of 45,"5' 7""",1,8 of 9,"70""",Record: 17-5-0,0,5.75,36 of 56,64%,6.51,Orthodox,41%,57%,0.0,0,0 of 0,---,0%,0.00,100%,36 of 58,135 lbs.,6 of 6,0 of 0,0:00,17 of 34,"Jul 24, 1996",0 of 0,6 of 20,"5' 7""",0,5 of 8,"70""",Record: 14-7-0,0,6.50,17 of 34,50%,5.48,Southpaw,43%,48%,0.0,0,0 of 0,---,11%,0.36,86%,17 of 34,135 lbs.,M,2024
4,4:12,2024-05-18,Submission,Herb Dean,2,3,strawweight,f1,9 of 15,3 of 3,3:06,21 of 71,"Jan 12, 1985",4 of 5,19 of 64,"5' 3""",0,0 of 0,"64""",Record: 17-13-0,1,4.86,28 of 79,35%,5.38,Orthodox,49%,61%,0.1,1,2 of 6,33%,33%,0.80,76%,39 of 91,115 lbs.,3 of 6,2 of 6,0:04,15 of 54,"Nov 18, 1992",0 of 0,11 of 51,"5' 2""",0,3 of 3,"62""",Record: 11-3-0,0,4.17,17 of 60,28%,3.57,Orthodox,40%,63%,0.2,0,1 of 7,14%,33%,2.24,66%,25 of 77,115 lbs.,M,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7644,2:50,1994-03-11,KO/TKO,John McCarthy,1,No,,f1,1 of 1,0 of 0,--,1 of 3,--,7 of 9,7 of 11,"5' 10""",0,0 of 0,--,Record: 1-5-0,0,0.00,8 of 12,66%,0.00,Southpaw,0%,0%,0.0,0,0 of 0,---,0%,0.00,0%,11 of 15,170 lbs.,0 of 1,0 of 0,--,2 of 6,--,0 of 0,1 of 2,"6' 2""",0,1 of 3,--,Record: 0-1-0,0,0.00,2 of 6,33%,0.00,Orthodox,0%,0%,0.0,1,1 of 1,100%,0%,0.00,0%,2 of 6,245 lbs.,M,1994
7645,4:52,1994-03-11,Submission,John McCarthy,1,No,,f1,0 of 0,0 of 0,--,1 of 1,--,1 of 2,2 of 3,--,0,0 of 0,--,Record: 1-0-0,1,0.00,2 of 3,66%,0.00,Orthodox,0%,0%,0.0,3,1 of 1,100%,0%,0.00,0%,14 of 15,--,0 of 0,0 of 0,--,0 of 0,--,0 of 0,0 of 0,"6' 3""",0,0 of 0,--,Record: 0-1-0,0,0.00,0 of 0,---,0.00,Orthodox,0%,0%,0.0,0,0 of 1,0%,0%,0.00,0%,0 of 0,210 lbs.,M,1994
7646,12:13,1994-03-11,KO/TKO,John McCarthy,1,No,,f1,1 of 1,1 of 1,--,1 of 1,--,9 of 15,9 of 15,"6' 0""",0,1 of 1,--,Record: 2-1-0,0,0.00,11 of 17,64%,0.00,Orthodox,0%,0%,0.0,0,1 of 1,100%,0%,0.00,0%,74 of 86,210 lbs.,0 of 0,2 of 2,--,1 of 2,--,1 of 1,4 of 5,"6' 5""",0,0 of 0,--,Record: 1-3-0,0,0.00,4 of 5,80%,0.00,Orthodox,0%,0%,0.0,0,0 of 0,---,0%,0.00,0%,95 of 102,275 lbs.,M,1994
7647,0:58,1994-03-11,Submission,John McCarthy,1,No,,f1,1 of 1,1 of 1,--,0 of 0,"Aug 28, 1963",0 of 0,0 of 0,"6' 2""",0,0 of 0,--,Record: 20-17-0,0,0.00,1 of 1,100%,0.00,Orthodox,0%,0%,0.0,1,0 of 1,0%,0%,0.00,0%,1 of 1,225 lbs.,0 of 0,0 of 0,--,1 of 1,--,0 of 0,0 of 0,--,0,1 of 1,--,Record: 0-1-0,0,0.00,1 of 1,100%,0.00,Orthodox,0%,0%,0.0,0,0 of 0,---,0%,0.00,0%,2 of 2,--,M,1994


In [20]:
data['f1_control'] = data['f1_control'].replace({
    "--": 0,
})
data['f2_control'] = data['f1_control'].replace({
    "--": 0,
})

In [21]:
# Convert to centimeters
def inches_to_cm(height_str):
    # Use regular expression to extract feet and inches
    match = re.match(r"(\d+)\'\s*(\d+)\"|(\d+)\"", height_str)
    if match:
        feet = int(match.group(1)) if match.group(1) else 0
        inches = int(match.group(2)) if match.group(2) else int(match.group(3))
        total_inches = feet * 12 + inches
        return total_inches * 2.54
    else:
        return None

In [22]:
data['f1_height'] = data['f1_height'].apply(inches_to_cm)
data['f2_height'] = data['f2_height'].apply(inches_to_cm)
data['f1_reach'] = data['f1_reach'].apply(inches_to_cm)
data['f2_reach'] = data['f2_reach'].apply(inches_to_cm)

In [23]:
data.isna().sum()[data.isna().sum() > 0]

referee         24
weightclass    166
f1_height        3
f1_reach       309
f2_height       11
f2_reach       963
dtype: int64

In [24]:
data = data.dropna(subset=['f1_height', 'f2_height', 'referee', 'weightclass'])

In [25]:
data['f1_height'].corr(data['f1_reach'])
# let's fill the -- with a modeled approach 

0.8986308199703793

In [26]:
# fill reach f1
from sklearn.linear_model import LinearRegression
reach_data = data[['f1_height','f1_reach']].copy()
pred_data = reach_data[reach_data['f1_reach'].isna()]

# Split data into training and prediction sets
train_data = reach_data.dropna()
f2_data = data[['f2_height', 'f2_reach']].copy()

# Train the linear regression model
X_train = train_data[['f1_height']]
y_train = train_data['f1_reach']

model = LinearRegression()
model.fit(X_train, y_train)

# Predict missing values
X_pred = pred_data[['f1_height']]
pred_values = model.predict(X_pred)

# Replace missing values with predicted values
data.loc[data['f1_reach'].isna(), 'f1_reach'] = pred_values

In [27]:
# fill reach f2
from sklearn.linear_model import LinearRegression
reach_data = data[['f2_height','f2_reach']].copy()
pred_data = reach_data[reach_data['f2_reach'].isna()]

# Split data into training and prediction sets
train_data = reach_data.dropna()
f2_data = data[['f2_height', 'f2_reach']].copy()

# Train the linear regression model
X_train = train_data[['f2_height']]
y_train = train_data['f2_reach']

model = LinearRegression()
model.fit(X_train, y_train)

# Predict missing values
X_pred = pred_data[['f2_height']]
pred_values = model.predict(X_pred)

# Replace missing values with predicted values
data.loc[data['f2_reach'].isna(), 'f2_reach'] = pred_values

In [28]:
data.isna().sum().sum()

0

In [29]:
data.replace(['--', '---'], None, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.replace(['--', '---'], None, inplace=True)


In [30]:
data.isna().sum()[data.isna().sum() > 0]

f1_dob            22
f1_sig_str_%      21
f1_td_%         2286
f2_dob            96
f2_sig_str_%      45
f2_td_%         2807
dtype: int64

In [31]:
# Fill NaN values with 0 for selected columns
data.loc[:, 'f1_sig_str_%'] = data['f1_sig_str_%'].fillna(0)
data.loc[:, 'f1_td_%'] = data['f1_td_%'].fillna(0)
data.loc[:, 'f2_sig_str_%'] = data['f2_sig_str_%'].fillna(0)
data.loc[:, 'f2_td_%'] = data['f2_td_%'].fillna(0)

In [32]:
def pounds_to_kg(weight_str):
    # Split the string into numeric value and unit
    weight_value, unit = weight_str.split()
    
    # Convert the numeric value to float and pounds to kilograms
    weight_kg = float(weight_value) * 0.453592
    
    return round(weight_kg, 2)

data['f1_weight'] = data['f1_weight'].apply(pounds_to_kg)
data['f2_weight'] = data['f2_weight'].apply(pounds_to_kg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['f1_weight'] = data['f1_weight'].apply(pounds_to_kg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['f2_weight'] = data['f2_weight'].apply(pounds_to_kg)


In [33]:
data.isna().sum()[data.isna().sum() > 0]


f1_dob    22
f2_dob    96
dtype: int64

In [36]:
data = data.round(2)

In [37]:
data.to_csv('cleaned_data.csv', index=False)