In [1]:
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv("ufc_data_merged.csv")

In [3]:
# Changes suffix to prefix 
def fix_suffix_to_prefix(column_name):
    column_name = column_name.lower().replace('.', '_').replace(':', '').replace(' ', '_')
    if column_name.endswith('_f1') or column_name.endswith('_f2'):
        suffix = column_name[-3:]
        prefix = 'f1_' if suffix == '_f1' else 'f2_'
        return prefix + column_name[:-3]
    elif '_f1_' in column_name:
        return column_name.replace('_f1_', '_')
    elif '_f2_' in column_name:
        return column_name.replace('_f2_', '_')
    else:
        return column_name

# Sorts columns
def custom_sort(column_name):
    if not column_name.startswith('f1_') and not column_name.startswith('f2_'):
        return (0, column_name)
    elif column_name.startswith('f1_'):
        return (1, column_name)
    else:
        return (2, column_name)

In [4]:
# Fix formatting colummn names
data.columns = [fix_suffix_to_prefix(col) for col in data.columns]
# Sort columns
sorted_columns = sorted(data.columns, key=custom_sort)
data = data[sorted_columns]

In [5]:
# Drop list for redundant, and unneccesary columns
drop_list = []
data.head(2)

Unnamed: 0,card_name,end_time,f1,f2,fight_date,fights_location,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_fighter,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_unnamed_11,f1_url,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_fighter,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_unnamed_11,f2_url,f2_weight
0,UFC Fight Night: Barboza vs. Murphy,5:00,Lerone Murphy,Edson Barboza,"May 18, 2024","Las Vegas, Nevada, USA",Decision - Unanimous,Herb Dean,5,5,Featherweight Bout,Lerone Murphy,27 of 34,13 of 17,5:19,205 of 344,"Jul 22, 1991",Lerone Murphy,2 of 3,172 of 308,"5' 9""",0,21 of 22,"73""",Record: 14-0-1,0,2.6,220 of 364,60%,5.01,Orthodox,54%,60%,0.6,0,4 of 6,66%,52%,1.59,46%,259 of 410,,http://www.ufcstats.com/fighter-details/396fe8...,145 lbs.,15 of 21,2 of 3,0:17,77 of 239,"Jan 21, 1986",Edson Barboza,0 of 0,38 of 189,"5' 11""",0,26 of 32,"75""",Record: 24-12-0,0,4.65,79 of 242,32%,4.11,Orthodox,44%,56%,0.1,0,0 of 0,---,50%,0.45,72%,79 of 242,,http://www.ufcstats.com/fighter-details/64a50d...,145 lbs.
1,UFC Fight Night: Barboza vs. Murphy,1:30,Khaos Williams,Carlston Harris,"May 18, 2024","Las Vegas, Nevada, USA",KO/TKO,Dan Miragliotta,1,3,Welterweight Bout,Khaos Williams,2 of 4,0 of 0,0:01,11 of 19,"Mar 30, 1994",Khaos Williams,1 of 1,5 of 9,"6' 0""",1,5 of 7,"77""",Record: 15-3-0,0,5.42,12 of 20,60%,5.81,Orthodox,39%,42%,0.0,0,0 of 0,---,0%,0.0,80%,12 of 20,,http://www.ufcstats.com/fighter-details/2558ae...,170 lbs.,1 of 2,0 of 0,0:00,14 of 30,"Jul 09, 1987",Carlston Harris,0 of 0,10 of 25,"6' 0""",0,3 of 3,"76""",Record: 19-6-0,0,2.66,14 of 30,46%,3.18,Orthodox,49%,52%,0.8,0,0 of 0,---,29%,1.97,55%,14 of 30,,http://www.ufcstats.com/fighter-details/a53d30...,170 lbs.


In [6]:
data['winner'] = np.where(data['winner'] != '0', 'f1', data['winner'])

In [7]:
data['winner'].value_counts()

winner
f1    7506
0      143
Name: count, dtype: int64

In [8]:
drop_list += ['f1', 'f2', 'fights_location', 'f1_fighter', 'f1_unnamed_11', 'f1_url', 'f2_fighter', 'f2_unnamed_11', 'f2_url', 'card_name']

In [9]:
data.drop(columns=drop_list, inplace=True)

#### filtering data before looking at the acutal value since there are some issues with record with old fights, since they are less relevant I will remove them

In [10]:
weightclass_pattern = re.compile(r"(light\s)?(\w+weight)")

def parse_weightclass(weightclass):
    matches = re.findall(weightclass_pattern, weightclass)
    if not matches:
        return "other"
    else:
        return  ''.join(matches[0])

def parse_gender(weightclass):    
    return 'F' if 'women' in weightclass.lower() else 'M'

In [11]:
data['weightclass'] = data['weightclass'].str.lower()
data['weightclass'] = data['weightclass'].apply(parse_weightclass)
data['gender'] = data['weightclass'].apply(parse_gender)
data['weightclass'] = data[data['weightclass'] != 'other'][['weightclass']]

In [12]:
data['weightclass'].value_counts(dropna=False)

weightclass
lightweight          1319
welterweight         1276
middleweight         1009
bantamweight          867
featherweight         768
heavyweight           701
light heavyweight     685
flyweight             557
strawweight           301
NaN                   166
Name: count, dtype: int64

In [13]:
# Convert "fight_date" column to datetime format
data['fight_date'] = pd.to_datetime(data['fight_date'])

# Extract year from the "fight_date" column
data['year'] = data['fight_date'].dt.year

weightclass_counts = data.pivot_table(index='year', columns='weightclass', aggfunc='size', fill_value=0)

# Display the DataFrame
display(weightclass_counts)
display(weightclass_counts.shape)
weightclass_counts.columns

weightclass,bantamweight,featherweight,flyweight,heavyweight,light heavyweight,lightweight,middleweight,strawweight,welterweight
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1997,0,0,0,26,1,11,1,0,0
1998,0,0,0,6,3,3,8,0,1
1999,0,0,0,20,1,6,13,0,2
2000,0,0,0,13,2,7,12,0,9
2001,0,0,0,10,8,7,5,0,10
2002,0,0,0,11,9,8,12,0,13
2003,0,0,0,13,5,9,7,0,7
2004,0,0,0,7,6,6,8,0,12
2005,0,0,0,14,15,0,26,0,25
2006,0,0,0,26,33,19,36,0,42


(28, 9)

Index(['bantamweight', 'featherweight', 'flyweight', 'heavyweight',
       'light heavyweight', 'lightweight', 'middleweight', 'strawweight',
       'welterweight'],
      dtype='object', name='weightclass')

In [14]:
data.shape

(7649, 68)

In [15]:
data.isna().sum()[data.isna().sum() > 0]

referee         24
weightclass    166
f1_stance       11
f2_stance       66
dtype: int64

In [16]:
data.isna().sum().sum()

267

In [17]:
# Concatenate 'f1_stance' and 'f2_stance' columns
combined_stance = pd.concat([data['f1_stance'], data['f2_stance']])
combined_stance_mode = combined_stance.mode()[0]

# Fill NaN values in both 'f1_stance' and 'f2_stance' with combined_stance_mode
data['f1_stance'].fillna(combined_stance_mode, inplace=True)
data['f2_stance'].fillna(combined_stance_mode, inplace=True)

In [18]:
data[data['referee'].isna()]
# old fight, can be dropped

Unnamed: 0,end_time,fight_date,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_weight,gender,year
2381,5:00,2019-09-07,Decision - Unanimous,,3,3,middleweight,f1,5 of 10,0 of 2,3:39,21 of 67,"Oct 12, 1987",5 of 8,14 of 60,"6' 0""",0,7 of 7,"73""",Record: 21-6-1,0,2.84,26 of 77,33%,2.97,Orthodox,34%,55%,0.2,0,2 of 3,66%,45%,2.58,65%,47 of 106,185 lbs.,2 of 7,0 of 0,0:00,17 of 69,"Aug 02, 1984",0 of 0,6 of 51,"6' 0""",0,9 of 11,"75""",Record: 25-7-0,0,2.91,17 of 69,24%,2.96,Southpaw,35%,55%,0.6,0,0 of 0,---,28%,0.62,66%,24 of 76,205 lbs.,M,2019
2524,1:59,2019-06-01,Submission,,2,3,bantamweight,f1,5 of 6,0 of 0,1:09,18 of 69,"Aug 23, 1991",1 of 2,12 of 63,"5' 9""",0,2 of 2,"74""",Record: 4-1-0,0,6.96,19 of 71,26%,6.84,Orthodox,46%,62%,1.1,2,0 of 0,---,0%,0.0,78%,30 of 82,135 lbs.,2 of 10,0 of 2,2:35,40 of 102,"Jan 10, 1996",2 of 2,36 of 90,"5' 7""",0,4 of 6,"73""",Record: 3-1-0,0,2.72,42 of 106,39%,6.01,Orthodox,39%,73%,0.0,0,3 of 5,60%,60%,6.44,0%,48 of 112,145 lbs.,M,2019
2622,5:00,2019-03-23,Decision - Unanimous,,3,3,flyweight,f1,12 of 12,11 of 15,1:18,49 of 113,"Oct 06, 1988",0 of 0,44 of 112,"5' 4""",0,4 of 4,"64""",Record: 21-10-1,0,4.52,60 of 128,46%,4.35,Orthodox,38%,55%,0.1,0,0 of 0,---,27%,0.42,58%,70 of 142,125 lbs.,28 of 37,21 of 27,5:15,50 of 94,"Oct 04, 1984",1 of 1,16 of 54,"5' 6""",0,28 of 31,"68""",Record: 21-11-0,0,4.45,72 of 122,59%,4.63,Orthodox,49%,54%,0.7,0,1 of 2,50%,44%,0.95,34%,107 of 174,135 lbs.,M,2019
2928,5:00,2018-07-28,Decision - Unanimous,,3,3,lightweight,f1,15 of 24,17 of 25,7:11,10 of 37,"Oct 01, 1992",12 of 19,24 of 55,"5' 9""",0,0 of 2,"72""",Record: 14-8-0,1,4.66,39 of 81,48%,4.33,Orthodox,41%,55%,0.1,0,4 of 10,40%,31%,1.06,59%,86 of 137,145 lbs.,9 of 11,9 of 10,3:55,9 of 21,"Feb 23, 1989",1 of 1,8 of 18,"5' 9""",0,2 of 3,"70""",Record: 11-5-0,0,2.97,19 of 32,59%,2.48,Southpaw,51%,55%,0.5,1,2 of 6,33%,33%,2.36,70%,37 of 50,155 lbs.,M,2018
3042,5:00,2018-05-19,Decision - Split,,3,3,welterweight,f1,4 of 7,1 of 4,2:44,29 of 72,"Jul 25, 1981",0 of 0,20 of 59,"5' 6""",0,6 of 10,"67""",Record: 26-4-0,0,2.51,30 of 76,39%,2.38,Orthodox,47%,57%,0.6,0,3 of 9,33%,36%,3.94,47%,32 of 78,170 lbs.,11 of 21,13 of 18,2:13,15 of 74,"Aug 02, 1984",0 of 0,12 of 64,"6' 0""",0,5 of 7,"75""",Record: 25-7-0,0,2.91,28 of 92,30%,2.96,Southpaw,35%,55%,0.6,0,0 of 0,---,28%,0.62,66%,62 of 128,205 lbs.,M,2018
3047,5:00,2018-05-19,Decision - Unanimous,,3,3,bantamweight,f1,18 of 22,16 of 19,4:10,40 of 84,"Aug 12, 1980",10 of 17,36 of 81,"5' 6""",1,12 of 17,"66""",Record: 13-7-0,0,3.5,66 of 120,55%,3.94,Orthodox,47%,52%,0.1,0,4 of 10,40%,31%,1.74,61%,84 of 138,135 lbs.,10 of 13,4 of 6,0:08,40 of 98,"Oct 22, 1980",0 of 0,25 of 82,"5' 8""",0,9 of 9,"69""",Record: 16-8-1,0,4.68,44 of 104,42%,3.47,Orthodox,42%,53%,0.6,1,0 of 0,---,0%,0.0,52%,58 of 123,135 lbs.,M,2018
3082,5:00,2018-04-14,Decision - Unanimous,,3,3,flyweight,f1,20 of 29,4 of 7,2:48,54 of 147,"Nov 15, 1991",0 of 2,31 of 119,"5' 5""",0,7 of 8,"67""",Record: 5-3-0,0,3.77,58 of 156,37%,4.08,Orthodox,41%,58%,0.0,0,2 of 2,100%,50%,0.7,71%,70 of 171,125 lbs.,23 of 36,6 of 8,0:44,62 of 198,"Mar 30, 1989",0 of 0,34 of 158,"5' 6""",0,11 of 12,"69""",Record: 4-5-0,0,5.2,68 of 206,33%,3.34,Orthodox,37%,50%,0.0,0,1 of 4,25%,50%,0.84,14%,74 of 213,125 lbs.,M,2018
3573,0:30,2017-03-18,KO/TKO,,1,3,lightweight,f1,2 of 2,0 of 0,0:01,6 of 7,"Mar 16, 1993",0 of 0,2 of 3,"5' 10""",1,2 of 2,"73""",Record: 17-7-0,0,2.64,6 of 7,85%,2.54,Orthodox,40%,52%,0.0,0,0 of 0,---,43%,3.29,66%,6 of 7,155 lbs.,1 of 1,0 of 0,0:00,1 of 1,"May 22, 1987",0 of 0,0 of 0,"6' 1""",0,0 of 0,"75""",Record: 8-3-0,0,3.91,1 of 1,100%,2.57,Orthodox,54%,47%,1.5,0,0 of 0,---,25%,2.31,100%,1 of 1,155 lbs.,M,2017
4044,3:44,2016-04-10,KO/TKO,,1,3,lightweight,f1,1 of 5,0 of 0,0:01,25 of 59,"Aug 08, 1988",0 of 0,18 of 48,"5' 9""",1,6 of 6,"73""",Record: 27-6-0,0,3.03,25 of 59,42%,2.98,Orthodox,39%,64%,0.0,0,0 of 0,---,63%,1.34,88%,25 of 59,155 lbs.,1 of 2,0 of 1,0:00,11 of 42,"Aug 08, 1986",0 of 0,9 of 40,"5' 9""",0,1 of 1,"70""",Record: 14-7-0,0,3.26,11 of 43,25%,3.36,Orthodox,48%,62%,0.2,0,0 of 0,---,55%,0.78,34%,11 of 43,155 lbs.,M,2016
4078,5:00,2016-02-27,Decision - Unanimous,,3,3,bantamweight,f1,14 of 18,16 of 22,8:37,33 of 58,"Dec 18, 1985",23 of 29,46 of 76,"5' 8""",0,12 of 15,"69""",Record: 15-7-0,1,3.82,72 of 109,66%,4.77,Orthodox,42%,55%,0.2,0,1 of 5,20%,41%,1.26,61%,112 of 154,135 lbs.,13 of 19,6 of 8,1:23,13 of 48,"Dec 02, 1992",4 of 10,9 of 46,"5' 8""",0,1 of 1,"70""",Record: 23-9-1,0,5.48,23 of 66,34%,4.31,Switch,49%,50%,0.9,0,1 of 6,16%,39%,0.53,70%,28 of 72,135 lbs.,M,2016


In [19]:
data.dropna(how='any', inplace=True)

In [20]:
data['method'].value_counts()

method
Decision - Unanimous       2652
KO/TKO                     2364
Submission                 1438
Decision - Split            724
Decision - Majority          91
TKO - Doctor's Stoppage      83
Overturned                   56
Could Not Continue           28
DQ                           23
Name: count, dtype: int64

In [21]:
# Replace values according to the specified mapping
data['method'] = data['method'].replace({
    "TKO - Doctor's Stoppage": 'KO/TKO',
    "Decision - Majority": 'Decision - Split',
    "Overturned": 'Decision - Split',
    "Could Not Continue": 'Stopped',
    "DQ": 'Stopped'
})

In [22]:
data['method'].value_counts()
# filter out stopped 

method
Decision - Unanimous    2652
KO/TKO                  2447
Submission              1438
Decision - Split         871
Stopped                   51
Name: count, dtype: int64

In [23]:
data = data[~data['method'].isin(['Stopped'])]
data

Unnamed: 0,end_time,fight_date,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_weight,gender,year
0,5:00,2024-05-18,Decision - Unanimous,Herb Dean,5,5,featherweight,f1,27 of 34,13 of 17,5:19,205 of 344,"Jul 22, 1991",2 of 3,172 of 308,"5' 9""",0,21 of 22,"73""",Record: 14-0-1,0,2.60,220 of 364,60%,5.01,Orthodox,54%,60%,0.6,0,4 of 6,66%,52%,1.59,46%,259 of 410,145 lbs.,15 of 21,2 of 3,0:17,77 of 239,"Jan 21, 1986",0 of 0,38 of 189,"5' 11""",0,26 of 32,"75""",Record: 24-12-0,0,4.65,79 of 242,32%,4.11,Orthodox,44%,56%,0.1,0,0 of 0,---,50%,0.45,72%,79 of 242,145 lbs.,M,2024
1,1:30,2024-05-18,KO/TKO,Dan Miragliotta,1,3,welterweight,f1,2 of 4,0 of 0,0:01,11 of 19,"Mar 30, 1994",1 of 1,5 of 9,"6' 0""",1,5 of 7,"77""",Record: 15-3-0,0,5.42,12 of 20,60%,5.81,Orthodox,39%,42%,0.0,0,0 of 0,---,0%,0.00,80%,12 of 20,170 lbs.,1 of 2,0 of 0,0:00,14 of 30,"Jul 09, 1987",0 of 0,10 of 25,"6' 0""",0,3 of 3,"76""",Record: 19-6-0,0,2.66,14 of 30,46%,3.18,Orthodox,49%,52%,0.8,0,0 of 0,---,29%,1.97,55%,14 of 30,170 lbs.,M,2024
2,5:00,2024-05-18,Decision - Unanimous,Mark Smith,3,3,welterweight,f1,31 of 35,35 of 38,10:13,6 of 10,"Jan 23, 1991",2 of 4,9 of 13,"6' 1""",0,3 of 4,"77""",Record: 13-4-0,1,1.08,43 of 52,82%,2.53,Orthodox,69%,46%,0.0,0,5 of 6,83%,58%,4.04,78%,110 of 134,170 lbs.,3 of 4,3 of 4,3:44,0 of 2,"Nov 17, 1992",0 of 0,0 of 2,"5' 10""",0,0 of 0,"72""",Record: 10-5-0,0,3.69,3 of 6,50%,1.74,Orthodox,41%,45%,1.6,0,1 of 8,12%,35%,1.61,44%,15 of 20,170 lbs.,M,2024
3,2:47,2024-05-18,KO/TKO,Chris Tognoni,1,3,bantamweight,f1,1 of 2,0 of 0,0:17,15 of 28,"Nov 29, 1993",21 of 28,27 of 45,"5' 7""",1,8 of 9,"70""",Record: 17-5-0,0,5.75,36 of 56,64%,6.51,Orthodox,41%,57%,0.0,0,0 of 0,---,0%,0.00,100%,36 of 58,135 lbs.,6 of 6,0 of 0,0:00,17 of 34,"Jul 24, 1996",0 of 0,6 of 20,"5' 7""",0,5 of 8,"70""",Record: 14-7-0,0,6.50,17 of 34,50%,5.48,Southpaw,43%,48%,0.0,0,0 of 0,---,11%,0.36,86%,17 of 34,135 lbs.,M,2024
4,4:12,2024-05-18,Submission,Herb Dean,2,3,strawweight,f1,9 of 15,3 of 3,3:06,21 of 71,"Jan 12, 1985",4 of 5,19 of 64,"5' 3""",0,0 of 0,"64""",Record: 17-13-0,1,4.86,28 of 79,35%,5.38,Orthodox,49%,61%,0.1,1,2 of 6,33%,33%,0.80,76%,39 of 91,115 lbs.,3 of 6,2 of 6,0:04,15 of 54,"Nov 18, 1992",0 of 0,11 of 51,"5' 2""",0,3 of 3,"62""",Record: 11-3-0,0,4.17,17 of 60,28%,3.57,Orthodox,40%,63%,0.2,0,1 of 7,14%,33%,2.24,66%,25 of 77,115 lbs.,M,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7545,0:39,1997-02-07,Submission,John McCarthy,1,1,lightweight,f1,0 of 0,2 of 2,--,0 of 2,"Feb 12, 1974",0 of 0,2 of 3,"5' 11""",0,0 of 1,--,Record: 11-4-0,0,0.00,2 of 4,50%,0.00,Orthodox,0%,0%,0.0,1,1 of 1,100%,0%,0.00,0%,3 of 5,199 lbs.,0 of 0,0 of 0,--,0 of 0,--,0 of 0,0 of 0,"5' 9""",0,0 of 0,--,Record: 1-1-0,0,0.00,0 of 0,---,0.00,Orthodox,0%,0%,0.0,0,0 of 1,0%,0%,0.00,0%,0 of 0,190 lbs.,M,1997
7546,1:17,1997-02-07,KO/TKO,John McCarthy,1,1,heavyweight,f1,0 of 0,5 of 15,--,2 of 4,"Apr 01, 1977",10 of 10,17 of 29,"6' 0""",1,0 of 0,"74""",Record: 26-14-0 (1 NC),0,2.83,17 of 29,58%,1.59,Southpaw,46%,50%,0.6,0,0 of 0,---,60%,1.02,53%,23 of 35,185 lbs.,1 of 1,1 of 6,--,0 of 1,"Feb 07, 1965",0 of 0,0 of 6,"6' 2""",0,0 of 0,--,Record: 7-5-1,0,3.02,1 of 7,14%,1.83,Orthodox,35%,56%,0.0,0,0 of 0,---,100%,0.87,80%,1 of 7,233 lbs.,M,1997
7547,8:02,1997-02-07,KO/TKO,John McCarthy,1,1,heavyweight,f1,6 of 6,11 of 17,--,2 of 3,"Apr 26, 1965",17 of 22,23 of 35,"5' 11""",0,1 of 1,--,Record: 4-2-0,0,0.00,30 of 42,71%,0.00,Orthodox,0%,0%,0.0,1,2 of 2,100%,0%,0.00,0%,63 of 78,323 lbs.,0 of 0,2 of 2,--,1 of 3,--,0 of 0,3 of 5,"6' 1""",0,0 of 0,--,Record: 0-2-0,0,0.00,3 of 5,60%,0.00,Southpaw,0%,0%,0.0,0,0 of 0,---,0%,0.00,0%,4 of 7,215 lbs.,M,1997
7548,3:00,1997-02-07,Decision - Unanimous,John McCarthy,2,1,lightweight,f1,0 of 0,3 of 6,--,9 of 27,"Mar 13, 1969",8 of 10,20 of 43,"5' 11""",0,0 of 0,--,Record: 30-27-3 (1 NC),0,6.47,20 of 43,46%,2.26,Southpaw,51%,44%,2.3,0,1 of 1,100%,0%,0.00,66%,72 of 99,199 lbs.,0 of 0,0 of 0,--,1 of 17,"Feb 23, 1968",0 of 0,1 of 17,"5' 7""",0,0 of 0,--,Record: 9-3-0,0,0.83,1 of 17,5%,0.92,Orthodox,31%,44%,0.5,0,1 of 10,10%,28%,1.85,0%,19 of 38,205 lbs.,M,1997


In [24]:
data['f1_control'] = data['f1_control'].replace({
    "--": 0,
})
data['f2_control'] = data['f1_control'].replace({
    "--": 0,
})

In [25]:
data['f1_reach'].value_counts(dropna=False)


f1_reach
70"    710
75"    640
73"    621
71"    620
74"    620
72"    616
76"    484
77"    379
68"    371
69"    335
67"    304
66"    246
78"    230
--     220
65"    216
79"    185
80"    163
64"    127
63"     94
81"     45
62"     43
84"     41
82"     37
61"     21
83"     19
60"     12
59"      5
58"      4
Name: count, dtype: int64

In [26]:
# Convert to centimeters
def inches_to_cm(height_str):
    # Use regular expression to extract feet and inches
    match = re.match(r"(\d+)\'\s*(\d+)\"|(\d+)\"", height_str)
    if match:
        feet = int(match.group(1)) if match.group(1) else 0
        inches = int(match.group(2)) if match.group(2) else int(match.group(3))
        total_inches = feet * 12 + inches
        return total_inches * 2.54
    else:
        return None

In [27]:
data

Unnamed: 0,end_time,fight_date,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_weight,gender,year
0,5:00,2024-05-18,Decision - Unanimous,Herb Dean,5,5,featherweight,f1,27 of 34,13 of 17,5:19,205 of 344,"Jul 22, 1991",2 of 3,172 of 308,"5' 9""",0,21 of 22,"73""",Record: 14-0-1,0,2.60,220 of 364,60%,5.01,Orthodox,54%,60%,0.6,0,4 of 6,66%,52%,1.59,46%,259 of 410,145 lbs.,15 of 21,2 of 3,5:19,77 of 239,"Jan 21, 1986",0 of 0,38 of 189,"5' 11""",0,26 of 32,"75""",Record: 24-12-0,0,4.65,79 of 242,32%,4.11,Orthodox,44%,56%,0.1,0,0 of 0,---,50%,0.45,72%,79 of 242,145 lbs.,M,2024
1,1:30,2024-05-18,KO/TKO,Dan Miragliotta,1,3,welterweight,f1,2 of 4,0 of 0,0:01,11 of 19,"Mar 30, 1994",1 of 1,5 of 9,"6' 0""",1,5 of 7,"77""",Record: 15-3-0,0,5.42,12 of 20,60%,5.81,Orthodox,39%,42%,0.0,0,0 of 0,---,0%,0.00,80%,12 of 20,170 lbs.,1 of 2,0 of 0,0:01,14 of 30,"Jul 09, 1987",0 of 0,10 of 25,"6' 0""",0,3 of 3,"76""",Record: 19-6-0,0,2.66,14 of 30,46%,3.18,Orthodox,49%,52%,0.8,0,0 of 0,---,29%,1.97,55%,14 of 30,170 lbs.,M,2024
2,5:00,2024-05-18,Decision - Unanimous,Mark Smith,3,3,welterweight,f1,31 of 35,35 of 38,10:13,6 of 10,"Jan 23, 1991",2 of 4,9 of 13,"6' 1""",0,3 of 4,"77""",Record: 13-4-0,1,1.08,43 of 52,82%,2.53,Orthodox,69%,46%,0.0,0,5 of 6,83%,58%,4.04,78%,110 of 134,170 lbs.,3 of 4,3 of 4,10:13,0 of 2,"Nov 17, 1992",0 of 0,0 of 2,"5' 10""",0,0 of 0,"72""",Record: 10-5-0,0,3.69,3 of 6,50%,1.74,Orthodox,41%,45%,1.6,0,1 of 8,12%,35%,1.61,44%,15 of 20,170 lbs.,M,2024
3,2:47,2024-05-18,KO/TKO,Chris Tognoni,1,3,bantamweight,f1,1 of 2,0 of 0,0:17,15 of 28,"Nov 29, 1993",21 of 28,27 of 45,"5' 7""",1,8 of 9,"70""",Record: 17-5-0,0,5.75,36 of 56,64%,6.51,Orthodox,41%,57%,0.0,0,0 of 0,---,0%,0.00,100%,36 of 58,135 lbs.,6 of 6,0 of 0,0:17,17 of 34,"Jul 24, 1996",0 of 0,6 of 20,"5' 7""",0,5 of 8,"70""",Record: 14-7-0,0,6.50,17 of 34,50%,5.48,Southpaw,43%,48%,0.0,0,0 of 0,---,11%,0.36,86%,17 of 34,135 lbs.,M,2024
4,4:12,2024-05-18,Submission,Herb Dean,2,3,strawweight,f1,9 of 15,3 of 3,3:06,21 of 71,"Jan 12, 1985",4 of 5,19 of 64,"5' 3""",0,0 of 0,"64""",Record: 17-13-0,1,4.86,28 of 79,35%,5.38,Orthodox,49%,61%,0.1,1,2 of 6,33%,33%,0.80,76%,39 of 91,115 lbs.,3 of 6,2 of 6,3:06,15 of 54,"Nov 18, 1992",0 of 0,11 of 51,"5' 2""",0,3 of 3,"62""",Record: 11-3-0,0,4.17,17 of 60,28%,3.57,Orthodox,40%,63%,0.2,0,1 of 7,14%,33%,2.24,66%,25 of 77,115 lbs.,M,2024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7545,0:39,1997-02-07,Submission,John McCarthy,1,1,lightweight,f1,0 of 0,2 of 2,0,0 of 2,"Feb 12, 1974",0 of 0,2 of 3,"5' 11""",0,0 of 1,--,Record: 11-4-0,0,0.00,2 of 4,50%,0.00,Orthodox,0%,0%,0.0,1,1 of 1,100%,0%,0.00,0%,3 of 5,199 lbs.,0 of 0,0 of 0,0,0 of 0,--,0 of 0,0 of 0,"5' 9""",0,0 of 0,--,Record: 1-1-0,0,0.00,0 of 0,---,0.00,Orthodox,0%,0%,0.0,0,0 of 1,0%,0%,0.00,0%,0 of 0,190 lbs.,M,1997
7546,1:17,1997-02-07,KO/TKO,John McCarthy,1,1,heavyweight,f1,0 of 0,5 of 15,0,2 of 4,"Apr 01, 1977",10 of 10,17 of 29,"6' 0""",1,0 of 0,"74""",Record: 26-14-0 (1 NC),0,2.83,17 of 29,58%,1.59,Southpaw,46%,50%,0.6,0,0 of 0,---,60%,1.02,53%,23 of 35,185 lbs.,1 of 1,1 of 6,0,0 of 1,"Feb 07, 1965",0 of 0,0 of 6,"6' 2""",0,0 of 0,--,Record: 7-5-1,0,3.02,1 of 7,14%,1.83,Orthodox,35%,56%,0.0,0,0 of 0,---,100%,0.87,80%,1 of 7,233 lbs.,M,1997
7547,8:02,1997-02-07,KO/TKO,John McCarthy,1,1,heavyweight,f1,6 of 6,11 of 17,0,2 of 3,"Apr 26, 1965",17 of 22,23 of 35,"5' 11""",0,1 of 1,--,Record: 4-2-0,0,0.00,30 of 42,71%,0.00,Orthodox,0%,0%,0.0,1,2 of 2,100%,0%,0.00,0%,63 of 78,323 lbs.,0 of 0,2 of 2,0,1 of 3,--,0 of 0,3 of 5,"6' 1""",0,0 of 0,--,Record: 0-2-0,0,0.00,3 of 5,60%,0.00,Southpaw,0%,0%,0.0,0,0 of 0,---,0%,0.00,0%,4 of 7,215 lbs.,M,1997
7548,3:00,1997-02-07,Decision - Unanimous,John McCarthy,2,1,lightweight,f1,0 of 0,3 of 6,0,9 of 27,"Mar 13, 1969",8 of 10,20 of 43,"5' 11""",0,0 of 0,--,Record: 30-27-3 (1 NC),0,6.47,20 of 43,46%,2.26,Southpaw,51%,44%,2.3,0,1 of 1,100%,0%,0.00,66%,72 of 99,199 lbs.,0 of 0,0 of 0,0,1 of 17,"Feb 23, 1968",0 of 0,1 of 17,"5' 7""",0,0 of 0,--,Record: 9-3-0,0,0.83,1 of 17,5%,0.92,Orthodox,31%,44%,0.5,0,1 of 10,10%,28%,1.85,0%,19 of 38,205 lbs.,M,1997


In [28]:
data['f1_height'] = data['f1_height'].apply(inches_to_cm)
data['f2_height'] = data['f2_height'].apply(inches_to_cm)
data['f1_reach'] = data['f1_reach'].apply(inches_to_cm)
data['f2_reach'] = data['f2_reach'].apply(inches_to_cm)

In [30]:
data['f1_height'].corr(data['f1_reach'])
# let's fill the -- with a modeled approach 

0.8986037184852236