In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# Fight details: Full list of all fights (Bouts), with associated Events
df_fight_details = pd.read_csv(
    'ufc_fight_details.csv',
    na_values="--"
)

# Fight results: For Outcome and Bout-Weightclass mapping
df_fight_results = pd.read_csv(
    'ufc_fight_results.csv',
    na_values="--"
)

# Event details: For Date mapping
df_event_details = pd.read_csv(
    'ufc_event_details.csv',
    na_values="--"
)
df_event_details['DATE'] = pd.to_datetime(df_event_details['DATE'], format="%B %d, %Y")

# Fighter stats: Basic fighter stats like height, weight, stance, DOB
df_fighter_tott = pd.read_csv(
    'ufc_fighter_tott.csv',
    na_values="--"
)
df_fighter_tott['DOB'] = pd.to_datetime(df_fighter_tott['DOB'], format="%b %d, %Y")

# Clean

## `df_fight_results`
We need to split the `BOUT` column containing the string listing the two fighters into two columns. The order is important here. The first fighter is the Bout's headliner and most likely the higher ranked fighter. This is the red corner, leaving the second mentioned fighter to Blue. Our match prediction will subsequently be probability of a Red victory.

Also we need to create a "winner" column. Both of these must come from the fight results table, because the ordering of the names can be different between the the fight details and fight results tables. Since the winners are specified by order of "W/L" or "L/W", the name ordering matters.

Start by looking at the value counts of the `OUTCOME` column in the fight results df. We see that 64% of the time the red fighter wins, 34% the blue, 1% no contest, and 0.7% was a draw

In [3]:
df_fight_results['OUTCOME'].value_counts(normalize=True)

OUTCOME
W/L      0.638124
L/W      0.344292
NC/NC    0.010194
D/D      0.007390
Name: proportion, dtype: float64

Now we'll create a function that generates our dependent variable, i.e., a boolean flag indicating a red victory. The function maps 'W/L' to 1 (Red win), 'L/W' to 0 (Red loss), and anything else to nan (neither a victory or loss)

In [4]:
def did_red_win(outcome: str) -> bool:
    match outcome:
        case "W/L":
            result = 1 # Red won
        case "L/W":
            result = 0 # Red lost
        case _:
            result = np.nan # Other scenario, like no contest or draw
    return result

df_fight_results['Red_Victory'] = df_fight_results['OUTCOME'].apply(did_red_win)

# Confirm the value counts of red victory matches the outcome column (64% red victory, 34% blue, and 1.7% neither)
df_fight_results['Red_Victory'].value_counts(normalize=True, dropna=False)

Red_Victory
1.0    0.638124
0.0    0.344292
NaN    0.017584
Name: proportion, dtype: float64

Now we'll create the Red and blue fighter columns by assuming the first-listed fighter in the `BOUT` column is the red fighter, and the second is blue. For example in the event `Jairzinho Rozenstruik  vs. Ciryl Gane`, Rozenstruik is the Red fighter and Gane is Blue.

In [5]:
df_fight_results[['Red', 'Blue']] = df_fight_results['BOUT'].str.split(' vs. ', expand=True)
df_fight_results['Red'] = df_fight_results['Red'].str.strip()
df_fight_results['Blue'] = df_fight_results['Blue'].str.strip()

View a few rows to confirm tranformations

In [6]:
df_fight_results.sample(3)

Unnamed: 0,EVENT,BOUT,OUTCOME,WEIGHTCLASS,METHOD,ROUND,TIME,TIME FORMAT,REFEREE,DETAILS,URL,Red_Victory,Red,Blue
5886,UFC on FX: Johnson vs McCall,Jake Hecht vs. Sean Pierson,L/W,Welterweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Chris Adams,Rich Coreen 28 - 29.Hector Gomez 28 - 29.Barry...,http://ufcstats.com/fight-details/d08a7d4de3c7...,0.0,Jake Hecht,Sean Pierson
3829,UFC Fight Night: Rodriguez vs. Penn,Anthony Rocco Martin vs. Alex White,W/L,Lightweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Al Guinee,Chris Flores 27 - 30.Jeff Mullen 27 - 30.Junic...,http://ufcstats.com/fight-details/815647949087...,1.0,Anthony Rocco Martin,Alex White
2784,UFC Fight Night: Barboza vs. Gaethje,Sodiq Yusuff vs. Sheymon Moraes,W/L,Featherweight Bout,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Dan Miragliotta,Dave Tirelli 27 - 29.Eric Colon 28 - 29.David ...,http://ufcstats.com/fight-details/818b55b08701...,1.0,Sodiq Yusuff,Sheymon Moraes


## `df_fighter_tott`

### Convert fighter height to inches
Input data contains height in string form as `{feet}' {inches}"`, e.g., `5' 6"` for five feet 6 inches, or 66 inches. Convert to a numeric column (in inches).

In [7]:
# Confirm all columns have this format:
re_pattern = r"\d' \d+\"" # e.g., 5' 10" or 6' 2"

# Will return all `True` if every non-null value follows this pattern.
df_fighter_tott['HEIGHT'].dropna().str.contains(re_pattern, regex=True).value_counts()

HEIGHT
True    3553
Name: count, dtype: int64

Having confirmed the string format, we can create the logic to convert

In [8]:
# Convert fighter height text field to numeric field in inches
def get_inches(feet_and_inches: str) -> float:
    """Convert strings of heights (e.g., 5' 6") to number of inches"""

    if type(feet_and_inches) == str:

        ft_inches_list = feet_and_inches.strip('"').split("' ")

        inches = int(ft_inches_list[0]) * 12 + int(ft_inches_list[1])

        result = inches

    else: # For nan inputs, return nan
        result = np.nan

    return result

df_fighter_tott['HEIGHT_in'] = df_fighter_tott['HEIGHT'].apply(get_inches)

### Convert fighter weights
Convert weight string to float, noting that all weights are expressed as "{weight} lbs."

In [9]:
# Confirm the pattern
df_fighter_tott['WEIGHT'].str.contains(' lbs.').value_counts()

WEIGHT
True    4164
Name: count, dtype: int64

In [10]:
df_fighter_tott['WEIGHT_lbs'] = df_fighter_tott['WEIGHT'].str.strip(' lbs.').astype(float)

### Convert fighter reach
Simpler than height, as the format is just a two-digit number with the inches symbol (") at the end.

In [11]:
# Confirm the pattern. Should be only `True`s
df_fighter_tott['REACH'].dropna().str.contains('"').value_counts()

REACH
True    1840
Name: count, dtype: int64

In [12]:
# Convert to number
df_fighter_tott['REACH_in'] = df_fighter_tott['REACH'].str.strip('"').astype(float)

View sample to confirm transformations

In [13]:
df_fighter_tott.sample(3)

Unnamed: 0,FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,URL,HEIGHT_in,WEIGHT_lbs,REACH_in
245,Eric Bedard,"6' 2""",239 lbs.,,,1984-09-11,http://ufcstats.com/fighter-details/ce05955c26...,74.0,239.0,
2303,Olivier Murad,"5' 10""",145 lbs.,"72""",Southpaw,1993-12-21,http://ufcstats.com/fighter-details/c3dd69333e...,70.0,145.0,72.0
1490,Valeri Ignatov,"5' 9""",170 lbs.,,Orthodox,NaT,http://ufcstats.com/fighter-details/7b6ce3a946...,69.0,170.0,


# JOINS

## Create `df_fight_all` by combining `df_fight_details`, `df_event_details` and `df_fight_results`

### Append Event Date to fight details

In [14]:
# Merge date field into events
df_fight_all = pd.merge(
    df_fight_details,#.drop(columns=['URL']),
    df_event_details.drop(columns=['URL']),
    on='EVENT',
    how='left',
    indicator=True
).rename(columns={'DATE': 'EVENT_DATE'})

# Confirm all merge indicator values are "both"
# meaning the key was present in both tables
print(df_fight_all['_merge'].value_counts(normalize=True)['both'] == 1.0)

df_fight_all.drop(columns=['_merge'], inplace=True)

True


### Append Outcome and Weightclass to fight details

In [15]:
from pprint import pprint
# The fight details and fight results tables have some inconsistencies in their spellings.
# However the `URL` field is unique and spelled the same for each. Will use this as a join key.

df_fight_all = pd.merge(
    df_fight_all.drop(columns=['EVENT', 'BOUT']), # Dropping these because they sometimes conflict with the same cols in df_fight_results
    df_fight_results[['URL', 'EVENT', 'BOUT', 'WEIGHTCLASS', 'Red', 'Blue', 'Red_Victory']],
    on='URL',
    how='left',
    indicator=True
)

# What % of rows matched? 99.8% (13 rows)
print(df_fight_all['_merge'].value_counts(normalize=True))

# The 0.2% correspond to a single event, which contained 13 Bouts
print('Number of unmatched Events: ', df_fight_all.loc[df_fight_all['_merge'] != 'both', 'EVENT'].unique())
print('List of unmatched of Bouts:')
pprint(df_fight_all.loc[df_fight_all['_merge'] != 'both', 'BOUT'].unique())

# Drop indicator column
df_fight_all.drop(columns=['_merge'], inplace=True)

_merge
both          0.998346
left_only     0.001654
right_only    0.000000
Name: proportion, dtype: float64
Number of unmatched Events:  [nan]
List of unmatched of Bouts:
array([nan], dtype=object)


## Create `df` - Append fighter details per red and blue to fight data

In [16]:
# Create a dataframe containing the red and blue fighter tott values for each match

# Append fighter details for Red
df = pd.merge(
    df_fight_all,
    df_fighter_tott.drop(columns=['HEIGHT', 'WEIGHT', 'REACH', 'URL']), # Drop the pre-converted cols
    how='left',
    left_on='Red',
    right_on='FIGHTER'
)

# Append fighter details for Blue
df = pd.merge(
    df,
    df_fighter_tott.drop(columns=['HEIGHT', 'WEIGHT', 'REACH', 'URL']),
    how='left',
    left_on='Blue', right_on='FIGHTER', suffixes=('_Red', '_Blue')
).drop(columns=['Red', 'Blue'])


## Missing Fighter Data

In [17]:
# Missing figher names on join
# In a few cases, there was not a match between fighters. About 98.5% of all fights had both
# fighters matched against the fighters table.

df[['FIGHTER_Red', 'FIGHTER_Blue']].isna().value_counts(normalize=True)

FIGHTER_Red  FIGHTER_Blue
False        False           0.985313
             True            0.006711
True         False           0.006204
             True            0.001773
Name: proportion, dtype: float64

Missing Data will be further explored in the EDA notebook.

# Feature Engineering
In addition to knowing the basic details about fighters, we also want to know how they compare to each other, and how they compare to others in their own weight division
* Age at time of fight
* Differentials (Red vs Blue) for attributes like height, reach, weight, age
* Number of prior fights
* Number of prior victories


## Age at time of fight
We have DOB for both corners and the date of the fight, so we can calculate both ages

In [18]:
df['Age_yrs_Red'] = (df['EVENT_DATE'] - df['DOB_Red']).dt.days / 365.24
df['Age_yrs_Blue'] = (df['EVENT_DATE'] - df['DOB_Blue']).dt.days / 365.24

## Attribute differentials
The disparity in the Red and Blue fighters with respect to the attributes we have could indicate an advantage of one fighter over the other. We'll define all of these by appending a `_diff` to the column names following the exising attribute columns

In [19]:
## Attribute Differentials
df['WEIGHT_lbs_diff'] = df['WEIGHT_lbs_Red'] - df['WEIGHT_lbs_Blue']
df['HEIGHT_in_diff'] = df['HEIGHT_in_Red'] - df['HEIGHT_in_Blue']
df['REACH_in_diff'] = df['REACH_in_Red'] - df['REACH_in_Blue']
df['Age_diff'] = (df['DOB_Red'] - df['DOB_Blue']).dt.days / 365.24

## Stance Differentials
What does it mean when a lefty takes on a rightie? Will this have an effect?

In [20]:
df_fighter_tott['STANCE'].value_counts()

STANCE
Orthodox       2274
Southpaw        510
Switch          148
Open Stance       7
Sideways          3
Name: count, dtype: int64

In [21]:
df['STANCE_diff'] = df['STANCE_Red'] + '_' + df['STANCE_Blue']

# Example
df[['STANCE_Red', 'STANCE_Blue', 'STANCE_diff']].sample(5)

Unnamed: 0,STANCE_Red,STANCE_Blue,STANCE_diff
5472,Orthodox,Southpaw,Orthodox_Southpaw
7863,Southpaw,Orthodox,Southpaw_Orthodox
3869,Orthodox,Orthodox,Orthodox_Orthodox
1999,Orthodox,Orthodox,Orthodox_Orthodox
1200,Orthodox,Switch,Orthodox_Switch


## Prior fights

Another clear set of features we can engineer from this data is each fighter's record **prior to** the fight. This feature will be dynamic, since each fighter's record will change for each fight.

To do this, we will need to do a separate analysis of the `df_fight_all` dataset created above, since this contains both fighter names, the outcome and the date. That's all we'll need.

In [22]:
# Create melted table because we need one row per bout per fighter.
# Current table is one row per bout

df_prior_fights = df_fight_all.melt(
    id_vars=['EVENT', 'BOUT', 'URL', 'Red_Victory', 'EVENT_DATE'],
    value_vars=['Red', 'Blue'],
    var_name='Corner',
    value_name='Fighter',
)


In [23]:
# Was fighter in melted row victorious?

def was_victorious(row):
    if ((row['Corner'] == 'Red') & (row['Red_Victory'] == 1)):
        result = 1
    elif ((row['Corner'] == 'Red') & (row['Red_Victory'] == 0)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 1)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 0)):
        result = 1
    else:
        result = np.nan

    return result

# Apply func to calc the `Victory` column
df_prior_fights['Victory'] = df_prior_fights.apply(lambda row: was_victorious(row), axis=1)

# Confirm
df_prior_fights[['Corner', 'Red_Victory', 'Victory']].value_counts()


Corner  Red_Victory  Victory
Blue    1.0          0.0        5008
Red     1.0          1.0        5008
Blue    0.0          1.0        2702
Red     0.0          0.0        2702
Name: count, dtype: int64

In [24]:
# Simplify prior fights df to show only necessary columns
df_prior_fights.drop(columns=['BOUT', 'URL', 'Red_Victory', 'Corner'], inplace=True)

df_prior_fights.head()

Unnamed: 0,EVENT,EVENT_DATE,Fighter,Victory
0,UFC Fight Night: Royval vs. Taira,2024-10-12,Brandon Royval,1.0
1,UFC Fight Night: Royval vs. Taira,2024-10-12,Brad Tavares,0.0
2,UFC Fight Night: Royval vs. Taira,2024-10-12,Chidi Njokuani,1.0
3,UFC Fight Night: Royval vs. Taira,2024-10-12,Grant Dawson,1.0
4,UFC Fight Night: Royval vs. Taira,2024-10-12,Daniel Rodriguez,1.0


### Add prior wins, losses and total fights into `df`

In [25]:
def get_record(fighter: str, date: 'datetime') -> tuple:
    """Accept fighter name and date, return record prior to that date"""

    df_prior_fights_sub = df_prior_fights[
        (df_prior_fights['Fighter'] == fighter) &
        (df_prior_fights['EVENT_DATE'] < date)
    ]

    fights = df_prior_fights_sub['Victory'].count()
    victories = df_prior_fights_sub['Victory'].sum()
    losses = fights - victories
    f_win = victories / fights

    return fights, victories, losses, f_win

get_record('Brandon Royval', pd.to_datetime(['2024-10-12'])[0])

(np.int64(9), np.float64(6.0), np.float64(3.0), np.float64(0.6666666666666666))

In [26]:
# df_test = df.sample(3)

for i, row in df.iterrows():
    event_date = row['EVENT_DATE']
    fighter_red = row['FIGHTER_Red']
    fighter_blue = row['FIGHTER_Blue']

    df.loc[i, ['Red_prior_fights', 'Red_prior_victories', 'Red_prior_losses', 'Red_prior_f_win']] = get_record(fighter_red, event_date)
    df.loc[i, ['Blue_prior_fights', 'Blue_prior_victories', 'Blue_prior_losses', 'Blue_prior_f_win']] = get_record(fighter_blue, event_date)

  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = victories / fights
  f_win = vict

In [27]:
df.to_clipboard()

# Missing data (migrate to EDA section)

In [18]:
df_fighter_tott['HEIGHT'].isna().value_counts(normalize=True)[True]

np.float64(0.16674484052532834)

In [19]:
# In addition to the 1.3% of fights with one fighter missing, there were further instances of
# missing data for fighters 

missing_dict = {col: df_fighter_tott[col].isna().value_counts(normalize=True) for col in df_fighter_tott.columns}
missing_df = pd.DataFrame.from_dict(missing_dict).T
missing_df

Unnamed: 0,False,True
FIGHTER,1.0,
HEIGHT,0.833255,0.166745
WEIGHT,0.976548,0.023452
REACH,0.43152,0.56848
STANCE,0.689962,0.310038
DOB,0.797842,0.202158
URL,1.0,
HEIGHT_in,0.833255,0.166745
WEIGHT_lbs,0.976548,0.023452
REACH_in,0.43152,0.56848
