In [1]:
import pandas as pd
import numpy as np
import math

# Introduction


# Dataset Description

@TODO describe datasets

The five datasets downloaded include:
* ufc_event_details.csv
* ufc_fight_details.csv
* ufc_fight_results.csv
* ufc_fight_stats.csv
* ufc_fighter_details.csv
* ufc_fighter_tott.csv

In [2]:
# Event details: For Date mapping
df_event_details = pd.read_csv(
    'processed data/ufc_event_details.csv',
    na_values="--"
)
df_event_details['EVENT_DATE'] = pd.to_datetime(df_event_details['DATE'])
df_event_details.drop(columns=['DATE'])

# Fight results: For Outcome and Bout-Weightclass mapping
df_fight_results = pd.read_csv(
    'processed data/ufc_fight_results.csv',
    na_values="--"
)

# Fight stats: stats per round and fighter for each bout
df_fight_stats = pd.read_csv(
    'processed data/ufc_fight_stats.csv',
    na_values="--"
)

# Fighter stats: Basic fighter stats like height, weight, stance, DOB
df_fighter = pd.read_csv(
    'processed data/ufc_fighter.csv',
    na_values="--"
)
df_fighter['DOB'] = pd.to_datetime(df_fighter['DOB'])


# Clean

## `df_fight_results`
We need to split the `BOUT` column containing the string listing the two fighters into two columns. The order is important here. The first fighter is the Bout's headliner and most likely the higher ranked fighter. This is the red corner, leaving the second mentioned fighter to Blue. Our match prediction will subsequently be probability of a Red victory.

Also we need to create a "winner" column. Both of these must come from the fight results table, because the ordering of the names can be different between the the fight details and fight results tables. Since the winners are specified by order of "W/L" or "L/W", the name ordering matters.

Start by looking at the value counts of the `OUTCOME` column in the fight results df. We see that 64% of the time the red fighter wins, 34% the blue, 1% no contest, and 0.7% was a draw

In [3]:
df_fight_results['OUTCOME'].value_counts(normalize=True)

OUTCOME
W/L      0.638124
L/W      0.344292
NC/NC    0.010194
D/D      0.007390
Name: proportion, dtype: float64

Now we'll create a function that generates our dependent variable, i.e., a boolean flag indicating a red victory. The function maps 'W/L' to 1 (Red win), 'L/W' to 0 (Red loss), and anything else to nan (neither a victory or loss)

In [4]:
def did_red_win(outcome: str) -> bool:
    match outcome:
        case "W/L":
            result = 1 # Red won
        case "L/W":
            result = 0 # Red lost
        case _:
            result = np.nan # Other scenario, like no contest or draw
    return result

df_fight_results['Red_Victory'] = df_fight_results['OUTCOME'].apply(did_red_win)

# Confirm the value counts of red victory matches the outcome column (64% red victory, 34% blue, and 1.7% neither)
df_fight_results['Red_Victory'].value_counts(normalize=True, dropna=False)

Red_Victory
1.0    0.638124
0.0    0.344292
NaN    0.017584
Name: proportion, dtype: float64

## `df_fighter`

### Convert height to inches
Input data contains height in string form as `{feet}' {inches}"`, e.g., `5' 6"` for five feet 6 inches, or 66 inches. Convert to a numeric column (in inches).

In [5]:
# Confirm all columns have this format:
re_pattern = r"\d' \d+\"" # e.g., 5' 10" or 6' 2"

# Will return all `True` if every non-null value follows this pattern.
df_fighter['HEIGHT'].dropna().str.contains(re_pattern, regex=True).value_counts()

HEIGHT
True    3955
Name: count, dtype: int64

Having confirmed the string format, we can create the logic to convert

In [6]:
# Convert fighter height text field to numeric field in inches
def get_inches(feet_and_inches: str) -> float:
    """Convert strings of heights (e.g., 5' 6") to number of inches"""

    if type(feet_and_inches) == str:

        ft_inches_list = feet_and_inches.strip('"').split("' ")

        inches = int(ft_inches_list[0]) * 12 + int(ft_inches_list[1])

        result = inches

    else: # For nan inputs, return nan
        result = np.nan

    return result

df_fighter['HEIGHT_in'] = df_fighter['HEIGHT'].apply(get_inches)

### Convert weights to lbs
Convert weight string to float, noting that all weights are expressed as "{weight} lbs."

In [7]:
# Confirm the pattern
df_fighter['WEIGHT'].str.contains(' lbs.').value_counts()

WEIGHT
True    4174
Name: count, dtype: int64

In [8]:
df_fighter['WEIGHT_lbs'] = df_fighter['WEIGHT'].str.strip(' lbs.').astype(float)

### Convert fighter reach
Simpler than height, as the format is just a two-digit number with the inches symbol (") at the end.

In [9]:
# Confirm the pattern. Should be only `True`s
df_fighter['REACH'].dropna().str.contains('"').value_counts()

REACH
True    2331
Name: count, dtype: int64

In [10]:
# Convert to number
df_fighter['REACH_in'] = df_fighter['REACH'].str.strip('"').astype(float)

View sample to confirm transformations

In [11]:
df_fighter.sample(3)

Unnamed: 0,FIRST,LAST,NICKNAME,URL,FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,HEIGHT_in,WEIGHT_lbs,REACH_in
1785,Chris,Camozzi,,http://ufcstats.com/fighter-details/6c2030e0a1...,Chris Camozzi,"6' 2""",185 lbs.,"75""",Orthodox,1986-11-20,74.0,185.0,75.0
1534,James,Terry,Intensity,http://ufcstats.com/fighter-details/5befa79331...,James Terry,"5' 10""",170 lbs.,,Orthodox,1981-03-16,70.0,170.0,
2280,Jose,Ochoa,Kalzifer,http://ufcstats.com/fighter-details/88be62d6c1...,Jose Ochoa,,125 lbs.,,,2000-12-31,,125.0,


# Create `df_fight_all` - Append Event info to Fight Results
This is only because we will want the event date to use for feature calculation later, and this only lives in the event details table

In [12]:
from pprint import pprint
# The fight details and fight results tables have some inconsistencies in their spellings.
# However the `URL` field is unique and spelled the same for each. Will use this as a join key.

df_fight_all = pd.merge(
    df_fight_results,
    df_event_details,
    on='EVENT',
    how='left',
    suffixes=['_fight', '_event'],
    indicator=True
)

# Confirm join quality - for all rows the left-hand `EVENT` was matched by the right
print(df_fight_all['_merge'].value_counts(normalize=True))

# Drop indicator column
df_fight_all.drop(columns=['_merge'], inplace=True)

_merge
both          1.0
left_only     0.0
right_only    0.0
Name: proportion, dtype: float64


Missing Data will be further explored in the EDA notebook.

# Gender and Weightclass
These can both be derived from the `WEIGHTCLASS` column of `df_fight_all`. We want to merge it into the fighter info so it can be used along with other fighter information.

In [13]:
df_fight_all['WEIGHTCLASS'].value_counts()

WEIGHTCLASS
Lightweight Bout                                                       1298
Welterweight Bout                                                      1238
Middleweight Bout                                                       977
Featherweight Bout                                                      732
Heavyweight Bout                                                        652
                                                                       ... 
Ultimate Fighter China Welterweight Tournament Title Bout                 1
TUF Nations Canada vs. Australia Welterweight Tournament Title Bout       1
Ultimate Fighter Brazil 3 Middleweight Tournament Title Bout              1
Ultimate Fighter Brazil 3 Heavyweight Tournament Title Bout               1
UFC 2 Tournament Title Bout                                               1
Name: count, Length: 109, dtype: int64

There are 109 distinct values for `WEIGHTCLASS`, but there are two usable elements in each string:
1) The weight division is mentioned, following the general pattern '{..}weight'
2) If it's a women's fight, the word 'woman' or 'women' appears, both of which have the pattern 'wom'

Next we'll use these observations to extract a lower cardinality weightclass category along with a 2-class gender variable

In [14]:
import re

# Func to extract weight class from the `WEIGHTCLASS` string values
def get_weightclass(weightclass: str) -> str:
    search = re.search(r'\w+\s?weight', weightclass, flags=re.IGNORECASE)

    return search[0] if search else np.nan

# Extract gender from `WEIGHTCLASS`
df_fight_all['gender'] = df_fight_all['WEIGHTCLASS'].apply(
    lambda x: 'W' if re.search(r'wom', x, flags=re.IGNORECASE) else 'M'
)

# Extract weight class from `WEIGHTCLASS`
df_fight_all['weightclass'] = df_fight_all['WEIGHTCLASS'].apply(
    # lambda x: re.search('\w+weight', x, flags=re.IGNORECASE)[0]
    get_weightclass
)

# Check extractions
pd.crosstab(df_fight_all['weightclass'], df_fight_all['gender'], dropna=False)

gender,M,W
weightclass,Unnamed: 1_level_1,Unnamed: 2_level_1
Bantamweight,681,215
Catch Weight,66,0
Featherweight,760,29
Flyweight,349,233
Heavyweight,1407,0
Lightweight,1346,0
Middleweight,1029,0
Open Weight,101,0
Strawweight,0,316
Welterweight,1301,0


- Gender has no missings (the function didn't allow them), but it is still assumed to be a male fight unless 'wom' appeared
- There is an edge case where 15 men's fights had nan's for `weightclass`. As shown below, none of the original `WEIGHTCLASS` values for those instances gave a clue as to the actual weight class. Note also they are very old.

In [15]:
# Where is `weightclass` null?
df_fight_all[df_fight_all['weightclass'].isna()]['WEIGHTCLASS'].value_counts()

WEIGHTCLASS
UFC Superfight Championship Bout               5
Ultimate Ultimate '96 Tournament Title Bout    1
UFC 10 Tournament Title Bout                   1
UFC 8 Tournament Title Bout                    1
Ultimate Ultimate '95 Tournament Title Bout    1
UFC 7 Tournament Title Bout                    1
UFC 6 Tournament Title Bout                    1
UFC 5 Tournament Title Bout                    1
UFC 4 Tournament Title Bout                    1
UFC 3 Tournament Title Bout                    1
UFC 2 Tournament Title Bout                    1
Name: count, dtype: int64

# Imputing Fighter Gaps
Before saving our final ML dataset the missingness of this dataset needs to be addressed. This table will ultimately be merged into the ML dataset, so its missingness could have a big impact on that of the final ML dataset. There are cleverer ways of imputing these missings than simply doing a flat imputation on the final dataset. If we impute fighter stats prior to merging, we can take advantage of things like grouped averaged by gender and weightclass, providing a more accurate picture of proper imputed values.

But to start, what are the missing rates for `df_fighter`?

In [16]:
# df of missing rates
df_fighter.isna().mean().sort_values(ascending=False).rename('Missing Rate').to_frame()

Unnamed: 0,Missing Rate
REACH,0.452817
REACH_in,0.452817
NICKNAME,0.448592
STANCE,0.194601
DOB,0.17723
HEIGHT,0.071596
HEIGHT_in,0.071596
WEIGHT,0.020188
WEIGHT_lbs,0.020188
FIRST,0.003521


If we discount `NICKNAME` (probably useless as a feature), then four features have higher than a 5% missing rate. We want to 

## Height & Weight

There are a few edge cases where a 2-item list is the mode, indicating more than one weightclass or gender per fighter. Since these are very low occurrences, for now we'll address by assigning the first item in the list to the fighter. Can re-address at a later time. (@TODO)

In [17]:
# This block melts the fighter df such that fighter names are in one column, then gets the mode of both gender and weightclass

df_gender_weightclass = df_fight_all[
    ['FIGHTER_Red', 'FIGHTER_Blue', 'gender', 'weightclass']
].melt(
    id_vars=['gender', 'weightclass'],
    value_vars=['FIGHTER_Red', 'FIGHTER_Blue'],
    value_name='FIGHTER'
).groupby('FIGHTER', as_index=False).agg({
    'gender': pd.Series.mode,
    'weightclass': pd.Series.mode
})

# clean any lists by taking the first item
df_gender_weightclass['gender'] = df_gender_weightclass['gender'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)
df_gender_weightclass['weightclass'] = df_gender_weightclass['weightclass'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)

print("=====Value counts of gender=====")
print(df_gender_weightclass['gender'].value_counts())

print("\n=====Value counts of weightclass=====")
print(df_gender_weightclass['weightclass'].value_counts())

=====Value counts of gender=====
gender
M    2283
W     254
Name: count, dtype: int64

=====Value counts of weightclass=====
weightclass
Heavyweight      445
Lightweight      435
Welterweight     382
Bantamweight     324
Middleweight     319
Featherweight    249
Flyweight        193
Strawweight      102
Open Weight       83
Catch Weight       5
Name: count, dtype: int64


In [18]:
# Append weightclass and gender to `df_fighter`
df_fighter = pd.merge(
    df_fighter,
    df_gender_weightclass,
    on='FIGHTER'
)

# Summarize augmented df_fighter to get avg metrics by gender & weight
df_fighter_summary = df_fighter.groupby(['gender', 'weightclass'])[['HEIGHT_in',  'WEIGHT_lbs']].median().reset_index()

df_fighter_summary.sort_values(['gender', 'WEIGHT_lbs'])

Unnamed: 0,gender,weightclass,HEIGHT_in,WEIGHT_lbs
3,M,Flyweight,66.0,125.0
0,M,Bantamweight,67.0,135.0
2,M,Featherweight,69.0,145.0
5,M,Lightweight,70.0,155.0
1,M,Catch Weight,72.0,170.0
8,M,Welterweight,71.0,170.0
6,M,Middleweight,72.0,185.0
4,M,Heavyweight,74.0,215.0
7,M,Open Weight,72.0,225.0
12,W,Strawweight,63.0,115.0


We used the median to get the average weight, because there are sometimes high-end outliers, especially in the larger weight divisions. The `WEIGHTCLASS` column in the raw dataset does not distinguish between light heavyweight and heavyweight, so this category is quite broad.

The table above matches almost exactly with [UFC's weight classes](https://www.ufc.com/news/understanding-ufc-weight-classes-and-weigh-ins), especially for men. An alternative, domain-aware imputation method thus also presents itself: use the nominal weights per weight class. So this exercise was either a waste of time, or a useful confirmatory analysis, depending on your perspective. Either way, we'll now use the grouped medians to impute missings.

In [19]:
df_fighter['HEIGHT_in'] = df_fighter.groupby(['gender', 'weightclass'])['HEIGHT_in'].transform(lambda x: x.fillna(x.median()))
df_fighter['WEIGHT_lbs'] = df_fighter.groupby(['gender', 'weightclass'])['WEIGHT_lbs'].transform(lambda x: x.fillna(x.median()))

df_fighter.to_clipboard()

## Reach
To approach imputing reach, I'll start by hypothesizing that it can be reasonably well predicted by other body measurements. Let's see how well it's correlated with height & weight

In [20]:
import seaborn as sns

df_fighter[['HEIGHT_in', 'WEIGHT_lbs', 'REACH_in']].corr()

Unnamed: 0,HEIGHT_in,WEIGHT_lbs,REACH_in
HEIGHT_in,1.0,0.734441,0.889688
WEIGHT_lbs,0.734441,1.0,0.762845
REACH_in,0.889688,0.762845,1.0


The pearson correlation coefficient is high for all three of these, and height. Most likely height is the driver, and weight and reach are just covariates. Still, let's use both to build a linear regressor to impute height.

In [21]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
df_lr = df_fighter[['HEIGHT_in', 'WEIGHT_lbs', 'REACH_in']].dropna()
lr.fit(df_lr[['HEIGHT_in', 'WEIGHT_lbs']], df_lr['REACH_in'])

In [22]:
# Apply the model to impute REACH

df_fighter['REACH_in'] = np.where(
    df_fighter['REACH_in'].isna(),
    lr.predict(df_fighter[['HEIGHT_in', 'WEIGHT_lbs']]), # value if missing
    df_fighter['REACH_in'] # value if not missing
)

## Stance
Stance had a roughly 19% missing rate. Unfortunately, stance is not likely to be a function of gender

In [23]:
df_fighter['STANCE'] = df_fighter['STANCE'].fillna('Orthodox')

df_fighter['STANCE'].value_counts(normalize=True)


STANCE
Orthodox       0.775276
Southpaw       0.164692
Switch         0.056477
Open Stance    0.002370
Sideways       0.001185
Name: proportion, dtype: float64

## Missingness now

In [24]:
df_fighter.isna().mean().sort_values(ascending=False).rename('Missing Rate').to_frame()

Unnamed: 0,Missing Rate
NICKNAME,0.352291
REACH,0.259874
DOB,0.062401
HEIGHT,0.008689
WEIGHT,0.007504
FIRST,0.00316
LAST,0.0
URL,0.0
FIGHTER,0.0
STANCE,0.0


# Create `df` - Append fighter details per red and blue to fight data

In [25]:
# Create a dataframe containing the red and blue fighter tott values for each match
fighter_cols = [
    'FIGHTER', 'STANCE', 'DOB', 'HEIGHT_in', 'WEIGHT_lbs', 'REACH_in'
]

# Append fighter details for Red
df = pd.merge(
    df_fight_all,
    df_fighter[fighter_cols], 
    how='left',
    left_on='FIGHTER_Red',
    right_on='FIGHTER'
).drop(columns=['FIGHTER'])

# # Append fighter details for Blue
df = pd.merge(
    df,
    df_fighter[fighter_cols],
    how='left',
    left_on='FIGHTER_Blue',
    right_on='FIGHTER',
    suffixes=('_Red', '_Blue')
)

## Age at time of fight
We have DOB for both corners and the date of the fight, so we can calculate both ages

In [26]:
df['Age_yrs_Red'] = (df['EVENT_DATE'] - df['DOB_Red']).dt.days / 365.24
df['Age_yrs_Blue'] = (df['EVENT_DATE'] - df['DOB_Blue']).dt.days / 365.24

## Differentials
The disparity in the Red and Blue fighters with respect to the attributes we have could indicate an advantage of one fighter over the other. We'll define all of these by appending a `_diff` to the column names following the exising attribute columns

### Attributes

In [27]:
## Attribute Differentials
df['WEIGHT_lbs_diff'] = df['WEIGHT_lbs_Red'] - df['WEIGHT_lbs_Blue']
df['HEIGHT_in_diff'] = df['HEIGHT_in_Red'] - df['HEIGHT_in_Blue']
df['REACH_in_diff'] = df['REACH_in_Red'] - df['REACH_in_Blue']
df['Age_diff'] = (df['DOB_Red'] - df['DOB_Blue']).dt.days / 365.24

### Stance
What does it mean when a lefty takes on a rightie? Will this have an effect?

In [28]:
df['STANCE_diff'] = df['STANCE_Red'] + '_' + df['STANCE_Blue']

# Example
df[['STANCE_Red', 'STANCE_Blue', 'STANCE_diff']].sample(5)

Unnamed: 0,STANCE_Red,STANCE_Blue,STANCE_diff
1799,Orthodox,Orthodox,Orthodox_Orthodox
4714,Southpaw,Orthodox,Southpaw_Orthodox
1256,Southpaw,Orthodox,Southpaw_Orthodox
5522,Orthodox,Orthodox,Orthodox_Orthodox
3354,Orthodox,Orthodox,Orthodox_Orthodox


## Prior fights

Another clear set of features we can engineer from this data is each fighter's record **prior to** the fight. This feature will be dynamic, since each fighter's record will change for each fight.

To do this, we will need to do a separate analysis of the `df_fight_all` dataset created above, since this contains both fighter names, the outcome and the date. That's all we'll need.

In [29]:
# Create melted table because we need one row per bout per fighter.
# Current table is one row per bout

df_prior_fights = df_fight_all.melt(
    id_vars=['EVENT', 'BOUT', 'Red_Victory', 'EVENT_DATE'],
    value_vars=['FIGHTER_Red', 'FIGHTER_Blue'],
    var_name='Corner',
    value_name='Fighter',
)
df_prior_fights['Corner'] = df_prior_fights['Corner'].str.replace('FIGHTER_', '')

In [30]:
# Was fighter in melted row victorious?

def was_victorious(row):
    if ((row['Corner'] == 'Red') & (row['Red_Victory'] == 1)):
        result = 1
    elif ((row['Corner'] == 'Red') & (row['Red_Victory'] == 0)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 1)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 0)):
        result = 1
    else:
        result = np.nan

    return result

# Apply func to calc the `Victory` column
df_prior_fights['Victory'] = df_prior_fights.apply(lambda row: was_victorious(row), axis=1)

# Confirm
df_prior_fights[['Corner', 'Red_Victory', 'Victory']].value_counts()


Corner  Red_Victory  Victory
Blue    1.0          0.0        5008
Red     1.0          1.0        5008
Blue    0.0          1.0        2702
Red     0.0          0.0        2702
Name: count, dtype: int64

In [31]:
# Simplify prior fights df to show only necessary columns
df_prior_fights.drop(columns=['BOUT', 'Red_Victory', 'Corner'], inplace=True)

df_prior_fights.head()

Unnamed: 0,EVENT,EVENT_DATE,Fighter,Victory
0,UFC Fight Night: Royval vs. Taira,2024-10-12,Brandon Royval,1.0
1,UFC Fight Night: Royval vs. Taira,2024-10-12,Brad Tavares,0.0
2,UFC Fight Night: Royval vs. Taira,2024-10-12,Chidi Njokuani,1.0
3,UFC Fight Night: Royval vs. Taira,2024-10-12,Grant Dawson,1.0
4,UFC Fight Night: Royval vs. Taira,2024-10-12,Daniel Rodriguez,1.0


### Add prior wins, losses and total fights into `df`

In [32]:
import datetime as dt

def get_record(fighter: str, date: dt.datetime) -> tuple:
    """Accept fighter name and date, return record prior to that date"""

    df_prior_fights_sub = df_prior_fights[
        (df_prior_fights['Fighter'] == fighter) &
        (df_prior_fights['EVENT_DATE'] < date)
    ]

    fights = df_prior_fights_sub['Victory'].count()
    victories = df_prior_fights_sub['Victory'].sum()
    losses = fights - victories
    f_win = victories / fights if fights != 0 else 0

    return fights, victories, losses, f_win

get_record('Brandon Royval', pd.to_datetime(['2024-10-12'])[0])

(np.int64(9), np.float64(6.0), np.float64(3.0), np.float64(0.6666666666666666))

In [33]:
# df_test = df.sample(3)

for i, row in df.iterrows():
    event_date = row['EVENT_DATE']
    fighter_red = row['FIGHTER_Red']
    fighter_blue = row['FIGHTER_Blue']

    df.loc[i, ['Red_prior_fights', 'Red_prior_victories', 'Red_prior_losses', 'Red_prior_f_win']] = get_record(fighter_red, event_date)
    df.loc[i, ['Blue_prior_fights', 'Blue_prior_victories', 'Blue_prior_losses', 'Blue_prior_f_win']] = get_record(fighter_blue, event_date)

In [34]:
df.isna().mean().sort_values(ascending=False).rename('Missing Rate').to_frame()

Unnamed: 0,Missing Rate
Age_diff,0.038823
DOB_Blue,0.029434
Age_yrs_Blue,0.029434
Red_Victory,0.017635
DOB_Red,0.01624
Age_yrs_Red,0.01624
REACH_in_diff,0.010784
HEIGHT_in_diff,0.010784
WEIGHT_lbs_diff,0.010784
STANCE_diff,0.010784


# Save ML dataset

In [35]:
df.to_csv('ufc_ml_dataset.csv', index=False)