In [1]:
import pandas as pd
import numpy as np
import math

# Introduction


# Dataset Description

@TODO describe datasets

The five datasets downloaded include:
* ufc_event_details.csv
* ufc_fight_details.csv
* ufc_fight_results.csv
* ufc_fight_stats.csv
* ufc_fighter_details.csv
* ufc_fighter_tott.csv

In [2]:
# Event details: For Date mapping
df_event_details = pd.read_csv(
    'processed data/ufc_event_details.csv',
    na_values="--"
)
df_event_details['EVENT_DATE'] = pd.to_datetime(df_event_details['DATE'])

# Fight results: For Outcome and Bout-Weightclass mapping
df_fight_results = pd.read_csv(
    'processed data/ufc_fight_results.csv',
    na_values="--"
)

# Fight stats: stats per round and fighter for each bout
df_fight_stats = pd.read_csv(
    'processed data/ufc_fight_stats.csv',
    na_values="--"
)

# Fighter stats: Basic fighter stats like height, weight, stance, DOB
df_fighter = pd.read_csv(
    'processed data/ufc_fighter.csv',
    na_values="--"
)
df_fighter['DOB'] = pd.to_datetime(df_fighter['DOB'])


# Clean

## `df_fight_results`
We need to split the `BOUT` column containing the string listing the two fighters into two columns. The order is important here. The first fighter is the Bout's headliner and most likely the higher ranked fighter. This is the red corner, leaving the second mentioned fighter to Blue. Our match prediction will subsequently be probability of a Red victory.

Also we need to create a "winner" column. Both of these must come from the fight results table, because the ordering of the names can be different between the the fight details and fight results tables. Since the winners are specified by order of "W/L" or "L/W", the name ordering matters.

Start by looking at the value counts of the `OUTCOME` column in the fight results df. We see that 64% of the time the red fighter wins, 34% the blue, 1% no contest, and 0.7% was a draw

In [3]:
df_fight_results['OUTCOME'].value_counts(normalize=True)

OUTCOME
W/L      0.638124
L/W      0.344292
NC/NC    0.010194
D/D      0.007390
Name: proportion, dtype: float64

Now we'll create a function that generates our dependent variable, i.e., a boolean flag indicating a red victory. The function maps 'W/L' to 1 (Red win), 'L/W' to 0 (Red loss), and anything else to nan (neither a victory or loss)

In [4]:
def did_red_win(outcome: str) -> bool:
    match outcome:
        case "W/L":
            result = 1 # Red won
        case "L/W":
            result = 0 # Red lost
        case _:
            result = np.nan # Other scenario, like no contest or draw
    return result

df_fight_results['Red_Victory'] = df_fight_results['OUTCOME'].apply(did_red_win)

# Confirm the value counts of red victory matches the outcome column (64% red victory, 34% blue, and 1.7% neither)
df_fight_results['Red_Victory'].value_counts(normalize=True, dropna=False)

Red_Victory
1.0    0.638124
0.0    0.344292
NaN    0.017584
Name: proportion, dtype: float64

## `df_fighter_tott`

### Convert fighter height to inches
Input data contains height in string form as `{feet}' {inches}"`, e.g., `5' 6"` for five feet 6 inches, or 66 inches. Convert to a numeric column (in inches).

In [5]:
# Confirm all columns have this format:
re_pattern = r"\d' \d+\"" # e.g., 5' 10" or 6' 2"

# Will return all `True` if every non-null value follows this pattern.
df_fighter['HEIGHT'].dropna().str.contains(re_pattern, regex=True).value_counts()

HEIGHT
True    3955
Name: count, dtype: int64

Having confirmed the string format, we can create the logic to convert

In [6]:
# Convert fighter height text field to numeric field in inches
def get_inches(feet_and_inches: str) -> float:
    """Convert strings of heights (e.g., 5' 6") to number of inches"""

    if type(feet_and_inches) == str:

        ft_inches_list = feet_and_inches.strip('"').split("' ")

        inches = int(ft_inches_list[0]) * 12 + int(ft_inches_list[1])

        result = inches

    else: # For nan inputs, return nan
        result = np.nan

    return result

df_fighter['HEIGHT_in'] = df_fighter['HEIGHT'].apply(get_inches)

### Convert fighter weights
Convert weight string to float, noting that all weights are expressed as "{weight} lbs."

In [7]:
# Confirm the pattern
df_fighter['WEIGHT'].str.contains(' lbs.').value_counts()

WEIGHT
True    4174
Name: count, dtype: int64

In [8]:
df_fighter['WEIGHT_lbs'] = df_fighter['WEIGHT'].str.strip(' lbs.').astype(float)

### Convert fighter reach
Simpler than height, as the format is just a two-digit number with the inches symbol (") at the end.

In [9]:
# Confirm the pattern. Should be only `True`s
df_fighter['REACH'].dropna().str.contains('"').value_counts()

REACH
True    2331
Name: count, dtype: int64

In [10]:
# Convert to number
df_fighter['REACH_in'] = df_fighter['REACH'].str.strip('"').astype(float)

View sample to confirm transformations

In [11]:
df_fighter.sample(3)

Unnamed: 0,FIRST,LAST,NICKNAME,URL,FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,HEIGHT_in,WEIGHT_lbs,REACH_in
1736,Leo,Kuntz,The Lion,http://ufcstats.com/fighter-details/6905d45bd7...,Leo Kuntz,"5' 10""",155 lbs.,,Orthodox,1983-10-03,70.0,155.0,
1626,Jason,Blackford,,http://ufcstats.com/fighter-details/619d807fa5...,Jason Blackford,,,,,NaT,,,
1007,Joe,Lauzon,,http://ufcstats.com/fighter-details/3bad7ef643...,Joe Lauzon,"5' 10""",155 lbs.,"71""",Orthodox,1984-05-22,70.0,155.0,71.0


# JOINS

## Create `df_fight_all` - Append Event info to Fight Results
This is only because we will want the event date to use for feature calculation later, and this only lives in the event details table

In [12]:
from pprint import pprint
# The fight details and fight results tables have some inconsistencies in their spellings.
# However the `URL` field is unique and spelled the same for each. Will use this as a join key.

df_fight_all = pd.merge(
    df_fight_results,
    df_event_details,
    on='EVENT',
    how='left',
    suffixes=['_fight', '_event'],
    indicator=True
)

# Confirm join quality - for all rows the left-hand `EVENT` was matched by the right
print(df_fight_all['_merge'].value_counts(normalize=True))

# Drop indicator column
df_fight_all.drop(columns=['_merge'], inplace=True)

_merge
both          1.0
left_only     0.0
right_only    0.0
Name: proportion, dtype: float64


## Create `df` - Append fighter details per red and blue to fight data

In [13]:
# Create a dataframe containing the red and blue fighter tott values for each match

# Append fighter details for Red
df = pd.merge(
    df_fight_all,
    df_fighter.drop(columns=['HEIGHT', 'WEIGHT', 'REACH', 'URL']), # Drop the pre-converted cols
    how='left',
    left_on='FIGHTER_Red',
    right_on='FIGHTER'
).drop(columns=['FIGHTER'])

# # Append fighter details for Blue
df = pd.merge(
    df,
    df_fighter.drop(columns=['HEIGHT', 'WEIGHT', 'REACH', 'URL']),
    how='left',
    left_on='FIGHTER_Blue',
    right_on='FIGHTER',
    suffixes=('_Red', '_Blue')
)


## Missing Fighter Data

In [14]:
# Missing figher names on join
# In a few cases, there was not a match between fighters. About 98.5% of all fights had both
# fighters matched against the fighters table.

df[['FIGHTER_Red', 'FIGHTER_Blue']].isna().value_counts(normalize=True)

FIGHTER_Red  FIGHTER_Blue
False        False           1.0
Name: proportion, dtype: float64

Missing Data will be further explored in the EDA notebook.

# Feature Engineering
In addition to knowing the basic details about fighters, we also want to know how they compare to each other, and how they compare to others in their own weight division
* Age at time of fight
* Differentials (Red vs Blue) for attributes like height, reach, weight, age
* Number of prior fights
* Number of prior victories


## Age at time of fight
We have DOB for both corners and the date of the fight, so we can calculate both ages

In [15]:
df['Age_yrs_Red'] = (df['EVENT_DATE'] - df['DOB_Red']).dt.days / 365.24
df['Age_yrs_Blue'] = (df['EVENT_DATE'] - df['DOB_Blue']).dt.days / 365.24

## Gender and Weightclass
These can both be derived from the `WEIGHTCLASS` column of `df_fight_all`. We want to merge it into the fighter info so it can be used along with other fighter information.

In [16]:
df_fight_all['WEIGHTCLASS'].value_counts()

WEIGHTCLASS
Lightweight Bout                                                       1298
Welterweight Bout                                                      1238
Middleweight Bout                                                       977
Featherweight Bout                                                      732
Heavyweight Bout                                                        652
                                                                       ... 
Ultimate Fighter China Welterweight Tournament Title Bout                 1
TUF Nations Canada vs. Australia Welterweight Tournament Title Bout       1
Ultimate Fighter Brazil 3 Middleweight Tournament Title Bout              1
Ultimate Fighter Brazil 3 Heavyweight Tournament Title Bout               1
UFC 2 Tournament Title Bout                                               1
Name: count, Length: 109, dtype: int64

There are 109 distinct values for `WEIGHTCLASS`, but there are two usable elements in each string:
1) The weight division is mentioned, following the general pattern '{..}weight'
2) If it's a women's fight, the word 'woman' or 'women' appears, both of which have the pattern 'wom'

Next we'll use these observations to extract a lower cardinality weightclass category along with a 2-class gender variable

In [17]:
import re

# Func to extract weight class from the `WEIGHTCLASS` string values
def get_weightclass(weightclass: str) -> str:
    search = re.search(r'\w+\s?weight', weightclass, flags=re.IGNORECASE)

    return search[0] if search else np.nan

# Extract gender from `WEIGHTCLASS`
df_fight_all['gender'] = df_fight_all['WEIGHTCLASS'].apply(
    lambda x: 'W' if re.search(r'wom', x, flags=re.IGNORECASE) else 'M'
)

# Extract weight class from `WEIGHTCLASS`
df_fight_all['weightclass'] = df_fight_all['WEIGHTCLASS'].apply(
    # lambda x: re.search('\w+weight', x, flags=re.IGNORECASE)[0]
    get_weightclass
)

# Check extractions
pd.crosstab(df_fight_all['weightclass'], df_fight_all['gender'], dropna=False)

gender,M,W
weightclass,Unnamed: 1_level_1,Unnamed: 2_level_1
Bantamweight,681,215
Catch Weight,66,0
Featherweight,760,29
Flyweight,349,233
Heavyweight,1407,0
Lightweight,1346,0
Middleweight,1029,0
Open Weight,101,0
Strawweight,0,316
Welterweight,1301,0


- Gender has no missings (the function didn't allow them), but it is still assumed to be a male fight unless 'wom' appeared
- There is an edge case where 15 men's fights had nan's for `weightclass`. As shown below, none of the original `WEIGHTCLASS` values for those instances gave a clue as to the actual weight class. Note also they are very old.

In [18]:
# Where is `weightclass` null?
df_fight_all[df_fight_all['weightclass'].isna()]['WEIGHTCLASS'].value_counts()

WEIGHTCLASS
UFC Superfight Championship Bout               5
Ultimate Ultimate '96 Tournament Title Bout    1
UFC 10 Tournament Title Bout                   1
UFC 8 Tournament Title Bout                    1
Ultimate Ultimate '95 Tournament Title Bout    1
UFC 7 Tournament Title Bout                    1
UFC 6 Tournament Title Bout                    1
UFC 5 Tournament Title Bout                    1
UFC 4 Tournament Title Bout                    1
UFC 3 Tournament Title Bout                    1
UFC 2 Tournament Title Bout                    1
Name: count, dtype: int64

## Attribute differentials
The disparity in the Red and Blue fighters with respect to the attributes we have could indicate an advantage of one fighter over the other. We'll define all of these by appending a `_diff` to the column names following the exising attribute columns

In [19]:
## Attribute Differentials
df['WEIGHT_lbs_diff'] = df['WEIGHT_lbs_Red'] - df['WEIGHT_lbs_Blue']
df['HEIGHT_in_diff'] = df['HEIGHT_in_Red'] - df['HEIGHT_in_Blue']
df['REACH_in_diff'] = df['REACH_in_Red'] - df['REACH_in_Blue']
df['Age_diff'] = (df['DOB_Red'] - df['DOB_Blue']).dt.days / 365.24

## Stance Differentials
What does it mean when a lefty takes on a rightie? Will this have an effect?

In [20]:
df_fighter['STANCE'].value_counts()

STANCE
Orthodox       2630
Southpaw        585
Switch          206
Open Stance       7
Sideways          3
Name: count, dtype: int64

In [21]:
df['STANCE_diff'] = df['STANCE_Red'] + '_' + df['STANCE_Blue']

# Example
df[['STANCE_Red', 'STANCE_Blue', 'STANCE_diff']].sample(5)

Unnamed: 0,STANCE_Red,STANCE_Blue,STANCE_diff
7469,Southpaw,Southpaw,Southpaw_Southpaw
542,Orthodox,Orthodox,Orthodox_Orthodox
4923,Orthodox,Southpaw,Orthodox_Southpaw
3083,Orthodox,Southpaw,Orthodox_Southpaw
1863,Orthodox,Orthodox,Orthodox_Orthodox


## Prior fights

Another clear set of features we can engineer from this data is each fighter's record **prior to** the fight. This feature will be dynamic, since each fighter's record will change for each fight.

To do this, we will need to do a separate analysis of the `df_fight_all` dataset created above, since this contains both fighter names, the outcome and the date. That's all we'll need.

In [22]:
# Create melted table because we need one row per bout per fighter.
# Current table is one row per bout

df_prior_fights = df_fight_all.melt(
    id_vars=['EVENT', 'BOUT', 'Red_Victory', 'EVENT_DATE'],
    value_vars=['FIGHTER_Red', 'FIGHTER_Blue'],
    var_name='Corner',
    value_name='Fighter',
)
df_prior_fights['Corner'] = df_prior_fights['Corner'].str.replace('FIGHTER_', '')

In [23]:
# Was fighter in melted row victorious?

def was_victorious(row):
    if ((row['Corner'] == 'Red') & (row['Red_Victory'] == 1)):
        result = 1
    elif ((row['Corner'] == 'Red') & (row['Red_Victory'] == 0)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 1)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 0)):
        result = 1
    else:
        result = np.nan

    return result

# Apply func to calc the `Victory` column
df_prior_fights['Victory'] = df_prior_fights.apply(lambda row: was_victorious(row), axis=1)

# Confirm
df_prior_fights[['Corner', 'Red_Victory', 'Victory']].value_counts()


Corner  Red_Victory  Victory
Blue    1.0          0.0        5008
Red     1.0          1.0        5008
Blue    0.0          1.0        2702
Red     0.0          0.0        2702
Name: count, dtype: int64

In [24]:
# Simplify prior fights df to show only necessary columns
df_prior_fights.drop(columns=['BOUT', 'Red_Victory', 'Corner'], inplace=True)

df_prior_fights.head()

Unnamed: 0,EVENT,EVENT_DATE,Fighter,Victory
0,UFC Fight Night: Royval vs. Taira,2024-10-12,Brandon Royval,1.0
1,UFC Fight Night: Royval vs. Taira,2024-10-12,Brad Tavares,0.0
2,UFC Fight Night: Royval vs. Taira,2024-10-12,Chidi Njokuani,1.0
3,UFC Fight Night: Royval vs. Taira,2024-10-12,Grant Dawson,1.0
4,UFC Fight Night: Royval vs. Taira,2024-10-12,Daniel Rodriguez,1.0


### Add prior wins, losses and total fights into `df`

In [None]:
def get_record(fighter: str, date: 'datetime') -> tuple:
    """Accept fighter name and date, return record prior to that date"""

    df_prior_fights_sub = df_prior_fights[
        (df_prior_fights['Fighter'] == fighter) &
        (df_prior_fights['EVENT_DATE'] < date)
    ]

    fights = df_prior_fights_sub['Victory'].count()
    victories = df_prior_fights_sub['Victory'].sum()
    losses = fights - victories
    f_win = victories / fights if fights != 0 else np.nan

    return fights, victories, losses, f_win

get_record('Brandon Royval', pd.to_datetime(['2024-10-12'])[0])

(np.int64(9), np.float64(6.0), np.float64(3.0), np.float64(0.6666666666666666))

In [26]:
# df_test = df.sample(3)

for i, row in df.iterrows():
    event_date = row['EVENT_DATE']
    fighter_red = row['FIGHTER_Red']
    fighter_blue = row['FIGHTER_Blue']

    df.loc[i, ['Red_prior_fights', 'Red_prior_victories', 'Red_prior_losses', 'Red_prior_f_win']] = get_record(fighter_red, event_date)
    df.loc[i, ['Blue_prior_fights', 'Blue_prior_victories', 'Blue_prior_losses', 'Blue_prior_f_win']] = get_record(fighter_blue, event_date)

# Save ML dataset

In [25]:
df.to_csv('ufc_ml_dataset.csv')