In [1]:
import pandas as pd
import numpy as np
import math
pd.set_option('display.max_rows', None)  # Display only 10 rows


# Introduction


# Dataset Description

@TODO describe datasets

The five datasets downloaded include:
* ufc_event_details.csv
* ufc_fight_details.csv
* ufc_fight_results.csv
* ufc_fight_stats.csv
* ufc_fighter_details.csv
* ufc_fighter_tott.csv

In [2]:
# Event details: For Date mapping
df_event_details = pd.read_csv(
    'processed data/ufc_event_details.csv',
    na_values="--"
)
df_event_details['EVENT_DATE'] = pd.to_datetime(df_event_details['DATE'])
df_event_details.drop(columns=['DATE'])

# Fight results: For Outcome and Bout-Weightclass mapping
df_fight_results = pd.read_csv(
    'processed data/ufc_fight_results.csv',
    na_values="--"
)

# Fight stats: stats per round and fighter for each bout
df_fight_stats = pd.read_csv(
    'processed data/ufc_fight_stats.csv',
    na_values=["--", "---"]
)

# Fighter stats: Basic fighter stats like height, weight, stance, DOB
df_fighter = pd.read_csv(
    'processed data/ufc_fighter.csv',
    na_values="--"
)
df_fighter['DOB'] = pd.to_datetime(df_fighter['DOB'])


# Clean

## `df_fight_results`
We need to split the `BOUT` column containing the string listing the two fighters into two columns. The order is important here. The first fighter is the Bout's BODYliner and most likely the higher ranked fighter. This is the red corner, leaving the second mentioned fighter to Blue. Our match prediction will subsequently be probability of a Red victory.

Also we need to create a "winner" column. Both of these must come from the fight results table, because the ordering of the names can be different between the the fight details and fight results tables. Since the winners are specified by order of "W/L" or "L/W", the name ordering matters.

Start by looking at the value counts of the `OUTCOME` column in the fight results df. We see that 64% of the time the red fighter wins, 34% the blue, 1% no contest, and 0.7% was a draw

In [3]:
df_fight_results['OUTCOME'].value_counts(normalize=True)

OUTCOME
W/L      0.638124
L/W      0.344292
NC/NC    0.010194
D/D      0.007390
Name: proportion, dtype: float64

Now we'll create a function that generates our dependent variable, i.e., a boolean flag indicating a red victory. The function maps 'W/L' to 1 (Red win), 'L/W' to 0 (Red loss), and anything else to nan (neither a victory or loss)

In [4]:
def did_red_win(outcome: str) -> bool:
    match outcome:
        case "W/L":
            result = 1 # Red won
        case "L/W":
            result = 0 # Red lost
        case _:
            result = np.nan # Other scenario, like no contest or draw
    return result

df_fight_results['Red_Victory'] = df_fight_results['OUTCOME'].apply(did_red_win)

# Confirm the value counts of red victory matches the outcome column (64% red victory, 34% blue, and 1.7% neither)
df_fight_results['Red_Victory'].value_counts(normalize=True, dropna=False)

Red_Victory
1.0    0.638124
0.0    0.344292
NaN    0.017584
Name: proportion, dtype: float64

## `df_fighter`

### Convert height to inches
Input data contains height in string form as `{feet}' {inches}"`, e.g., `5' 6"` for five feet 6 inches, or 66 inches. Convert to a numeric column (in inches).

In [5]:
# Confirm all columns have this format:
re_pattern = r"\d' \d+\"" # e.g., 5' 10" or 6' 2"

# Will return all `True` if every non-null value follows this pattern.
df_fighter['HEIGHT'].dropna().str.contains(re_pattern, regex=True).value_counts()

HEIGHT
True    3955
Name: count, dtype: int64

Having confirmed the string format, we can create the logic to convert

In [6]:
# Convert fighter height text field to numeric field in inches
def get_inches(feet_and_inches: str) -> float:
    """Convert strings of heights (e.g., 5' 6") to number of inches"""

    if type(feet_and_inches) == str:

        ft_inches_list = feet_and_inches.strip('"').split("' ")

        inches = int(ft_inches_list[0]) * 12 + int(ft_inches_list[1])

        result = inches

    else: # For nan inputs, return nan
        result = np.nan

    return result

df_fighter['HEIGHT_in'] = df_fighter['HEIGHT'].apply(get_inches)

### Convert weights to lbs
Convert weight string to float, noting that all weights are expressed as "{weight} lbs."

In [7]:
# Confirm the pattern
df_fighter['WEIGHT'].str.contains(' lbs.').value_counts()

WEIGHT
True    4174
Name: count, dtype: int64

In [8]:
df_fighter['WEIGHT_lbs'] = df_fighter['WEIGHT'].str.strip(' lbs.').astype(float)

### Convert fighter reach
Simpler than height, as the format is just a two-digit number with the inches symbol (") at the end.

In [9]:
# Confirm the pattern. Should be only `True`s
df_fighter['REACH'].dropna().str.contains('"').value_counts()

REACH
True    2331
Name: count, dtype: int64

In [10]:
# Convert to number
df_fighter['REACH_in'] = df_fighter['REACH'].str.strip('"').astype(float)

View sample to confirm transformations

In [11]:
df_fighter.sample(3)

Unnamed: 0,FIRST,LAST,NICKNAME,URL,FIGHTER,HEIGHT,WEIGHT,REACH,STANCE,DOB,HEIGHT_in,WEIGHT_lbs,REACH_in
1037,Ali,Bagautinov,Puncher,http://ufcstats.com/fighter-details/3dd92ff9fb...,Ali Bagautinov,"5' 4""",125 lbs.,"65""",Orthodox,1985-06-10,64.0,125.0,65.0
1063,Toby,Misech,2 Quick,http://ufcstats.com/fighter-details/3f87218a7a...,Toby Misech,"5' 8""",145 lbs.,,Southpaw,1988-03-07,68.0,145.0,
1617,Julio,Paulino,The Dominican Demon,http://ufcstats.com/fighter-details/60c72f7459...,Julio Paulino,"6' 0""",170 lbs.,,Orthodox,1975-12-04,72.0,170.0,


## `df_fight_stats`
This very stat-rich table will need to be heavily processed to make it usable for feature engineering. It contains 19 columns, four of which are index columns: `EVENT`, `BOUT`, `ROUND` and `FIGHTER`. For instance, the first 5 rows in the table relate to Brandon Royval's stats in the 5 rounds of his fight vs Tatsuro Taira at UFC Fight Night: Royval vs. Taira.

The remaining 15 columns are a series of stats pertaining to that fighter's performance in that round of that fight. The stats are:
* KD: Knockdowns (already numeric :))
* SIG.STR: number of significant strikes (in the form {attempted} of {landed})
* SIG.STR.%: % of significant strikes
* .. @TODO finish


### Create numeric cols from text

In [12]:
import re

def add_cols_from_stat_text(df: pd.DataFrame, stat_col: str) -> pd.DataFrame:

    """
    Adds new columns to a DataFrame based on a given statistical column.
    This function cleans the `stat_col` name, splits the values, and creates new columns to
    represent the attempted and landed values, as well as their ratio.

    Args:
    df: The input DataFrame.
    stat_col: The name of the column containing statistical data in the format "X of Y".

    Returns:
    The DataFrame with three new columns:
        - f`{stat_col_clean}_land`: The landed value from the `stat_col`.
        - f`{stat_col_clean}_att`: The number of attempted from the category given in `stat_col`.
        - f`f_{stat_col_clean}`: The ratio of succeeded to attempted values.

    """

    df = df.copy()

    # Clean incoming col so derived cols will be clean as well
    stat_col_clean = re.sub(r'[.\s+]', '_', stat_col.strip().replace('.', ''))
    
    # Define name of three derived cols
    landed_col, attempted_col, ratio_col = (
        f'{stat_col_clean}_land',
        f'{stat_col_clean}_att',
        f'f_{stat_col_clean}'
    )

    # Create attemped adn succeeded cols by
    df[[landed_col, attempted_col]] = df[stat_col].str.split(' of ', expand=True).astype('float')

    # Create ratio col
    df[ratio_col] = df[landed_col] / df[attempted_col]

    return df 

In [13]:
# Create derived cols from stat text for all fields of that form
stat_cols_text_form = ['SIG.STR.', 'TOTAL STR.', 'TD', 'BODY', 'BODY', 'LEG', 'DISTANCE', 'CLINCH', 'GROUND']

for col in stat_cols_text_form:
    df_fight_stats = add_cols_from_stat_text(df_fight_stats, col)


In [14]:
# Confirm for one type of stat text field
df_fight_stats[df_fight_stats.columns[df_fight_stats.columns.str.contains(r'SIG', regex=True)]].sample()

Unnamed: 0,SIG.STR.,SIG.STR. %,SIGSTR_land,SIGSTR_att,f_SIGSTR
29134,31 of 49,63%,31.0,49.0,0.632653


In [15]:
# Drop the original columns
df_fight_stats.drop(columns=stat_cols_text_form, inplace=True)

### Control time to seconds

In [16]:
# Extract control time `CTRL` as seconds

def get_seconds(formatted_time: str) -> float:
    """
    Converts a formatted time string to seconds.

    Args:
        formatted_time: A string representing time in the format 'minutes:seconds'.

    Returns:
        The equivalent time in seconds, or NaN if the format is invalid.
    """

    # Anything in the wrong format will return nan
    try:
        pieces = formatted_time.split(':')
        return float(pieces[0])*60 + float(pieces[1])
    except Exception as e:
        pass

df_fight_stats['CTRL_sec'] = df_fight_stats['CTRL'].apply(get_seconds)

# ctrl.apply(get_seconds)

### Append fight date

In [17]:
df_fight_stats = pd.merge(
    df_fight_stats,
    df_event_details[['EVENT', 'EVENT_DATE']],
    on='EVENT',
    how='left',
    indicator=True
)
df_fight_stats.drop(columns='_merge', inplace=True)

# Create `df_fight_all` - Append Event info to Fight Results
This is only because we will want the event date to use for feature calculation later, and this only lives in the event details table

In [18]:
from pprint import pprint
# The fight details and fight results tables have some inconsistencies in their spellings.
# However the `URL` field is unique and spelled the same for each. Will use this as a join key.

df_fight_all = pd.merge(
    df_fight_results,
    df_event_details,
    on='EVENT',
    how='left',
    suffixes=['_fight', '_event'],
    indicator=True
)

# Confirm join quality - for all rows the left-hand `EVENT` was matched by the right
print(df_fight_all['_merge'].value_counts(normalize=True))

# Drop indicator column
df_fight_all.drop(columns=['_merge'], inplace=True)

_merge
both          1.0
left_only     0.0
right_only    0.0
Name: proportion, dtype: float64


Missing Data will be further explored in the EDA notebook.

# Gender and Weightclass
These can both be derived from the `WEIGHTCLASS` column of `df_fight_all`. We want to merge it into the fighter info so it can be used along with other fighter information.

In [19]:
df_fight_all['WEIGHTCLASS'].value_counts()

WEIGHTCLASS
Lightweight Bout                                                        1298
Welterweight Bout                                                       1238
Middleweight Bout                                                        977
Featherweight Bout                                                       732
Heavyweight Bout                                                         652
Bantamweight Bout                                                        652
Light Heavyweight Bout                                                   633
Flyweight Bout                                                           325
Women's Strawweight Bout                                                 297
Women's Flyweight Bout                                                   221
Women's Bantamweight Bout                                                195
Open Weight Bout                                                         101
Catch Weight Bout                                               

There are 109 distinct values for `WEIGHTCLASS`, but there are two usable elements in each string:
1) The weight division is mentioned, following the general pattern '{..}weight'
2) If it's a women's fight, the word 'woman' or 'women' appears, both of which have the pattern 'wom'

Next we'll use these observations to extract a lower cardinality weightclass category along with a 2-class gender variable

In [20]:
import re

# Func to extract weight class from the `WEIGHTCLASS` string values
def get_weightclass(weightclass: str) -> str:
    search = re.search(r'\w+\s?weight', weightclass, flags=re.IGNORECASE)

    return search[0] if search else np.nan

# Extract gender from `WEIGHTCLASS`
df_fight_all['gender'] = df_fight_all['WEIGHTCLASS'].apply(
    lambda x: 'W' if re.search(r'wom', x, flags=re.IGNORECASE) else 'M'
)

# Extract weight class from `WEIGHTCLASS`
df_fight_all['weightclass'] = df_fight_all['WEIGHTCLASS'].apply(
    # lambda x: re.search('\w+weight', x, flags=re.IGNORECASE)[0]
    get_weightclass
)

# Check extractions
pd.crosstab(df_fight_all['weightclass'], df_fight_all['gender'], dropna=False)

gender,M,W
weightclass,Unnamed: 1_level_1,Unnamed: 2_level_1
Bantamweight,681,215
Catch Weight,66,0
Featherweight,760,29
Flyweight,349,233
Heavyweight,1407,0
Lightweight,1346,0
Middleweight,1029,0
Open Weight,101,0
Strawweight,0,316
Welterweight,1301,0


- Gender has no missings (the function didn't allow them), but it is still assumed to be a male fight unless 'wom' appeared
- There is an edge case where 15 men's fights had nan's for `weightclass`. As shown below, none of the original `WEIGHTCLASS` values for those instances gave a clue as to the actual weight class. Note also they are very old.

In [21]:
# Where is `weightclass` null?
df_fight_all[df_fight_all['weightclass'].isna()]['WEIGHTCLASS'].value_counts()

WEIGHTCLASS
UFC Superfight Championship Bout               5
Ultimate Ultimate '96 Tournament Title Bout    1
UFC 10 Tournament Title Bout                   1
UFC 8 Tournament Title Bout                    1
Ultimate Ultimate '95 Tournament Title Bout    1
UFC 7 Tournament Title Bout                    1
UFC 6 Tournament Title Bout                    1
UFC 5 Tournament Title Bout                    1
UFC 4 Tournament Title Bout                    1
UFC 3 Tournament Title Bout                    1
UFC 2 Tournament Title Bout                    1
Name: count, dtype: int64

# Imputing Fighter Gaps
Before saving our final ML dataset the missingness of this dataset needs to be addressed. This table will ultimately be merged into the ML dataset, so its missingness could have a big impact on that of the final ML dataset. There are cleverer ways of imputing these missings than simply doing a flat imputation on the final dataset. If we impute fighter stats prior to merging, we can take advantage of things like grouped averaged by gender and weightclass, providing a more accurate picture of proper imputed values.

But to start, what are the missing rates for `df_fighter`?

In [22]:
# df of missing rates
df_fighter.isna().mean().sort_values(ascending=False).rename('Missing Rate').to_frame()

Unnamed: 0,Missing Rate
REACH,0.452817
REACH_in,0.452817
NICKNAME,0.448592
STANCE,0.194601
DOB,0.17723
HEIGHT,0.071596
HEIGHT_in,0.071596
WEIGHT,0.020188
WEIGHT_lbs,0.020188
FIRST,0.003521


If we discount `NICKNAME` (probably useless as a feature), then four features have higher than a 5% missing rate. We want to 

## Height & Weight

There are a few edge cases where a 2-item list is the mode, indicating more than one weightclass or gender per fighter. Since these are very low occurrences, for now we'll address by assigning the first item in the list to the fighter. Can re-address at a later time. (@TODO)

In [23]:
# This block melts the fighter df such that fighter names are in one column, then gets the mode of both gender and weightclass

df_gender_weightclass = df_fight_all[
    ['FIGHTER_Red', 'FIGHTER_Blue', 'gender', 'weightclass']
].melt(
    id_vars=['gender', 'weightclass'],
    value_vars=['FIGHTER_Red', 'FIGHTER_Blue'],
    value_name='FIGHTER'
).groupby('FIGHTER', as_index=False).agg({
    'gender': pd.Series.mode,
    'weightclass': pd.Series.mode
})

# clean any lists by taking the first item
df_gender_weightclass['gender'] = df_gender_weightclass['gender'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)
df_gender_weightclass['weightclass'] = df_gender_weightclass['weightclass'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)

print("=====Value counts of gender=====")
print(df_gender_weightclass['gender'].value_counts())

print("\n=====Value counts of weightclass=====")
print(df_gender_weightclass['weightclass'].value_counts())

=====Value counts of gender=====
gender
M    2283
W     254
Name: count, dtype: int64

=====Value counts of weightclass=====
weightclass
Heavyweight      445
Lightweight      435
Welterweight     382
Bantamweight     324
Middleweight     319
Featherweight    249
Flyweight        193
Strawweight      102
Open Weight       83
Catch Weight       5
Name: count, dtype: int64


In [24]:
# Append weightclass and gender to `df_fighter`
df_fighter = pd.merge(
    df_fighter,
    df_gender_weightclass,
    on='FIGHTER'
)

# Summarize augmented df_fighter to get avg metrics by gender & weight
df_fighter_summary = df_fighter.groupby(['gender', 'weightclass'])[['HEIGHT_in',  'WEIGHT_lbs']].median().reset_index()

df_fighter_summary.sort_values(['gender', 'WEIGHT_lbs'])

Unnamed: 0,gender,weightclass,HEIGHT_in,WEIGHT_lbs
3,M,Flyweight,66.0,125.0
0,M,Bantamweight,67.0,135.0
2,M,Featherweight,69.0,145.0
5,M,Lightweight,70.0,155.0
1,M,Catch Weight,72.0,170.0
8,M,Welterweight,71.0,170.0
6,M,Middleweight,72.0,185.0
4,M,Heavyweight,74.0,215.0
7,M,Open Weight,72.0,225.0
12,W,Strawweight,63.0,115.0


We used the median to get the average weight, because there are sometimes high-end outliers, especially in the larger weight divisions. The `WEIGHTCLASS` column in the raw dataset does not distinguish between light heavyweight and heavyweight, so this category is quite broad.

The table above matches almost exactly with [UFC's weight classes](https://www.ufc.com/news/understanding-ufc-weight-classes-and-weigh-ins), especially for men. An alternative, domain-aware imputation method thus also presents itself: use the nominal weights per weight class. So this exercise was either a waste of time, or a useful confirmatory analysis, depending on your perspective. Either way, we'll now use the grouped medians to impute missings.

In [25]:
df_fighter['HEIGHT_in'] = df_fighter.groupby(['gender', 'weightclass'])['HEIGHT_in'].transform(lambda x: x.fillna(x.median()))
df_fighter['WEIGHT_lbs'] = df_fighter.groupby(['gender', 'weightclass'])['WEIGHT_lbs'].transform(lambda x: x.fillna(x.median()))

df_fighter.to_clipboard()

## Reach
To approach imputing reach, I'll start by hypothesizing that it can be reasonably well predicted by other body measurements. Let's see how well it's correlated with height & weight

In [26]:
import seaborn as sns

df_fighter[['HEIGHT_in', 'WEIGHT_lbs', 'REACH_in']].corr()

Unnamed: 0,HEIGHT_in,WEIGHT_lbs,REACH_in
HEIGHT_in,1.0,0.734441,0.889688
WEIGHT_lbs,0.734441,1.0,0.762845
REACH_in,0.889688,0.762845,1.0


The pearson correlation coefficient is high for all three of these, and height. Most likely height is the driver, and weight and reach are just covariates. Still, let's use both to build a linear regressor to impute height.

In [27]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
df_lr = df_fighter[['HEIGHT_in', 'WEIGHT_lbs', 'REACH_in']].dropna()
lr.fit(df_lr[['HEIGHT_in', 'WEIGHT_lbs']], df_lr['REACH_in'])

In [28]:
# Apply the model to impute REACH

df_fighter['REACH_in'] = np.where(
    df_fighter['REACH_in'].isna(),
    lr.predict(df_fighter[['HEIGHT_in', 'WEIGHT_lbs']]), # value if missing
    df_fighter['REACH_in'] # value if not missing
)

## Stance
Stance had a roughly 19% missing rate. Unfortunately, stance is not likely to be a function of gender

In [29]:
df_fighter['STANCE'] = df_fighter['STANCE'].fillna('Orthodox')

df_fighter['STANCE'].value_counts(normalize=True)


STANCE
Orthodox       0.775276
Southpaw       0.164692
Switch         0.056477
Open Stance    0.002370
Sideways       0.001185
Name: proportion, dtype: float64

## Missingness now

In [30]:
df_fighter.isna().mean().sort_values(ascending=False).rename('Missing Rate').to_frame()

Unnamed: 0,Missing Rate
NICKNAME,0.352291
REACH,0.259874
DOB,0.062401
HEIGHT,0.008689
WEIGHT,0.007504
FIRST,0.00316
LAST,0.0
URL,0.0
FIGHTER,0.0
STANCE,0.0


# Create `df` - Append fighter details per red and blue to fight data

In [31]:
# Create a dataframe containing the red and blue fighter tott values for each match
fighter_cols = [
    'FIGHTER', 'STANCE', 'DOB', 'HEIGHT_in', 'WEIGHT_lbs', 'REACH_in'
]

# Append fighter details for Red
df = pd.merge(
    df_fight_all,
    df_fighter[fighter_cols], 
    how='left',
    left_on='FIGHTER_Red',
    right_on='FIGHTER'
).drop(columns=['FIGHTER'])

# # Append fighter details for Blue
df = pd.merge(
    df,
    df_fighter[fighter_cols],
    how='left',
    left_on='FIGHTER_Blue',
    right_on='FIGHTER',
    suffixes=('_Red', '_Blue')
)

## Age at time of fight
We have DOB for both corners and the date of the fight, so we can calculate both ages

In [32]:
df['Age_yrs_Red'] = (df['EVENT_DATE'] - df['DOB_Red']).dt.days / 365.24
df['Age_yrs_Blue'] = (df['EVENT_DATE'] - df['DOB_Blue']).dt.days / 365.24

## Differentials
The disparity in the Red and Blue fighters with respect to the attributes we have could indicate an advantage of one fighter over the other. We'll define all of these by appending a `_diff` to the column names following the exising attribute columns

### Attributes

In [33]:
## Attribute Differentials
df['WEIGHT_lbs_diff'] = df['WEIGHT_lbs_Red'] - df['WEIGHT_lbs_Blue']
df['HEIGHT_in_diff'] = df['HEIGHT_in_Red'] - df['HEIGHT_in_Blue']
df['REACH_in_diff'] = df['REACH_in_Red'] - df['REACH_in_Blue']
df['Age_diff'] = (df['DOB_Red'] - df['DOB_Blue']).dt.days / 365.24

### Stance
What does it mean when a lefty takes on a rightie? Will this have an effect?

In [34]:
df['STANCE_diff'] = df['STANCE_Red'] + '_' + df['STANCE_Blue']

# Example
df[['STANCE_Red', 'STANCE_Blue', 'STANCE_diff']].sample(5)

Unnamed: 0,STANCE_Red,STANCE_Blue,STANCE_diff
5830,Orthodox,Orthodox,Orthodox_Orthodox
4847,Orthodox,Southpaw,Orthodox_Southpaw
4033,Orthodox,Orthodox,Orthodox_Orthodox
1116,Orthodox,Orthodox,Orthodox_Orthodox
5227,Orthodox,Orthodox,Orthodox_Orthodox


## Prior fights

Another clear set of features we can engineer from this data is each fighter's record **prior to** the fight. This feature will be dynamic, since each fighter's record will change for each fight.

To do this, we will need to do a separate analysis of the `df_fight_all` dataset created above, since this contains both fighter names, the outcome and the date. That's all we'll need.

In [35]:
# Create melted table because we need one row per bout per fighter.
# Current table is one row per bout

df_prior_fights = df_fight_all.melt(
    id_vars=['EVENT', 'BOUT', 'Red_Victory', 'EVENT_DATE'],
    value_vars=['FIGHTER_Red', 'FIGHTER_Blue'],
    var_name='Corner',
    value_name='Fighter',
)
df_prior_fights['Corner'] = df_prior_fights['Corner'].str.replace('FIGHTER_', '')

In [36]:
# Was fighter in melted row victorious?

def was_victorious(row):
    if ((row['Corner'] == 'Red') & (row['Red_Victory'] == 1)):
        result = 1
    elif ((row['Corner'] == 'Red') & (row['Red_Victory'] == 0)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 1)):
        result = 0
    elif ((row['Corner'] == 'Blue') & (row['Red_Victory'] == 0)):
        result = 1
    else:
        result = np.nan

    return result

# Apply func to calc the `Victory` column
df_prior_fights['Victory'] = df_prior_fights.apply(lambda row: was_victorious(row), axis=1)

# Confirm
df_prior_fights[['Corner', 'Red_Victory', 'Victory']].value_counts()


Corner  Red_Victory  Victory
Blue    1.0          0.0        5008
Red     1.0          1.0        5008
Blue    0.0          1.0        2702
Red     0.0          0.0        2702
Name: count, dtype: int64

In [37]:
# Simplify prior fights df to show only necessary columns
df_prior_fights.drop(columns=['BOUT', 'Red_Victory', 'Corner'], inplace=True)

### Add prior wins, losses and total fights into `df`

In [38]:
import datetime as dt

def get_record(fighter: str, date: dt.datetime) -> tuple:
    """Accept fighter name and date, return record prior to that date"""

    df_prior_fights_sub = df_prior_fights[
        (df_prior_fights['Fighter'] == fighter) &
        (df_prior_fights['EVENT_DATE'] < date)
    ]

    fights = df_prior_fights_sub['Victory'].count()
    victories = df_prior_fights_sub['Victory'].sum()
    losses = fights - victories
    f_win = victories / fights if fights != 0 else 0

    return fights, victories, losses, f_win

get_record('Brandon Royval', pd.to_datetime(['2024-10-12'])[0])

(np.int64(9), np.float64(6.0), np.float64(3.0), np.float64(0.6666666666666666))

In [None]:
# Apply prior record to fighters in df
for i, row in df.iterrows():
    event_date = row['EVENT_DATE']
    fighter_red = row['FIGHTER_Red']
    fighter_blue = row['FIGHTER_Blue']

    df.loc[i, ['Red_prior_fights', 'Red_prior_victories', 'Red_prior_losses', 'Red_prior_f_win']] = get_record(fighter_red, event_date)
    df.loc[i, ['Blue_prior_fights', 'Blue_prior_victories', 'Blue_prior_losses', 'Blue_prior_f_win']] = get_record(fighter_blue, event_date)

In [40]:
df.isna().mean().sort_values(ascending=False).rename('Missing Rate').to_frame()

Unnamed: 0,Missing Rate
Age_diff,0.038823
DOB_Blue,0.029434
Age_yrs_Blue,0.029434
Red_Victory,0.017635
DOB_Red,0.01624
Age_yrs_Red,0.01624
REACH_in_diff,0.010784
HEIGHT_in_diff,0.010784
WEIGHT_lbs_diff,0.010784
STANCE_diff,0.010784


## Prior stats

In [35]:
# Build agg dict and use to summarize fighter's stats

def summarize_fighter_stats(
        df: pd.DataFrame, fighter: str=None, date: dt.datetime=None, grouper: str='BOUT'
    ) -> pd.DataFrame:
    """
    Summarizes a fighter's statistics up to a specific date.
    This function filters the DataFrame for the specified fighter and date,
    calculates various statistics (sums and averages), and groups the results
    by the specified grouper.

    Args:
    df: Only works with `df_fight_stats` when cleaned as per this NB.
    fighter: The name of the fighter to summarize.
    date: The cutoff date for the summary.
    grouper: The column to group the statistics by (default: 'BOUT').

    Returns:
    A DataFrame with summarized statistics for the fighter, grouped by the specified grouper.
    If fighter-wide stats requested, add add'l columns denoting totals/averages per round

    Example:
        summarize_fighter_stats(
            df_fight_stats,
            'Conor McGregor',
            pd.to_datetime('2013-08-30'),
        )
    """
    
    # Only filter if fighter is not None
    if fighter:
        # Filter conditions
        fighter_match = (df['FIGHTER'] == fighter)
        date_match = (df['EVENT_DATE'] < date)

        # Apply conditions to subset df
        df = df[fighter_match & date_match]

    # Build agg dict to direct which cols are sum'd vs avg'd
    agg_dict={}
    for col in df.select_dtypes('number').columns:
        if 'f_' in col:
            agg_tuple = (col, 'mean')
        else:
            agg_tuple = (col, 'sum')
        agg_dict[col] = agg_tuple

    # Add round count metric, so stats can be normalized by it
    agg_dict['rounds'] = ('ROUND', 'nunique')
    agg_dict['fights'] = ('BOUT', 'nunique')
    
    # Add date
    agg_dict['EVENT_DATE'] = ('EVENT_DATE', 'mean')

    fighter_stats = df.groupby(grouper, as_index=False).agg(**agg_dict).sort_values('EVENT_DATE')    

    # If fighter-wide stats requested, add add'l columns denoting totals/averages per round
    if grouper == 'FIGHTER':
        for col in fighter_stats.select_dtypes('number').columns:
            fighter_stats[f'{col}_per_round'] = fighter_stats[col] / fighter_stats['rounds']
    
    return fighter_stats

#### Test this with one fighter - Jon Jones

In [58]:
# Summary of Jon Jones' fights up to the Matt Hammill fight

fighter = 'Jon Jones'
date = pd.to_datetime('2009-08-30')

summarize_fighter_stats(
    df_fight_stats,
    fighter,
    date,
    grouper='FIGHTER'
)

Unnamed: 0,FIGHTER,KD,SUB.ATT,REV.,SIGSTR_land,SIGSTR_att,f_SIGSTR,TOTAL_STR_land,TOTAL_STR_att,f_TOTAL_STR,...,f_DISTANCE_per_round,CLINCH_land_per_round,CLINCH_att_per_round,f_CLINCH_per_round,GROUND_land_per_round,GROUND_att_per_round,f_GROUND_per_round,CTRL_sec_per_round,rounds_per_round,fights_per_round
0,Jon Jones,1.0,1.0,0.0,112.0,251.0,0.488404,163.0,305.0,0.560609,...,0.115851,16.333333,26.333333,0.242828,4.333333,5.666667,0.283333,179.666667,1.0,1.0


### Apply fighter stats to ML df

In [121]:
from tqdm import tqdm

def summarize_and_concat_stats(df, df_fight_stats):
    """Summarizes fighter stats and concatenates them to the original DataFrame.

    Args:
        df (pd.DataFrame): The original DataFrame containing fight information.
        df_fight_stats (pd.DataFrame): The DataFrame containing detailed fighter statistics.
        grouper (str, optional): The column to group by when summarizing fighter stats. Default 'FIGHTER'.

    Returns:
        pd.DataFrame: A new DataFrame with additional columns for summarized fighter stats.
    """

    result_dict = {}
    for _, row in tqdm(df.iterrows(), total=len(df.index)):

        # Error handling to account for fighter-time combinations when no prior fight data exists
        # Results in an empty red_, blue_stats dataframe, so also create a flag to denote
        # previously unknown fighters.

        try:
            red_stats = summarize_fighter_stats(
                df_fight_stats,
                row['FIGHTER_Red'],
                row['EVENT_DATE']
            ).select_dtypes('number').rename(columns=lambda x: f'Red_{x}').iloc[0]

            red_stats['Red_Unknown'] = 0

        except IndexError:
            red_stats = pd.Series({'Red_Unknown': 1})

        try:
            blue_stats = summarize_fighter_stats(
                df_fight_stats,
                row['FIGHTER_Blue'],
                row['EVENT_DATE']
            ).select_dtypes('number').rename(columns=lambda x: f'Blue_{x}').iloc[0]

            blue_stats['Blue_Unknown'] = 0
    
        except IndexError:
            blue_stats = pd.Series({'Blue_Unknown': 1})

        result_dict[_] = pd.concat([row, red_stats, blue_stats])

    return pd.DataFrame.from_dict(result_dict, orient='index')

new_df = summarize_and_concat_stats(df, df_fight_stats)

100%|██████████| 7882/7882 [01:22<00:00, 95.09it/s]


In [119]:
pd.Series({'No Name': 'Yes'})

No Name    Yes
dtype: object

# Save ML dataset

In [122]:
df.to_csv('ufc_ml_dataset_2024-11-14.csv')