In [63]:
import re
import numpy as np
import pandas as pd

Our dataset:

In [64]:
fights_stats = pd.read_csv("../stats/stats.csv", sep=';')
fights_stats.tail()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,red_fighter_sig_str_body_pct,blue_fighter_sig_str_body_pct,red_fighter_sig_str_leg_pct,blue_fighter_sig_str_leg_pct,red_fighter_sig_str_distance_pct,blue_fighter_sig_str_distance_pct,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct
7749,ORLANDO WIET,ROBERT LUCARELLI,11/03/1994,The Gladiator,-,W,L,KO/TKO,1,2:50,...,12,0,0,50,12,100,0,0,87,0
7750,FRANK HAMAKER,THADDEUS LUSTER,11/03/1994,-,-,W,L,Submission,1,4:52,...,0,0,0,0,50,0,0,0,50,0
7751,JOHNNY RHODES,DAVID LEVICKI,11/03/1994,-,-,W,L,KO/TKO,1,12:13,...,9,0,9,0,9,25,9,50,81,25
7752,PATRICK SMITH,RAY WIZARD,11/03/1994,-,-,W,L,Submission,1,0:58,...,100,0,0,100,0,100,100,0,0,0
7753,SCOTT MORRIS,SEAN DAUGHERTY,11/03/1994,-,-,W,L,Submission,1,0:20,...,0,0,0,0,0,0,100,0,0,0


# Merge additional athlete-based features

Let's merge additional athlete-based features from an external dataset like: *Height*, *Reach*, *Stance*

In [65]:
# External dataset
athlete_stats = pd.read_csv("raw_fighter_details.csv", sep=',')
athlete_stats.tail()

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg
3591,Zhang Tiequan,"5' 8""",155 lbs.,"69""",Orthodox,"Jul 25, 1978",1.23,36%,2.14,51%,1.95,58%,75%,3.4
3592,Alex Zuniga,,145 lbs.,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0
3593,George Zuniga,"5' 9""",185 lbs.,,,,7.64,38%,5.45,37%,0.0,0%,100%,0.0
3594,Allan Zuniga,"5' 7""",155 lbs.,"70""",Orthodox,"Apr 04, 1992",3.93,52%,1.8,61%,0.0,0%,57%,1.0
3595,Virgil Zwicker,"6' 2""",205 lbs.,"74""",,"Jun 26, 1982",3.34,48%,4.87,39%,1.31,30%,50%,0.0


## External dataset feature engineering

We leave only the features we are interested in:

In [66]:
athlete_stats = athlete_stats.loc[:, ["fighter_name", "Height", "Reach", "Stance"]]
athlete_stats.tail()

Unnamed: 0,fighter_name,Height,Reach,Stance
3591,Zhang Tiequan,"5' 8""","69""",Orthodox
3592,Alex Zuniga,,,
3593,George Zuniga,"5' 9""",,
3594,Allan Zuniga,"5' 7""","70""",Orthodox
3595,Virgil Zwicker,"6' 2""","74""",


### Dropping NaNs

Dropping rows, that bring us no information whatsoever:

In [67]:
athlete_stats.dropna(how="all", subset=["Height", "Reach", "Stance"], inplace=True)
athlete_stats.shape

(3363, 4)

Impututing the rest of NaNs with zeros:

In [68]:
athlete_stats.fillna(0, inplace=True)
athlete_stats.isna().sum()

fighter_name    0
Height          0
Reach           0
Stance          0
dtype: int64

Convert external dataset fighter names to uppercase, and columns to lowercase to match our format:

In [69]:
# Fighter names => upper
athlete_stats['fighter_name'] = athlete_stats['fighter_name'].str.upper()
# Columns => lower
athlete_stats.columns = athlete_stats.columns.str.lower()
athlete_stats.head()

Unnamed: 0,fighter_name,height,reach,stance
1,PAPY ABEDI,"5' 11""",0,Southpaw
2,SHAMIL ABDURAKHIMOV,"6' 3""","76""",Orthodox
3,DANNY ABBADI,"5' 11""",0,Orthodox
4,HIROYUKI ABE,"5' 6""",0,Orthodox
5,RICARDO ABREU,"5' 11""",0,Orthodox


Defining a function that converts height and reach from inches to cm:

In [70]:
def conv_from_inches_to_cm(inches):
    """Converts from inches to cm"""
    # If the data is missing
    if not inches:
        return inches
        
    res = inches.strip('"').split("'")
    # Foot to cm conversion rate
    f_cm_cr = 30.48
    # Inch to cm convetsion rate
    i_cm_cr = 2.54

    # If converting height
    if len(res) > 1:
        return int(res[0]) * f_cm_cr + int(res[1]) * i_cm_cr
    # Or reach
    else:
        return int(res[0]) * i_cm_cr

Applying:

In [71]:
# Convert height
athlete_stats['height'] = athlete_stats['height'].apply(conv_from_inches_to_cm)
# Convert reach
athlete_stats['reach'] = athlete_stats['reach'].apply(conv_from_inches_to_cm)
# Take a look
athlete_stats.head()

Unnamed: 0,fighter_name,height,reach,stance
1,PAPY ABEDI,180.34,0.0,Southpaw
2,SHAMIL ABDURAKHIMOV,190.5,193.04,Orthodox
3,DANNY ABBADI,180.34,0.0,Orthodox
4,HIROYUKI ABE,167.64,0.0,Orthodox
5,RICARDO ABREU,180.34,0.0,Orthodox


Merging:

In [72]:
fights_stats = pd.merge(fights_stats, athlete_stats, left_on='red_fighter_name', right_on='fighter_name')
fights_stats.rename(columns={'height': 'red_fighter_height', 'reach': 'red_fighter_reach', 'stance': 'red_fighter_stance'}, inplace=True)
fights_stats.drop(columns='fighter_name', inplace=True)

In [73]:
fights_stats = pd.merge(fights_stats, athlete_stats, left_on='blue_fighter_name', right_on='fighter_name')
fights_stats.rename(columns={'height': 'blue_fighter_height', 'reach': 'blue_fighter_reach', 'stance': 'blue_fighter_stance'}, inplace=True)
fights_stats.drop(columns='fighter_name', inplace=True)

In [74]:
fights_stats.head()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct,red_fighter_height,red_fighter_reach,red_fighter_stance,blue_fighter_height,blue_fighter_reach,blue_fighter_stance
0,ILIA TOPURIA,MAX HOLLOWAY,26/10/2024,El Matador,Blessed,W,L,KO/TKO,3,1:34,...,0,0,5,0,170.18,175.26,Orthodox,180.34,175.26,Orthodox
1,ROBERT WHITTAKER,KHAMZAT CHIMAEV,26/10/2024,The Reaper,Borz,L,W,Submission,1,3:34,...,0,0,0,100,182.88,185.42,Orthodox,187.96,190.5,Orthodox
2,MAGOMED ANKALAEV,ALEKSANDAR RAKIC,26/10/2024,-,Rocket,W,L,Decision - Unanimous,3,5:00,...,9,5,0,0,190.5,190.5,Orthodox,193.04,198.12,Orthodox
3,LERONE MURPHY,DAN IGE,26/10/2024,The Miracle,50K,W,L,Decision - Unanimous,3,5:00,...,23,13,5,17,175.26,185.42,Orthodox,170.18,180.34,Orthodox
4,GEOFF NEAL,RAFAEL DOS ANJOS,26/10/2024,Handz of Steel,-,W,L,KO/TKO,1,1:30,...,0,0,27,0,180.34,190.5,Southpaw,172.72,177.8,Southpaw


# Data preprocessing

Creating a single **winner** feature instead of two **red_fighter_result** and **blue_fighter_result** features:

In [3]:
df.loc[:, 'winner'] = df['red_fighter_result'].apply(lambda x: 'red' if x == 'W' else 'blue')

In [4]:
df['winner'].head()

0     red
1    blue
2     red
3     red
4     red
Name: winner, dtype: object

Changing columns from **red/blue+feature name** to **winner/loser+feature name**:

In [5]:
def assign_winner_n_loser(df, columns_to_change, winner_col='winner'):
    for col in columns_to_change:
        df.loc[:, f"winner_{col.replace('fighter_', '')}"] = df[f"red_{col}"].where(df[winner_col] == 'red', df[f"blue_{col}"])
        df.loc[:, f"loser_{col.replace('fighter_', '')}"] = df[f"red_{col}"].where(df[winner_col] == 'blue', df[f"blue_{col}"])
        
    return df 

In [6]:
cols_to_change = ['fighter_name', 'fighter_nickname','fighter_KD',
                  'fighter_sig_str', 'fighter_sig_str_pct', 
                  'fighter_total_str', 'fighter_TD', 'fighter_TD_pct', 
                  'fighter_sub_att', 'fighter_rev', 'fighter_ctrl', 
                  'fighter_sig_str_head', 'fighter_sig_str_body', 
                  'fighter_sig_str_leg', 'fighter_sig_str_distance', 
                  'fighter_sig_str_clinch', 'fighter_sig_str_ground', 
                  'fighter_sig_str_head_pct', 'fighter_sig_str_body_pct', 
                  'fighter_sig_str_leg_pct', 'fighter_sig_str_distance_pct', 
                  'fighter_sig_str_clinch_pct', 'fighter_sig_str_ground_pct']

df = assign_winner_n_loser(df, cols_to_change)
df.head()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,winner_sig_str_body_pct,loser_sig_str_body_pct,winner_sig_str_leg_pct,loser_sig_str_leg_pct,winner_sig_str_distance_pct,loser_sig_str_distance_pct,winner_sig_str_clinch_pct,loser_sig_str_clinch_pct,winner_sig_str_ground_pct,loser_sig_str_ground_pct
0,ILIA TOPURIA,MAX HOLLOWAY,26/10/2024,El Matador,Blessed,W,L,KO/TKO,3,1:34,...,14,16,20,24,94,100,0,0,5,0
1,ROBERT WHITTAKER,KHAMZAT CHIMAEV,26/10/2024,The Reaper,Borz,L,W,Submission,1,3:34,...,33,0,0,100,0,100,0,0,100,0
2,MAGOMED ANKALAEV,ALEKSANDAR RAKIC,26/10/2024,-,Rocket,W,L,Decision - Unanimous,3,5:00,...,40,16,23,64,90,94,9,5,0,0
3,LERONE MURPHY,DAN IGE,26/10/2024,The Miracle,50K,W,L,Decision - Unanimous,3,5:00,...,23,10,7,13,71,69,23,13,5,17
4,SHARA MAGOMEDOV,ARMEN PETROSYAN,26/10/2024,Bullet,Superman,W,L,KO/TKO,2,4:52,...,44,12,18,58,96,97,3,2,0,0


Dropping the **red/blue** features, leaving the new **winner/loser** ones:

In [7]:
df = df.loc[:, ['event_date', 'method', 'round', 'time', 'time_format', 
           'referee', 'details', 'bout_type', 'bonus', 'event_name', 'event_location']].join(df.iloc[:, -47:])
df.head()


Unnamed: 0,event_date,method,round,time,time_format,referee,details,bout_type,bonus,event_name,...,winner_sig_str_body_pct,loser_sig_str_body_pct,winner_sig_str_leg_pct,loser_sig_str_leg_pct,winner_sig_str_distance_pct,loser_sig_str_distance_pct,winner_sig_str_clinch_pct,loser_sig_str_clinch_pct,winner_sig_str_ground_pct,loser_sig_str_ground_pct
0,26/10/2024,KO/TKO,3,1:34,5 Rnd (5-5-5-5-5),Marc Goddard,Punch to Head At Distance,UFC Featherweight Title Bout,belt,UFC 308: Topuria vs. Holloway,...,14,16,20,24,94,100,0,0,5,0
1,26/10/2024,Submission,1,3:34,5 Rnd (5-5-5-5-5),Jason Herzog,Rear Naked Choke,Middleweight Bout,perf,UFC 308: Topuria vs. Holloway,...,33,0,0,100,0,100,0,0,100,0
2,26/10/2024,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Vitor Ribeiro,Sal D'amato 28 - 29. Jacob Montalvo 28 - 29. T...,Light Heavyweight Bout,-,UFC 308: Topuria vs. Holloway,...,40,16,23,64,90,94,9,5,0,0
3,26/10/2024,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Mark Smith,Mike Bell 28 - 29. Ben Cartlidge 28 - 29. Hadi...,Featherweight Bout,-,UFC 308: Topuria vs. Holloway,...,23,10,7,13,71,69,23,13,5,17
4,26/10/2024,KO/TKO,2,4:52,3 Rnd (5-5-5),Kerry Hatley,Spinning Back Fist Head,Middleweight Bout,perf,UFC 308: Topuria vs. Holloway,...,44,12,18,58,96,97,3,2,0,0


# Data Cleaning

## NaN values

In [8]:
df.isnull().sum().sum()

0

Replacing NaN entry fillers with zeros:

In [9]:
df = df.replace(['-', '--', '---'], "0")

## Duplicates

In [10]:
df.duplicated().sum()

0

We can see that there are no NaNs or duplicates. Let's get to Feature Engineering.

# Feature Engineering

## Standardizing Features

Standardizing base values into pct % (e.g. from **50 of 100** to **50%**):

In [11]:
def convert_total_str_to_pct(row):
    row = row.split("of")
    return (int(row[0]) * 100) / int(row[1]) if re.search(r"\d", row[0]) and int(row[0]) != 0 else 0

# Applying
df.loc[:, 'winner_total_str_landed_pct'] = df['winner_total_str'].apply(convert_total_str_to_pct)
df.loc[:, 'loser_total_str_landed_pct'] = df['loser_total_str'].apply(convert_total_str_to_pct)

Replacing **--** and **-** entries with zeros:

In [12]:
df.loc[df['winner_ctrl'].isin(['--', '-', '0']), 'winner_ctrl'] = '0:00'
df.loc[df['loser_ctrl'].isin(['--', '-', '0']), 'loser_ctrl'] = '0:00'

Standardizing time from **mm:ss** into total **ss**:

In [13]:
df.loc[:, 'winner_ctrl'] = pd.to_timedelta("00:" + df.loc[:, 'winner_ctrl']).dt.total_seconds()
df.loc[:, 'loser_ctrl'] = pd.to_timedelta("00:" + df.loc[:, 'loser_ctrl']).dt.total_seconds()

## Dropping Redundant Features

In [14]:
df.drop(['loser_sig_str', 'winner_total_str', 'loser_total_str', 'winner_sig_str',
         'winner_TD', 'loser_TD', 'winner_sig_str_head', 'loser_sig_str_head', 
         'winner_sig_str_body', 'loser_sig_str_body', 'winner_sig_str_leg', 'loser_sig_str_leg',
         'winner_sig_str_distance', 'loser_sig_str_distance', 'winner_sig_str_clinch',
         'loser_sig_str_clinch', 'winner_sig_str_ground', 'loser_sig_str_ground'], axis=1, inplace=True)

## Converting Dtypes

In [15]:
def convert_dtypes(df):
    start_idx = pd.Index(df.columns).get_loc('winner_KD')
    
    for i in range(start_idx, len(df.columns)):
        if df[df.columns[i]].dtype != float:
            df[df.columns[i]] = df[df.columns[i]].astype(float)
    return df

# Applying
df = convert_dtypes(df)

## Engineering New Features

Let's engineer some additional features:
1. Striking dominance - a fighter's overall striking performance. Calculated as: KD + Significant strikes % + Total landed strikes %;
2. Wrestling dominance - a fighter's overall wrestling performance. Calculated as: TD % + Submission attempts + reversals;

**Striking dominance**:

In [16]:
df['winner_striking_dominance'] = df['winner_KD'] + df['winner_sig_str_pct'] + df['winner_total_str_landed_pct']
df['loser_striking_dominance'] = df['loser_KD'] + df['loser_sig_str_pct'] + df['loser_total_str_landed_pct']

**Wrestling dominance**:

In [17]:
df['winner_wrestling_dominance'] = df['winner_TD_pct'] + df['winner_sub_att'] + df['winner_rev']
df['loser_wrestling_dominance'] = df['loser_TD_pct'] + df['loser_sub_att'] + df['loser_rev']