In [55]:
import re
import numpy as np
import pandas as pd

# Load our fight-based dataset

In [56]:
fights_stats = pd.read_csv("../stats/stats.csv", sep=';')
fights_stats.tail()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,red_fighter_sig_str_body_pct,blue_fighter_sig_str_body_pct,red_fighter_sig_str_leg_pct,blue_fighter_sig_str_leg_pct,red_fighter_sig_str_distance_pct,blue_fighter_sig_str_distance_pct,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct
7749,ORLANDO WIET,ROBERT LUCARELLI,11/03/1994,The Gladiator,-,W,L,KO/TKO,1,2:50,...,12,0,0,50,12,100,0,0,87,0
7750,FRANK HAMAKER,THADDEUS LUSTER,11/03/1994,-,-,W,L,Submission,1,4:52,...,0,0,0,0,50,0,0,0,50,0
7751,JOHNNY RHODES,DAVID LEVICKI,11/03/1994,-,-,W,L,KO/TKO,1,12:13,...,9,0,9,0,9,25,9,50,81,25
7752,PATRICK SMITH,RAY WIZARD,11/03/1994,-,-,W,L,Submission,1,0:58,...,100,0,0,100,0,100,100,0,0,0
7753,SCOTT MORRIS,SEAN DAUGHERTY,11/03/1994,-,-,W,L,Submission,1,0:20,...,0,0,0,0,0,0,100,0,0,0


In [57]:
fights_stats.columns

Index(['red_fighter_name', 'blue_fighter_name', 'event_date',
       'red_fighter_nickname', 'blue_fighter_nickname', 'red_fighter_result',
       'blue_fighter_result', 'method', 'round', 'time', 'time_format',
       'referee', 'details', 'bout_type', 'bonus', 'event_name',
       'event_location', 'red_fighter_KD', 'blue_fighter_KD',
       'red_fighter_sig_str', 'blue_fighter_sig_str',
       'red_fighter_sig_str_pct', 'blue_fighter_sig_str_pct',
       'red_fighter_total_str', 'blue_fighter_total_str', 'red_fighter_TD',
       'blue_fighter_TD', 'red_fighter_TD_pct', 'blue_fighter_TD_pct',
       'red_fighter_sub_att', 'blue_fighter_sub_att', 'red_fighter_rev',
       'blue_fighter_rev', 'red_fighter_ctrl', 'blue_fighter_ctrl',
       'red_fighter_sig_str_head', 'blue_fighter_sig_str_head',
       'red_fighter_sig_str_body', 'blue_fighter_sig_str_body',
       'red_fighter_sig_str_leg', 'blue_fighter_sig_str_leg',
       'red_fighter_sig_str_distance', 'blue_fighter_sig_str_distan

## Drop redundant features

Let's drop some features that are redundant and have no value for us here, for example: <br>
> `x_fighter_sig_str`, where we already have `x_fighter_sig_str_pct`. Where the prior takes the `75 of 144` form, and the latter takes the percentage version of the same value = `52%`. The latter is already scaled, and will be easier to work with.

In [58]:
fights_stats.loc[:1, ['red_fighter_sig_str', 'red_fighter_sig_str_pct']]

Unnamed: 0,red_fighter_sig_str,red_fighter_sig_str_pct
0,75 of 144,52
1,2 of 2,100


In [59]:
redundant_cols = ["fighter_sig_str", "fighter_TD"]

# Init both fighters' corners
fighters = ('red_', 'blue_')

cols_to_drop = [f"{fighter}{col}" for col in redundant_cols for fighter in fighters]
cols_to_drop

['red_fighter_sig_str',
 'blue_fighter_sig_str',
 'red_fighter_TD',
 'blue_fighter_TD']

Dropping:

In [60]:
fights_stats.drop(columns=cols_to_drop, inplace=True)

## Rename columns

Let's rename some columns to avoid name conflicts later and to better represent what they mean.

Preparing:

In [61]:
# Specify cols that will be renamed (stripped of 'red_' or 'blue_' prefixes)
sig_str_acc_cols = [
    'fighter_sig_str_head', 'fighter_sig_str_body',
    'fighter_sig_str_leg', 'fighter_sig_str_distance',
    'fighter_sig_str_clinch', 'fighter_sig_str_ground'
]

sig_str_tar_cols = ['fighter_sig_str_head_pct', 'fighter_sig_str_body_pct',
                     'fighter_sig_str_leg_pct']

sig_str_pos_cols = ['fighter_sig_str_distance_pct', 'fighter_sig_str_clinch_pct',
                     'fighter_sig_str_ground_pct']

# Init mappings
sig_str_acc_mappings = {}
sig_str_tar_mappings = {}
sig_str_pos_mappings = {}

# Init both fighters' corners
fighters = ('red_', 'blue_')

# Postfixes
sig_str_acc_postfix = "_acc"
sig_str_tar_postfix = "_tar_pct"
sig_str_pos_postfix = "_pct"

# Start mapping
for fighter in fighters:
    # For significant strikes accuracy features
    for col in sig_str_acc_cols:
        sig_str_acc_mappings[f"{fighter}{col}"] = f"{fighter}{col}{sig_str_acc_postfix}"

    # For significant strikes by target features
    for col in sig_str_tar_cols:
        # Reposition '_pct' to the end
        sig_str_tar_mappings[f"{fighter}{col}"] = f"{fighter}{col.strip("_pct")}{sig_str_tar_postfix}"

    # For significant strikes by position features
    for col in sig_str_pos_cols:
        # Reposition '_pct' to the end
        sig_str_pos_mappings[f"{fighter}{col}"] = f"{fighter}{col.strip("_pct")}{sig_str_pos_postfix}"

# Merge mappings into 1 single dict
col_names_mappings = sig_str_acc_mappings | sig_str_tar_mappings | sig_str_pos_mappings
col_names_mappings

{'red_fighter_sig_str_head': 'red_fighter_sig_str_head_acc',
 'red_fighter_sig_str_body': 'red_fighter_sig_str_body_acc',
 'red_fighter_sig_str_leg': 'red_fighter_sig_str_leg_acc',
 'red_fighter_sig_str_distance': 'red_fighter_sig_str_distance_acc',
 'red_fighter_sig_str_clinch': 'red_fighter_sig_str_clinch_acc',
 'red_fighter_sig_str_ground': 'red_fighter_sig_str_ground_acc',
 'blue_fighter_sig_str_head': 'blue_fighter_sig_str_head_acc',
 'blue_fighter_sig_str_body': 'blue_fighter_sig_str_body_acc',
 'blue_fighter_sig_str_leg': 'blue_fighter_sig_str_leg_acc',
 'blue_fighter_sig_str_distance': 'blue_fighter_sig_str_distance_acc',
 'blue_fighter_sig_str_clinch': 'blue_fighter_sig_str_clinch_acc',
 'blue_fighter_sig_str_ground': 'blue_fighter_sig_str_ground_acc',
 'red_fighter_sig_str_head_pct': 'red_fighter_sig_str_head_tar_pct',
 'red_fighter_sig_str_body_pct': 'red_fighter_sig_str_body_tar_pct',
 'red_fighter_sig_str_leg_pct': 'red_fighter_sig_str_leg_tar_pct',
 'blue_fighter_sig_str_

Renaming:

In [62]:
fights_stats.rename(columns=col_names_mappings, inplace=True)
fights_stats.columns

Index(['red_fighter_name', 'blue_fighter_name', 'event_date',
       'red_fighter_nickname', 'blue_fighter_nickname', 'red_fighter_result',
       'blue_fighter_result', 'method', 'round', 'time', 'time_format',
       'referee', 'details', 'bout_type', 'bonus', 'event_name',
       'event_location', 'red_fighter_KD', 'blue_fighter_KD',
       'red_fighter_sig_str_pct', 'blue_fighter_sig_str_pct',
       'red_fighter_total_str', 'blue_fighter_total_str', 'red_fighter_TD_pct',
       'blue_fighter_TD_pct', 'red_fighter_sub_att', 'blue_fighter_sub_att',
       'red_fighter_rev', 'blue_fighter_rev', 'red_fighter_ctrl',
       'blue_fighter_ctrl', 'red_fighter_sig_str_head_acc',
       'blue_fighter_sig_str_head_acc', 'red_fighter_sig_str_body_acc',
       'blue_fighter_sig_str_body_acc', 'red_fighter_sig_str_leg_acc',
       'blue_fighter_sig_str_leg_acc', 'red_fighter_sig_str_distance_acc',
       'blue_fighter_sig_str_distance_acc', 'red_fighter_sig_str_clinch_acc',
       'blue_fighter

Too long and too many features to look at, we will make the names shorter and make the number of features x2 less. But first, we need to merge some additional features :))

# Merge additional athlete-based features

Let's merge additional athlete-based features from an external dataset like: *Height*, *Reach*, *Stance*

In [63]:
# External dataset
athlete_stats = pd.read_csv("raw_fighter_details.csv", sep=',')
athlete_stats.tail()

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg
3591,Zhang Tiequan,"5' 8""",155 lbs.,"69""",Orthodox,"Jul 25, 1978",1.23,36%,2.14,51%,1.95,58%,75%,3.4
3592,Alex Zuniga,,145 lbs.,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0
3593,George Zuniga,"5' 9""",185 lbs.,,,,7.64,38%,5.45,37%,0.0,0%,100%,0.0
3594,Allan Zuniga,"5' 7""",155 lbs.,"70""",Orthodox,"Apr 04, 1992",3.93,52%,1.8,61%,0.0,0%,57%,1.0
3595,Virgil Zwicker,"6' 2""",205 lbs.,"74""",,"Jun 26, 1982",3.34,48%,4.87,39%,1.31,30%,50%,0.0


## External dataset feature engineering

We leave only the features we are interested in:

In [64]:
athlete_stats.drop(columns=["Weight", "DOB"], inplace=True)
athlete_stats.tail()

Unnamed: 0,fighter_name,Height,Reach,Stance,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg
3591,Zhang Tiequan,"5' 8""","69""",Orthodox,1.23,36%,2.14,51%,1.95,58%,75%,3.4
3592,Alex Zuniga,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0
3593,George Zuniga,"5' 9""",,,7.64,38%,5.45,37%,0.0,0%,100%,0.0
3594,Allan Zuniga,"5' 7""","70""",Orthodox,3.93,52%,1.8,61%,0.0,0%,57%,1.0
3595,Virgil Zwicker,"6' 2""","74""",,3.34,48%,4.87,39%,1.31,30%,50%,0.0


### Renaming columns

We need to rename some columns for differentiating between athlete-based and fight-based.
Let's prep the columns:

In [65]:
postfix = "_cs" # _cs is an abbreviation for career statistic
cols_to_rename = ["SLpM", "Str_Acc", "SApM", "Str_Def", "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg"]
name_mappings = {col: f"{col}{postfix}" for col in cols_to_rename}
name_mappings

{'SLpM': 'SLpM_cs',
 'Str_Acc': 'Str_Acc_cs',
 'SApM': 'SApM_cs',
 'Str_Def': 'Str_Def_cs',
 'TD_Avg': 'TD_Avg_cs',
 'TD_Acc': 'TD_Acc_cs',
 'TD_Def': 'TD_Def_cs',
 'Sub_Avg': 'Sub_Avg_cs'}

Rename:

In [66]:
athlete_stats.rename(columns=name_mappings, inplace=True)
athlete_stats.head(3)

Unnamed: 0,fighter_name,Height,Reach,Stance,SLpM_cs,Str_Acc_cs,SApM_cs,Str_Def_cs,TD_Avg_cs,TD_Acc_cs,TD_Def_cs,Sub_Avg_cs
0,Tom Aaron,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0
1,Papy Abedi,"5' 11""",,Southpaw,2.8,55%,3.15,48%,3.47,57%,50%,1.3
2,Shamil Abdurakhimov,"6' 3""","76""",Orthodox,2.45,44%,2.45,58%,1.23,24%,47%,0.2


### Dropping NaNs

Dropping rows, that bring us no information whatsoever:

In [67]:
athlete_stats.dropna(how="all", subset=["Height", "Reach", "Stance"], inplace=True)
athlete_stats.shape

(3363, 12)

Impututing the rest of NaNs with zeros:

In [68]:
athlete_stats.fillna(0, inplace=True)
print(f"Number of NaN entries: {athlete_stats.isna().sum().sum()}")

Number of NaN entries: 0


Convert external dataset fighter names to uppercase, and columns to lowercase to match our format:

In [69]:
# Fighter names => upper
athlete_stats['fighter_name'] = athlete_stats['fighter_name'].str.upper()
# Columns => lower
athlete_stats.columns = athlete_stats.columns.str.lower()
athlete_stats.head()

Unnamed: 0,fighter_name,height,reach,stance,slpm_cs,str_acc_cs,sapm_cs,str_def_cs,td_avg_cs,td_acc_cs,td_def_cs,sub_avg_cs
1,PAPY ABEDI,"5' 11""",0,Southpaw,2.8,55%,3.15,48%,3.47,57%,50%,1.3
2,SHAMIL ABDURAKHIMOV,"6' 3""","76""",Orthodox,2.45,44%,2.45,58%,1.23,24%,47%,0.2
3,DANNY ABBADI,"5' 11""",0,Orthodox,3.29,38%,4.41,57%,0.0,0%,77%,0.0
4,HIROYUKI ABE,"5' 6""",0,Orthodox,1.71,36%,3.11,63%,0.0,0%,33%,0.0
5,RICARDO ABREU,"5' 11""",0,Orthodox,3.79,31%,3.98,68%,2.13,42%,100%,0.7


Defining a function that converts height and reach from inches to cm:

In [70]:
def conv_from_inches_to_cm(inches):
    """Converts from inches to cm"""
    # If the data is missing
    if not inches:
        return inches
        
    res = inches.strip('"').split("'")
    # Foot to cm conversion rate
    f_cm_cr = 30.48
    # Inch to cm convetsion rate
    i_cm_cr = 2.54

    # If converting height
    if len(res) > 1:
        return int(res[0]) * f_cm_cr + int(res[1]) * i_cm_cr
    # Or reach
    else:
        return int(res[0]) * i_cm_cr

Applying:

In [71]:
# Convert height
athlete_stats['height'] = athlete_stats['height'].apply(conv_from_inches_to_cm)
# Convert reach
athlete_stats['reach'] = athlete_stats['reach'].apply(conv_from_inches_to_cm)
# Take a look
athlete_stats.head()

Unnamed: 0,fighter_name,height,reach,stance,slpm_cs,str_acc_cs,sapm_cs,str_def_cs,td_avg_cs,td_acc_cs,td_def_cs,sub_avg_cs
1,PAPY ABEDI,180.34,0.0,Southpaw,2.8,55%,3.15,48%,3.47,57%,50%,1.3
2,SHAMIL ABDURAKHIMOV,190.5,193.04,Orthodox,2.45,44%,2.45,58%,1.23,24%,47%,0.2
3,DANNY ABBADI,180.34,0.0,Orthodox,3.29,38%,4.41,57%,0.0,0%,77%,0.0
4,HIROYUKI ABE,167.64,0.0,Orthodox,1.71,36%,3.11,63%,0.0,0%,33%,0.0
5,RICARDO ABREU,180.34,0.0,Orthodox,3.79,31%,3.98,68%,2.13,42%,100%,0.7


Prepare mappings to map features to red/blue fighters:

In [72]:
red_features = {col: f"red_fighter_{col}" for col in athlete_stats.columns if 'fighter' not in col}
blue_features = {col: f"blue_fighter_{col}" for col in athlete_stats.columns if 'fighter' not in col}

Merging:

In [73]:
stats = pd.merge(fights_stats, athlete_stats.rename(columns=red_features), left_on='red_fighter_name', right_on='fighter_name')
stats.drop(columns='fighter_name', inplace=True)

In [74]:
stats = pd.merge(stats, athlete_stats.rename(columns=blue_features), left_on='blue_fighter_name', right_on='fighter_name')
stats.drop(columns='fighter_name', inplace=True)

In [75]:
stats.columns

Index(['red_fighter_name', 'blue_fighter_name', 'event_date',
       'red_fighter_nickname', 'blue_fighter_nickname', 'red_fighter_result',
       'blue_fighter_result', 'method', 'round', 'time', 'time_format',
       'referee', 'details', 'bout_type', 'bonus', 'event_name',
       'event_location', 'red_fighter_KD', 'blue_fighter_KD',
       'red_fighter_sig_str_pct', 'blue_fighter_sig_str_pct',
       'red_fighter_total_str', 'blue_fighter_total_str', 'red_fighter_TD_pct',
       'blue_fighter_TD_pct', 'red_fighter_sub_att', 'blue_fighter_sub_att',
       'red_fighter_rev', 'blue_fighter_rev', 'red_fighter_ctrl',
       'blue_fighter_ctrl', 'red_fighter_sig_str_head_acc',
       'blue_fighter_sig_str_head_acc', 'red_fighter_sig_str_body_acc',
       'blue_fighter_sig_str_body_acc', 'red_fighter_sig_str_leg_acc',
       'blue_fighter_sig_str_leg_acc', 'red_fighter_sig_str_distance_acc',
       'blue_fighter_sig_str_distance_acc', 'red_fighter_sig_str_clinch_acc',
       'blue_fighter

# Data preprocessing

Creating a single **winner** feature instead of two **red_fighter_result** and **blue_fighter_result** features:

In [19]:
stats.loc[:, 'winner'] = stats['red_fighter_result'].apply(lambda x: 'red' if x == 'W' else 'blue')

In [20]:
stats['winner'].head()

0     red
1    blue
2     red
3     red
4     red
Name: winner, dtype: object

Changing columns from **red/blue+feature name** to **winner/loser+feature name**:

In [21]:
stats.columns[-5:]

Index(['blue_fighter_td_avg', 'blue_fighter_td_acc', 'blue_fighter_td_def',
       'blue_fighter_sub_avg', 'winner'],
      dtype='object')

In [22]:
def assign_winner_n_loser(df, winner_col='winner'):
    """Filters what columns to take into account,
       creates new columns, instead of red/blue makes winner/loser,
       gets data points from red/blue column based on 
       the value of the feature 'winner' in that same row."""
    
    # Get the columns, leave features like 'round', 'method', 'time' like they are
    cols_to_drop = [col for col in df.columns if col.startswith('red') or col.startswith('blue')]
    cols_to_change = set(col.removeprefix("red_").removeprefix("blue_") for col in cols_to_drop)

    # Change each column
    for col in cols_to_change:
        # Create new features, sort the data according to the value of the 'winner' featuer
        df.loc[:, f"winner_{col.replace('fighter_', '')}"] = \
            df[f"red_{col}"].where(df[winner_col] == 'red', df[f"blue_{col}"])
        df.loc[:, f"loser_{col.replace('fighter_', '')}"] = \
            df[f"red_{col}"].where(df[winner_col] == 'blue', df[f"blue_{col}"])

    # Drop the red/blue columns to keep only winner/loser
    df.drop(columns=cols_to_drop, inplace=True)
    return df 

In [23]:
stats = assign_winner_n_loser(stats)
stats.columns[-5:]

Index(['loser_result', 'winner_sig_str_distance', 'loser_sig_str_distance',
       'winner_rev', 'loser_rev'],
      dtype='object')

# Data Cleaning

## NaN values

In [24]:
stats.isnull().sum().sum()

0

Replacing NaN entry fillers with zeros:

In [25]:
stats.isin(['-', '--', '---']).sum().sum()

14444

In [26]:
stats = stats.replace(['-', '--', '---'], "0")
stats.isin(['-', '--', '---']).sum().sum()

0

## Duplicates

In [27]:
stats.duplicated().sum()

0

We can see that there are no NaNs or duplicates. Let's get to Feature Engineering.

# Feature Engineering

Convert columns that are in object dtype but where values just consist of intigers only to remove the pool of preprocessing:

In [28]:
stats.dtypes.value_counts()

object     69
float64    12
int64       1
Name: count, dtype: int64

In [29]:
for col in stats.columns:
    try:
        stats[col] = stats[col].astype(float)
    except:
        continue

In [30]:
stats.dtypes.value_counts()

object     47
float64    35
Name: count, dtype: int64

## Process categorical features

Find columns that need to be cleaned:

In [31]:
cols_to_clean = [] 

for col in stats.columns:
    val = stats.loc[0, col]
    if (
        stats[col].dtype == 'object'
        and (col.startswith('loser_') or col.startswith('winner_'))
        and any(char.isdigit() for char in val)
    ):
        cols_to_clean.append(col)
    
print(f"Number of categorical features to preprocess: {len(cols_to_clean)}")

Number of categorical features to preprocess: 28


Taking a look:

In [32]:
stats[cols_to_clean].loc[:2, ['loser_sig_str_body', 'winner_ctrl', 'winner_td_def']]

Unnamed: 0,loser_sig_str_body,winner_ctrl,winner_td_def
0,13 of 30,0:45,100%
1,0 of 0,3:20,0%
2,9 of 12,1:31,85%


## Standardizing Features

In total we have to standardize 3 types of features:
1. Ratio to pct: 75 of 144 => 52 (%)
2. Dropping pct symbol: 85% => 85 (%)
3. Time: 1:31	=> 91 (seconds)

But let's first group the columns into 3 different buckets for simplicity:

In [33]:
"""This way of matching is not safe, and REGEXes should be used instead,
but this is a trivial case and I think we should be fine."""
of_cols = []
no_pct_symbol_cols = []
time_cols = []

for col in cols_to_clean:
    val = stats.loc[0, col]

    if "of" in val:
        of_cols.append(col)
    elif "%" in val:
        no_pct_symbol_cols.append(col)
    elif ":" in val:
        time_cols.append(col)

Let's take a look:

In [34]:
stats.loc[:2, of_cols]

Unnamed: 0,winner_total_str,loser_total_str,winner_sig_str_body,loser_sig_str_body,winner_sig_str_leg,loser_sig_str_leg,winner_sig_str,loser_sig_str,winner_sig_str_clinch,loser_sig_str_clinch,winner_sig_str_head,loser_sig_str_head,winner_TD,loser_TD,winner_sig_str_ground,loser_sig_str_ground,winner_sig_str_distance,loser_sig_str_distance
0,78 of 147,84 of 209,11 of 14,13 of 30,15 of 16,19 of 23,75 of 144,79 of 204,0 of 0,0 of 0,49 of 114,47 of 151,2 of 2,0 of 0,4 of 4,0 of 0,71 of 140,79 of 204
1,25 of 31,2 of 2,1 of 1,0 of 0,0 of 0,2 of 2,3 of 4,2 of 2,0 of 0,0 of 0,2 of 3,0 of 0,2 of 4,0 of 0,3 of 3,0 of 0,0 of 1,2 of 2
2,75 of 142,59 of 123,22 of 29,9 of 12,13 of 13,34 of 40,55 of 114,53 of 117,5 of 9,3 of 6,20 of 72,10 of 65,0 of 0,0 of 1,0 of 0,0 of 0,50 of 105,50 of 111


In [35]:
stats.loc[:2, no_pct_symbol_cols]

Unnamed: 0,winner_str_acc,loser_str_acc,winner_td_def,loser_td_def,winner_str_def,loser_str_def,winner_td_acc,loser_td_acc
0,45%,46%,100%,84%,68%,60%,55%,71%
1,72%,40%,0%,84%,66%,60%,75%,27%
2,53%,52%,85%,90%,65%,52%,33%,25%


In [36]:
stats.loc[:2, time_cols]

Unnamed: 0,winner_ctrl,loser_ctrl
0,0:45,0:00
1,3:20,0:00
2,1:31,1:00


Looks good, let's also add _pct postfix to the column that will consist of pct values to differentiate later.

In [37]:
"""of_cols = [col + "_pct" for col in of_cols]
no_pct_symbol_cols = [col + "_pct" for col in no_pct_symbol_cols]"""

'of_cols = [col + "_pct" for col in of_cols]\nno_pct_symbol_cols = [col + "_pct" for col in no_pct_symbol_cols]'

Let's standardize.

### Standardizing fractions into pct % (e.g. from **50 of 100** to **50%**):

In [33]:
def convert_ratio_to_pct(row):
    row = row.split("of")
    return (int(row[0]) * 100) / int(row[1]) if re.search(r"\d", row[0]) and int(row[0]) != 0 else 0

Applying:

In [34]:
"""for col in of_cols:
    stats[col] = stats[col].apply(convert_ratio_to_pct)"""

# Rename
name_mappings = {col: f"{col}_pct" for col in of_cols}
stats.rename(columns=name_mappings, inplace=True)

In [46]:
stats[name_mappings.values()]

Unnamed: 0,winner_sig_str_distance_pct,winner_sig_str_distance_pct.1,loser_sig_str_distance_pct,loser_sig_str_distance_pct.1,winner_sig_str_body_pct,winner_sig_str_body_pct.1,loser_sig_str_body_pct,loser_sig_str_body_pct.1,winner_sig_str_leg_pct,winner_sig_str_leg_pct.1,...,loser_TD_pct,loser_TD_pct.1,winner_sig_str_ground_pct,winner_sig_str_ground_pct.1,loser_sig_str_ground_pct,loser_sig_str_ground_pct.1,winner_sig_str_pct,winner_sig_str_pct.1,loser_sig_str_pct,loser_sig_str_pct.1
0,94.0,71 of 140,100.0,79 of 204,14.0,11 of 14,16.0,13 of 30,20.0,15 of 16,...,0.0,0 of 0,5.0,4 of 4,0.0,0 of 0,52.0,75 of 144,38.0,79 of 204
1,0.0,0 of 1,100.0,2 of 2,33.0,1 of 1,0.0,0 of 0,0.0,0 of 0,...,0.0,0 of 0,100.0,3 of 3,0.0,0 of 0,75.0,3 of 4,100.0,2 of 2
2,90.0,50 of 105,94.0,50 of 111,40.0,22 of 29,16.0,9 of 12,23.0,13 of 13,...,0.0,0 of 1,0.0,0 of 0,0.0,0 of 0,48.0,55 of 114,45.0,53 of 117
3,71.0,37 of 80,69.0,32 of 79,23.0,12 of 17,10.0,5 of 8,7.0,4 of 6,...,33.0,1 of 3,5.0,3 of 4,17.0,8 of 11,52.0,52 of 100,43.0,46 of 106
4,72.0,13 of 24,100.0,4 of 13,16.0,3 of 3,25.0,1 of 1,16.0,3 of 3,...,0.0,0 of 0,27.0,5 of 6,0.0,0 of 0,60.0,18 of 30,30.0,4 of 13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6816,66.0,2 of 4,0.0,0 of 2,33.0,1 of 2,0.0,0 of 0,33.0,1 of 1,...,100.0,1 of 1,33.0,1 of 1,0.0,0 of 0,60.0,3 of 5,0.0,0 of 2
6817,0.0,0 of 1,100.0,1 of 1,0.0,0 of 0,0.0,0 of 0,0.0,0 of 1,...,0.0,0 of 0,100.0,4 of 5,0.0,0 of 2,66.0,4 of 6,33.0,1 of 3
6818,12.0,1 of 3,100.0,2 of 6,12.0,1 of 1,0.0,0 of 1,0.0,0 of 0,...,100.0,1 of 1,87.0,7 of 9,0.0,0 of 0,66.0,8 of 12,33.0,2 of 6
6819,9.0,1 of 1,25.0,1 of 2,9.0,1 of 1,0.0,0 of 0,9.0,1 of 1,...,0.0,0 of 0,81.0,9 of 15,25.0,1 of 1,64.0,11 of 17,80.0,4 of 5


In [35]:
stats['winner_sig_str_pct']

Unnamed: 0,winner_sig_str_pct,winner_sig_str_pct.1
0,52.0,75 of 144
1,75.0,3 of 4
2,48.0,55 of 114
3,52.0,52 of 100
4,60.0,18 of 30
...,...,...
6816,60.0,3 of 5
6817,66.0,4 of 6
6818,66.0,8 of 12
6819,64.0,11 of 17


Replacing **--** and **-** entries with zeros:

In [378]:
stats.loc[stats['winner_ctrl'].isin(['--', '-', '0']), 'winner_ctrl'] = '0:00'
stats.loc[stats['loser_ctrl'].isin(['--', '-', '0']), 'loser_ctrl'] = '0:00'

Standardizing time from **mm:ss** into total **ss**:

In [379]:
stats.loc[:, 'winner_ctrl'] = pd.to_timedelta("00:" + stats.loc[:, 'winner_ctrl']).dt.total_seconds()
stats.loc[:, 'loser_ctrl'] = pd.to_timedelta("00:" + stats.loc[:, 'loser_ctrl']).dt.total_seconds()

Find columns that need to be cleaned:

In [270]:
cols_to_clean = [] 
for col in stats.columns:
    if stats[col].dtype == 'object' and (col.startswith('loser_') or col.startswith('winner_')) \
       and any(char.isdigit() for char in stats.loc[0, col]):
        cols_to_clean.append(col)
    
cols_to_clean

['winner_sig_str',
 'loser_sig_str',
 'winner_sig_str_distance_pct',
 'loser_sig_str_distance_pct',
 'winner_sig_str_pct',
 'loser_sig_str_pct',
 'winner_td_def',
 'loser_td_def',
 'winner_str_def',
 'loser_str_def',
 'winner_sig_str_leg',
 'loser_sig_str_leg',
 'winner_TD',
 'loser_TD',
 'winner_rev',
 'loser_rev',
 'winner_KD',
 'loser_KD',
 'winner_sig_str_ground',
 'loser_sig_str_ground',
 'winner_sig_str_body_pct',
 'loser_sig_str_body_pct',
 'winner_sig_str_body',
 'loser_sig_str_body',
 'winner_sig_str_distance',
 'loser_sig_str_distance',
 'winner_sig_str_leg_pct',
 'loser_sig_str_leg_pct',
 'winner_td_acc',
 'loser_td_acc',
 'winner_total_str',
 'loser_total_str',
 'winner_sig_str_clinch',
 'loser_sig_str_clinch',
 'winner_sig_str_head',
 'loser_sig_str_head',
 'winner_sig_str_head_pct',
 'loser_sig_str_head_pct',
 'winner_sub_att',
 'loser_sub_att',
 'winner_ctrl',
 'loser_ctrl',
 'winner_str_acc',
 'loser_str_acc',
 'winner_TD_pct',
 'winner_sig_str_ground_pct',
 'loser_sig_

In [None]:
stats['winner_sig_str_head_pct'] = stats['winner_sig_str_head_pct'].astype(float)

In [352]:
%debug

> [0;32m/var/folders/99/3nnshfd56kv1_g2l04l874t00000gn/T/ipykernel_3054/386721011.py[0m(7)[0;36m<module>[0;34m()[0m
[0;32m      5 [0;31m        [0mstats[0m[0;34m[[0m[0mcol[0m[0;34m][0m[0;34m.[0m[0mdtype[0m [0;34m==[0m [0;34m'object'[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m        [0;32mand[0m [0;34m([0m[0mcol[0m[0;34m.[0m[0mstartswith[0m[0;34m([0m[0;34m'loser_'[0m[0;34m)[0m [0;32mor[0m [0mcol[0m[0;34m.[0m[0mstartswith[0m[0;34m([0m[0;34m'winner_'[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 7 [0;31m        [0;32mand[0m [0many[0m[0;34m([0m[0mchar[0m[0;34m.[0m[0misdigit[0m[0;34m([0m[0;34m)[0m [0;32mfor[0m [0mchar[0m [0;32min[0m [0mval[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m    [0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      9 [0;31m        [0mcols_to_clean[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mcol[0m[0;34m)[0m[0;34

ipdb>  val


45.0


ipdb>  col


'winner_ctrl'


ipdb>  q


In [277]:
stats['winner_sig_str_head_pct'] = stats['winner_sig_str_head_pct'].astype(float)

In [278]:
stats['winner_sig_str_head_pct'].dtype

dtype('float64')

In [276]:
stats['winner_sig_str_head_pct'].unique()

array(['65', '66', '36', '69', '63', '82', '81', '76', '46', '86', '35',
       '89', '53', '84', '67', '51', '55', '64', '75', '48', '74', '57',
       '90', '91', '52', '59', '40', '100', '70', '73', '71', '47', '68',
       '43', '61', '54', '50', '42', '83', '77', '60', '92', '62', '72',
       '78', '41', '80', '39', '79', '34', '30', '88', '58', '26', '85',
       '45', '20', '87', '37', '96', '19', '56', '44', '38', '13', '94',
       '93', '32', '18', '27', '33', '28', '0', '11', '23', '15', '95',
       '12', '25', '21', '29', '31', '10', '97', '49', '24', '22', '98',
       '16', '6', '5', '17', '14', '3', '7'], dtype=object)

## Dropping Redundant Features

In [24]:
stats.drop(['loser_sig_str', 'winner_total_str', 'loser_total_str', 'winner_sig_str',
         'winner_TD', 'loser_TD', 'winner_sig_str_head', 'loser_sig_str_head', 
         'winner_sig_str_body', 'loser_sig_str_body', 'winner_sig_str_leg', 'loser_sig_str_leg',
         'winner_sig_str_distance', 'loser_sig_str_distance', 'winner_sig_str_clinch',
         'loser_sig_str_clinch', 'winner_sig_str_ground', 'loser_sig_str_ground'], axis=1, inplace=True)

## Converting Dtypes

In [25]:
def convert_dtypes(stats):
    start_idx = pd.Index(stats.columns).get_loc('winner_KD')
    
    for i in range(start_idx, len(stats.columns)):
        if stats[stats.columns[i]].dtype != float:
            stats[stats.columns[i]] = stats[stats.columns[i]].astype(float)
    return stats

# Applying
stats = convert_dtypes(stats)

## Engineering New Features

Let's engineer some additional features:
1. Striking dominance - a fighter's overall striking performance. Calculated as: KD + Significant strikes % + Total landed strikes %;
2. Wrestling dominance - a fighter's overall wrestling performance. Calculated as: TD % + Submission attempts + reversals;

**Striking dominance**:

In [26]:
stats['winner_striking_dominance'] = stats['winner_KD'] + stats['winner_sig_str_pct'] + stats['winner_total_str_landed_pct']
stats['loser_striking_dominance'] = stats['loser_KD'] + stats['loser_sig_str_pct'] + stats['loser_total_str_landed_pct']

**Wrestling dominance**:

In [27]:
stats['winner_wrestling_dominance'] = stats['winner_TD_pct'] + stats['winner_sub_att'] + stats['winner_rev']
stats['loser_wrestling_dominance'] = stats['loser_TD_pct'] + stats['loser_sub_att'] + stats['loser_rev']

In [28]:
stats.dtypes

event_date                      object
method                          object
round                            int64
time                            object
time_format                     object
referee                         object
details                         object
bout_type                       object
bonus                           object
event_name                      object
event_location                  object
winner                          object
winner_name                     object
loser_name                      object
winner_nickname                 object
loser_nickname                  object
winner_KD                      float64
loser_KD                       float64
winner_sig_str_pct             float64
loser_sig_str_pct              float64
winner_TD_pct                  float64
loser_TD_pct                   float64
winner_sub_att                 float64
loser_sub_att                  float64
winner_rev                     float64
loser_rev                