In [37]:
import re

import numpy as np
import pandas as pd

# Loading data

## Loading fights dataset

In [38]:
fights_stats = pd.read_csv("../stats/stats_raw.csv", sep=";")
fights_stats.head()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,red_fighter_sig_str_body_pct,blue_fighter_sig_str_body_pct,red_fighter_sig_str_leg_pct,blue_fighter_sig_str_leg_pct,red_fighter_sig_str_distance_pct,blue_fighter_sig_str_distance_pct,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct
0,ILIA TOPURIA,MAX HOLLOWAY,26/10/2024,El Matador,Blessed,W,L,KO/TKO,3,1:34,...,14,16,20,24,94,100,0,0,5,0
1,ROBERT WHITTAKER,KHAMZAT CHIMAEV,26/10/2024,The Reaper,Borz,L,W,Submission,1,3:34,...,0,33,100,0,100,0,0,0,0,100
2,MAGOMED ANKALAEV,ALEKSANDAR RAKIC,26/10/2024,-,Rocket,W,L,Decision - Unanimous,3,5:00,...,40,16,23,64,90,94,9,5,0,0
3,LERONE MURPHY,DAN IGE,26/10/2024,The Miracle,50K,W,L,Decision - Unanimous,3,5:00,...,23,10,7,13,71,69,23,13,5,17
4,SHARA MAGOMEDOV,ARMEN PETROSYAN,26/10/2024,Bullet,Superman,W,L,KO/TKO,2,4:52,...,44,12,18,58,96,97,3,2,0,0


## Drop redundant features

Let's drop some features that are redundant and have no value for us here, for example: <br>
> `x_fighter_sig_str`, where we already have `x_fighter_sig_str_pct`. Where the prior takes the `75 of 144` form, and the latter takes the percentage version of the same value = `52%`. The latter is already scaled, and will be easier to work with.

In [39]:
fights_stats.loc[:1, ["red_fighter_sig_str", "red_fighter_sig_str_pct"]]

Unnamed: 0,red_fighter_sig_str,red_fighter_sig_str_pct
0,75 of 144,52
1,2 of 2,100


In [40]:
redundant_cols = ["fighter_sig_str", "fighter_TD"]

# Init both fighters' corners
fighters = ("red_", "blue_")

# For both red/blue fighters
cols_to_drop = [f"{fighter}{col}" for col in redundant_cols for fighter in fighters]
cols_to_drop

['red_fighter_sig_str',
 'blue_fighter_sig_str',
 'red_fighter_TD',
 'blue_fighter_TD']

Dropping:

In [41]:
fights_stats.drop(columns=cols_to_drop, inplace=True)

## Rename features

Let's rename some columns to avoid name conflicts later and to better represent what they mean.

Preparing:

In [42]:
# Significant strikes accuracy
sig_str_acc_cols = [
    "fighter_sig_str_head",
    "fighter_sig_str_body",
    "fighter_sig_str_leg",
    "fighter_sig_str_distance",
    "fighter_sig_str_clinch",
    "fighter_sig_str_ground",
]

# Significant strikes target
sig_str_tar_cols = [
    "fighter_sig_str_head_pct",
    "fighter_sig_str_body_pct",
    "fighter_sig_str_leg_pct",
]

# Significant strikes position
sig_str_pos_cols = [
    "fighter_sig_str_distance_pct",
    "fighter_sig_str_clinch_pct",
    "fighter_sig_str_ground_pct",
]

# Init mappings
sig_str_acc_mappings = {}
sig_str_tar_mappings = {}
sig_str_pos_mappings = {}

# Init both fighters' corners
fighters = ("red_", "blue_")

# Postfixes
sig_str_acc_postfix = "_acc"
sig_str_tar_postfix = "_tar_pct"
sig_str_pos_postfix = "_pos_pct"

# Start mapping
for fighter in fighters:
    # For significant strikes accuracy features
    for col in sig_str_acc_cols:
        sig_str_acc_mappings[f"{fighter}{col}"] = f"{fighter}{col}{sig_str_acc_postfix}"

    # For significant strikes by target features
    for col in sig_str_tar_cols:
        # Reposition '_pct' to the end
        sig_str_tar_mappings[f"{fighter}{col}"] = (
            f"{fighter}{col.strip("_pct")}{sig_str_tar_postfix}"
        )

    # For significant strikes by position features
    for col in sig_str_pos_cols:
        # Reposition '_pct' to the end
        sig_str_pos_mappings[f"{fighter}{col}"] = (
            f"{fighter}{col.strip("_pct")}{sig_str_pos_postfix}"
        )

# Merge mappings into 1 single dict
col_names_mappings = sig_str_acc_mappings | sig_str_tar_mappings | sig_str_pos_mappings
col_names_mappings

{'red_fighter_sig_str_head': 'red_fighter_sig_str_head_acc',
 'red_fighter_sig_str_body': 'red_fighter_sig_str_body_acc',
 'red_fighter_sig_str_leg': 'red_fighter_sig_str_leg_acc',
 'red_fighter_sig_str_distance': 'red_fighter_sig_str_distance_acc',
 'red_fighter_sig_str_clinch': 'red_fighter_sig_str_clinch_acc',
 'red_fighter_sig_str_ground': 'red_fighter_sig_str_ground_acc',
 'blue_fighter_sig_str_head': 'blue_fighter_sig_str_head_acc',
 'blue_fighter_sig_str_body': 'blue_fighter_sig_str_body_acc',
 'blue_fighter_sig_str_leg': 'blue_fighter_sig_str_leg_acc',
 'blue_fighter_sig_str_distance': 'blue_fighter_sig_str_distance_acc',
 'blue_fighter_sig_str_clinch': 'blue_fighter_sig_str_clinch_acc',
 'blue_fighter_sig_str_ground': 'blue_fighter_sig_str_ground_acc',
 'red_fighter_sig_str_head_pct': 'red_fighter_sig_str_head_tar_pct',
 'red_fighter_sig_str_body_pct': 'red_fighter_sig_str_body_tar_pct',
 'red_fighter_sig_str_leg_pct': 'red_fighter_sig_str_leg_tar_pct',
 'blue_fighter_sig_str_

Renaming:

In [43]:
fights_stats.rename(columns=col_names_mappings, inplace=True)
fights_stats.columns

Index(['red_fighter_name', 'blue_fighter_name', 'event_date',
       'red_fighter_nickname', 'blue_fighter_nickname', 'red_fighter_result',
       'blue_fighter_result', 'method', 'round', 'time', 'time_format',
       'referee', 'details', 'bout_type', 'bonus', 'event_name',
       'event_location', 'red_fighter_KD', 'blue_fighter_KD',
       'red_fighter_sig_str_pct', 'blue_fighter_sig_str_pct',
       'red_fighter_total_str', 'blue_fighter_total_str', 'red_fighter_TD_pct',
       'blue_fighter_TD_pct', 'red_fighter_sub_att', 'blue_fighter_sub_att',
       'red_fighter_rev', 'blue_fighter_rev', 'red_fighter_ctrl',
       'blue_fighter_ctrl', 'red_fighter_sig_str_head_acc',
       'blue_fighter_sig_str_head_acc', 'red_fighter_sig_str_body_acc',
       'blue_fighter_sig_str_body_acc', 'red_fighter_sig_str_leg_acc',
       'blue_fighter_sig_str_leg_acc', 'red_fighter_sig_str_distance_acc',
       'blue_fighter_sig_str_distance_acc', 'red_fighter_sig_str_clinch_acc',
       'blue_fighter

Too long and too many features to look at, we will make the names shorter and make the number of features x2 less. But first, we need to merge some additional features :))

## Loading athlete stats dataset

Let's merge additional athlete-based features from an external dataset like: `Height`, `Reach`, `Stance`

In [44]:
# External dataset
athlete_stats = pd.read_csv("../external_data/raw_fighter_details.csv", sep=",")
athlete_stats.tail()

Unnamed: 0,fighter_name,Height,Weight,Reach,Stance,DOB,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg
3591,Zhang Tiequan,"5' 8""",155 lbs.,"69""",Orthodox,"Jul 25, 1978",1.23,36%,2.14,51%,1.95,58%,75%,3.4
3592,Alex Zuniga,,145 lbs.,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0
3593,George Zuniga,"5' 9""",185 lbs.,,,,7.64,38%,5.45,37%,0.0,0%,100%,0.0
3594,Allan Zuniga,"5' 7""",155 lbs.,"70""",Orthodox,"Apr 04, 1992",3.93,52%,1.8,61%,0.0,0%,57%,1.0
3595,Virgil Zwicker,"6' 2""",205 lbs.,"74""",,"Jun 26, 1982",3.34,48%,4.87,39%,1.31,30%,50%,0.0


### Feature engineering

#### Dropping irrelevant features

We leave only the features we are interested in:

In [45]:
athlete_stats.drop(columns=["Weight", "DOB"], inplace=True)
athlete_stats.tail()

Unnamed: 0,fighter_name,Height,Reach,Stance,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg
3591,Zhang Tiequan,"5' 8""","69""",Orthodox,1.23,36%,2.14,51%,1.95,58%,75%,3.4
3592,Alex Zuniga,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0
3593,George Zuniga,"5' 9""",,,7.64,38%,5.45,37%,0.0,0%,100%,0.0
3594,Allan Zuniga,"5' 7""","70""",Orthodox,3.93,52%,1.8,61%,0.0,0%,57%,1.0
3595,Virgil Zwicker,"6' 2""","74""",,3.34,48%,4.87,39%,1.31,30%,50%,0.0


#### Renaming features

We need to rename some columns for differentiating between athlete-based and fight-based.
Let's prep the columns:

In [46]:
postfix = "_cs"  # _cs is an abbreviation for career statistic
cols_to_rename = [
    "SLpM",
    "Str_Acc",
    "SApM",
    "Str_Def",
    "TD_Avg",
    "TD_Acc",
    "TD_Def",
    "Sub_Avg",
]
name_mappings = {col: f"{col}{postfix}" for col in cols_to_rename}
name_mappings

{'SLpM': 'SLpM_cs',
 'Str_Acc': 'Str_Acc_cs',
 'SApM': 'SApM_cs',
 'Str_Def': 'Str_Def_cs',
 'TD_Avg': 'TD_Avg_cs',
 'TD_Acc': 'TD_Acc_cs',
 'TD_Def': 'TD_Def_cs',
 'Sub_Avg': 'Sub_Avg_cs'}

Rename:

In [47]:
athlete_stats.rename(columns=name_mappings, inplace=True)
athlete_stats.head(3)

Unnamed: 0,fighter_name,Height,Reach,Stance,SLpM_cs,Str_Acc_cs,SApM_cs,Str_Def_cs,TD_Avg_cs,TD_Acc_cs,TD_Def_cs,Sub_Avg_cs
0,Tom Aaron,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0
1,Papy Abedi,"5' 11""",,Southpaw,2.8,55%,3.15,48%,3.47,57%,50%,1.3
2,Shamil Abdurakhimov,"6' 3""","76""",Orthodox,2.45,44%,2.45,58%,1.23,24%,47%,0.2


#### Imputing NaNs

Dropping rows, that bring us no information whatsoever:

In [48]:
athlete_stats.dropna(how="all", subset=["Height", "Reach", "Stance"], inplace=True)
athlete_stats.shape

(3363, 12)

Impututing the rest of NaNs with zeros:

In [49]:
athlete_stats.fillna(0, inplace=True)
print(f"Number of NaN entries: {athlete_stats.isna().sum().sum()}")

Number of NaN entries: 0


### Formatting to match the format in the fights dataset

Convert external dataset fighter names to uppercase, and columns to lowercase to match our format:

In [50]:
# Fighter names => upper
athlete_stats["fighter_name"] = athlete_stats["fighter_name"].str.upper()
# Columns => lower
athlete_stats.columns = athlete_stats.columns.str.lower()
athlete_stats.head()

Unnamed: 0,fighter_name,height,reach,stance,slpm_cs,str_acc_cs,sapm_cs,str_def_cs,td_avg_cs,td_acc_cs,td_def_cs,sub_avg_cs
1,PAPY ABEDI,"5' 11""",0,Southpaw,2.8,55%,3.15,48%,3.47,57%,50%,1.3
2,SHAMIL ABDURAKHIMOV,"6' 3""","76""",Orthodox,2.45,44%,2.45,58%,1.23,24%,47%,0.2
3,DANNY ABBADI,"5' 11""",0,Orthodox,3.29,38%,4.41,57%,0.0,0%,77%,0.0
4,HIROYUKI ABE,"5' 6""",0,Orthodox,1.71,36%,3.11,63%,0.0,0%,33%,0.0
5,RICARDO ABREU,"5' 11""",0,Orthodox,3.79,31%,3.98,68%,2.13,42%,100%,0.7


#### Converting from inches to cm

Defining a function that converts height and reach from inches to cm:

In [51]:
def conv_from_inches_to_cm(inches):
    """Converts from inches to cm"""
    # If the data is missing
    if not inches:
        return inches

    res = inches.strip('"').split("'")
    # Foot to cm conversion rate
    f_cm_cr = 30.48
    # Inch to cm convetsion rate
    i_cm_cr = 2.54

    # If converting height
    if len(res) > 1:
        return int(res[0]) * f_cm_cr + int(res[1]) * i_cm_cr
    # Or reach
    else:
        return int(res[0]) * i_cm_cr

Applying:

In [52]:
# Convert height
athlete_stats["height"] = athlete_stats["height"].apply(conv_from_inches_to_cm)
# Convert reach
athlete_stats["reach"] = athlete_stats["reach"].apply(conv_from_inches_to_cm)
# Take a look
athlete_stats.head()

Unnamed: 0,fighter_name,height,reach,stance,slpm_cs,str_acc_cs,sapm_cs,str_def_cs,td_avg_cs,td_acc_cs,td_def_cs,sub_avg_cs
1,PAPY ABEDI,180.34,0.0,Southpaw,2.8,55%,3.15,48%,3.47,57%,50%,1.3
2,SHAMIL ABDURAKHIMOV,190.5,193.04,Orthodox,2.45,44%,2.45,58%,1.23,24%,47%,0.2
3,DANNY ABBADI,180.34,0.0,Orthodox,3.29,38%,4.41,57%,0.0,0%,77%,0.0
4,HIROYUKI ABE,167.64,0.0,Orthodox,1.71,36%,3.11,63%,0.0,0%,33%,0.0
5,RICARDO ABREU,180.34,0.0,Orthodox,3.79,31%,3.98,68%,2.13,42%,100%,0.7


## Merge datasets

Prepare mappings to map features to red/blue fighters:

In [53]:
red_mappings = {
    col: f"red_fighter_{col}" for col in athlete_stats.columns if "fighter" not in col
}
blue_mappings = {
    col: f"blue_fighter_{col}" for col in athlete_stats.columns if "fighter" not in col
}

Merging:

In [54]:
# Merge reds
stats = pd.merge(
    fights_stats,
    athlete_stats.rename(columns=red_mappings),
    left_on="red_fighter_name",
    right_on="fighter_name",
)
stats.drop(columns="fighter_name", inplace=True)

In [55]:
# Merge blues
stats = pd.merge(
    stats,
    athlete_stats.rename(columns=blue_mappings),
    left_on="blue_fighter_name",
    right_on="fighter_name",
)
stats.drop(columns="fighter_name", inplace=True)

In [56]:
stats.columns

Index(['red_fighter_name', 'blue_fighter_name', 'event_date',
       'red_fighter_nickname', 'blue_fighter_nickname', 'red_fighter_result',
       'blue_fighter_result', 'method', 'round', 'time', 'time_format',
       'referee', 'details', 'bout_type', 'bonus', 'event_name',
       'event_location', 'red_fighter_KD', 'blue_fighter_KD',
       'red_fighter_sig_str_pct', 'blue_fighter_sig_str_pct',
       'red_fighter_total_str', 'blue_fighter_total_str', 'red_fighter_TD_pct',
       'blue_fighter_TD_pct', 'red_fighter_sub_att', 'blue_fighter_sub_att',
       'red_fighter_rev', 'blue_fighter_rev', 'red_fighter_ctrl',
       'blue_fighter_ctrl', 'red_fighter_sig_str_head_acc',
       'blue_fighter_sig_str_head_acc', 'red_fighter_sig_str_body_acc',
       'blue_fighter_sig_str_body_acc', 'red_fighter_sig_str_leg_acc',
       'blue_fighter_sig_str_leg_acc', 'red_fighter_sig_str_distance_acc',
       'blue_fighter_sig_str_distance_acc', 'red_fighter_sig_str_clinch_acc',
       'blue_fighter

# Data preprocessing

Creating a single `winner` feature instead of two `red_fighter_result` and `blue_fighter_result` features:

In [22]:
stats.loc[:, "winner"] = stats["red_fighter_result"].apply(
    lambda x: "red" if x == "W" else "blue"
)

In [23]:
stats["winner"].head()

0     red
1    blue
2     red
3     red
4     red
Name: winner, dtype: object

Saving the order of the columns because it will be distorted:

In [24]:
def rename_condition(col):
    if col.startswith("red_fighter_"):
        return col.replace("red_fighter_", "winner_")
    elif col.startswith("blue_fighter_"):
        return col.replace("blue_fighter_", "loser_")
    return col

In [25]:
# Setup mapping
col_mappings = {col: rename_condition(col) for col in stats.columns}
# Get the new column names
cols_order = list(col_mappings.values())

Changing columns from **red/blue+feature name** to **winner/loser+feature name**:

In [26]:
stats.columns[-5:]

Index(['blue_fighter_td_avg_cs', 'blue_fighter_td_acc_cs',
       'blue_fighter_td_def_cs', 'blue_fighter_sub_avg_cs', 'winner'],
      dtype='object')

In [27]:
def assign_winner_n_loser(df, winner_col="winner"):
    """Filters what columns to take into account,
    creates new columns, instead of red/blue makes winner/loser,
    gets data points from red/blue column based on
    the value of the feature 'winner' in that same row."""

    # Get the columns, leave features like 'round', 'method', 'time' like they are
    cols_to_drop = [
        col for col in df.columns if col.startswith("red") or col.startswith("blue")
    ]
    # Remove prefixes to replace them with winner/loser instead
    cols_to_change = set(
        col.removeprefix("red_").removeprefix("blue_") for col in cols_to_drop
    )

    # Change each column
    for col in cols_to_change:
        # Create new features, sort the data according to the value of the 'winner' featuer
        df.loc[:, f"winner_{col.replace('fighter_', '')}"] = df[f"red_{col}"].where(
            df[winner_col] == "red", df[f"blue_{col}"]
        )
        df.loc[:, f"loser_{col.replace('fighter_', '')}"] = df[f"red_{col}"].where(
            df[winner_col] == "blue", df[f"blue_{col}"]
        )

    # Drop the red/blue columns to keep only winner/loser
    df.drop(columns=cols_to_drop, inplace=True)
    return df

In [28]:
stats = assign_winner_n_loser(stats)
stats.columns[-5:]

Index(['loser_slpm_cs', 'winner_sig_str_clinch_acc',
       'loser_sig_str_clinch_acc', 'winner_sig_str_ground_pos_pct',
       'loser_sig_str_ground_pos_pct'],
      dtype='object')

In [29]:
stats.head()

Unnamed: 0,event_date,method,round,time,time_format,referee,details,bout_type,bonus,event_name,...,winner_stance,loser_stance,winner_sig_str_leg_tar_pct,loser_sig_str_leg_tar_pct,winner_slpm_cs,loser_slpm_cs,winner_sig_str_clinch_acc,loser_sig_str_clinch_acc,winner_sig_str_ground_pos_pct,loser_sig_str_ground_pos_pct
0,26/10/2024,KO/TKO,3,1:34,5 Rnd (5-5-5-5-5),Marc Goddard,Punch to Head At Distance,UFC Featherweight Title Bout,belt,UFC 308: Topuria vs. Holloway,...,Orthodox,Orthodox,20,24,2.5,7.26,0 of 0,0 of 0,5,0
1,26/10/2024,Submission,1,3:34,5 Rnd (5-5-5-5-5),Jason Herzog,Rear Naked Choke,Middleweight Bout,perf,UFC 308: Topuria vs. Holloway,...,Orthodox,Orthodox,0,100,9.03,4.52,0 of 0,0 of 0,100,0
2,26/10/2024,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Vitor Ribeiro,Sal D'amato 28 - 29. Jacob Montalvo 28 - 29. T...,Light Heavyweight Bout,-,UFC 308: Topuria vs. Holloway,...,Orthodox,Orthodox,23,64,3.41,4.23,5 of 9,3 of 6,0,0
3,26/10/2024,Decision - Unanimous,3,5:00,3 Rnd (5-5-5),Mark Smith,Mike Bell 28 - 29. Ben Cartlidge 28 - 29. Hadi...,Featherweight Bout,-,UFC 308: Topuria vs. Holloway,...,Orthodox,Orthodox,7,13,2.86,3.95,12 of 16,6 of 16,5,17
4,26/10/2024,KO/TKO,1,1:30,3 Rnd (5-5-5),Mark Smith,to At Distance Dos Anjos leg injury,Welterweight Bout,-,UFC 308: Topuria vs. Holloway,...,Southpaw,Southpaw,16,25,4.94,3.49,0 of 0,0 of 0,27,0


Setting the previous, correct order up:

In [30]:
stats = stats.loc[:, cols_order]

# Data Cleaning

## NaN values

In [31]:
stats.isnull().sum().sum()

0

Replacing NaN entry fillers with zeros:

In [32]:
stats.isin(["-", "--", "---"]).sum().sum()

14392

In [33]:
stats = stats.replace(["-", "--", "---"], "0")
stats.isin(["-", "--", "---"]).sum().sum()

0

## Duplicates

In [34]:
stats.duplicated().sum()

0

We can see that there are no NaNs or duplicates. Let's get to Feature Engineering.

# Feature Engineering

## First dtype conversion

Convert columns that are in object dtype but where values just consist of intigers only to remove the pool of preprocessing:

In [35]:
stats.dtypes.value_counts()

object     65
float64    12
int64       1
Name: count, dtype: int64

In [36]:
for col in stats.columns:
    try:
        stats[col] = stats[col].astype(float)

    # If not possible, (contains strings) we'll handle it in a minute
    except:
        continue

In [37]:
stats.dtypes.value_counts()

object     43
float64    35
Name: count, dtype: int64

## Process categorical features

Find columns that need to be cleaned:

In [38]:
cols_to_clean = []

for col in stats.columns:
    val = stats.loc[0, col]
    if (
        stats[col].dtype == "object"
        and (col.startswith("loser_") or col.startswith("winner_"))
        and any(char.isdigit() for char in val)
    ):
        cols_to_clean.append(col)

print(f"Number of categorical features to preprocess: {len(cols_to_clean)}")

Number of categorical features to preprocess: 24


Taking a look:

In [39]:
stats.loc[:2, ["winner_total_str", "winner_td_acc_cs", "winner_ctrl"]]

Unnamed: 0,winner_total_str,winner_td_acc_cs,winner_ctrl
0,78 of 147,55%,0:45
1,25 of 31,75%,3:20
2,75 of 142,33%,1:31


## Standardizing Features

In total we have to standardize 3 types of features:
1. Ratio to pct: 75 of 144 => 52 (%)
2. Dropping pct symbol: 85% => 85 (%)
3. Time: 1:31	=> 91 (seconds)

But let's first group the columns into 3 different buckets for simplicity:

In [40]:
"""This way of matching is not safe, and REGEXes should be used instead,
but this is a trivial case and I think we should be fine."""

of_cols = []
no_pct_symbol_cols = []
time_cols = []

for col in cols_to_clean:
    val = stats.loc[0, col]

    if "of" in val:
        of_cols.append(col)
    elif "%" in val:
        no_pct_symbol_cols.append(col)
    elif ":" in val:
        time_cols.append(col)

Let's take a look:

In [41]:
stats.loc[:2, of_cols]

Unnamed: 0,winner_total_str,loser_total_str,winner_sig_str_head_acc,loser_sig_str_head_acc,winner_sig_str_body_acc,loser_sig_str_body_acc,winner_sig_str_leg_acc,loser_sig_str_leg_acc,winner_sig_str_distance_acc,loser_sig_str_distance_acc,winner_sig_str_clinch_acc,loser_sig_str_clinch_acc,winner_sig_str_ground_acc,loser_sig_str_ground_acc
0,78 of 147,84 of 209,49 of 114,47 of 151,11 of 14,13 of 30,15 of 16,19 of 23,71 of 140,79 of 204,0 of 0,0 of 0,4 of 4,0 of 0
1,25 of 31,2 of 2,2 of 3,0 of 0,1 of 1,0 of 0,0 of 0,2 of 2,0 of 1,2 of 2,0 of 0,0 of 0,3 of 3,0 of 0
2,75 of 142,59 of 123,20 of 72,10 of 65,22 of 29,9 of 12,13 of 13,34 of 40,50 of 105,50 of 111,5 of 9,3 of 6,0 of 0,0 of 0


In [42]:
stats.loc[:2, no_pct_symbol_cols]

Unnamed: 0,winner_str_acc_cs,winner_str_def_cs,winner_td_acc_cs,winner_td_def_cs,loser_str_acc_cs,loser_str_def_cs,loser_td_acc_cs,loser_td_def_cs
0,45%,68%,55%,100%,46%,60%,71%,84%
1,72%,66%,75%,0%,40%,60%,27%,84%
2,53%,65%,33%,85%,52%,52%,25%,90%


In [43]:
stats.loc[:2, time_cols]

Unnamed: 0,winner_ctrl,loser_ctrl
0,0:45,0:00
1,3:20,0:00
2,1:31,1:00


Looks good, let's also add _pct postfix to the column that will consist of pct values to differentiate later.

In [44]:
"""of_cols = [col + "_pct" for col in of_cols]
no_pct_symbol_cols = [col + "_pct" for col in no_pct_symbol_cols]"""

'of_cols = [col + "_pct" for col in of_cols]\nno_pct_symbol_cols = [col + "_pct" for col in no_pct_symbol_cols]'

Let's standardize.

### Standardizing fractions into pct % (e.g. from 70 of 140 to 50 (%)):

In [45]:
def convert_ratio_to_pct(row):
    row = row.split("of")

    if re.search(r"\d", row[0]) and int(row[0]) != 0:
        return (int(row[0]) * 100) / int(row[1])
    else:
        return 0

Applying:

In [46]:
for col in of_cols:
    stats[col] = stats[col].apply(convert_ratio_to_pct)

# Rename
name_mappings = {col: f"{col}_pct" for col in of_cols}
stats.rename(columns=name_mappings, inplace=True)

In [47]:
stats[name_mappings.values()].head(3)

Unnamed: 0,winner_total_str_pct,loser_total_str_pct,winner_sig_str_head_acc_pct,loser_sig_str_head_acc_pct,winner_sig_str_body_acc_pct,loser_sig_str_body_acc_pct,winner_sig_str_leg_acc_pct,loser_sig_str_leg_acc_pct,winner_sig_str_distance_acc_pct,loser_sig_str_distance_acc_pct,winner_sig_str_clinch_acc_pct,loser_sig_str_clinch_acc_pct,winner_sig_str_ground_acc_pct,loser_sig_str_ground_acc_pct
0,53.061224,40.191388,42.982456,31.125828,78.571429,43.333333,93.75,82.608696,50.714286,38.72549,0.0,0.0,100.0,0.0
1,80.645161,100.0,66.666667,0.0,100.0,0.0,0.0,100.0,0.0,100.0,0.0,0.0,100.0,0.0
2,52.816901,47.96748,27.777778,15.384615,75.862069,75.0,100.0,85.0,47.619048,45.045045,55.555556,50.0,0.0,0.0


Looks pretty good.

### Standardizing pct features, dropping % symbol (e.g. from 50% to 50)

In [48]:
def strip_pct_symbol(row):
    return row.strip("%")

Applying:

In [49]:
for col in no_pct_symbol_cols:
    stats[col] = stats[col].apply(strip_pct_symbol).astype(float)

stats[no_pct_symbol_cols].head(3)

Unnamed: 0,winner_str_acc_cs,winner_str_def_cs,winner_td_acc_cs,winner_td_def_cs,loser_str_acc_cs,loser_str_def_cs,loser_td_acc_cs,loser_td_def_cs
0,45.0,68.0,55.0,100.0,46.0,60.0,71.0,84.0
1,72.0,66.0,75.0,0.0,40.0,60.0,27.0,84.0
2,53.0,65.0,33.0,85.0,52.0,52.0,25.0,90.0


Looks solid.

### Standardizing time features from 'mm:ss' into 'ss' (e.g. from 1:31 (minutes) to 91 (seconds))

Standardizing time from `mm:ss` into total `s`:

In [50]:
pd.to_timedelta("00:00:45").total_seconds()

45.0

Features to work with:

In [51]:
time_cols

['winner_ctrl', 'loser_ctrl']

#### Replace 0s to match the common format `0:00`

In [52]:
stats.loc[:, "winner_ctrl"] = stats.loc[:, "winner_ctrl"].replace("0", "0:00")
stats.loc[:, "loser_ctrl"] = stats.loc[:, "loser_ctrl"].replace("0", "0:00")

#### Format time schema to make it have the `hh:mm:ss` shape:

In [53]:
stats.loc[:, "winner_ctrl"][40:43]

40     0:34
41     0:44
42    12:02
Name: winner_ctrl, dtype: object

In [54]:
def format_time_schema(row):
    time = row.split(":")

    # If minutes are already in 'mm' format => skip
    if len(time[0]) == 1:
        time[0] = "0" + time[0]

    # Add hours too
    time = ["00"] + time

    return ":".join(time)

In [55]:
stats.loc[:, "winner_ctrl"] = stats.loc[:, "winner_ctrl"].apply(format_time_schema)
stats.loc[:, "loser_ctrl"] = stats.loc[:, "loser_ctrl"].apply(format_time_schema)
stats.loc[:, "winner_ctrl"][40:43]

40    00:00:34
41    00:00:44
42    00:12:02
Name: winner_ctrl, dtype: object

#### Convert to total seconds:

In [56]:
stats["winner_ctrl"].head(3)

0    00:00:45
1    00:03:20
2    00:01:31
Name: winner_ctrl, dtype: object

In [57]:
stats.loc[:, "winner_ctrl"] = pd.to_timedelta(
    stats.loc[:, "winner_ctrl"]
).dt.total_seconds()
stats.loc[:, "loser_ctrl"] = pd.to_timedelta(
    stats.loc[:, "loser_ctrl"]
).dt.total_seconds()

In [58]:
stats["winner_ctrl"].head(3)

0     45.0
1    200.0
2     91.0
Name: winner_ctrl, dtype: object

Looks good.

## Second dtype conversion

Convert some more columns that are of type object but now can now be cleanly converted to numerical dtype since the values are cleaned.

In [59]:
stats.dtypes.value_counts()

float64    57
object     21
Name: count, dtype: int64

In [60]:
for col in stats.columns:
    try:
        stats[col] = stats[col].astype(float)

    # If not possible, (contains strings) we'll handle it in a minute
    except:
        continue

In [61]:
stats.dtypes.value_counts()

float64    59
object     19
Name: count, dtype: int64

## Engineering New Features

Let's engineer some additional features:
1. Striking dominance - a fighter's overall striking performance. Calculated as: KD + Significant strikes % + Total landed strikes %;
2. Wrestling dominance - a fighter's overall wrestling performance. Calculated as: TD % + Submission attempts + reversals;

**Striking dominance**:

In [62]:
stats.columns

Index(['winner_name', 'loser_name', 'event_date', 'winner_nickname',
       'loser_nickname', 'winner_result', 'loser_result', 'method', 'round',
       'time', 'time_format', 'referee', 'details', 'bout_type', 'bonus',
       'event_name', 'event_location', 'winner_KD', 'loser_KD',
       'winner_sig_str_pct', 'loser_sig_str_pct', 'winner_total_str_pct',
       'loser_total_str_pct', 'winner_TD_pct', 'loser_TD_pct',
       'winner_sub_att', 'loser_sub_att', 'winner_rev', 'loser_rev',
       'winner_ctrl', 'loser_ctrl', 'winner_sig_str_head_acc_pct',
       'loser_sig_str_head_acc_pct', 'winner_sig_str_body_acc_pct',
       'loser_sig_str_body_acc_pct', 'winner_sig_str_leg_acc_pct',
       'loser_sig_str_leg_acc_pct', 'winner_sig_str_distance_acc_pct',
       'loser_sig_str_distance_acc_pct', 'winner_sig_str_clinch_acc_pct',
       'loser_sig_str_clinch_acc_pct', 'winner_sig_str_ground_acc_pct',
       'loser_sig_str_ground_acc_pct', 'winner_sig_str_head_tar_pct',
       'loser_sig_str

In [63]:
stats["winner_striking_dominance"] = (
    stats["winner_KD"] + stats["winner_sig_str_pct"] + stats["winner_total_str_pct"]
)
stats["loser_striking_dominance"] = (
    stats["loser_KD"] + stats["loser_sig_str_pct"] + stats["loser_total_str_pct"]
)

**Wrestling dominance**:

In [64]:
stats["winner_wrestling_dominance"] = (
    stats["winner_TD_pct"] + stats["winner_sub_att"] + stats["winner_rev"]
)
stats["loser_wrestling_dominance"] = (
    stats["loser_TD_pct"] + stats["loser_sub_att"] + stats["loser_rev"]
)

### Delta

Let's now decrease the amount of features by x2 times. We're going to use *delta* for this. For example, instead of having both `winner_striking_dominance` and `loser_striking_dominance` features, we're going to just have `delta_striking_dominance`. Which would just be `winner_striking_dominance` - `loser_striking_dominance`. Where a positive value would mean that the winner has a higher striking dominance factor and vice versa for negative.

Let's get the columns that we'll be transforming:

In [65]:
stats.columns

Index(['winner_name', 'loser_name', 'event_date', 'winner_nickname',
       'loser_nickname', 'winner_result', 'loser_result', 'method', 'round',
       'time', 'time_format', 'referee', 'details', 'bout_type', 'bonus',
       'event_name', 'event_location', 'winner_KD', 'loser_KD',
       'winner_sig_str_pct', 'loser_sig_str_pct', 'winner_total_str_pct',
       'loser_total_str_pct', 'winner_TD_pct', 'loser_TD_pct',
       'winner_sub_att', 'loser_sub_att', 'winner_rev', 'loser_rev',
       'winner_ctrl', 'loser_ctrl', 'winner_sig_str_head_acc_pct',
       'loser_sig_str_head_acc_pct', 'winner_sig_str_body_acc_pct',
       'loser_sig_str_body_acc_pct', 'winner_sig_str_leg_acc_pct',
       'loser_sig_str_leg_acc_pct', 'winner_sig_str_distance_acc_pct',
       'loser_sig_str_distance_acc_pct', 'winner_sig_str_clinch_acc_pct',
       'loser_sig_str_clinch_acc_pct', 'winner_sig_str_ground_acc_pct',
       'loser_sig_str_ground_acc_pct', 'winner_sig_str_head_tar_pct',
       'loser_sig_str

In [66]:
def deltafy_data(df):
    """Filters what columns to take into account,
    creates new columns, instead of red/blue makes winner/loser,
    gets data points from red/blue column based on
    the value of the feature 'winner' in that same row."""  # Change this docstring

    # Get the columns, leave features like 'round', 'method', 'time' like they are
    cols_to_drop = [
        col
        for col in df.columns
        if (col.startswith("winner_") or col.startswith("loser_"))
        and df[col].dtype == float
    ]
    # Remove prefixes to replace them with winner/loser instead
    cols_to_deltafy = [
        col.removeprefix("winner_") for col in cols_to_drop if col.startswith("winner_")
    ]

    # Change each column
    for col in cols_to_deltafy:
        # Create new features, sort the data according to the value of the 'winner' featuer
        df[f"delta_{col}"] = df.loc[:, f"winner_{col}"] - df.loc[:, f"loser_{col}"]

    # Drop the red/blue columns to keep only winner/loser
    df.drop(columns=cols_to_drop, inplace=True)
    return df

In [67]:
stats = deltafy_data(stats)
stats.columns

Index(['winner_name', 'loser_name', 'event_date', 'winner_nickname',
       'loser_nickname', 'winner_result', 'loser_result', 'method', 'round',
       'time', 'time_format', 'referee', 'details', 'bout_type', 'bonus',
       'event_name', 'event_location', 'winner_stance', 'loser_stance',
       'winner', 'delta_KD', 'delta_sig_str_pct', 'delta_total_str_pct',
       'delta_TD_pct', 'delta_sub_att', 'delta_rev', 'delta_ctrl',
       'delta_sig_str_head_acc_pct', 'delta_sig_str_body_acc_pct',
       'delta_sig_str_leg_acc_pct', 'delta_sig_str_distance_acc_pct',
       'delta_sig_str_clinch_acc_pct', 'delta_sig_str_ground_acc_pct',
       'delta_sig_str_head_tar_pct', 'delta_sig_str_body_tar_pct',
       'delta_sig_str_leg_tar_pct', 'delta_sig_str_distance_pos_pct',
       'delta_sig_str_clinch_pos_pct', 'delta_sig_str_ground_pos_pct',
       'delta_height', 'delta_reach', 'delta_slpm_cs', 'delta_str_acc_cs',
       'delta_sapm_cs', 'delta_str_def_cs', 'delta_td_avg_cs',
       'delta_

# Saving

Saving preprocessed, cleaned, merged, feature-engineer added, ready for EDA dataset:

In [70]:
stats.to_csv("../stats/stats_processed.csv", sep=";")