In [1]:
import pandas as pd
import re
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

### Objective

> in order to predict the outcome based on fighting style + others I need to mimic the stats that I can input for upcoming fights -- that info is available 

1. -- split columns like x of y into x_attempted and x_successful -- these are:  
- f1_sig_str + f2_sig_str
- f1_td + f2_td


2. -- get fight time in seconds   
-length_in_seconds
3. 
| Statistic                                                   | Calculation                                                           |
|-------------------------------------------------------------|-----------------------------------------------------------------------|
| SLpM - Significant Strikes Landed per Minute                | fx_slmp = (fx_sig_str_success / length_in_seconds) x 60              |
| Str. Acc. - Significant Striking Accuracy                   | fx_str_acc = (fx_sig_str_success / f1_sig_str_attempted) x 100      |
| SApM - Significant Strikes Absorbed per Minute             | fx_sapm = fy_slmp                                                    |
| Str. Def. - Significant Strike Defence                      | fx_str_def = 100 - fy_str_acc                                         |
| TD Avg. - Average Takedowns Landed per 15 minutes           | fx_td_avg = (fx_td_success / length_in_seconds) x 60 * 15           |
| TD Acc. - Takedown Accuracy                                 | fx_td_acc = fx_td_%                                                   |
| TD Def. - Takedown Defense                                  | fx_td_def = 100 - fy_td_%                                             |
| Sub. Avg. - Average Submissions Attempted per 15 minutes    | fx_sub_avg = (fx_sub_att / length_in_seconds) x 60                   |


In [2]:
# Load the dataset
data = pd.read_csv('../3_cluster_fight_style/data_with_win_percentages.csv')
data.shape

(7289, 75)

**1. -- split columns like x of y into x_attempted and x_successful -- these are:**

In [3]:
# Function to split '2 out of 3' into two separate columns
def split_success_attempted(df, column_name):
    # Split the column into a DataFrame with two columns
    split_data = df[column_name].str.strip().str.split(expand=True)
    
    # Assign the split data to new columns
    df[f'{column_name}_success'] = pd.to_numeric(split_data[0], errors='coerce')
    df[f'{column_name}_attempted'] = pd.to_numeric(split_data[2], errors='coerce')

In [4]:
columns_to_split = ['f1_sig_str', 'f2_sig_str', 'f1_td', 'f2_td']

for column in columns_to_split:
    split_success_attempted(data, column)

# Drop the original columns if needed
data.drop(columns=columns_to_split, inplace=True)

# Check the transformed DataFrame
display(data)

Unnamed: 0,end_time,f1,f2,fight_date,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_weight,gender,year,name_x,winner_cluster,name_y,loser_cluster,win_percentage,f1_sig_str_success,f1_sig_str_attempted,f2_sig_str_success,f2_sig_str_attempted,f1_td_success,f1_td_attempted,f2_td_success,f2_td_attempted
0,5:00,Lerone Murphy,Edson Barboza,2024-05-18,Decision - Unanimous,Herb Dean,5,5,featherweight,f1,27 of 34,13 of 17,5:19,205 of 344,"Jul 22, 1991",2 of 3,172 of 308,175.26,0,21 of 22,185.42,Record: 14-0-1,0,2.60,60%,5.01,Orthodox,54%,60%,0.6,0,66%,52%,1.59,46%,259 of 410,65.77,15 of 21,2 of 3,5:19,77 of 239,"Jan 21, 1986",0 of 0,38 of 189,180.34,0,26 of 32,190.50,Record: 24-12-0,0,4.65,32%,4.11,Orthodox,44%,56%,0.1,0,0,50%,0.45,72%,79 of 242,65.77,M,2024,Lerone Murphy,4,Edson Barboza,4,50.0,220,364,79,242,4,6,0,0
1,1:30,Khaos Williams,Carlston Harris,2024-05-18,KO/TKO,Dan Miragliotta,1,3,welterweight,f1,2 of 4,0 of 0,0:01,11 of 19,"Mar 30, 1994",1 of 1,5 of 9,182.88,1,5 of 7,195.58,Record: 15-3-0,0,5.42,60%,5.81,Orthodox,39%,42%,0.0,0,0,0%,0.00,80%,12 of 20,77.11,1 of 2,0 of 0,0:01,14 of 30,"Jul 09, 1987",0 of 0,10 of 25,182.88,0,3 of 3,193.04,Record: 19-6-0,0,2.66,46%,3.18,Orthodox,49%,52%,0.8,0,0,29%,1.97,55%,14 of 30,77.11,M,2024,Khaos Williams,4,Carlston Harris,2,52.0,12,20,14,30,0,0,0,0
2,5:00,Themba Gorimbo,Ramiz Brahimaj,2024-05-18,Decision - Unanimous,Mark Smith,3,3,welterweight,f1,31 of 35,35 of 38,10:13,6 of 10,"Jan 23, 1991",2 of 4,9 of 13,185.42,0,3 of 4,195.58,Record: 13-4-0,1,1.08,82%,2.53,Orthodox,69%,46%,0.0,0,83%,58%,4.04,78%,110 of 134,77.11,3 of 4,3 of 4,10:13,0 of 2,"Nov 17, 1992",0 of 0,0 of 2,177.80,0,0 of 0,182.88,Record: 10-5-0,0,3.69,50%,1.74,Orthodox,41%,45%,1.6,0,12%,35%,1.61,44%,15 of 20,77.11,M,2024,Themba Gorimbo,2,Ramiz Brahimaj,2,50.0,43,52,3,6,5,6,1,8
3,2:47,Adrian Yanez,Vinicius Salvador,2024-05-18,KO/TKO,Chris Tognoni,1,3,bantamweight,f1,1 of 2,0 of 0,0:17,15 of 28,"Nov 29, 1993",21 of 28,27 of 45,170.18,1,8 of 9,177.80,Record: 17-5-0,0,5.75,64%,6.51,Orthodox,41%,57%,0.0,0,0,0%,0.00,100%,36 of 58,61.23,6 of 6,0 of 0,0:17,17 of 34,"Jul 24, 1996",0 of 0,6 of 20,170.18,0,5 of 8,177.80,Record: 14-7-0,0,6.50,50%,5.48,Southpaw,43%,48%,0.0,0,0,11%,0.36,86%,17 of 34,61.23,M,2024,Adrian Yanez,4,Vinicius Salvador,4,50.0,36,56,17,34,0,0,0,0
4,4:12,Angela Hill,Luana Pinheiro,2024-05-18,Submission,Herb Dean,2,3,strawweight,f1,9 of 15,3 of 3,3:06,21 of 71,"Jan 12, 1985",4 of 5,19 of 64,160.02,0,0 of 0,162.56,Record: 17-13-0,1,4.86,35%,5.38,Orthodox,49%,61%,0.1,1,33%,33%,0.80,76%,39 of 91,52.16,3 of 6,2 of 6,3:06,15 of 54,"Nov 18, 1992",0 of 0,11 of 51,157.48,0,3 of 3,157.48,Record: 11-3-0,0,4.17,28%,3.57,Orthodox,40%,63%,0.2,0,14%,33%,2.24,66%,25 of 77,52.16,M,2024,Angela Hill,4,Luana Pinheiro,4,50.0,28,79,17,60,2,6,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7284,0:39,Jerry Bohlander,Nick Sanzo,1997-02-07,Submission,John McCarthy,1,1,lightweight,f1,0 of 0,2 of 2,0,0 of 2,"Feb 12, 1974",0 of 0,2 of 3,180.34,0,0 of 1,185.29,Record: 11-4-0,0,0.00,50%,0.00,Orthodox,0%,0%,0.0,1,100%,0%,0.00,0%,3 of 5,90.26,0 of 0,0 of 0,0,0 of 0,,0 of 0,0 of 0,175.26,0,0 of 0,179.62,Record: 1-1-0,0,0.00,0,0.00,Orthodox,0%,0%,0.0,0,0%,0%,0.00,0%,0 of 0,86.18,M,1997,Jerry Bohlander,3,Nick Sanzo,3,50.0,2,4,0,0,1,1,0,1
7285,1:17,Vitor Belfort,Tra Telligman,1997-02-07,KO/TKO,John McCarthy,1,1,heavyweight,f1,0 of 0,5 of 15,0,2 of 4,"Apr 01, 1977",10 of 10,17 of 29,182.88,1,0 of 0,187.96,Record: 26-14-0 (1 NC),0,2.83,58%,1.59,Southpaw,46%,50%,0.6,0,0,60%,1.02,53%,23 of 35,83.91,1 of 1,1 of 6,0,0 of 1,"Feb 07, 1965",0 of 0,0 of 6,187.96,0,0 of 0,193.07,Record: 7-5-1,0,3.02,14%,1.83,Orthodox,35%,56%,0.0,0,0,100%,0.87,80%,1 of 7,105.69,M,1997,Vitor Belfort,2,Tra Telligman,2,50.0,17,29,1,7,0,0,0,0
7286,8:02,Scott Ferrozzo,Jim Mullen,1997-02-07,KO/TKO,John McCarthy,1,1,heavyweight,f1,6 of 6,11 of 17,0,2 of 3,"Apr 26, 1965",17 of 22,23 of 35,180.34,0,1 of 1,185.29,Record: 4-2-0,0,0.00,71%,0.00,Orthodox,0%,0%,0.0,1,100%,0%,0.00,0%,63 of 78,146.51,0 of 0,2 of 2,0,1 of 3,,0 of 0,3 of 5,185.42,0,0 of 0,190.38,Record: 0-2-0,0,0.00,60%,0.00,Southpaw,0%,0%,0.0,0,0,0%,0.00,0%,4 of 7,97.52,M,1997,Scott Ferrozzo,3,Jim Mullen,3,50.0,30,42,3,5,2,2,0,0
7287,3:00,Yoshiki Takahashi,Wallid Ismail,1997-02-07,Decision - Unanimous,John McCarthy,2,1,lightweight,f1,0 of 0,3 of 6,0,9 of 27,"Mar 13, 1969",8 of 10,20 of 43,180.34,0,0 of 0,185.29,Record: 30-27-3 (1 NC),0,6.47,46%,2.26,Southpaw,51%,44%,2.3,0,100%,0%,0.00,66%,72 of 99,90.26,0 of 0,0 of 0,0,1 of 17,"Feb 23, 1968",0 of 0,1 of 17,170.18,0,0 of 0,174.24,Record: 9-3-0,0,0.83,5%,0.92,Orthodox,31%,44%,0.5,0,10%,28%,1.85,0%,19 of 38,92.99,M,1997,Yoshiki Takahashi,1,Wallid Ismail,1,50.0,20,43,1,17,1,1,1,10


In [5]:
data['gender'].value_counts()

gender
M    7289
Name: count, dtype: int64

**2. -- get fight time in seconds** 

In [6]:
def time_to_seconds(time_str):
    # Split the time
    minutes, seconds = map(int, time_str.split(':'))
    
    # Convert hours and minutes to seconds
    total_seconds = minutes * 60 + seconds
    
    return total_seconds

In [7]:
data['end_time_seconds'] = data['end_time'].apply(time_to_seconds)
# get time in seconds of last round + (rounds - one + 300) for full rounds
data['length_in_seconds'] = data['end_time_seconds'] + (data['round'] -1 ) * 300
data.drop(columns='length_in_seconds')
data[['length_in_seconds', 'end_time', 'round']]

Unnamed: 0,length_in_seconds,end_time,round
0,1500,5:00,5
1,90,1:30,1
2,900,5:00,3
3,167,2:47,1
4,552,4:12,2
...,...,...,...
7284,39,0:39,1
7285,77,1:17,1
7286,482,8:02,1
7287,480,3:00,2


In [8]:
data.isna().sum()[data.isna().sum() > 0]

f1_dob    21
f2_dob    94
dtype: int64

In [9]:
import dis
import pandas as pd

# Calculate statistics for fighter 1 (f1)
fight_stats = pd.DataFrame()

fight_stats['method'] = data['method']
fight_stats['referee'] = data['referee']
fight_stats['weightclass'] = data['weightclass']
fight_stats['winner'] = 'f1'

fight_stats['f1'] = data['f1']
fight_stats['f2'] = data['f2']

fight_stats['f1_slmp'] = (data['f1_sig_str_success'] / data['length_in_seconds']) * 60
fight_stats['f2_slmp'] = (data['f2_sig_str_success'] / data['length_in_seconds']) * 60

fight_stats['f1_str_acc_%'] = (data['f1_sig_str_success'] / data['f1_sig_str_attempted']) * 100
fight_stats['f2_str_acc_%'] = (data['f2_sig_str_success'] / data['f2_sig_str_attempted']) * 100

fight_stats['f1_sapm'] = fight_stats['f2_slmp']
fight_stats['f2_sapm'] = fight_stats['f1_slmp']

fight_stats['f1_str_def_%'] = 100 - fight_stats['f2_str_acc_%']
fight_stats['f2_str_def_%'] = 100 - fight_stats['f1_str_acc_%']

fight_stats['f1_td_avg'] = (data['f1_td_success'] / data['length_in_seconds']) * 60 * 15
fight_stats['f2_td_avg'] = (data['f2_td_success'] / data['length_in_seconds']) * 60 * 15

data['f1_td_%'] = pd.to_numeric(data['f1_td_%'].str.rstrip('%'), errors='coerce')
data['f2_td_%'] = pd.to_numeric(data['f2_td_%'].str.rstrip('%'), errors='coerce')

fight_stats['f1_td_acc_%'] = data['f1_td_%'] 
fight_stats['f2_td_acc_%'] = data['f2_td_%'] 

fight_stats['f1_td_def_%'] = 100 - data['f2_td_%']  
fight_stats['f2_td_def_%'] = 100 - data['f1_td_%']  

fight_stats['f1_sub_avg'] = (data['f1_sub_att'] / data['length_in_seconds']) * 60
fight_stats['f2_sub_avg'] = (data['f2_sub_att'] / data['length_in_seconds']) * 60

fight_stats['f1_height'] = data['f1_height'] 
fight_stats['f2_height'] = data['f2_height'] 

fight_stats['f1_weight'] = data['f1_weight'] 
fight_stats['f2_weight'] = data['f2_weight'] 

fight_stats['f1_reach'] = data['f1_reach'] 
fight_stats['f2_reach'] = data['f2_reach'] 

fight_stats['f1_stance'] = data['f1_stance'] 
fight_stats['f2_stance'] = data['f2_stance'] 

fight_stats = fight_stats.round(2)

In [10]:
fight_stats.isna().sum()[fight_stats.isna().sum() > 0]

f1_str_acc_%    20
f2_str_acc_%    45
f1_str_def_%    45
f2_str_def_%    20
dtype: int64

In [11]:
columns_with_null = ['f1_str_acc_%', 'f2_str_acc_%', 'f1_str_def_%', 'f2_str_def_%']
rows_with_null = fight_stats[fight_stats[columns_with_null].isnull().any(axis=1)]
display(rows_with_null.head(2))
columns_with_null = ['f1_str_acc_%', 'f2_str_acc_%', 'f1_str_def_%', 'f2_str_def_%']
rows_with_null_index = fight_stats[fight_stats[columns_with_null].isnull().any(axis=1)].index
rows_with_null_data = data.loc[rows_with_null_index]
display(rows_with_null_data.head(2))

Unnamed: 0,method,referee,weightclass,winner,f1,f2,f1_slmp,f2_slmp,f1_str_acc_%,f2_str_acc_%,f1_sapm,f2_sapm,f1_str_def_%,f2_str_def_%,f1_td_avg,f2_td_avg,f1_td_acc_%,f2_td_acc_%,f1_td_def_%,f2_td_def_%,f1_sub_avg,f2_sub_avg,f1_height,f2_height,f1_weight,f2_weight,f1_reach,f2_reach,f1_stance,f2_stance
495,Submission,Chris Tognoni,flyweight,f1,Karine Silva,Ketlen Souza,1.71,0.0,30.0,,0.0,1.71,,70.0,8.57,0.0,100,0,100,0,0.57,0.0,165.1,160.02,56.7,52.16,170.18,160.02,Orthodox,Orthodox
997,Submission,Mark Smith,heavyweight,f1,Jailton Almeida,Parker Porter,3.93,0.0,52.94,,0.0,3.93,,47.06,3.27,0.0,50,0,100,50,0.22,0.0,190.5,182.88,92.99,120.2,200.66,190.5,Orthodox,Orthodox


Unnamed: 0,end_time,f1,f2,fight_date,method,referee,round,time_format,weightclass,winner,f1_body,f1_clinch,f1_control,f1_distance,f1_dob,f1_ground,f1_head,f1_height,f1_kd,f1_leg,f1_reach,f1_record,f1_rev,f1_sapm,f1_sig_str_%,f1_slpm,f1_stance,f1_str__acc_,f1_str__def,f1_sub__avg_,f1_sub_att,f1_td_%,f1_td_acc_,f1_td_avg_,f1_td_def_,f1_total_str,f1_weight,f2_body,f2_clinch,f2_control,f2_distance,f2_dob,f2_ground,f2_head,f2_height,f2_kd,f2_leg,f2_reach,f2_record,f2_rev,f2_sapm,f2_sig_str_%,f2_slpm,f2_stance,f2_str__acc_,f2_str__def,f2_sub__avg_,f2_sub_att,f2_td_%,f2_td_acc_,f2_td_avg_,f2_td_def_,f2_total_str,f2_weight,gender,year,name_x,winner_cluster,name_y,loser_cluster,win_percentage,f1_sig_str_success,f1_sig_str_attempted,f2_sig_str_success,f2_sig_str_attempted,f1_td_success,f1_td_attempted,f2_td_success,f2_td_attempted,end_time_seconds,length_in_seconds
495,1:45,Karine Silva,Ketlen Souza,2023-06-03,Submission,Chris Tognoni,1,3,flyweight,f1,0 of 2,0 of 0,1:21,2 of 7,"Dec 02, 1993",1 of 3,1 of 6,165.1,0,2 of 2,170.18,Record: 18-4-0,0,3.12,30%,2.67,Orthodox,35%,49%,2.2,1,100,72%,3.59,0%,4 of 12,56.7,0 of 0,0 of 0,1:21,0 of 0,"Aug 18, 1994",0 of 0,0 of 0,160.02,0,0 of 0,160.02,Record: 14-4-0,0,2.03,0,4.78,Orthodox,68%,60%,0.0,0,0,50%,0.9,50%,1 of 1,52.16,M,2023,Karine Silva,2,Ketlen Souza,4,48.0,3,10,0,0,1,1,0,0,105,105
997,4:35,Jailton Almeida,Parker Porter,2022-05-21,Submission,Mark Smith,1,3,heavyweight,f1,2 of 2,0 of 0,4:29,1 of 1,"Jun 26, 1991",17 of 33,16 of 32,190.5,0,0 of 0,200.66,Record: 20-3-0,0,0.74,52%,2.54,Orthodox,64%,40%,2.2,1,50,59%,6.88,75%,36 of 58,92.99,0 of 0,0 of 0,4:29,0 of 0,"Apr 22, 1985",0 of 0,0 of 0,182.88,0,0 of 0,190.5,Record: 14-9-0,0,6.32,0,6.55,Orthodox,50%,51%,0.5,0,0,26%,1.35,66%,0 of 0,120.2,M,2022,Jailton Almeida,2,Parker Porter,4,48.0,18,34,0,0,1,2,0,0,275,275


In [12]:
# nan value bc dividing by 0 
fight_stats.fillna('0', inplace=True)
fight_stats.isna().sum().sum()

0

In [13]:
fight_stats.head(3)

Unnamed: 0,method,referee,weightclass,winner,f1,f2,f1_slmp,f2_slmp,f1_str_acc_%,f2_str_acc_%,f1_sapm,f2_sapm,f1_str_def_%,f2_str_def_%,f1_td_avg,f2_td_avg,f1_td_acc_%,f2_td_acc_%,f1_td_def_%,f2_td_def_%,f1_sub_avg,f2_sub_avg,f1_height,f2_height,f1_weight,f2_weight,f1_reach,f2_reach,f1_stance,f2_stance
0,Decision - Unanimous,Herb Dean,featherweight,f1,Lerone Murphy,Edson Barboza,8.8,3.16,60.44,32.64,3.16,8.8,67.36,39.56,2.4,0.0,66,0,100,34,0.0,0.0,175.26,180.34,65.77,65.77,185.42,190.5,Orthodox,Orthodox
1,KO/TKO,Dan Miragliotta,welterweight,f1,Khaos Williams,Carlston Harris,8.0,9.33,60.0,46.67,9.33,8.0,53.33,40.0,0.0,0.0,0,0,100,100,0.0,0.0,182.88,182.88,77.11,77.11,195.58,193.04,Orthodox,Orthodox
2,Decision - Unanimous,Mark Smith,welterweight,f1,Themba Gorimbo,Ramiz Brahimaj,2.87,0.2,82.69,50.0,0.2,2.87,50.0,17.31,5.0,1.0,83,12,88,17,0.0,0.0,185.42,177.8,77.11,77.11,195.58,182.88,Orthodox,Orthodox


In [14]:
fight_stats.to_csv('../feature_engineered_fight.csv', index=False)