In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import pdb
  

In [2]:
df = pd.read_csv("../datasets/merged_stats_n_scorecards/merged_stats_n_scorecards.csv", sep=';')
df.tail()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,red_fighter_sig_str_leg_pct,blue_fighter_sig_str_leg_pct,red_fighter_sig_str_distance_pct,blue_fighter_sig_str_distance_pct,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct,red_fighter_total_pts,blue_fighter_total_pts
2138,FELICE HERRIG,VIRNA JANDIROBA,15/08/2020,Lil Bulldog,Carcara,L,W,Submission,1,1:44,...,100,0,100,0,0,0,0,100,- - -,- - -
2139,TJ BROWN,DANNY CHAVEZ,15/08/2020,Downtown,The Colombian Warrior,L,W,Decision - Unanimous,3,5:00,...,13,36,88,81,11,0,0,18,28 28 28,29 29 29
2140,ASHLEY YODER,LIVINHA SOUZA,15/08/2020,SpiderMonkey,The Brazilian Gangsta,L,W,Decision - Unanimous,3,5:00,...,9,50,100,78,0,10,0,10,28 27 28,29 30 29
2141,CHRIS DAUKAUS,PARKER PORTER,15/08/2020,-,-,W,L,KO/TKO,1,4:28,...,10,23,78,82,16,17,5,0,- - -,- - -
2142,KAI KAMAKA,TONY KELLEY,15/08/2020,The Fighting Hawaiian,Primetime,W,L,Decision - Unanimous,3,5:00,...,0,14,78,66,19,33,1,0,29 29 29,28 28 28


# Establishing questions

### Here are a few questions I want to be answered by the end of this notebook:
1. What are the most popular locations that UFC hosts its events in?
2. What is the historical average amount of UFC fights hosted annually?
3. What are the top methods of winning?
4. What are the main fight tactics contributing to a victory?
5. Does being in a particular corner contribute to a victory? (meme, but still)
6. What fight features contribute to the fight bonuses the most?

# Data preprocessing

### Summing scorecards from all 3 judges into one

In [4]:
df.loc[:, 'red_fighter_total_pts'] = df['red_fighter_total_pts'].apply(lambda x: sum(int(n) for n in x.split()) if x[0].isnumeric() else "-")
df.loc[:, 'blue_fighter_total_pts'] = df['blue_fighter_total_pts'].apply(lambda x: sum(int(n) for n in x.split()) if x[0].isnumeric() else "-")

### Creating a single *winner* feature instead of two *red_fighter_result* and *blue_fighter_result* features

In [13]:
df.loc[:, 'winner'] = df['red_fighter_result'].apply(lambda x: 'red' if x == 'W' else 'blue')

### Changing columns from *red/blue+feature name* to *winner/loser+feature name*

In [32]:
def define_winner_and_loser(red_fighter_feature, blue_fighter_feature):
    
    if df['winner'] == 'red':
        pdb.set_trace()
        winner_feature = red_fighter_feature
        loser_feature = blue_fighter_feature
    else:
        winner_feature = blue_fighter_feature
        loser_feature = red_fighter_feature
        
    return winner_feature, loser_feature

In [33]:
df.loc[:, 'winner_fighter_name'], df.loc[:, 'loser_fighter_name'] = 

#df.loc[:, 'winner_fighter_name'] = df.apply(lambda row: row['red_fighter_name'] if row['winner'] == 'red' else row['blue_fighter_name'], axis=1)
#df.loc[:, 'loser_fighter_name'] = df.apply(lambda row: row['red_fighter_name'] if row['winner'] == 'blue' else row['blue_fighter_name'], axis=1)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [19]:
df.head()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,blue_fighter_sig_str_distance_pct,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct,red_fighter_total_pts,blue_fighter_total_pts,winner,winner_fighter_name,loser_fighter_name
0,ILIA TOPURIA,MAX HOLLOWAY,26/10/2024,El Matador,Blessed,W,L,KO/TKO,3,1:34,...,100,0,0,5,0,59,55,red,ILIA TOPURIA,MAX HOLLOWAY
1,ROBERT WHITTAKER,KHAMZAT CHIMAEV,26/10/2024,The Reaper,Borz,L,W,Submission,1,3:34,...,0,0,0,0,100,-,-,blue,KHAMZAT CHIMAEV,ROBERT WHITTAKER
2,MAGOMED ANKALAEV,ALEKSANDAR RAKIC,26/10/2024,-,Rocket,W,L,Decision - Unanimous,3,5:00,...,94,9,5,0,0,87,84,red,MAGOMED ANKALAEV,ALEKSANDAR RAKIC
3,LERONE MURPHY,DAN IGE,26/10/2024,The Miracle,50K,W,L,Decision - Unanimous,3,5:00,...,69,23,13,5,17,87,84,red,LERONE MURPHY,DAN IGE
4,SHARA MAGOMEDOV,ARMEN PETROSYAN,26/10/2024,Bullet,Superman,W,L,KO/TKO,2,4:52,...,97,3,2,0,0,30,27,red,SHARA MAGOMEDOV,ARMEN PETROSYAN


# Data Cleaning

## NaN values

In [8]:
df.head()

Unnamed: 0,red_fighter_name,blue_fighter_name,event_date,red_fighter_nickname,blue_fighter_nickname,red_fighter_result,blue_fighter_result,method,round,time,...,red_fighter_sig_str_leg_pct,blue_fighter_sig_str_leg_pct,red_fighter_sig_str_distance_pct,blue_fighter_sig_str_distance_pct,red_fighter_sig_str_clinch_pct,blue_fighter_sig_str_clinch_pct,red_fighter_sig_str_ground_pct,blue_fighter_sig_str_ground_pct,red_fighter_total_pts,blue_fighter_total_pts
0,ILIA TOPURIA,MAX HOLLOWAY,26/10/2024,El Matador,Blessed,W,L,KO/TKO,3,1:34,...,20,24,94,100,0,0,5,0,59,55
1,ROBERT WHITTAKER,KHAMZAT CHIMAEV,26/10/2024,The Reaper,Borz,L,W,Submission,1,3:34,...,100,0,100,0,0,0,0,100,-,-
2,MAGOMED ANKALAEV,ALEKSANDAR RAKIC,26/10/2024,-,Rocket,W,L,Decision - Unanimous,3,5:00,...,23,64,90,94,9,5,0,0,87,84
3,LERONE MURPHY,DAN IGE,26/10/2024,The Miracle,50K,W,L,Decision - Unanimous,3,5:00,...,7,13,71,69,23,13,5,17,87,84
4,SHARA MAGOMEDOV,ARMEN PETROSYAN,26/10/2024,Bullet,Superman,W,L,KO/TKO,2,4:52,...,18,58,96,97,3,2,0,0,30,27


In [9]:
df.columns

Index(['red_fighter_name', 'blue_fighter_name', 'event_date',
       'red_fighter_nickname', 'blue_fighter_nickname', 'red_fighter_result',
       'blue_fighter_result', 'method', 'round', 'time', 'time_format',
       'referee', 'details', 'bout_type', 'bonus', 'event_name',
       'event_location', 'red_fighter_KD', 'blue_fighter_KD',
       'red_fighter_sig_str', 'blue_fighter_sig_str',
       'red_fighter_sig_str_pct', 'blue_fighter_sig_str_pct',
       'red_fighter_total_str', 'blue_fighter_total_str', 'red_fighter_TD',
       'blue_fighter_TD', 'red_fighter_TD_pct', 'blue_fighter_TD_pct',
       'red_fighter_sub_att', 'blue_fighter_sub_att', 'red_fighter_rev',
       'blue_fighter_rev', 'red_fighter_ctrl', 'blue_fighter_ctrl',
       'red_fighter_sig_str_head', 'blue_fighter_sig_str_head',
       'red_fighter_sig_str_body', 'blue_fighter_sig_str_body',
       'red_fighter_sig_str_leg', 'blue_fighter_sig_str_leg',
       'red_fighter_sig_str_distance', 'blue_fighter_sig_str_distan