In [12]:
import os
import pandas as pd
import numpy as np

In [13]:
RAW_DATA_PATH: str = '/'.join([
    os.getcwd().replace('src', 'data'),
    'ufc-raw.csv'
])
    
CLEAN_DATA_PATH: str = RAW_DATA_PATH.replace('raw', 'clean')

In [14]:
def clean_dataframe(df_path: str, *args, **kwargs) -> pd.DataFrame:
    
    
    df: pd.DataFrame = (pd
                        .read_csv(RAW_DATA_PATH)
                        # Fill NA for each column depending on what column contains
                        .apply(lambda col: col.fillna(0) if col.dtype.kind in 'biufc' else col.fillna(''))
#                         Convert all columns to lowercase, trim any possible blank spaces, and convert - to _
                        .rename( columns=lambda col_name: col_name.lower().replace(' ', '').replace('-','_') )
#                         Correct columns
                        .assign( r_odds=lambda df: df.r_odds.fillna(0.0).astype('int64') )
                       )
        
    if kwargs.get('save', True):
        df.to_csv(CLEAN_DATA_PATH, index=False)
        
    return df

In [15]:
df_clean: pd.DataFrame = clean_dataframe(RAW_DATA_PATH)
df_clean.shape

(4896, 119)

In [19]:
df_clean[[column for column in df_clean.columns if 'avg' in column]].describe()

Unnamed: 0,b_avg_sig_str_landed,b_avg_sig_str_pct,b_avg_sub_att,b_avg_td_landed,b_avg_td_pct,r_avg_sig_str_landed,r_avg_sig_str_pct,r_avg_sub_att,r_avg_td_landed,r_avg_td_pct,avg_sub_att_dif,avg_td_dif
count,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0,4896.0
mean,21.311218,0.37525,0.418863,1.0383,0.258762,24.850377,0.421553,0.479913,1.23987,0.304719,-0.06105,-0.20157
std,21.483893,0.19622,0.647963,1.306088,0.255765,20.529765,0.156404,0.641705,1.275255,0.236352,0.848169,1.701414
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-8.3,-11.0
25%,2.68,0.33,0.0,0.0,0.0,4.555,0.377708,0.0,0.25,0.1225,-0.419643,-1.0
50%,17.0,0.429321,0.1,0.666667,0.25,23.5,0.4425,0.3,1.0,0.300208,0.0,0.0
75%,35.131159,0.5,0.666667,1.571429,0.42,38.0,0.506992,0.75,1.888889,0.45,0.25,0.6
max,154.0,1.0,7.0,10.86,1.0,141.0,1.0,8.3,12.5,1.0,6.0,10.86


In [16]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4896 entries, 0 to 4895
Columns: 119 entries, r_fighter to b_ko_odds
dtypes: bool(1), float64(57), int64(47), object(14)
memory usage: 4.4+ MB


In [17]:
df_clean.sample(10)

Unnamed: 0,r_fighter,b_fighter,r_odds,b_odds,r_ev,b_ev,date,location,country,winner,title_bout,weight_class,gender,no_of_rounds,b_current_lose_streak,b_current_win_streak,b_draw,b_avg_sig_str_landed,b_avg_sig_str_pct,b_avg_sub_att,b_avg_td_landed,b_avg_td_pct,b_longest_win_streak,b_losses,b_total_rounds_fought,b_total_title_bouts,b_win_by_decision_majority,b_win_by_decision_split,b_win_by_decision_unanimous,b_win_by_ko/tko,b_win_by_submission,b_win_by_tko_doctor_stoppage,b_wins,b_stance,b_height_cms,b_reach_cms,b_weight_lbs,r_current_lose_streak,r_current_win_streak,r_draw,r_avg_sig_str_landed,r_avg_sig_str_pct,r_avg_sub_att,r_avg_td_landed,r_avg_td_pct,r_longest_win_streak,r_losses,r_total_rounds_fought,r_total_title_bouts,r_win_by_decision_majority,r_win_by_decision_split,r_win_by_decision_unanimous,r_win_by_ko/tko,r_win_by_submission,r_win_by_tko_doctor_stoppage,r_wins,r_stance,r_height_cms,r_reach_cms,r_weight_lbs,r_age,b_age,lose_streak_dif,win_streak_dif,longest_win_streak_dif,win_dif,loss_dif,total_round_dif,total_title_bout_dif,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1,b_match_weightclass_rank,r_match_weightclass_rank,r_women'sflyweight_rank,r_women'sfeatherweight_rank,r_women'sstrawweight_rank,r_women'sbantamweight_rank,r_heavyweight_rank,r_lightheavyweight_rank,r_middleweight_rank,r_welterweight_rank,r_lightweight_rank,r_featherweight_rank,r_bantamweight_rank,r_flyweight_rank,r_pound_for_pound_rank,b_women'sflyweight_rank,b_women'sfeatherweight_rank,b_women'sstrawweight_rank,b_women'sbantamweight_rank,b_heavyweight_rank,b_lightheavyweight_rank,b_middleweight_rank,b_welterweight_rank,b_lightweight_rank,b_featherweight_rank,b_bantamweight_rank,b_flyweight_rank,b_pound_for_pound_rank,better_rank,finish,finish_details,finish_round,finish_round_time,total_fight_time_secs,r_dec_odds,b_dec_odds,r_sub_odds,b_sub_odds,r_ko_odds,b_ko_odds
1224,Maycee Barber,JJ Aldrich,-275,235,36.363636,235.0,3/23/2019,"Nashville, Tennessee, USA",USA,Red,False,Women's Flyweight,FEMALE,3,0,3,0,55.75,0.405,0.0,0.25,0.05,3,1,12,0,0,0,3,0,0,0,3,Southpaw,165.1,170.18,125,0,1,0,68.0,0.7,0.0,1.0,0.2,1,0,2,0,0,0,0,1,0,0,1,Switch,165.1,165.1,125,20,26,0,2,2,2,-1,10,0,-1,0,0.0,5.08,-6,-12.25,0.0,-0.75,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,KO/TKO,Punches,2.0,3:01,481.0,160.0,325.0,375.0,1200.0,250.0,1600.0
2532,Valerie Letourneau,Joanne Calderwood,-150,130,66.666667,130.0,6/18/2016,"Ottawa, Ontario, Canada",Canada,Blue,False,Women's Flyweight,FEMALE,3,0,1,0,70.0,0.523333,0.0,1.666667,0.416667,1,1,7,0,0,0,2,0,0,0,2,Orthodox,167.64,165.1,125,1,0,0,78.5,0.365,0.5,1.0,0.55,3,1,14,1,0,1,2,0,0,0,3,Orthodox,170.18,172.72,115,33,30,1,1,-2,-1,0,-7,-1,0,0,-2.54,-7.62,3,-8.5,-0.5,0.666667,0,1,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,KO/TKO,Kick,3.0,2:44,764.0,123.0,220.0,806.0,1175.0,765.0,675.0
3676,Max Holloway,Will Chope,-265,245,37.735849,245.0,1/4/2014,"Singapore, Singapore",Singapore,Red,False,Featherweight,MALE,3,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,Switch,193.04,193.04,145,2,0,0,62.666667,0.345,0.0,0.0,0.0,3,3,15,0,0,1,1,1,0,0,3,Orthodox,180.34,175.26,155,22,23,2,0,-3,-3,3,-15,0,-1,0,12.7,17.78,-1,-62.666667,0.0,0.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,KO/TKO,Punch,2.0,2:27,447.0,150.0,500.0,0.0,0.0,0.0,0.0
4457,Scott Jorgensen,Jeff Curran,-410,365,24.390244,365.0,10/29/2011,"Las Vegas, Nevada, USA",USA,Red,False,Bantamweight,MALE,3,1,0,0,9.0,0.6,1.0,0.0,0.0,0,1,3,0,0,0,0,0,0,0,0,Orthodox,167.64,175.26,135,0,1,0,15.0,0.53,0.0,1.0,1.0,1,0,1,0,0,0,0,1,0,0,1,Orthodox,165.1,167.64,135,29,34,-1,-1,-1,-1,-1,2,0,-1,0,2.54,7.62,-5,-6.0,1.0,-1.0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,U-DEC,,3.0,5:00,900.0,0.0,0.0,0.0,0.0,0.0,0.0
1025,Holly Holm,Raquel Pennington,-125,105,80.0,105.0,1/18/2020,"Las Vegas, Nevada, USA",USA,Red,False,Women's Bantamweight,FEMALE,3,0,1,0,0.0,0.45,0.9,1.14,0.26,4,4,32,1,0,2,3,0,2,0,7,Orthodox,170.18,170.18,135,1,0,0,0.0,0.34,0.1,0.55,0.31,3,5,35,5,0,1,2,2,0,0,5,Southpaw,172.72,175.26,135,38,31,1,1,1,2,1,-3,-4,-2,2,-2.54,-5.08,7,0.0,0.8,0.59,0,1,5.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Red,U-DEC,,3.0,5:00,900.0,115.0,170.0,1800.0,1100.0,500.0,1000.0
622,Ricardo Ramos,Lerone Murphy,-162,130,61.728395,130.0,7/15/2020,"Abu Dhabi, Abu Dhabi, United Arab Emirates",United Arab Emirates,Blue,False,Featherweight,MALE,3,0,0,1,1.87,0.35,2.0,0.0,0.0,0,0,3,0,0,0,0,0,0,0,0,Orthodox,175.26,185.42,145,0,2,0,3.05,0.39,0.7,2.13,0.64,3,1,14,0,0,1,2,1,1,0,5,Orthodox,175.26,182.88,145,24,28,0,-2,-3,-5,-1,-11,0,-1,-1,0.0,2.54,4,-1.18,1.3,-2.13,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,KO/TKO,Punches,1.0,4:18,258.0,160.0,300.0,300.0,2000.0,850.0,425.0
857,Erik Koch,Kyle Stewart,-140,120,71.428571,120.0,7/27/2019,"Edmonton, Alberta, Canada",Canada,Red,False,Welterweight,MALE,3,1,0,0,1.41,0.35,0.0,1.32,0.2,0,1,1,0,0,0,0,0,0,0,0,Orthodox,182.88,193.04,170,2,0,0,2.33,0.42,1.0,1.02,0.42,4,6,27,0,0,0,2,3,2,0,7,Southpaw,177.8,180.34,170,30,30,1,0,-4,-7,5,-26,0,-3,-2,5.08,12.7,0,-0.92,-1.0,0.3,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,U-DEC,,3.0,5:00,900.0,260.0,316.0,382.0,850.0,534.0,405.0
4458,Hatsu Hioki,George Roop,-315,285,31.746032,285.0,10/29/2011,"Las Vegas, Nevada, USA",USA,Red,False,Featherweight,MALE,3,0,1,0,38.2,0.332,0.0,0.2,0.2,1,3,12,0,0,1,0,1,0,0,2,Orthodox,185.42,182.88,135,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,Orthodox,180.34,177.8,145,28,29,0,1,1,2,-3,12,0,1,0,5.08,5.08,-1,38.2,0.0,0.2,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,S-DEC,,3.0,5:00,900.0,0.0,0.0,0.0,0.0,0.0,0.0
2631,Brendan O'Reilly,Alan Jouban,415,-525,415.0,19.047619,3/19/2016,"Brisbane, Queensland, Australia",Australia,Blue,False,Welterweight,MALE,3,1,0,0,36.8,0.51,0.0,0.2,0.2,2,2,9,0,0,0,1,2,0,0,3,Southpaw,182.88,185.42,170,0,1,0,28.5,0.395,0.5,3.0,0.3,1,1,6,0,0,0,1,0,0,0,1,Orthodox,170.18,175.26,170,28,34,-1,-1,1,2,-1,3,0,2,0,12.7,10.16,-6,8.3,-0.5,-2.8,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,KO/TKO,Elbow,1.0,2:15,135.0,751.0,350.0,1350.0,555.0,1520.0,-145.0
2163,Bradley Scott,Scott Askham,135,-155,135.0,64.516129,3/18/2017,"London, England, United Kingdom",United Kingdom,Red,False,Middleweight,MALE,3,1,0,0,23.8,0.586,0.6,0.2,0.1,1,3,11,0,0,0,0,2,0,0,2,Southpaw,190.5,190.5,185,1,0,0,39.4,0.486,0.6,0.4,0.09,1,3,12,1,0,0,0,0,2,0,2,Orthodox,185.42,193.04,170,27,28,0,0,0,0,0,-1,-1,2,-2,5.08,-2.54,-1,-15.6,0.0,-0.2,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neither,,,0.0,,0.0,230.0,197.0,804.0,660.0,840.0,340.0


In [None]:
from ydata_profiling import ProfileReport

In [None]:
profile = ProfileReport(df_clean, minimal=True)

In [None]:
profile.to_notebook_iframe()