### Importing Pandas/Matplotlib and UFC Dataset from Kaggle

In [496]:
import pandas as pd
import matplotlib as mpl

In [497]:
all_fights = pd.read_csv('UFC dataset/Large set/large_dataset.csv')
all_fights.head()

Unnamed: 0,event_name,r_fighter,b_fighter,winner,weight_class,is_title_bout,gender,method,finish_round,total_rounds,...,weight_diff,reach_diff,SLpM_total_diff,SApM_total_diff,sig_str_acc_total_diff,td_acc_total_diff,str_def_total_diff,td_def_total_diff,sub_avg_diff,td_avg_diff
0,UFC Fight Night: Ribas vs. Namajunas,Amanda Ribas,Rose Namajunas,Blue,Women's Flyweight,0,Women,Decision - Unanimous,5,5.0,...,0.0,2.54,0.94,-0.11,-0.01,0.04,-0.02,0.26,0.2,0.69
1,UFC Fight Night: Ribas vs. Namajunas,Karl Williams,Justin Tafa,Red,Heavyweight,0,Men,Decision - Unanimous,3,3.0,...,-13.16,12.7,-1.22,-3.32,-0.02,0.5,0.13,0.5,0.2,4.75
2,UFC Fight Night: Ribas vs. Namajunas,Edmen Shahbazyan,AJ Dobson,Red,Middleweight,0,Men,KO/TKO,1,3.0,...,0.0,-2.54,-0.69,-1.22,0.06,-0.37,-0.01,-0.02,0.3,0.57
3,UFC Fight Night: Ribas vs. Namajunas,Payton Talbott,Cameron Saaiman,Red,Bantamweight,0,Men,KO/TKO,2,3.0,...,0.0,7.62,2.73,-0.6,0.08,-0.28,0.0,0.43,-0.2,-0.91
4,UFC Fight Night: Ribas vs. Namajunas,Billy Quarantillo,Youssef Zalal,Blue,Featherweight,0,Men,Submission,2,3.0,...,0.0,-5.08,4.48,3.84,0.07,-0.11,-0.22,0.01,-0.2,-1.04


In [498]:
fighter_data = pd.read_csv('UFC dataset/Fighter stats/fighter_stats.csv')
fighter_data.head()

Unnamed: 0,name,wins,losses,height,weight,reach,stance,age,SLpM,sig_str_acc,SApM,str_def,td_avg,td_acc,td_def,sub_avg
0,Amanda Ribas,12.0,5.0,160.02,56.7,167.64,Orthodox,30.0,4.63,0.4,3.4,0.61,2.07,0.51,0.85,0.7
1,Rose Namajunas,13.0,6.0,165.1,56.7,165.1,Orthodox,31.0,3.69,0.41,3.51,0.63,1.38,0.47,0.59,0.5
2,Karl Williams,10.0,1.0,190.5,106.59,200.66,Orthodox,34.0,2.87,0.52,1.7,0.6,4.75,0.5,1.0,0.2
3,Justin Tafa,7.0,4.0,182.88,119.75,187.96,Southpaw,30.0,4.09,0.54,5.02,0.47,0.0,0.0,0.5,0.0
4,Edmen Shahbazyan,13.0,4.0,187.96,83.91,190.5,Orthodox,26.0,3.6,0.52,4.09,0.45,2.24,0.38,0.63,0.6


## Data Munging
### First, we will drop duplicate rows. Then we will look for any null values

In [499]:
# Remove duplicates
all_fights.drop_duplicates(inplace=True)
fighter_data.drop_duplicates(inplace=True)

In [500]:
# For colummns with null values, find ratio of null entries to total entries
def print_null_ratios(df):
    for col in df.columns:
        total_nulls = df[col].isnull().sum()
        ratio = total_nulls / df.shape[0]
        if total_nulls > 0:
            print(f"{col:<25} {total_nulls:<15} {ratio:<8.3f}")

In [501]:
# Number of null values in dataset of all fights
print(f"{'Column:':<25} {'Total Nulls:':<15} {'Ratio:':<8}")
print_null_ratios(all_fights)


Column:                   Total Nulls:    Ratio:  
total_rounds              31              0.004   
referee                   32              0.004   
r_age                     76              0.010   
r_reach                   412             0.055   
r_stance                  26              0.003   
b_age                     190             0.026   
b_reach                   888             0.119   
b_stance                  68              0.009   
age_diff                  213             0.029   
reach_diff                1038            0.140   


In [502]:
# Number of null values in dataset of all fighters
print(f"{'Column:':<25} {'Total Nulls:':<15} {'Ratio:':<8}")
print_null_ratios(fighter_data)

Column:                   Total Nulls:    Ratio:  
name                      1               0.000   
wins                      1               0.000   
losses                    1               0.000   
height                    1               0.000   
weight                    1               0.000   
reach                     656             0.265   
stance                    78              0.031   
age                       161             0.065   
SLpM                      1               0.000   
sig_str_acc               1               0.000   
SApM                      1               0.000   
str_def                   1               0.000   
td_avg                    1               0.000   
td_acc                    1               0.000   
td_def                    1               0.000   
sub_avg                   1               0.000   


In [503]:
# Dropping Rows with Null Values
def drop_null(df):
    for col in df.columns:
        rows_to_drop = df[df[col].isnull()].index
        df.drop(rows_to_drop, inplace=True)

drop_null(all_fights)
drop_null(fighter_data)