### Importing Pandas/Matplotlib and UFC Dataset from Kaggle

In [183]:
import pandas as pd
import matplotlib as mpl

In [184]:
all_fights = pd.read_csv('UFC dataset/Large set/large_dataset.csv')
all_fights

Unnamed: 0,event_name,r_fighter,b_fighter,winner,weight_class,is_title_bout,gender,method,finish_round,total_rounds,...,weight_diff,reach_diff,SLpM_total_diff,SApM_total_diff,sig_str_acc_total_diff,td_acc_total_diff,str_def_total_diff,td_def_total_diff,sub_avg_diff,td_avg_diff
0,UFC Fight Night: Ribas vs. Namajunas,Amanda Ribas,Rose Namajunas,Blue,Women's Flyweight,0,Women,Decision - Unanimous,5,5.0,...,0.00,2.54,0.94,-0.11,-0.01,0.04,-0.02,0.26,0.2,0.69
1,UFC Fight Night: Ribas vs. Namajunas,Karl Williams,Justin Tafa,Red,Heavyweight,0,Men,Decision - Unanimous,3,3.0,...,-13.16,12.70,-1.22,-3.32,-0.02,0.50,0.13,0.50,0.2,4.75
2,UFC Fight Night: Ribas vs. Namajunas,Edmen Shahbazyan,AJ Dobson,Red,Middleweight,0,Men,KO/TKO,1,3.0,...,0.00,-2.54,-0.69,-1.22,0.06,-0.37,-0.01,-0.02,0.3,0.57
3,UFC Fight Night: Ribas vs. Namajunas,Payton Talbott,Cameron Saaiman,Red,Bantamweight,0,Men,KO/TKO,2,3.0,...,0.00,7.62,2.73,-0.60,0.08,-0.28,0.00,0.43,-0.2,-0.91
4,UFC Fight Night: Ribas vs. Namajunas,Billy Quarantillo,Youssef Zalal,Blue,Featherweight,0,Men,Submission,2,3.0,...,0.00,-5.08,4.48,3.84,0.07,-0.11,-0.22,0.01,-0.2,-1.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7434,UFC 2: No Way Out,Orlando Wiet,Robert Lucarelli,Red,Open Weight,0,Men,KO/TKO,1,,...,-34.02,,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00
7435,UFC 2: No Way Out,Frank Hamaker,Thaddeus Luster,Red,Open Weight,0,Men,Submission,1,,...,15.88,,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00
7436,UFC 2: No Way Out,Johnny Rhodes,David Levicki,Red,Open Weight,0,Men,KO/TKO,1,,...,-29.49,,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00
7437,UFC 2: No Way Out,Patrick Smith,Ray Wizard,Red,Open Weight,0,Men,Submission,1,,...,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00


In [185]:
fighter_data = pd.read_csv('UFC dataset/Fighter stats/fighter_stats.csv')
fighter_data

Unnamed: 0,name,wins,losses,height,weight,reach,stance,age,SLpM,sig_str_acc,SApM,str_def,td_avg,td_acc,td_def,sub_avg
0,Amanda Ribas,12.0,5.0,160.02,56.70,167.64,Orthodox,30.0,4.63,0.40,3.40,0.61,2.07,0.51,0.85,0.7
1,Rose Namajunas,13.0,6.0,165.10,56.70,165.10,Orthodox,31.0,3.69,0.41,3.51,0.63,1.38,0.47,0.59,0.5
2,Karl Williams,10.0,1.0,190.50,106.59,200.66,Orthodox,34.0,2.87,0.52,1.70,0.60,4.75,0.50,1.00,0.2
3,Justin Tafa,7.0,4.0,182.88,119.75,187.96,Southpaw,30.0,4.09,0.54,5.02,0.47,0.00,0.00,0.50,0.0
4,Edmen Shahbazyan,13.0,4.0,187.96,83.91,190.50,Orthodox,26.0,3.60,0.52,4.09,0.45,2.24,0.38,0.63,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2474,Thaddeus Luster,0.0,1.0,190.50,95.25,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0
2475,David Levicki,1.0,3.0,195.58,124.74,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0
2476,Ray Wizard,0.0,1.0,187.96,102.06,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0
2477,Sean Daugherty,0.0,2.0,182.88,79.38,,,48.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0


## Data Munging
### First, we will drop duplicate rows. Then we will remove any statistical categories that will not contribute to this study. Finally, we will remove data entires with null values

In [186]:
# Remove duplicates
all_fights.drop_duplicates(inplace=True)
fighter_data.drop_duplicates(inplace=True)

In [187]:
# Remove columns that won't contribute to the analysis
all_fights = all_fights[['r_fighter', 'b_fighter', 'winner', 'method', 
                         'weight_class', 'gender', 'is_title_bout', 'r_stance', 
                         'b_stance', 'age_diff', 'height_diff', 'reach_diff', 'SLpM_total_diff', 'SApM_total_diff', 
                         'sig_str_acc_total_diff', 'td_acc_total_diff', 'str_def_total_diff', 
                         'td_def_total_diff', 'sub_avg_diff', 'td_avg_diff' ]]
fighter_data = fighter_data.drop(columns=['weight'])

In [188]:
# For colummns with null values, find ratio of null entries to total entries
def print_null_ratios(df):
    for col in df.columns:
        total_nulls = df[col].isnull().sum()
        ratio = total_nulls / df.shape[0]
        if total_nulls > 0:
            print(f"{col:<25} {total_nulls:<15} {ratio:<8.3f}")

In [189]:
# Number of null values in dataset of all fights
print(f"{'Column:':<25} {'Total Nulls:':<15} {'Ratio:':<8}")
print_null_ratios(all_fights)


Column:                   Total Nulls:    Ratio:  
r_stance                  26              0.003   
b_stance                  68              0.009   
age_diff                  213             0.029   
reach_diff                1038            0.140   


In [190]:
# Number of null values in dataset of all fighters
print(f"{'Column:':<25} {'Total Nulls:':<15} {'Ratio:':<8}")
print_null_ratios(fighter_data)

Column:                   Total Nulls:    Ratio:  
name                      1               0.000   
wins                      1               0.000   
losses                    1               0.000   
height                    1               0.000   
reach                     656             0.265   
stance                    78              0.031   
age                       161             0.065   
SLpM                      1               0.000   
sig_str_acc               1               0.000   
SApM                      1               0.000   
str_def                   1               0.000   
td_avg                    1               0.000   
td_acc                    1               0.000   
td_def                    1               0.000   
sub_avg                   1               0.000   


In [191]:
# Dropping Rows with Null Values
def drop_null(df):
    for col in df.columns:
        df = df[df[col].notnull()]
    return df

fighter_data = drop_null(fighter_data)
all_fights = drop_null(all_fights)

In [192]:
all_fights

Unnamed: 0,r_fighter,b_fighter,winner,method,weight_class,gender,is_title_bout,r_stance,b_stance,age_diff,height_diff,reach_diff,SLpM_total_diff,SApM_total_diff,sig_str_acc_total_diff,td_acc_total_diff,str_def_total_diff,td_def_total_diff,sub_avg_diff,td_avg_diff
0,Amanda Ribas,Rose Namajunas,Blue,Decision - Unanimous,Women's Flyweight,Women,0,Orthodox,Orthodox,-1.0,-5.08,2.54,0.94,-0.11,-0.01,0.04,-0.02,0.26,0.2,0.69
1,Karl Williams,Justin Tafa,Red,Decision - Unanimous,Heavyweight,Men,0,Orthodox,Southpaw,4.0,7.62,12.70,-1.22,-3.32,-0.02,0.50,0.13,0.50,0.2,4.75
2,Edmen Shahbazyan,AJ Dobson,Red,KO/TKO,Middleweight,Men,0,Orthodox,Orthodox,-6.0,2.54,-2.54,-0.69,-1.22,0.06,-0.37,-0.01,-0.02,0.3,0.57
3,Payton Talbott,Cameron Saaiman,Red,KO/TKO,Bantamweight,Men,0,Switch,Southpaw,2.0,5.08,7.62,2.73,-0.60,0.08,-0.28,0.00,0.43,-0.2,-0.91
4,Billy Quarantillo,Youssef Zalal,Blue,Submission,Featherweight,Men,0,Orthodox,Switch,8.0,0.00,-5.08,4.48,3.84,0.07,-0.11,-0.22,0.01,-0.2,-1.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7184,Chuck Liddell,Jeff Monson,Red,Decision - Unanimous,Middleweight,Men,0,Orthodox,Orthodox,1.0,12.70,5.08,2.00,-0.16,0.05,0.63,-0.03,0.15,-0.7,-0.62
7207,Tito Ortiz,Wanderlei Silva,Red,Decision - Unanimous,UFC Light Heavyweight Title,Men,1,Orthodox,Orthodox,2.0,10.16,0.00,-0.20,0.69,0.06,-0.18,0.00,-0.08,0.0,1.50
7250,Jeremy Horn,Chuck Liddell,Red,Submission,Middleweight,Men,0,Orthodox,Orthodox,-6.0,-2.54,-5.08,-1.78,-0.72,0.04,-0.37,-0.01,-0.52,1.1,1.39
7263,Vitor Belfort,Wanderlei Silva,Red,KO/TKO,Middleweight,Men,0,Southpaw,Orthodox,-1.0,2.54,0.00,-1.20,0.64,0.06,0.07,-0.10,-0.09,0.0,0.05


In [193]:
print(all_fights['weight_class'].str.contains('Catch Weight').sum())

60
