In [1]:
import pandas as pd

# Load the UFC fight statistics dataset
df = pd.read_csv('ufc_fight_stats.csv')

# Select columns to load
df.drop(columns=['HEAD', 'BODY', 'LEG', 'TOTAL STR.'], inplace=True)

# Clean name columns (remove . at the end)
df.columns = df.columns.str.rstrip('.')

# Display columns and types
df.dtypes
df.head(5)

Unnamed: 0,EVENT,BOUT,ROUND,FIGHTER,KD,SIG.STR,SIG.STR. %,TD,TD %,SUB.ATT,REV,CTRL,DISTANCE,CLINCH,GROUND
0,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Round 1,Jack Della Maddalena,0.0,4 of 18,22%,0 of 0,---,0.0,0.0,0:00,4 of 18,0 of 0,0 of 0
1,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Round 2,Jack Della Maddalena,0.0,3 of 6,50%,0 of 1,0%,0.0,0.0,0:00,3 of 6,0 of 0,0 of 0
2,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Round 3,Jack Della Maddalena,0.0,9 of 20,45%,0 of 0,---,0.0,0.0,0:00,5 of 15,4 of 5,0 of 0
3,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Round 4,Jack Della Maddalena,0.0,1 of 11,9%,0 of 0,---,0.0,0.0,0:00,1 of 11,0 of 0,0 of 0
4,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Round 5,Jack Della Maddalena,0.0,1 of 6,16%,0 of 0,---,0.0,0.0,0:00,1 of 6,0 of 0,0 of 0


In [2]:
# CONVERT object TO FLOAT64

# ROUND (REMOVE TEXT))
df['ROUND'] = df['ROUND'].str.replace('Round', '', regex=False).str.strip() # (regex=False) literal character
df['ROUND'] = pd.to_numeric(df['ROUND'], errors='coerce') # Convert to float64 and coerce errors to NaN

# REMOVE % FROM COLUMNS AND CONVERT TO DECIMAL (bucle for)
for col in ['SIG.STR. %', 'TD %']:  # bucle to remove % from multiple columns
    df[col] = df[col].str.replace('%', '', regex=False).str.strip() # (regex=False) literal character // str.strip() to remove leading/trailing spaces
    df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to float64 and coerce errors to NaN
    df[col] = df[col] / 100  # Convert percentage to decimal


# SPLIT COLUMNS BY 'OF' DELIMETER (function and bucle for)

## Function to split columns (total columns)
def split_column(df, col):
    split_cols = df[col].str.split(' of ', expand=True) # Split column into two new columns
    suc = pd.to_numeric(split_cols[0], errors='coerce') # Convert successful part to float64
    tot = pd.to_numeric(split_cols[1], errors='coerce') # Convert total part to float64
    df[f'{col}.TOT'] = tot     # Create new column for total values

## Bucle for to apply function to all the columns
cols_to_split = ['SIG.STR', 'TD', 'DISTANCE', 'CLINCH', 'GROUND']
for col in cols_to_split:
    split_column(df, col)


# Calculate Position (%) of SIG.STR. 
df['DISTANCE %'] = df['DISTANCE.TOT'] / df['SIG.STR.TOT']
df['CLINCH %'] = df['CLINCH.TOT'] / df['SIG.STR.TOT']
df['GROUND %'] = df['GROUND.TOT'] / df['SIG.STR.TOT']

# Drop columns no longer needed and sort columns
df.drop(columns=['SIG.STR', 'TD', 'DISTANCE.TOT', 'DISTANCE', 'CLINCH.TOT', 'CLINCH', 'GROUND.TOT', 'GROUND'], inplace=True)

# Sort dataframe
df = df[['EVENT', 'BOUT', 'FIGHTER', 'ROUND', 'SIG.STR.TOT', 'SIG.STR. %', 'DISTANCE %', 'CLINCH %', 'GROUND %',  'KD', 'TD.TOT', 'TD %', 'SUB.ATT', 'CTRL', 'REV', ]]

# DISPLAY AND SAVE CLEAN DATAFRAME
display(df)
df.to_csv('ufc_fight_stats_clean.csv', index=False)

Unnamed: 0,EVENT,BOUT,FIGHTER,ROUND,SIG.STR.TOT,SIG.STR. %,DISTANCE %,CLINCH %,GROUND %,KD,TD.TOT,TD %,SUB.ATT,CTRL,REV
0,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Jack Della Maddalena,1.0,18.0,0.22,1.00,0.00,0.0,0.0,0.0,,0.0,0:00,0.0
1,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Jack Della Maddalena,2.0,6.0,0.50,1.00,0.00,0.0,0.0,1.0,0.0,0.0,0:00,0.0
2,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Jack Della Maddalena,3.0,20.0,0.45,0.75,0.25,0.0,0.0,0.0,,0.0,0:00,0.0
3,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Jack Della Maddalena,4.0,11.0,0.09,1.00,0.00,0.0,0.0,0.0,,0.0,0:00,0.0
4,UFC 322: Della Maddalena vs. Makhachev,Jack Della Maddalena vs. Islam Makhachev,Jack Della Maddalena,5.0,6.0,0.16,1.00,0.00,0.0,0.0,0.0,,0.0,0:00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39787,UFC 2: No Way Out,Johnny Rhodes vs. David Levicki,David Levicki,1.0,5.0,0.80,0.40,0.40,0.2,0.0,0.0,,0.0,--,0.0
39788,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,Patrick Smith,1.0,1.0,1.00,0.00,1.00,0.0,0.0,1.0,0.0,1.0,--,0.0
39789,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,Ray Wizard,1.0,1.0,1.00,1.00,0.00,0.0,0.0,0.0,,0.0,--,0.0
39790,UFC 2: No Way Out,Scott Morris vs. Sean Daugherty,Scott Morris,1.0,1.0,1.00,0.00,1.00,0.0,0.0,1.0,1.0,1.0,--,0.0
