In [1]:
import pandas as pd

# Load the UFC fight statistics dataset
df = pd.read_csv('ufc_fight_stats.csv')

# Select columns to load
df.drop(columns=['HEAD', 'BODY', 'LEG', 'TOTAL STR.'], inplace=True)

# Clean name columns (remove . at the end)
df.columns = df.columns.str.rstrip('.')

# Display columns and types
df.dtypes
df.head(5)

Unnamed: 0,EVENT,BOUT,ROUND,FIGHTER,KD,SIG.STR,SIG.STR. %,TD,TD %,SUB.ATT,REV,CTRL,DISTANCE,CLINCH,GROUND
0,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Round 1,Merab Dvalishvili,0.0,22 of 68,32%,0 of 8,0%,0.0,0.0,1:33,18 of 63,4 of 5,0 of 0
1,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Round 2,Merab Dvalishvili,0.0,19 of 73,26%,0 of 5,0%,0.0,1.0,2:06,17 of 69,2 of 4,0 of 0
2,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Round 3,Merab Dvalishvili,0.0,21 of 62,33%,2 of 7,28%,0.0,0.0,1:07,15 of 52,6 of 10,0 of 0
3,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Round 4,Merab Dvalishvili,0.0,37 of 71,52%,0 of 4,0%,1.0,0.0,0:15,34 of 67,3 of 4,0 of 0
4,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Round 5,Merab Dvalishvili,0.0,35 of 109,32%,0 of 5,0%,1.0,0.0,0:11,32 of 102,3 of 7,0 of 0


In [2]:
# CONVERT object TO FLOAT64

# ROUND (REMOVE TEXT))
df['ROUND'] = df['ROUND'].str.replace('Round', '', regex=False).str.strip() # (regex=False) literal character
df['ROUND'] = pd.to_numeric(df['ROUND'], errors='coerce') # Convert to float64 and coerce errors to NaN

# REMOVE % FROM COLUMNS AND CONVERT TO DECIMAL (bucle for)
for col in ['SIG.STR. %', 'TD %']:  # bucle to remove % from multiple columns
    df[col] = df[col].str.replace('%', '', regex=False).str.strip() # (regex=False) literal character // str.strip() to remove leading/trailing spaces
    df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to float64 and coerce errors to NaN
    df[col] = df[col] / 100  # Convert percentage to decimal


# SPLIT COLUMNS BY 'OF' DELIMETER (function and bucle for)

## Function to split columns (total columns)
def split_column(df, col):
    split_cols = df[col].str.split(' of ', expand=True) # Split column into two new columns
    suc = pd.to_numeric(split_cols[0], errors='coerce') # Convert successful part to float64
    tot = pd.to_numeric(split_cols[1], errors='coerce') # Convert total part to float64
    df[f'{col}.TOT'] = tot     # Create new column for total values

## Bucle for to apply function to all the columns
cols_to_split = ['SIG.STR', 'TD', 'DISTANCE', 'CLINCH', 'GROUND']
for col in cols_to_split:
    split_column(df, col)


# Calculate Position (%) of SIG.STR. 
df['DISTANCE %'] = df['DISTANCE.TOT'] / df['SIG.STR.TOT']
df['CLINCH %'] = df['CLINCH.TOT'] / df['SIG.STR.TOT']
df['GROUND %'] = df['GROUND.TOT'] / df['SIG.STR.TOT']

# Drop columns no longer needed and sort columns
df.drop(columns=['SIG.STR', 'TD', 'DISTANCE.TOT', 'DISTANCE', 'CLINCH.TOT', 'CLINCH', 'GROUND.TOT', 'GROUND'], inplace=True)

# Sort dataframe
df = df[['EVENT', 'BOUT', 'FIGHTER', 'ROUND', 'SIG.STR.TOT', 'SIG.STR. %', 'DISTANCE %', 'CLINCH %', 'GROUND %',  'KD', 'TD.TOT', 'TD %', 'SUB.ATT', 'CTRL', 'REV', ]]

# DISPLAY AND SAVE CLEAN DATAFRAME
display(df)
df.to_csv('ufc_fight_stats_clean.csv', index=False)

Unnamed: 0,EVENT,BOUT,FIGHTER,ROUND,SIG.STR.TOT,SIG.STR. %,DISTANCE %,CLINCH %,GROUND %,KD,TD.TOT,TD %,SUB.ATT,CTRL,REV
0,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Merab Dvalishvili,1.0,68.0,0.32,0.926471,0.073529,0.0,0.0,8.0,0.00,0.0,1:33,0.0
1,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Merab Dvalishvili,2.0,73.0,0.26,0.945205,0.054795,0.0,0.0,5.0,0.00,0.0,2:06,1.0
2,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Merab Dvalishvili,3.0,62.0,0.33,0.838710,0.161290,0.0,0.0,7.0,0.28,0.0,1:07,0.0
3,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Merab Dvalishvili,4.0,71.0,0.52,0.943662,0.056338,0.0,0.0,4.0,0.00,1.0,0:15,0.0
4,UFC 323: Dvalishvili vs. Yan 2,Merab Dvalishvili vs. Petr Yan,Merab Dvalishvili,5.0,109.0,0.32,0.935780,0.064220,0.0,0.0,5.0,0.00,1.0,0:11,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39907,UFC 2: No Way Out,Johnny Rhodes vs. David Levicki,David Levicki,1.0,5.0,0.80,0.400000,0.400000,0.2,0.0,0.0,,0.0,--,0.0
39908,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,Patrick Smith,1.0,1.0,1.00,0.000000,1.000000,0.0,0.0,1.0,0.00,1.0,--,0.0
39909,UFC 2: No Way Out,Patrick Smith vs. Ray Wizard,Ray Wizard,1.0,1.0,1.00,1.000000,0.000000,0.0,0.0,0.0,,0.0,--,0.0
39910,UFC 2: No Way Out,Scott Morris vs. Sean Daugherty,Scott Morris,1.0,1.0,1.00,0.000000,1.000000,0.0,0.0,1.0,1.00,1.0,--,0.0
