In [7]:
import pandas as pd

# Load the UFC fight statistics and events dataset
df_fight_stats = pd.read_csv('../02.csv_scrapping/ufc_fight_stats.csv')
df_events = pd.read_csv('../04.csv_clean/ufc_event_details_clean.csv')
df_fight_results = pd.read_csv('../04.csv_clean/ufc_fight_results.csv')

# Standardize column names
df_fight_stats.rename(columns={'TD %': 'td_acc_%', 'SIG.STR. %': 'sig_str_acc_%'}, inplace=True)
df_fight_stats.columns = df_fight_stats.columns.str.lower().str.strip().str.rstrip('.').str.replace(' %', '%').str.replace('.', '_').str.replace(' ', '_')

# Merge 'date' from 'df_events' into the main dataframe
df_fight_stats = df_fight_stats.merge(df_events[['event', 'date']], on = 'event', how='left')

# Convert 'date' column to datetime format
df_fight_stats['date'] = pd.to_datetime(df_fight_stats['date'], errors='coerce')

# Filter out rows < 2016-01-01
df_fight_stats = df_fight_stats[df_fight_stats['date'] >= '2016-01-01'].reset_index(drop=True)

# Display columns and types
#df_fight_stats.dtypes
#df_events.dtypes
#df_fight_results.dtypes

df_fight_stats.head(5)
#df_events.head(2)
#df_fight_results.head(2)

Unnamed: 0,event,bout,round,fighter,kd,sig_str,sig_str_acc_%,total_str,td,td_acc_%,sub_att,rev,ctrl,head,body,leg,distance,clinch,ground,date
0,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Round 1,Alexander Volkanovski,0.0,12 of 27,44%,17 of 32,0 of 2,0%,0.0,0.0,0:27,5 of 12,0 of 1,7 of 14,12 of 26,0 of 1,0 of 0,2026-01-31
1,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Round 2,Alexander Volkanovski,0.0,19 of 30,63%,20 of 33,0 of 0,---,1.0,0.0,0:00,13 of 20,0 of 2,6 of 8,19 of 30,0 of 0,0 of 0,2026-01-31
2,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Round 3,Alexander Volkanovski,0.0,21 of 35,60%,21 of 35,0 of 2,0%,0.0,0.0,0:00,21 of 33,0 of 0,0 of 2,21 of 35,0 of 0,0 of 0,2026-01-31
3,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Round 4,Alexander Volkanovski,0.0,25 of 44,56%,25 of 44,0 of 0,---,0.0,0.0,0:00,15 of 33,2 of 2,8 of 9,25 of 44,0 of 0,0 of 0,2026-01-31
4,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Round 5,Alexander Volkanovski,0.0,21 of 24,87%,29 of 34,2 of 3,66%,0.0,0.0,2:22,18 of 20,1 of 1,2 of 3,11 of 14,3 of 3,7 of 7,2026-01-31


In [8]:
# Replace '  vs. ' by ' vs. ' into 'df_fight_results' and standardize 'bout' formatting
df_fight_results['bout'] = df_fight_results['bout'].str.replace('  vs. ', ' vs. ', regex=False).str.strip()
df_fight_stats['bout'] = df_fight_stats['bout'].str.strip()

# Merge 'round_end_time' from 'df_fight_results' into the main dataframe
df_fight_stats = df_fight_stats.merge(df_fight_results[['bout', 'round_end', 'round_end_time']], on = ['bout'], how='left')

# Remove text from 'round' column and convert to numeric
df_fight_stats['round'] = df_fight_stats['round'].astype(str).str.replace('Round', '', regex=False).str.strip()
df_fight_stats['round'] = pd.to_numeric(df_fight_stats['round'], errors='coerce')

# Calculate 'round_time' column
df_fight_stats['round_time'] = df_fight_stats.apply(lambda row: 5.0 if row['round'] <  row['round_end'] else row['round_end_time'], axis=1)

# Create function to convert 'ctrl' from object to decimal minutes
def time_to_decimal_min(time_str):
    if pd.isna(time_str) or time_str == '-':
        return 0.0
    minutes, seconds = map(int, str(time_str).split(':'))
    return round(minutes + (seconds / 60), 2)

# Apply 'time_to_decimal_min' function to the 'ctrl' column
df_fight_stats['round_min'] = df_fight_stats.apply(
    lambda row: 5.0 if row['round'] < row['round_end'] else row['round_end_time'], axis=1
)

# Remove % from columns and convert to decimal
for col in ['sig_str_acc_%', 'td_acc_%']:
    df_fight_stats[col] = df_fight_stats[col].astype(str).str.replace('%', '', regex=False).str.strip()
    df_fight_stats[col] = pd.to_numeric(df_fight_stats[col], errors='coerce')
    df_fight_stats[col] = df_fight_stats[col] / 100

# Split columns by 'of' delimiter
def split_column(df, col):
    split_cols = df[col].astype(str).str.split(' of ', expand=True)
    df[f'{col}_land'] = pd.to_numeric(split_cols[0], errors='coerce')
    df[f'{col}_att'] = pd.to_numeric(split_cols[1], errors='coerce')
    df[f'{col}_acc_%'] = df[f'{col}_land'] / df[f'{col}_att']

# Apply 'split_column' function to columns
cols_to_split = ['total_str', 'sig_str', 'td', 'distance', 'clinch', 'ground', 'head', 'body', 'leg']
for col in cols_to_split:
    split_column(df_fight_stats, col)

# Reorder columns for better readability
df_fight_stats = df_fight_stats[[
    # General information
    'date', 'event', 'bout', 'fighter', 'round', 'round_time',
    # Striking stats
    'sig_str_land', 'sig_str_att', 'sig_str_acc_%', 'total_str_land', 'total_str_att', 'total_str_acc_%', 'kd',
    # Grappling stats
    'td_land', 'td_att', 'td_acc_%', 'sub_att', 'rev', 'ctrl',
    # Fight style stats and effectiveness
    'distance_land', 'clinch_land', 'ground_land',
    # Significant strike locations
    'head_land', 'body_land', 'leg_land'
]]

# Display final dataframe
#df_fight_stats.head(5)
#df_fight_stats.columns
df_fight_stats.dtypes
#df_fight_results.dtypes

date               datetime64[ns]
event                      object
bout                       object
fighter                    object
round                       int64
round_time                float64
sig_str_land                int64
sig_str_att                 int64
sig_str_acc_%             float64
total_str_land              int64
total_str_att               int64
total_str_acc_%           float64
kd                        float64
td_land                     int64
td_att                      int64
td_acc_%                  float64
sub_att                   float64
rev                       float64
ctrl                       object
distance_land               int64
clinch_land                 int64
ground_land                 int64
head_land                   int64
body_land                   int64
leg_land                    int64
dtype: object

In [9]:
# Merge 'fighter1' and 'fighter2' from 'df_fight_results' into the main dataframe
df_fight_stats = df_fight_stats.merge(df_fight_results[['bout', 'fighter1', 'fighter2']], on = ['bout'], how='left')

# Create 'opponent' column
def get_opponent(row):
    if row['fighter'] == row['fighter1']:
        return row['fighter2']
    else:
        return row['fighter1']

# Apply 'get_opponent' function to create 'opponent' column
df_fight_stats['opponent'] = df_fight_stats.apply(get_opponent, axis=1).astype(str)

# Create copy of dataframe for processing
df_fight_stats_oponnent = df_fight_stats[['bout', 'fighter', 'round', 'sig_str_land', 'sig_str_att', 'td_land', 'td_att', 'sub_att']].copy()

# Rename columns for opponent merge
df_fight_stats_oponnent.rename(columns={'fighter': 'opponent', 'sig_str_land': 'sig_str_land_opp', 'sig_str_att': 'sig_str_att_opp', 'td_land': 'td_land_opp', 'td_att': 'td_att_opp', 'sub_att': 'sub_att_opp'}, inplace=True)

# Merge 'sig_str_landed_opp', 'sig_str_att_opp', 'td_land_opp', 'td_att_opp' 
df_fight_stats = df_fight_stats.merge(df_fight_stats_oponnent[['bout', 'round', 'opponent', 'sig_str_land_opp', 'sig_str_att_opp', 'td_land_opp', 'td_att_opp', 'sub_att_opp']], left_on=['bout', 'round', 'opponent'], right_on=['bout', 'round', 'opponent'], how='left')

# Reorder columns for better readability
df_fight_stats = df_fight_stats[[
    # General information
    'date', 'event', 'bout', 'fighter', 'opponent', 'round', 'round_time',
    # Striking stats
    'sig_str_land', 'sig_str_att', 'sig_str_land_opp', 'sig_str_att_opp', 'total_str_land', 'total_str_att', 'total_str_acc_%', 'kd',
    # Grappling stats
    'td_land', 'td_att', 'td_acc_%', 'td_land_opp', 'td_att_opp', 'sub_att', 'sub_att_opp', 'rev', 'ctrl',
    # Fight style stats and effectiveness
    'distance_land', 'clinch_land', 'ground_land',
    # Significant strike locations
    'head_land', 'body_land', 'leg_land'
]]

# Save cleaned dataframe to CSV
df_fight_stats.to_csv('../04.csv_clean/ufc_fight_stats_round.csv', index=False)

df_fight_stats.head(5)
#df_fight_stats_oponnent.head(2)


Unnamed: 0,date,event,bout,fighter,opponent,round,round_time,sig_str_land,sig_str_att,sig_str_land_opp,...,sub_att,sub_att_opp,rev,ctrl,distance_land,clinch_land,ground_land,head_land,body_land,leg_land
0,2026-01-31,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Alexander Volkanovski,Diego Lopes,1,5.0,12,27,12.0,...,0.0,0.0,0.0,0:27,12,0,0,5,0,7
1,2026-01-31,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Alexander Volkanovski,Diego Lopes,1,5.0,12,27,12.0,...,0.0,0.0,0.0,0:27,12,0,0,5,0,7
2,2026-01-31,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Alexander Volkanovski,Diego Lopes,1,5.0,12,27,12.0,...,0.0,0.0,0.0,0:27,12,0,0,5,0,7
3,2026-01-31,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Alexander Volkanovski,Diego Lopes,1,5.0,12,27,12.0,...,0.0,0.0,0.0,0:27,12,0,0,5,0,7
4,2026-01-31,UFC 325: Volkanovski vs. Lopes 2,Alexander Volkanovski vs. Diego Lopes,Alexander Volkanovski,Diego Lopes,1,5.0,12,27,8.0,...,0.0,0.0,0.0,0:27,12,0,0,5,0,7
