In [1]:
import pandas as pd
import numpy as np 
from datetime import datetime as datetime 

##### Read in data and clean data

In [2]:
#read initial data 
ufc_data = pd.read_csv('../data/combined_fighter_data.csv')

# Rename columns for easier manipulation
ufc_data.rename(columns={
    'Sig. Str.': 'significant_strikes',
    'Total Str.': 'total_strikes',
    'TD': 'takedowns',
    'TD %': 'takedown_percentage',
    'Sub. Att': 'submission_attempts',
    'Rev.': 'reversals',
    'Ctrl': 'control_time',
    'Head': 'head_strikes',
    'Body': 'body_strikes',
    'Leg': 'leg_strikes',
    'Distance': 'distance_strikes',
    'Clinch': 'clinch_strikes',
    'Ground': 'ground_strikes',
    'Method': 'method'
}, inplace=True)

In [3]:
def categorize_method(method):
    method = method.lower()
    if 'dec' in method:
        return 'dec'
    elif 'sub' in method:
        return 'sub'
    elif 'ko/tko' in method or 'knockout' in method or 'tko' in method:
        return 'ko'
    elif 'cnc' in method:
        return 'ko'  # CNC indicates a stoppage due to an inability to continue, which is treated as a TKO.
    elif 'dq' in method or 'disqualification' in method:
        return 'dq'
    elif 'overturned' in method:
        return 'overturned'
    else:
        return 'other'

# Apply the categorization function to the 'method' column and update it
ufc_data['method'] = ufc_data['method'].apply(categorize_method)

In [4]:
def extract_strike_data(row, column_name):
    """
    Extracts landed and thrown strikes from a given column in a row.
    
    Args:
    row (pd.Series): The row from which to extract data.
    column_name (str): The column name from which to extract strike data.
    
    Returns:
    tuple: Landed and thrown strikes as integers.
    """
    if pd.notnull(row[column_name]) and 'of' in row[column_name]:
        landed, thrown = row[column_name].split(' of ')
        return int(landed), int(thrown)
    else:
        return 0, 0

# Columns containing strike data
strikes_col = [
    'significant_strikes', 'total_strikes', 'takedowns', 
    'head_strikes', 'body_strikes', 'leg_strikes', 
    'distance_strikes', 'clinch_strikes', 'ground_strikes'
]

# Apply the extraction function to each relevant column
for col in strikes_col:
    ufc_data[f'{col}_landed'], ufc_data[f'{col}_thrown'] = zip(*ufc_data.apply(lambda row: extract_strike_data(row, col), axis=1))

cols_drop = [
'significant_strikes',
'total_strikes',
'takedowns',
'takedown_percentage',
'submission_attempts',
'reversals',
'control_time',
'head_strikes',
'body_strikes',
'leg_strikes',
'distance_strikes',
'clinch_strikes',
'ground_strikes',
'Sig. Str. %'
]

ufc_data.drop(columns=cols_drop, inplace=True)

In [5]:
#Cleaning Date data 
# Convert DOB and Date to datetime

ufc_data = ufc_data[ufc_data['DOB'] != '--'].copy()
# Convert DOB and Date to datetime
ufc_data['DOB'] = pd.to_datetime(ufc_data['DOB'], format='%b %d, %Y')
ufc_data['Date'] = pd.to_datetime(ufc_data['Date'], format='%b. %d, %Y')

# Calculate the age at the time of the fight
ufc_data['fight_age'] = (ufc_data['Date'] - ufc_data['DOB']).dt.days // 365
ufc_data['current_age'] = (datetime.now() - ufc_data['DOB']).dt.days // 365

In [6]:
# Define weight classes
weight_classes = [
    'strawweight', 'flyweight', 'bantamweight', 'featherweight', 'lightweight',
    'welterweight', 'middleweight', 'light heavyweight', 'heavyweight'
]

# Convert to lowercase
ufc_data['Weight Class'] = ufc_data['Weight Class'].str.lower()

# Flag title fights
ufc_data['is_title_fight'] = ufc_data['Weight Class'].apply(lambda x: 'title' in x)

# Flag male fights
ufc_data['is_male_fight'] = ~ufc_data['Weight Class'].apply(lambda x: 'women' in x)

# Clean weight class
def clean_weight_class(wc):
    for wc_class in weight_classes:
        if wc_class in wc:
            return wc_class.replace(' ', '_')  # Convert to one word (if any)
    return 'other'  # Return 'other' for classes not in the predefined list

ufc_data['weight_class'] = ufc_data['Weight Class'].apply(clean_weight_class)
ufc_data = ufc_data[ufc_data['weight_class'] != 'other']

# Drop the old 'Weight Class' column
ufc_data = ufc_data.drop(columns=['Weight Class'])

# Filter out rows where 'Reach' is '--'
ufc_data = ufc_data[ufc_data['Reach'] != '--'].reset_index(drop=True)

In [7]:
# Function to extract the first numerical value from a string
def extract_first_value(s):
    if pd.isna(s) or s == '--':
        return None
    return int(s.split()[0])

# Apply the function to create new columns
ufc_data['height_inches'] = ufc_data['Height'].apply(extract_first_value)
ufc_data['weight_pounds'] = ufc_data['Weight'].apply(extract_first_value)
ufc_data['reach_inches'] = ufc_data['Reach'].apply(extract_first_value)

# Drop the original columns if no longer needed
ufc_data.drop(columns=['Height', 'Weight', 'Reach'], inplace=True)

def extract_round_number(round_str):
    try:
        return int(round_str.split()[1])
    except (IndexError, ValueError):
        return None

ufc_data['round_number'] = ufc_data['Round'].apply(extract_round_number)
ufc_data.drop(columns=['Round'], inplace=True)

In [8]:
#Remove any results that were really old and fighters are no longer active 

ufc_data['Date'] = pd.to_datetime(ufc_data['Date'], format='%b. %d, %Y')

fighter_agg = ufc_data.groupby('Name').agg(
    Latest_Fight=('Date', 'max'),
    Oldest_Fight=('Date', 'min')
).reset_index()

filtered_fighters = fighter_agg[fighter_agg['Latest_Fight'] >= '2023-01-01']

filtered_fighters = filtered_fighters.merge(ufc_data[['Name', 'weight_class']].drop_duplicates(), on='Name', how='inner')

filtered_fighters_sorted = filtered_fighters.sort_values('Oldest_Fight', ascending=True)

def get_oldest_fight_date(sub_df):
    min_date = sub_df['Date'].min()
    return pd.Series([min_date], index=['Oldest_Fight'])

oldest_fight_dates = ufc_data.groupby('weight_class').apply(get_oldest_fight_date).reset_index()

ufc_data = pd.merge(ufc_data, oldest_fight_dates, on='weight_class', how='left')

ufc_data_filtered = ufc_data[ufc_data['Date'] >= ufc_data['Oldest_Fight']]

ufc_data_filtered = ufc_data_filtered.drop(columns=['Oldest_Fight'])



In [9]:
# Rename columns for consistency
ufc_data_filtered.rename(columns={
    'Event': 'event',
    'Name': 'name',
    'KD': 'knockdowns',
    'Wins': 'wins',
    'Losses': 'losses',
    'Draws': 'draws',
    'No Contests': 'nc',
    'Stance': 'stance',
    'Date': 'date',
    'Result': 'result',
    'Method': 'method',
}, inplace=True)

##### Aggregate data from by Round data, to by Fight Data

In [10]:
ufc_fight_data = ufc_data_filtered.groupby(['event', 'name', 'wins', 'losses', 'draws', 'nc', 'stance', 'DOB', 'date', 'result', 'method'
                                            #method
                                            , 'Fighter_1', 'Fighter_2', 'current_age', 'fight_age', 'is_title_fight', 'is_male_fight', 'weight_class', 'height_inches', 'weight_pounds', 'reach_inches'
]).agg({
    'knockdowns': 'sum',
    'significant_strikes_landed': 'sum',
    'significant_strikes_thrown': 'sum',
    'total_strikes_landed': 'sum',
    'total_strikes_thrown': 'sum',
    'takedowns_landed': 'sum',
    'takedowns_thrown': 'sum',
    'head_strikes_landed': 'sum',
    'head_strikes_thrown': 'sum',
    'body_strikes_landed': 'sum',
    'body_strikes_thrown': 'sum',
    'leg_strikes_landed': 'sum',
    'leg_strikes_thrown': 'sum',
    'distance_strikes_landed': 'sum',
    'distance_strikes_thrown': 'sum',
    'clinch_strikes_landed': 'sum',
    'clinch_strikes_thrown': 'sum',
    'ground_strikes_landed': 'sum',
    'ground_strikes_thrown': 'sum',
    'round_number': 'max'
}).reset_index()

In [11]:
# One-hot encode the 'method' column
method_dummies = pd.get_dummies(ufc_fight_data['method'], prefix='method')
ufc_fight_data = ufc_fight_data.join(method_dummies)

##### Create historical metrics, metrics leading up to the fight

In [12]:
# Aggregating Data with methods
aggregated_fighter_data = ufc_fight_data.groupby(['name', 'weight_class', 'date']).agg(
    knockdowns=('knockdowns', 'sum'),
    significant_strikes_landed=('significant_strikes_landed', 'sum'),
    significant_strikes_thrown=('significant_strikes_thrown', 'sum'),
    total_strikes_landed=('total_strikes_landed', 'sum'),
    total_strikes_thrown=('total_strikes_thrown', 'sum'),
    takedowns_landed=('takedowns_landed', 'sum'),
    takedowns_thrown=('takedowns_thrown', 'sum'),
    head_strikes_landed=('head_strikes_landed', 'sum'),
    head_strikes_thrown=('head_strikes_thrown', 'sum'),
    body_strikes_landed=('body_strikes_landed', 'sum'),
    body_strikes_thrown=('body_strikes_thrown', 'sum'),
    leg_strikes_landed=('leg_strikes_landed', 'sum'),
    leg_strikes_thrown=('leg_strikes_thrown', 'sum'),
    distance_strikes_landed=('distance_strikes_landed', 'sum'),
    distance_strikes_thrown=('distance_strikes_thrown', 'sum'),
    clinch_strikes_landed=('clinch_strikes_landed', 'sum'),
    clinch_strikes_thrown=('clinch_strikes_thrown', 'sum'),
    ground_strikes_landed=('ground_strikes_landed', 'sum'),
    ground_strikes_thrown=('ground_strikes_thrown', 'sum'),
    total_title_fights=('is_title_fight', 'sum'),
    wins=('result', lambda x: (x == 'win').sum()),  
    total_rounds=('round_number', 'sum'),
    total_unique_events=('event', 'nunique'),
    **{f'total_{col}': (col, 'sum') for col in method_dummies.columns}
).reset_index()



In [13]:
# Assuming 'aggregated_fighter_data' is the DataFrame with aggregated fight data
def calculate_cumulative_metrics(row, fighter_data):
    # Filter the fighter's historical data before the current fight date
    historical_fights = fighter_data[
        (fighter_data['name'] == row['name']) &
        (fighter_data['weight_class'] == row['weight_class']) &
        (pd.to_datetime(fighter_data['date']) <= pd.to_datetime(row['date']))
    ]
    
    # Calculate cumulative metrics
    cumulative_metrics = {
        'cumulative_knockdowns': historical_fights['knockdowns'].sum(),
        'cumulative_significant_strikes_landed': historical_fights['significant_strikes_landed'].sum(),
        'cumulative_significant_strikes_thrown': historical_fights['significant_strikes_thrown'].sum(),
        'cumulative_total_strikes_landed': historical_fights['total_strikes_landed'].sum(),
        'cumulative_total_strikes_thrown': historical_fights['total_strikes_thrown'].sum(),
        'cumulative_takedowns_landed': historical_fights['takedowns_landed'].sum(),
        'cumulative_takedowns_thrown': historical_fights['takedowns_thrown'].sum(),
        'cumulative_head_strikes_landed': historical_fights['head_strikes_landed'].sum(),
        'cumulative_head_strikes_thrown': historical_fights['head_strikes_thrown'].sum(),
        'cumulative_body_strikes_landed': historical_fights['body_strikes_landed'].sum(),
        'cumulative_body_strikes_thrown': historical_fights['body_strikes_thrown'].sum(),
        'cumulative_leg_strikes_landed': historical_fights['leg_strikes_landed'].sum(),
        'cumulative_leg_strikes_thrown': historical_fights['leg_strikes_thrown'].sum(),
        'cumulative_distance_strikes_landed': historical_fights['distance_strikes_landed'].sum(),
        'cumulative_distance_strikes_thrown': historical_fights['distance_strikes_thrown'].sum(),
        'cumulative_clinch_strikes_landed': historical_fights['clinch_strikes_landed'].sum(),
        'cumulative_clinch_strikes_thrown': historical_fights['clinch_strikes_thrown'].sum(),
        'cumulative_ground_strikes_landed': historical_fights['ground_strikes_landed'].sum(),
        'cumulative_ground_strikes_thrown': historical_fights['ground_strikes_thrown'].sum(),
        'cumulative_title_fights': historical_fights['total_title_fights'].sum(),
        'cumulative_rounds': historical_fights['total_rounds'].sum(),
        'cumulative_unique_events': historical_fights['total_unique_events'].sum(),
        'cumulative_wins': historical_fights['wins'].sum(),
        'cumulative_dec': historical_fights['total_method_dec'].sum(),
        'cumulative_dq': historical_fights['total_method_dq'].sum(),
        'cumulative_ko': historical_fights['total_method_ko'].sum(),
        'cumulative_overturned': historical_fights['total_method_overturned'].sum(),
        'cumulative_sub': historical_fights['total_method_sub'].sum()
    }
    
    return pd.Series(cumulative_metrics)

# Apply the function to each row
cumulative_data = aggregated_fighter_data.apply(calculate_cumulative_metrics, axis=1, fighter_data=aggregated_fighter_data)


final_data_with_cumulative = pd.concat([aggregated_fighter_data, cumulative_data], axis=1)


In [14]:
# Combine the cumulative data with the original data

# Define the columns to keep
columns_to_keep = [
    'name', 'weight_class', 'date',
    'cumulative_knockdowns', 'cumulative_significant_strikes_landed', 'cumulative_significant_strikes_thrown',
    'cumulative_total_strikes_landed', 'cumulative_total_strikes_thrown', 'cumulative_takedowns_landed',
    'cumulative_takedowns_thrown', 'cumulative_head_strikes_landed', 'cumulative_head_strikes_thrown',
    'cumulative_body_strikes_landed', 'cumulative_body_strikes_thrown', 'cumulative_leg_strikes_landed',
    'cumulative_leg_strikes_thrown', 'cumulative_distance_strikes_landed', 'cumulative_distance_strikes_thrown',
    'cumulative_clinch_strikes_landed', 'cumulative_clinch_strikes_thrown', 'cumulative_ground_strikes_landed',
    'cumulative_ground_strikes_thrown', 'cumulative_title_fights', 'cumulative_rounds', 'cumulative_unique_events', 
    'cumulative_wins', 'cumulative_dec', 'cumulative_dq', 'cumulative_ko', 'cumulative_overturned','cumulative_sub'
]

# Select only the columns to keep
final_data_with_cumulative = final_data_with_cumulative[columns_to_keep]

# Strike Accuracy
final_data_with_cumulative['strike_accuracy'] = np.where(
    final_data_with_cumulative['cumulative_total_strikes_thrown'] > 0,
    final_data_with_cumulative['cumulative_total_strikes_landed'] / final_data_with_cumulative['cumulative_total_strikes_thrown'],
    0
)

final_data_with_cumulative['sig_strike_accuracy'] = np.where(
    final_data_with_cumulative['cumulative_significant_strikes_thrown'] > 0,
    final_data_with_cumulative['cumulative_significant_strikes_landed'] / final_data_with_cumulative['cumulative_significant_strikes_thrown'],
    0
)
# Takedown Accuracy
final_data_with_cumulative['takedown_accuracy'] = np.where(
    final_data_with_cumulative['cumulative_takedowns_thrown'] > 0,
    final_data_with_cumulative['cumulative_takedowns_landed'] / final_data_with_cumulative['cumulative_takedowns_thrown'],
    0
)
# Strike Ratios
final_data_with_cumulative['head_strike_ratio'] = np.where(
    final_data_with_cumulative['cumulative_total_strikes_landed'] > 0,
    final_data_with_cumulative['cumulative_head_strikes_landed'] / final_data_with_cumulative['cumulative_total_strikes_landed'],
    0
)
final_data_with_cumulative['body_strike_ratio'] = np.where(
    final_data_with_cumulative['cumulative_total_strikes_landed'] > 0,
    final_data_with_cumulative['cumulative_body_strikes_landed'] / final_data_with_cumulative['cumulative_total_strikes_landed'],
    0
)
final_data_with_cumulative['leg_strike_ratio'] = np.where(
    final_data_with_cumulative['cumulative_total_strikes_landed'] > 0,
    final_data_with_cumulative['cumulative_leg_strikes_landed'] / final_data_with_cumulative['cumulative_total_strikes_landed'],
    0
) 

#win ratio
final_data_with_cumulative['fight_duration'] = np.where(
    final_data_with_cumulative['cumulative_unique_events'] > 0,
    final_data_with_cumulative['cumulative_rounds'] / final_data_with_cumulative['cumulative_unique_events'],
    0
)

#average fight rounds 
final_data_with_cumulative['win_rate'] = np.where(
    final_data_with_cumulative['cumulative_unique_events'] > 0,
    final_data_with_cumulative['cumulative_wins'] / final_data_with_cumulative['cumulative_unique_events'],
    0
) 

#knockdown pct 
final_data_with_cumulative['knockdown_percentage'] =  np.where(
    final_data_with_cumulative['cumulative_total_strikes_landed'] > 0,
    final_data_with_cumulative['cumulative_knockdowns'] / final_data_with_cumulative['cumulative_total_strikes_landed'],
    0
) 

#knock out rate 
final_data_with_cumulative['ko_rate'] = np.where(
    final_data_with_cumulative['cumulative_unique_events'] > 0,
    final_data_with_cumulative['cumulative_ko'] / final_data_with_cumulative['cumulative_unique_events'],
    0
) 

#submission rate 
final_data_with_cumulative['submission_rate'] = np.where(
    final_data_with_cumulative['cumulative_unique_events'] > 0,
    final_data_with_cumulative['cumulative_ko'] / final_data_with_cumulative['cumulative_unique_events'],
    0
) 

#finish rate 
final_data_with_cumulative['finish_rate'] = np.where(
    final_data_with_cumulative['cumulative_unique_events'] > 0,
    (final_data_with_cumulative['cumulative_sub'] + final_data_with_cumulative['cumulative_ko'])/ final_data_with_cumulative['cumulative_unique_events'],
    0
) 


final_data_with_cumulative = final_data_with_cumulative.sort_values(by=['name', 'weight_class', 'date'])

# Create the next_fight_date column
final_data_with_cumulative['next_fight_date'] = final_data_with_cumulative.groupby(['name', 'weight_class'])['date'].shift(-1)

cols_drop = [
    'cumulative_knockdowns', 'cumulative_significant_strikes_landed', 'cumulative_significant_strikes_thrown',
    'cumulative_total_strikes_landed', 'cumulative_total_strikes_thrown', 'cumulative_takedowns_landed',
    'cumulative_takedowns_thrown', 'cumulative_head_strikes_landed', 'cumulative_head_strikes_thrown',
    'cumulative_body_strikes_landed', 'cumulative_body_strikes_thrown', 'cumulative_leg_strikes_landed',
    'cumulative_leg_strikes_thrown', 'cumulative_distance_strikes_landed', 'cumulative_distance_strikes_thrown',
    'cumulative_clinch_strikes_landed', 'cumulative_clinch_strikes_thrown', 'cumulative_ground_strikes_landed',
    'cumulative_ground_strikes_thrown', 'cumulative_title_fights', 'cumulative_rounds', 'cumulative_unique_events', 
    'cumulative_wins', 'cumulative_dec', 'cumulative_dq', 'cumulative_ko', 'cumulative_overturned','cumulative_sub'
]
final_data_with_cumulative.drop(columns=cols_drop, inplace=True)

##### Merge each fighter with their respective historical fight data 

In [15]:
import pandas as pd

# Make deep copies of the dataframes to avoid SettingWithCopyWarning
final_data_with_cumulative = final_data_with_cumulative.copy()
ufc_fight_data = ufc_fight_data.copy()

# Convert 'date' columns to datetime format
final_data_with_cumulative['date'] = pd.to_datetime(final_data_with_cumulative['date'], errors='coerce')
ufc_fight_data['date'] = pd.to_datetime(ufc_fight_data['date'], errors='coerce')

# Sort the cumulative data by name, weight_class, and date
final_data_with_cumulative = final_data_with_cumulative.sort_values(['name', 'weight_class', 'date'])

# Function to get the most recent cumulative data prior to each fight
def get_most_recent_cumulative(row, cumulative_data):
    # Filter cumulative data for the same fighter and weight class
    relevant_data = cumulative_data[(cumulative_data['name'] == row['name']) & 
                                    (cumulative_data['weight_class'] == row['weight_class']) & 
                                    (cumulative_data['date'] < row['date'])]
    # Return the most recent cumulative data (last row)
    if not relevant_data.empty:
        return relevant_data.iloc[-1]
    else:
        # If no prior fights, return a row of NaNs as default, with a specified dtype
        return pd.Series(index=cumulative_data.columns, dtype='float64')

# Apply the function to get the most recent cumulative stats for each fight event
most_recent_cumulative = ufc_fight_data.apply(lambda row: get_most_recent_cumulative(row, final_data_with_cumulative), axis=1)

# Reset index for both DataFrames to align for concatenation
most_recent_cumulative = most_recent_cumulative.reset_index(drop=True)
ufc_fight_data = ufc_fight_data.reset_index(drop=True)

# Concatenate the most recent cumulative data to the original fight data
ufc_fight_data_with_cumulative = pd.concat([ufc_fight_data, most_recent_cumulative], axis=1)

# Step 3: Merge the cumulative data with the UFC fight event data
ufc_fight_data_with_cumulative = pd.merge(
    ufc_fight_data,
    final_data_with_cumulative,
    left_on=['name', 'weight_class', 'date'],
    right_on=['name', 'weight_class', 'next_fight_date'],
    how='left',
    suffixes=('', '_cumulative')
)

cols = ['next_fight_date', 'date_cumulative']
ufc_fight_data_with_cumulative.drop(columns=cols, inplace=True)

In [16]:
ufc_fight_data_with_cumulative['is_first_fight'] = ufc_fight_data_with_cumulative['sig_strike_accuracy'].isnull()
ufc_fight_data_filtered = ufc_fight_data_with_cumulative.copy()
ufc_fight_data_filtered = ufc_fight_data_filtered[ufc_fight_data_filtered['is_first_fight'] == False]
ufc_fight_data_filtered.drop(columns='is_first_fight', inplace=True)

##### Merge fighter data to pairwise fights 

In [17]:
# Split data into two DataFrames based on the role of Fighter_1 and Fighter_2
fighter_A_df = ufc_fight_data_filtered.copy()
fighter_B_df = ufc_fight_data_filtered.copy()

# Rename columns to identify as fighter_A and fighter_B
fighter_A_df = fighter_A_df.add_prefix('fighter_A_')
fighter_B_df = fighter_B_df.add_prefix('fighter_B_')

# Ensure original identifiers are available for merging
fighter_A_df = fighter_A_df.rename(columns={'fighter_A_Fighter_1': 'Fighter_1', 'fighter_A_Fighter_2': 'Fighter_2', 'fighter_A_event': 'event'})
fighter_B_df = fighter_B_df.rename(columns={'fighter_B_Fighter_1': 'Fighter_2', 'fighter_B_Fighter_2': 'Fighter_1', 'fighter_B_event': 'event'})

# Merge DataFrames on 'event' and fighter identifiers
merged_df = pd.merge(fighter_A_df, fighter_B_df, on=['event', 'Fighter_1', 'Fighter_2'])

# Ensure consistent ordering: Keep only unique combinations of fighters by applying a lexicographical order
merged_df = merged_df[merged_df['Fighter_1'] <= merged_df['Fighter_2']]

# Drop redundant columns if necessarysi
merged_df.drop(columns=['Fighter_1', 'Fighter_2'], inplace=True)

# Dropping redundant columns
columns_to_drop = ['fighter_B_date', 'fighter_A_DOB', 'fighter_B_DOB']
merged_df.drop(columns=columns_to_drop, inplace=True)

# Ensuring only unique columns are kept and each fighter's data is correctly assigned
# For instance, remove redundant naming columns and use 'event' and 'fight_date' to ensure clarity
merged_df = merged_df.rename(columns={'event': 'fight_event', 'fighter_A_date': 'fight_date'})


In [18]:
merged_col_drop = [
    'fighter_A_name',
    'fighter_B_name',
    'fight_event',
    'fighter_A_method',
    'fighter_B_method',
    'fighter_B_result',
    'fighter_A_nc',
    'fighter_A_draws',
    'fighter_B_draws',
    'fighter_B_nc',
    'fighter_A_current_age',
    'fighter_B_current_age',
    'fighter_B_is_title_fight',
    'fighter_B_is_male_fight',
    'fighter_B_weight_class',
    'fight_date',
    'fighter_A_weight_pounds',
    'fighter_B_weight_pounds',
    'fighter_B_round_number',
    'fighter_B_method_dec',
    'fighter_B_method_dq',
    'fighter_B_method_ko',
    'fighter_B_method_overturned',
    'fighter_B_method_sub',
]

merged_df.drop(columns=merged_col_drop, inplace=True)

In [19]:
# Renaming universal columns to remove the prefix
merged_df.rename(columns={
    'fighter_A_is_title_fight': 'is_title_fight',
    'fighter_A_is_male_fight': 'is_male_fight',
    'fighter_A_weight_class': 'weight_class',
    'fighter_A_round_number': 'total_fight_rounds',
    'fighter_A_method_dec': 'win_by_decision',
    'fighter_A_method_dq': 'win_by_dq',
    'fighter_A_method_ko': 'win_by_ko',
    'fighter_A_method_overturned': 'win_by_overturn',
    'fighter_A_method_sub': 'win_by_sub'
}, inplace=True)

# One-Hot Encoding Categorical Variables
merged_df = pd.get_dummies(merged_df, columns=[
    'fighter_A_stance', 'fighter_B_stance',
    'is_title_fight', 'is_male_fight', 'weight_class'
], drop_first=True)


# Rename target variable
merged_df.rename(columns={'fighter_A_result': 'target'}, inplace=True)

# Convert target variable to binary (win/loss)
merged_df['target'] = merged_df['target'].apply(lambda x: 1 if x == 'win' else 0)


##### Get the difference of each metric between each fighter

In [20]:
final_model_df = merged_df.copy()
final_model_df.reset_index(drop=True, inplace=True)

In [21]:
numerical_columns = [
    'wins', 'losses', 'fight_age', 'height_inches', 'reach_inches', 'knockdowns',
    'significant_strikes_landed', 'significant_strikes_thrown', 'total_strikes_landed',
    'total_strikes_thrown', 'takedowns_landed', 'takedowns_thrown', 'head_strikes_landed',
    'head_strikes_thrown', 'body_strikes_landed', 'body_strikes_thrown', 'leg_strikes_landed',
    'leg_strikes_thrown', 'distance_strikes_landed', 'distance_strikes_thrown',
    'clinch_strikes_landed', 'clinch_strikes_thrown', 'ground_strikes_landed',
    'ground_strikes_thrown', 'strike_accuracy', 'sig_strike_accuracy', 'takedown_accuracy',
    'head_strike_ratio', 'body_strike_ratio', 'leg_strike_ratio', 'fight_duration', 'win_rate',
    'knockdown_percentage', 'ko_rate', 'submission_rate', 'finish_rate'
]

# Calculate the differences
for col in numerical_columns:
    fighter_A_col = f'fighter_A_{col}'
    fighter_B_col = f'fighter_B_{col}'
    final_model_df[f'diff_{col}'] = final_model_df[fighter_A_col] - final_model_df[fighter_B_col]
    
    
# Identify columns that are both in numerical_columns and start with fighter_A or fighter_B
columns_to_diff = [col for col in numerical_columns if f'fighter_A_{col}' in final_model_df.columns and f'fighter_B_{col}' in final_model_df.columns]

# Calculate differences
for col in columns_to_diff:
    final_model_df[f'diff_{col}'] = final_model_df[f'fighter_A_{col}'] - final_model_df[f'fighter_B_{col}']


columns_to_drop = []
for col in columns_to_diff:
    columns_to_drop.append(f'fighter_A_{col}')
    columns_to_drop.append(f'fighter_B_{col}')
# Drop original fighter_A and fighter_B columns for numerical metrics

final_model_df.drop(columns=columns_to_drop, inplace=True)

#drop all methods way to win and total rounds since these are outcome part of target 
drop_cols = [
'total_fight_rounds',
'win_by_decision',
'win_by_dq',
'win_by_ko',
'win_by_overturn',
'win_by_sub'
]

final_model_df.drop(columns=drop_cols, inplace=True)

##### Output final data of each dataframe

In [22]:
final_model_df.to_csv("../data/cleaned_data_ml.csv", index=False)