# Convert from Round-by-Round to per fight stats

In [13]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
from datetime import datetime



## --- Data Loading and Initial Cleaning ---


In [14]:
def load_and_prepare_data():
    fight_results = pd.read_csv("../raw_data/ufc_fight_results.csv").reset_index(drop=True)
    fight_results['BOUT'] = fight_results['BOUT'].apply(lambda x: x.replace("  ", " "))
    
    stats_df = pd.read_csv("../raw_data/ufc_fight_stats.csv").reset_index(drop=True)
    fighter_info = pd.read_csv("../raw_data/ufc_fighter_tott.csv").reset_index(drop=True)
    events_df = pd.read_csv("../raw_data/ufc_event_details.csv").reset_index(drop=True)
    
    events_df['DATE'] = pd.to_datetime(events_df['DATE'], errors='coerce')
    events_df['EVENT'] = events_df['EVENT'].astype(str).str.strip().str.lower()
    fight_results['EVENT'] = fight_results['EVENT'].astype(str).str.strip().str.lower()

    fight_results = fight_results.merge(events_df[['EVENT', 'DATE']], on='EVENT', how='left')

    return fight_results, stats_df, fighter_info


## --- Utility Functions ---



In [15]:
def safe_int(val):
    try:
        if pd.isna(val):
            return 0
        return int(float(val))
    except (ValueError, TypeError):
        return 0

def get_fight_time(bout, fight_results):
    row = fight_results[fight_results["BOUT"] == (bout + " ")]
    if row.empty:
        return None
    round_number = row["ROUND"].values[0] - 1
    time_str = row["TIME"].values[0]
    try:
        minutes, seconds = map(int, time_str.split(':'))
        total_time = round_number * 300 + (minutes * 60 + seconds)
        return total_time
    except ValueError:
        return None


## --- Fighter Metrics Handling ---

In [16]:
def update_fight_results_with_fighter_metrics(bout, fighters_dict, fight_results):
    try:
        red_fighter_name, blue_fighter_name = [name.strip() for name in bout.split(" vs. ")]
        red_fighter = fighters_dict[red_fighter_name].copy()
        blue_fighter = fighters_dict[blue_fighter_name].copy()

        new_data = {
            "red_Name": red_fighter_name,
            "blue_Name": blue_fighter_name,
        }

        for key in red_fighter.keys() | blue_fighter.keys():
            new_data[f"red_{key}"] = red_fighter.get(key, 0)
            new_data[f"blue_{key}"] = blue_fighter.get(key, 0)

        fight_results.loc[
            (fight_results["BOUT"].str.strip() == bout.strip()),
            new_data.keys()
        ] = new_data.values()

    except KeyError as e:
        print(f"Key error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

def aggregate_fight_stats(fighter, opponent, row_index, fighters_dict, stats_df):
    try:
        fight_time = max(1, fighters_dict[fighter]["fightTime"])

        # TAKEDOWNS
        td_data = str(stats_df.loc[row_index, "TD"]).split(" of ")
        takedowns_landed, takedowns_attempted = (map(int, td_data) if len(td_data) == 2 else (0, 0))
        fighters_dict[fighter]["TakedownsAttempted"] += takedowns_attempted
        fighters_dict[fighter]["TakedownsLanded"] += takedowns_landed
        fighters_dict[opponent]["OpponentTakedownsAttempted"] += takedowns_attempted
        fighters_dict[opponent]["OpponentTakedownsLanded"] += takedowns_landed

        # STRIKING
        sig_str_data = str(stats_df.loc[row_index, "SIG.STR."]).split(" of ")
        strikes_landed, strikes_attempted = (map(int, sig_str_data) if len(sig_str_data) == 2 else (0, 0))
        knockdowns = safe_int(stats_df.loc[row_index, "KD"])

        fighters_dict[fighter]["StrikesLanded"] += strikes_landed
        fighters_dict[fighter]["StrikesAttempted"] += strikes_attempted
        fighters_dict[fighter]["Knockdowns"] += knockdowns
        fighters_dict[opponent]["OpponentStrikesLanded"] += strikes_landed
        fighters_dict[opponent]["OpponentStrikesAttempted"] += strikes_attempted

        # CONTROL TIME
        ctrl_time = str(stats_df.loc[row_index, "CTRL"]).split(":")
        control_time = (int(ctrl_time[0]) * 60 + int(ctrl_time[1])) if len(ctrl_time) == 2 and all(x.isdigit() for x in ctrl_time) else 0
        fighters_dict[fighter]["ControlTime"] += control_time
        fighters_dict[opponent]["OpponentControlTime"] += control_time
        
        # Submissions & Reversals
        fighters_dict[fighter]["SubmissionAttempts"] += safe_int(stats_df.loc[row_index, "SUB.ATT"])
        fighters_dict[fighter]["Reversals"] += safe_int(stats_df.loc[row_index, "REV."])

    except Exception as e:
        print(f"Error processing fight metrics at index {row_index}: {e}")
        print(f"Problematic row: {stats_df.loc[row_index]}")


## --- Aggregation Loops ---

In [17]:
def build_fighters_aggregate(stats_df, fight_results):
    fighters_dict = defaultdict(lambda: defaultdict(float))
    current_fighter = ""
    current_bout = ""

    for i in range(len(stats_df) - 1, -1, -1):
        fighter = stats_df.loc[i, "FIGHTER"]
        bout = stats_df.loc[i, "BOUT"]
        event = stats_df.loc[i, "EVENT"]

        names = bout.split(" vs. ")
        opponent = names[1] if names[0] == fighter else names[0]

        if str(current_bout) != str(bout):
            update_fight_results_with_fighter_metrics(bout, fighters_dict, fight_results)
            current_bout = bout

        if current_fighter != fighter:
            time = get_fight_time(bout, fight_results)
            if time is not None:
                fighters_dict[fighter]["fightTime"] += time
            current_fighter = fighter

        aggregate_fight_stats(fighter, opponent, i, fighters_dict, stats_df)

    return fighters_dict

def compute_aggregated_features(fight_results):
    for color in ["red", "blue"]:
        c = lambda x: f"{color}_{x}"

        # Time-based stats
        fight_results[c("StrikesLandedPerMin")] = round(
            fight_results[c("StrikesLanded")] / (fight_results[c("fightTime")] / 60).replace(0, 1), 2
        )
        fight_results[c("StrikesAbsorbedPerMin")] = round(
            fight_results[c("OpponentStrikesLanded")] / (fight_results[c("fightTime")] / 60).replace(0, 1), 2
        )
        fight_results[c("TakedownsPer15Min")] = round(
            fight_results[c("TakedownsLanded")] / (fight_results[c("fightTime")] / 900).replace(0, 1), 2
        )
        fight_results[c("OpponentTakedownsPer15Min")] = round(
            fight_results[c("OpponentTakedownsLanded")] / (fight_results[c("fightTime")] / 900).replace(0, 1), 2
        )
        fight_results[c("SubmissionsPer15Min")] = round(
            fight_results[c("SubmissionAttempts")] / (fight_results[c("fightTime")] / 900).replace(0, 1), 2
        )
        fight_results[c("ControlPer15Min")] = round(
            fight_results[c("ControlTime")] / (fight_results[c("fightTime")] / 900).replace(0, 1), 2
        )

        # Percentages
        fight_results[c("StrikingAccuracyPct")] = round(
            fight_results[c("StrikesLanded")] / fight_results[c("StrikesAttempted")].replace(0, 1) * 100, 2
        )
        fight_results[c("StrikeDefencePct")] = round(
            100 - (fight_results[c("OpponentStrikesLanded")] / fight_results[c("OpponentStrikesAttempted")].replace(0, 1) * 100), 2
        )
        fight_results[c("TakedownAccuracyPct")] = round(
            fight_results[c("TakedownsLanded")] / fight_results[c("TakedownsAttempted")].replace(0, 1) * 100, 2
        )
        fight_results[c("TakedownDefencePct")] = round(
            100 - (fight_results[c("OpponentTakedownsLanded")] / fight_results[c("OpponentTakedownsAttempted")].replace(0, 1) * 100), 2
        )

def build_fighters_dataframe(fighters_dict):
    final_fighters = []

    for fighter_name, stats in fighters_dict.items():
        ft = stats.copy()
        ft["Name"] = fighter_name
        fight_time = max(1, ft.get("fightTime", 1))

        # Averages
        ft["StrikesLandedPerMin"] = round(ft.get("StrikesLanded", 0) / (fight_time / 60), 2)
        ft["StrikesAbsorbedPerMin"] = round(ft.get("OpponentStrikesLanded", 0) / (fight_time / 60), 2)
        ft["TakedownsPer15Min"] = round(ft.get("TakedownsLanded", 0) / (fight_time / 900), 2)
        ft["OpponentTakedownsPer15Min"] = round(ft.get("OpponentTakedownsLanded", 0) / (fight_time / 900), 2)
        ft["SubmissionsPer15Min"] = round(ft.get("SubmissionAttempts", 0) / (fight_time / 900), 2)
        ft["ControlPer15Min"] = round(ft.get("ControlTime", 0) / (fight_time / 900), 2)

        # Percentages
        ft["StrikingAccuracyPct"] = round(ft.get("StrikesLanded", 0) / max(1, ft.get("StrikesAttempted", 0)) * 100, 2)
        ft["StrikeDefencePct"] = round(100 - (ft.get("OpponentStrikesLanded", 0) / max(1, ft.get("OpponentStrikesAttempted", 0)) * 100), 2)
        ft["TakedownAccuracyPct"] = round(ft.get("TakedownsLanded", 0) / max(1, ft.get("TakedownsAttempted", 0)) * 100, 2)
        ft["TakedownDefencePct"] = round(100 - (ft.get("OpponentTakedownsLanded", 0) / max(1, ft.get("OpponentTakedownsAttempted", 0)) * 100), 2)

        final_fighters.append(ft)
    
    return pd.DataFrame(final_fighters)

## --- Dropping Raw Aggregated Features ---

In [18]:

def drop_raw_aggregated_features(fight_results):
    fight_results['OUTCOME'] = fight_results['OUTCOME'].replace({'W/L': "Red", 'L/W': "Blue"})
    fight_results.drop(fight_results[fight_results['OUTCOME'] == 0].index, inplace=True)
    fight_results.drop(columns=[
        "red_StrikesAttempted", "blue_StrikesAttempted", "red_OpponentStrikesAttempted",
        "blue_OpponentStrikesAttempted", "red_OpponentTakedownsAttempted", "blue_OpponentTakedownsAttempted",
        "red_OpponentStrikesLanded", "blue_OpponentStrikesLanded", "red_Knockdowns", "blue_Knockdowns",
        "red_TakedownsLanded", "blue_TakedownsLanded", "red_StrikesLanded", "blue_StrikesLanded",
        "red_TakedownsAttempted", "blue_TakedownsAttempted", "red_ControlTime", "blue_ControlTime",
        "red_OpponentControlTime", "blue_OpponentControlTime", "red_fightTime", "blue_fightTime",
        "red_Reversals", "blue_Reversals", "red_OpponentTakedownsLanded", "blue_OpponentTakedownsLanded",
        "red_SubmissionAttempts", "blue_SubmissionAttempts",
        "ROUND", "TIME", "TIME FORMAT", "REFEREE", "DETAILS", "URL", "WEIGHTCLASS", "EVENT", "BOUT"
    ], inplace=True)



## --- Fighter Info Cleaning and Merging ---

In [19]:
def clean_height(height_str):
    if pd.isna(height_str):
        return None
    match = re.match(r"(\d+)'\s*(\d+)\"?", height_str)
    if not match:
        return None
    feet, inches = match.groups()
    return int(feet) * 12 + int(inches)

def clean_weight(weight_str):
    if pd.isna(weight_str):
        return None
    match = re.search(r"(\d+)", weight_str)
    return int(match.group(1)) if match else None

def clean_reach(reach_str):
    if pd.isna(reach_str):
        return None
    match = re.search(r"(\d+)", reach_str)
    return int(match.group(1)) if match else None

def clean_dob(dob_str):
    if pd.isna(dob_str):
        return None
    try:
        return datetime.strptime(dob_str, "%b %d, %Y").date()
    except ValueError:
        return None

def clean_fighter_info(df):
    df = df.copy()
    for prefix in ['red_', 'blue_']:
        df[f'{prefix}Height'] = df[f'{prefix}Height'].apply(clean_height)
        df[f'{prefix}Weight'] = df[f'{prefix}Weight'].apply(clean_weight)
        df[f'{prefix}Reach'] = df[f'{prefix}Reach'].apply(clean_reach)
        df[f'{prefix}Stance'] = df[f'{prefix}Stance'].astype(str).str.strip().str.title()
        df[f'{prefix}DOB'] = df[f'{prefix}DOB'].apply(clean_dob)
    return df

def merge_fighter_info_to_fight_results(fight_results, fighter_info):
    cols_to_merge = ["FIGHTER", "HEIGHT", "WEIGHT", "REACH", "STANCE", "DOB"]
    fighter_info = fighter_info[cols_to_merge].copy()
    fighter_info["FIGHTER"] = fighter_info["FIGHTER"].str.strip()

    red_info = fighter_info.rename(columns={
        "FIGHTER": "red_Name",
        "HEIGHT": "red_Height",
        "WEIGHT": "red_Weight",
        "REACH": "red_Reach",
        "STANCE": "red_Stance",
        "DOB": "red_DOB",
    })

    fight_results = fight_results.merge(red_info, on="red_Name", how="left")

    blue_info = fighter_info.rename(columns={
        "FIGHTER": "blue_Name",
        "HEIGHT": "blue_Height",
        "WEIGHT": "blue_Weight",
        "REACH": "blue_Reach",
        "STANCE": "blue_Stance",
        "DOB": "blue_DOB",
    })

    fight_results = fight_results.merge(blue_info, on="blue_Name", how="left")

    return fight_results


## --- Clean and merge fighter averages ---


In [20]:
def clean_and_merge_fighter_averages(fighters_df, fighter_info):
    fighters_df = fighters_df.merge(
        fighter_info[["FIGHTER", "HEIGHT", "WEIGHT", "REACH", "STANCE", "DOB"]]
            .rename(columns={"FIGHTER": "Name"}),
        on="Name",
        how="left"
    )
    fighters_df['Height'] = fighters_df['HEIGHT'].apply(clean_height)
    fighters_df['Weight'] = fighters_df['WEIGHT'].apply(clean_weight)
    fighters_df['Reach'] = fighters_df['REACH'].apply(clean_reach)
    fighters_df['Stance'] = fighters_df['STANCE'].astype(str).str.strip().str.title()
    fighters_df['DOB'] = fighters_df['DOB'].apply(clean_dob)
    fighters_df = fighters_df.drop(columns=["HEIGHT", "WEIGHT", "REACH", "STANCE"])

    fighters_df = fighters_df.drop(columns=["OpponentTakedownsAttempted","OpponentTakedownsLanded","OpponentStrikesLanded","OpponentStrikesAttempted","OpponentControlTime","fightTime"
                                                  ,"TakedownsAttempted","TakedownsLanded","StrikesLanded","StrikesAttempted","Knockdowns","ControlTime","SubmissionAttempts","Reversals"])

    return fighters_df


## -- Mirror fight stats --


In [21]:
import pandas as pd
import numpy as np



def randomly_swap_fighters(df: pd.DataFrame, seed: int = 42, track_swaps: bool = False) -> pd.DataFrame:
    np.random.seed(seed)
    df_swapped = df.copy()
    print(df["OUTCOME"].value_counts(normalize=True))


    # Identify all red_/blue_ feature pairs
    red_cols = [col for col in df.columns if col.startswith("red_")]
    blue_cols = [col.replace("red_", "blue_") for col in red_cols if "blue_" + col[4:] in df.columns]

    assert len(red_cols) == len(blue_cols), "Mismatch in red_ and blue_ columns."

    swaps = np.random.rand(len(df)) < 0.5

    for red_col, blue_col in zip(red_cols, blue_cols):
        red_values = df_swapped.loc[swaps, red_col].copy()
        blue_values = df_swapped.loc[swaps, blue_col].copy()

        df_swapped.loc[swaps, red_col] = blue_values
        df_swapped.loc[swaps, blue_col] = red_values

    # Adjust Winner column
    if "OUTCOME" in df.columns:
        df_swapped.loc[swaps, "OUTCOME"] = df.loc[swaps, "OUTCOME"].replace({"Red": "Blue", "Blue": "Red"})

    if track_swaps:
        df_swapped["Swapped"] = swaps

    print(df_swapped["OUTCOME"].value_counts(normalize=True))

    return df_swapped.reset_index(drop=True)


## --- Elo Rankings Functions ---


In [22]:
def get_expected_score(elo_win, elo_loss):
    return 1 / (1 + 10 **((elo_loss - elo_win) / 400))

#This gives more points for a finish, even more if finish is in an earlier round
def get_k_factor(method, round, k = 100,):
    if method == "KO/TKO" or "Submission":
        match round:
            case 1: 
                return k * 1.2
            case 2: 
                return k * 1.18
            case _: 
                return k * 1.16
    if method == "Decision - Unanimous":
        return k * 1.14
    return k
    
def get_new_elo(win_elo, loss_elo, k_factor):
    exp_win = get_expected_score(win_elo, loss_elo)
    new_win_elo = win_elo + k_factor * (1-exp_win)
    new_loss_elo = loss_elo + k_factor * (0 - (1-exp_win))
    return round(new_win_elo, 2), round(new_loss_elo, 2)

## --- Elo Rankings Loop ---

In [23]:
def add_Elo_rankings(fight_results, fighters_df):
    """
    Adds Elo ratings to the fight_results DataFrame based on historical fight outcomes,
    updating fighters_df with the latest Elo ratings for each fighter.
    """

    
    fighters_df["Elo"] = 1000
    def get_fighter_elo(fighters_df, fighter_name, default_elo=1000):
        row = fighters_df.loc[fighters_df['Name'] == fighter_name, 'Elo']
        return float(row.iloc[0]) if not row.empty else default_elo

    def update_fighter_elo(fighters_df, fighter_name, new_elo):
        if fighters_df['Name'].eq(fighter_name).any():
            fighters_df.loc[fighters_df['Name'] == fighter_name, 'Elo'] = new_elo
        else:
            # If fighter doesn't exist, add them
            new_row = {'Name': fighter_name, 'Elo': new_elo}
            fighters_df = pd.concat([fighters_df, pd.DataFrame([new_row])], ignore_index=True)
        return fighters_df

    for i in range(len(fight_results) - 1, -1, -1):
        red_fighter = fight_results.loc[i, "red_Name"].strip()
        blue_fighter = fight_results.loc[i, "blue_Name"].strip()

        # Get current ELOs (default to 1000 if missing)
        red_elo_before = get_fighter_elo(fighters_df, red_fighter)
        blue_elo_before = get_fighter_elo(fighters_df, blue_fighter)

        # Store current ELOs into fight_results before the fight
        fight_results.loc[i, "red_Elo"] = red_elo_before
        fight_results.loc[i, "blue_Elo"] = blue_elo_before

        # Determine outcome
        outcome = fight_results.loc[i, "OUTCOME"]
        k_factor = get_k_factor(outcome, round=0.0)  # Optional: pass actual round info if available

        if outcome == "Red":
            red_elo_after, blue_elo_after = get_new_elo(red_elo_before, blue_elo_before, k_factor)
        elif outcome == "Blue":
            blue_elo_after, red_elo_after = get_new_elo(blue_elo_before, red_elo_before, k_factor)
        else:  # Draw or unknown outcome
            blue_elo_after, red_elo_after = get_new_elo(blue_elo_before, red_elo_before, k_factor / 2)

        # Update or insert new Elo ratings back into fighters_df
        fighters_df = update_fighter_elo(fighters_df, red_fighter, red_elo_after)
        fighters_df = update_fighter_elo(fighters_df, blue_fighter, blue_elo_after)

    return fight_results, fighters_df


## --- Main ---

In [None]:
def main():
    fight_results, stats_df, fighter_info = load_and_prepare_data()

    fighters_dict = build_fighters_aggregate(stats_df, fight_results)
    compute_aggregated_features(fight_results)

    fighters_df = build_fighters_dataframe(fighters_dict)

    drop_raw_aggregated_features(fight_results)

    fight_results = merge_fighter_info_to_fight_results(fight_results, fighter_info)
    fight_results, fighters_df = add_Elo_rankings(fight_results, fighters_df)
    fight_results = clean_fighter_info(fight_results)

    fighters_df = clean_and_merge_fighter_averages(fighters_df, fighter_info)

    #fight_results = swap_and_duplicate_fighter_metrics(fight_results)
    fight_results = randomly_swap_fighters(fight_results)

    fight_results.to_csv("../processed_data/fight_results_.csv", index=False)
    fighters_df.to_csv("../processed_data/fighter_averages.csv", index=False)


if __name__ == "__main__":
    main()

  fighters_df.loc[fighters_df['Name'] == fighter_name, 'Elo'] = new_elo


OUTCOME
Red      0.635842
Blue     0.347078
NC/NC    0.009882
D/D      0.007198
Name: proportion, dtype: float64
OUTCOME
Blue     0.492619
Red      0.490301
NC/NC    0.009882
D/D      0.007198
Name: proportion, dtype: float64
