# UFC Case Study Notebook

## Importing libraries

In [378]:
#from collections import Counter
import json
from datetime import datetime
from collections import defaultdict
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import kagglehub
from jinja2 import Environment, FileSystemLoader

## Util functions

In [379]:
def human_readable_date(date):
    # usage example d = human_readable_date(datetime.today())
    if isinstance(date, str):
        date = datetime.strptime(date, "%Y-%m-%d")  # ou "%d/%m/%Y" se for BR

    day = date.day
    year = date.year
    month = date.strftime("%B")

    # Sufixos: st, nd, rd, th
    if 11 <= day <= 13:
        suffix = "th"
    else:
        last_digit = day % 10
        suffix = {1: "st", 2: "nd", 3: "rd"}.get(last_digit, "th")

    return f"{month} {day}{suffix}, {year}"

def human_readable_duration(seconds: int) -> str:
    if seconds < 0:
        raise ValueError("Duration cannot be negative")

    minutes, sec = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)

    parts = []
    if hours:
        parts.append(f"{hours} hour" + ("s" if hours != 1 else ""))
    if minutes:
        parts.append(f"{minutes} minute" + ("s" if minutes != 1 else ""))
    if not parts:
        parts.append("less than a minute")

    return " and ".join(parts)

## Raw data fetch

In [None]:
path = kagglehub.dataset_download("neelagiriaditya/ufc-datasets-1994-2025")
print("Path to dataset files:", path)

data_folder = "/UFC DATASETS/"

raw_event_details = pd.read_csv(path + data_folder + "event_details.csv", parse_dates=["date"])
raw_figther_details = pd.read_csv(path + data_folder + "fighter_details.csv", parse_dates=["dob"])
raw_fight_details = pd.read_csv(path + data_folder + "fight_details.csv")
raw_ufc_details = pd.read_csv(path + data_folder + "UFC.csv", parse_dates=["date", "r_dob", "b_dob"])

## Data basic exploration

### Event details exploration

In [None]:
raw_event_details.head()

In [None]:
raw_event_details.info()

### Fighter details exploration

In [None]:
raw_figther_details.head()

In [None]:
raw_figther_details.info()

### Fight details exploration

In [None]:
raw_fight_details.head()

In [None]:
raw_fight_details.info()

# Fight details initial cleaning

> Only weight categories with short names (up to two words) and divisions with no numbers in the name were considered, as long names suggest personalized, inconsistent, or poorly categorized divisions in the raw data. Also Superfight Championship and Open Weight were removed because they were disputed only in, respectively 2 and 4 years

In [None]:
filtered_raw_fight_details = raw_fight_details[
    (raw_fight_details["division"].str.split().str.len() <= 2) &
    (~raw_fight_details["division"].str.contains(r'\d', regex=True)) & 
    (~raw_fight_details["division"].str.contains(r'open weight|superfight championship', regex=True))
]

# 1. Description Reports

## Historical overview of the UFC

### Events by year

In [None]:
# fights_by_year = Counter(raw_event_details["date"].map(lambda d: d.year)) my approach, more python way
# One problem with my approach is that I'll have to make a bunch of conversions in order to plot the data
events_by_year = raw_event_details["date"].dt.year.value_counts().sort_index() # LLM approach suggestion, more pandas way

In [None]:
chartName = "images/chart-1.png"
events_by_year.plot(kind="bar", figsize=(12, 6), title="Number of events by year")
plt.xlabel("Year")
plt.ylabel("Number of events")
plt.grid(True)
plt.tight_layout()
plt.savefig(chartName)
plt.show()

> We can see from the chart that the number of events in the year increased significantly from 2005 onwards

### Top 5 Cities and Countries with more events

In [None]:
event_location = raw_event_details["location"].copy()

split_location = event_location.str.split(',', expand=True).apply(lambda col: col.str.strip())

# Creating new data frame
event_location_df = pd.DataFrame()
event_location_df["city"] = split_location[0]
event_location_df["country"] = split_location.apply(
    lambda row: row.iloc[-2] if pd.isna(row.iloc[-1]) else row.iloc[-1],
    axis=1
)

In [None]:
countries_num_of_events = event_location_df["country"].value_counts().sort_values(ascending=False)
countries_num_of_events.head(5)

In [None]:
countries_report = countries_num_of_events.head(5).to_dict()
countries_report

In [None]:
cities_num_of_events = event_location_df["city"].value_counts().sort_values(ascending=False)
cities_num_of_events.head(5)

In [None]:
cities_report = cities_num_of_events.head(5).to_dict()
cities_report

### Number of fights by weight category over time

In [None]:
categories_fights = pd.merge(
    filtered_raw_fight_details[["event_id", "division"]],
    raw_event_details[["event_id", "date"]],
    on="event_id",
    how="inner"
).drop(["event_id"], axis = 1)

In [None]:
categories_fights["year"] = pd.to_datetime(categories_fights["date"]).dt.year

In [None]:
categories_fights = categories_fights.drop(["date"], axis = 1)

In [None]:
fights_per_year_division = (
    categories_fights
    .groupby(["year", "division"])
    .size()
    .reset_index(name="num_fights")
)

fights_per_year_division

#### ChartJS formatting transformation

In [None]:

data = fights_per_year_division.to_dict(orient="records")

# Reorganize per division
divisions = defaultdict(list)
years = sorted(set(d["year"] for d in data))

for division in sorted(set(d["division"] for d in data)):
    for year in years:
        fight = next((x["num_fights"] for x in data if x["year"] == year and x["division"] == division), 0)
        divisions[division].append(fight)

dist_fights_per_weight_chart_data = {
    "labels": years,
    "datasets": [
        {
            "label": division,
            "data": fights,
        }
        for division, fights in divisions.items()
    ]
}

dist_fights_per_weight_chart_data = json.dumps(dist_fights_per_weight_chart_data)

## General fight statistics

### Ratio of wins by knockout, submission, and decision

In [None]:
methods = filtered_raw_fight_details["method"]
methods.value_counts()

In [None]:
ko_submission_decision_methods = filtered_raw_fight_details[
    filtered_raw_fight_details["method"].str.contains("KO/TKO|Submission|Decision", na=False)
]

In [None]:
ko_submission_decision_methods["method"].value_counts()

#### Groupping decision method variations

In [None]:
ko_submission_decision_methods["method_grouped"] = np.where(
    ko_submission_decision_methods["method"].str.contains("Decision", na=False),
    "Decision",
    ko_submission_decision_methods["method"]
)

In [None]:
ko_submission_decision_methods["method_grouped"].value_counts()

#### Decision methods to chartjs and stats

In [None]:
ko_submission_decision_methods_json = ko_submission_decision_methods["method_grouped"].value_counts().to_dict()
vals_sum = sum(list(ko_submission_decision_methods_json.values()))

def apply_perc(n):
    return round(n/vals_sum * 100, 2)
ko_submission_decision_methods_props = list(map(apply_perc, list(ko_submission_decision_methods_json.values())))

### Average duration of fights by category

In [None]:
avg_duration_per_division = filtered_raw_fight_details.groupby("division")["match_time_sec"].mean().sort_values(ascending=False)

In [None]:
avg_duration_per_division

In [None]:
humanized = avg_duration_per_division.apply(lambda x: human_readable_duration(int(x)))

In [None]:
for division, duration in humanized.items():
    print(f"{division}: {duration}")

In [None]:
duration_by_weight_class = avg_duration_per_division.to_dict()
duration_by_weight_class = {
    k.replace("_", " ").title(): round(v, 2)
    for k, v in duration_by_weight_class.items()
}

### Most common submission techniques

> Although the initial objective included analyzing the most common submission techniques, the dataset used does not specify the type of submission (e.g., rear naked choke, triangle choke, kimura). Therefore, this analysis could not be performed with the available data.

### Average strikes per minute and per round

In [None]:
raw_ufc_details.head(5)

#### Average strikes per minute

The main formula to calculate this is 

total strikes landed / (match_time_sec / 60)

for this metric I'll consider only complete fights, or, fights that last more than 300 sec or 5 min

In [None]:
avg_strikes_per_min_ckp = raw_ufc_details.copy()
avg_strikes_per_min_ckp = avg_strikes_per_min_ckp[
    avg_strikes_per_min_ckp["match_time_sec"] > 300
]

In [None]:
avg_strikes_per_min_ckp["avg_str_min"] = (avg_strikes_per_min_ckp["b_total_str_landed"] + avg_strikes_per_min_ckp["r_total_str_landed"]) / (avg_strikes_per_min_ckp["match_time_sec"] / 60) 

In [None]:
avg_strikes_per_min_ckp.head(5)

#### Average strikes per round

The main formula to calculate this is 

total strikes landed / total_rounds

for this metric I'll consider only complete fights, or, fights that last more than than 300 sec or 5 min

In [None]:
avg_strikes_per_min_ckp["avg_str_rnd"] = (avg_strikes_per_min_ckp["b_total_str_landed"] + avg_strikes_per_min_ckp["r_total_str_landed"]) /  avg_strikes_per_min_ckp["finish_round"]

In [None]:
avg_strikes_per_min_ckp.head(5)

top_strike_fights = avg_strikes_per_min_ckp[["event_name", "date", "location", "division", "method", "match_time_sec", "b_name", "r_name", "avg_str_min", "avg_str_rnd"]]
top_strike_fights = top_strike_fights.head(5)
top_strike_fights["avg_str_min"] = round(top_strike_fights["avg_str_min"],2)
top_strike_fights["avg_str_rnd"] = avg_strikes_per_min_ckp["avg_str_rnd"]

top_strike_fights = top_strike_fights.to_dict(orient="records")

## Fighter Statistics

### Top 10 fighters with the most wins

In [None]:
top_10_fighters_with_the_most_wins = raw_figther_details.sort_values(by=['wins'], ascending=False)[["name", "nick_name", "wins"]].head(10)
top_10_fighters_with_the_most_wins_dict = top_10_fighters_with_the_most_wins.to_dict(orient="records")
top_10_fighters_with_the_most_wins_dict

### Fighters with the highest KO rate

In [None]:
# filtering fights with a winner

df_winners = raw_ufc_details[raw_ufc_details["winner"].notna()].copy()

# new column to mark victories as KO/TKO
df_winners["is_ko"] = df_winners["method"].str.contains("KO", case=False, na=False)

# counting KO/TKO victories by fighter
ko_wins = df_winners[df_winners["is_ko"]].groupby("winner")["fight_id"].count().reset_index(name="ko_wins")

# counting wins by fighter
total_wins = df_winners.groupby("winner")["fight_id"].count().reset_index(name="total_wins")

# join dataframes
ko_rate_df = pd.merge(ko_wins, total_wins, on="winner")

# calculate ko rate
ko_rate_df["ko_rate"] = ko_rate_df["ko_wins"] / ko_rate_df["total_wins"]

# getting fighters with more than 10 wins

ko_rate_df = ko_rate_df[
    ko_rate_df["total_wins"] > 10
]

# ordering
ko_rate_df = ko_rate_df.sort_values(by="ko_rate", ascending=False)

ko_rate_df_dict = ko_rate_df.head(10).to_dict(orient="records")

ko_rate_df.head(10)

### Performance comparasion by age

#### Calculating fighters ages

In [None]:
df = raw_ufc_details.copy()
df["r_age"] = (df["date"] - df["r_dob"]).dt.days // 365
df["b_age"] = (df["date"] - df["b_dob"]).dt.days // 365

#### Creating a new data frame with fight results and fighters age

In [None]:
red_fighters = df[["r_name", "r_age", "winner"]].rename(
    columns={"r_name": "fighter", "r_age": "age"}
)
red_fighters["is_winner"] = red_fighters["fighter"] == red_fighters["winner"]

blue_fighters = df[["b_name", "b_age", "winner"]].rename(
    columns={"b_name": "fighter", "b_age": "age"}
)
blue_fighters["is_winner"] = blue_fighters["fighter"] == blue_fighters["winner"]

fighters_df = pd.concat([red_fighters, blue_fighters])

In [None]:
fighters_df

In [None]:
performance_by_age = fighters_df.groupby("age")["is_winner"].mean().reset_index(name="win_rate")

> ⚠️ **Disclaimer: How the Win Rate by Age is Calculated**
>
> The `win_rate` (win percentage by age) in this notebook is computed by analyzing fighters' outcomes based on their age at the time of each fight.
> 
> The dataset has been transformed into a long format, where each row represents **one fighter per fight** (instead of one row per fight with two fighters). This allows us to associate each fighter with their age and match outcome individually.
>
> The win rate for each age is calculated as:
>
> **Number of fights won by fighters at a given age ÷ Total number of fights fought at that same age**
>
> For example, if fighters who were 30 years old won 60 out of 100 fights, the win rate at age 30 is 0.60.
>
> ---
>
> **Important Notes:**
> - This does **not** account for opponent strength, fight context, or event level.
> - Ages with very few fights may result in misleading rates due to low sample size.


In [None]:
chartName = "images/chart-2.png"

plt.figure(figsize=(10, 6))
plt.plot(performance_by_age["age"], performance_by_age["win_rate"], marker="o")
plt.xlabel("Age")
plt.ylabel("Win Rate")
plt.title("Performance by Age (Win Rate)")
plt.grid(True)
plt.tight_layout()
plt.savefig(chartName)
plt.show()


### Average career length

In [None]:
df = raw_ufc_details.copy()

In [None]:
red_fighters = df[["r_id", "r_name", "date"]].rename(columns={
    "r_id": "fighter_id",
    "r_name": "fighter_name"
})

blue_fighters = df[["b_id", "b_name", "date"]].rename(columns={
    "b_id": "fighter_id",
    "b_name": "fighter_name"
})

all_fighters = pd.concat([red_fighters, blue_fighters], ignore_index=True)

In [None]:
career_dates = all_fighters.groupby("fighter_id")["date"].agg(["min", "max"]).reset_index()
career_dates["career_length_days"] = (career_dates["max"] - career_dates["min"]).dt.days
career_dates["career_length_years"] = career_dates["career_length_days"] / 365

In [None]:
average_career_length_years = round(career_dates["career_length_years"].mean(), 2)
average_career_length_years

# 2. Analytical & Comparative Reports

> These go deeper and highlight analytical thinking, correlations, and behavior patterns.

## What Influences a Fight Outcome?

In [None]:
df_fight_outcome = raw_ufc_details.copy()

### Correlation between height/reach and win probability

In [None]:
height_diff = df_fight_outcome["r_height"] - df_fight_outcome["b_height"]
reach_diff = df_fight_outcome["r_reach"] - df_fight_outcome["b_reach"]
is_red_win = df_fight_outcome["winner"] == df_fight_outcome["r_name"]

In [None]:
df_corr = pd.DataFrame({
    "height_diff": height_diff,
    "reach_diff": reach_diff,
    "is_red_win": is_red_win.astype(int)  # True/False -> 1/0
})

#### Evaluate correlation

In [None]:
correlation_matrix = df_corr.corr()

correlation_matrix

In [None]:
corr_result = correlation_matrix["is_red_win"].sort_values(ascending=False)

In [None]:
influences_fight_outcome = (
    f"Note: Based on the data, reach difference shows a correlation of {corr_result['reach_diff']:.3f} "
    f"with winning probability, while height difference shows a correlation of {corr_result['height_diff']:.3f}. "
    "This suggests that reach has a slightly higher influence on fight outcomes than height, although both "
    "correlations are weak and should not be considered strong predictors."
)

print(influences_fight_outcome)

### Does making weight increase win chances?

> Fighters who cut weight correctly can have a psychological and physical advantage (even if temporary) over those who miss weight, or at least, they don't suffer the penalties.

#### Divisions weight limits

#### Creating divisions dictionary

In [None]:
divisions = list(df_fight_outcome["division"].unique())

# Initialy those are weights that I know so far

# Weights in pounds

weight_limits = {
    "flyweight": 125,
    "bantamweight": 135,
    "featherweight": 145,
    "lightweight": 155,
    "welterweight": 170,
    "middleweight": 185,
    "light heavyweight": 205,
    "heavyweight": 265
}

known_divisions = list(weight_limits.keys())
for division in divisions:
    for kd in known_divisions:
        if kd in division.lower():
            weight_limits[division] = weight_limits[kd]
            break

df_fight_outcome["division_limit"] = df_fight_outcome["division"].map(weight_limits)

#### Filtering divisions with limit

In [None]:
df_fight_outcome = df_fight_outcome[df_fight_outcome["division_limit"].notna()]

#### Does the fighter made the weight?

In [None]:
# Select relevant columns for fighters in the red corner,
# renaming columns to generic fighter-related names
red_df = df_fight_outcome[[
     "division_limit", "r_name", "r_weight", "winner"
]].rename(columns={
    "r_name": "fighter_name",        # rename red fighter's name to "fighter_name"
    "r_weight": "fighter_weight"     # rename red fighter's weight to "fighter_weight"
})

# Create a new boolean column "won" that indicates if the red fighter won the fight
red_df["won"] = red_df["winner"] == red_df["fighter_name"]


# Repeat the same process for fighters in the blue corner:
# select relevant columns and rename them to generic fighter names
blue_df = df_fight_outcome[[
     "division_limit", "b_name", "b_weight", "winner"
]].rename(columns={
    "b_name": "fighter_name",        # rename blue fighter's name to "fighter_name"
    "b_weight": "fighter_weight"     # rename blue fighter's weight to "fighter_weight"
})

# Create boolean column "won" indicating if the blue fighter won
blue_df["won"] = blue_df["winner"] == blue_df["fighter_name"]


# Concatenate both dataframes (red_df and blue_df) by stacking rows,
# resulting in a dataframe with all fighters regardless of corner (red or blue)
full_df = pd.concat([red_df, blue_df], ignore_index=True)


# Create a new boolean column "made_weight" that checks if the fighter made weight,
# comparing the fighter's weight to the division weight limit
full_df["made_weight"] = full_df["fighter_weight"] <= full_df["division_limit"]


# Group the dataframe by "made_weight" (True/False),
# and calculate the mean of the "won" column for each group.
# Since "won" is boolean (True=1, False=0), the mean represents the win rate per group.
win_rate_by_weight = (
    full_df
    .groupby("made_weight")["won"]
    .mean()
    .reset_index()
)
type(win_rate_by_weight)
# made_weight_stats = win_rate_by_weight.to_dict(orient="records")
# Print the final result: win rates grouped by whether fighters made weight or not
print(win_rate_by_weight)


### Do fighters coming off a win have a higher chance of winning again?

In [None]:
df_fight_outcome = raw_ufc_details.copy()

#### Creating full version of data frame

In [None]:
# Select relevant columns for fighters in the red corner,
# renaming columns to generic fighter-related names
red_df = df_fight_outcome[[
    "date", "r_name", "winner"
]].rename(columns={
    "r_name": "fighter_name",        # rename red fighter's name to "fighter_name"
})

# Create a new boolean column "won" that indicates if the red fighter won the fight
red_df["won"] = red_df["winner"] == red_df["fighter_name"]


# Repeat the same process for fighters in the blue corner:
# select relevant columns and rename them to generic fighter names
blue_df = df_fight_outcome[[
    "date", "b_name", "winner"
]].rename(columns={
    "b_name": "fighter_name",        # rename blue fighter's name to "fighter_name"
})

# Create boolean column "won" indicating if the blue fighter won
blue_df["won"] = blue_df["winner"] == blue_df["fighter_name"]

# Concatenate both dataframes (red_df and blue_df) by stacking rows,
# resulting in a dataframe with all fighters regardless of corner (red or blue)
full_df = pd.concat([red_df, blue_df], ignore_index=True)

#### Creating the won_previous information

In [None]:
# 1. Sort the full_df dataframe by fighter and fight date (in ascending order)
full_df = full_df.sort_values(by=["fighter_name", "date"])

# 2. Create the column won previous, which takes the value of the 'won' column from the previous fight for the same fighter
full_df["won_previous"] = full_df.groupby("fighter_name")["won"].shift(1)

# 3. Since a fighter's first fight has no previous fight, the value will be NaN, so I set it to false instead.
full_df["won_previous"] = full_df["won_previous"].astype("boolean").fillna(False)

#### Evaluating the won_previous rate

In [None]:
win_rate_after_win = full_df[full_df["won_previous"] == True]["won"].mean()
win_rate_after_loss = full_df[full_df["won_previous"] == False]["won"].mean()

print(f"Win rate after winning previous fight: {win_rate_after_win:.2%}")
print(f"Win rate after losing previous fight: {win_rate_after_loss:.2%}")

## Rivalries & Rematches

In [None]:
df_rivalries_and_rematches = raw_ufc_details.copy()

### Timeline of fights between the same opponents
> For the sake of simplicity I got the 5 fight matches with the most rematches

In [None]:

# Step 1: Create unique key for each pair of opponents
# Using IDs because they are more consistent than names (names may change or be spelled differently)
df_rivalries_and_rematches["pair_key"] = df_rivalries_and_rematches.apply(
    lambda x: tuple(sorted([x["r_id"], x["b_id"]])),
    axis=1
)

# Step 2: Counting fights between pairs
pair_counts = (
    df_rivalries_and_rematches.groupby("pair_key")["fight_id"]
        .count()
        .reset_index(name="num_fights")
)

# Step 3: Select the 5 pairs with the most number of rematches
top_5_pairs = pair_counts.sort_values("num_fights", ascending=False).head(5)["pair_key"]

# Step 4: Filter only those rivalries
df_top_rivalries = df_rivalries_and_rematches[df_rivalries_and_rematches["pair_key"].isin(top_5_pairs)].copy()

# Step 5: Sort chronologically
df_top_rivalries["date"] = pd.to_datetime(df_top_rivalries["date"])
df_top_rivalries = df_top_rivalries.sort_values(["pair_key", "date"])

# Step 6: Create a "fight #" identifier within the rivalry
df_top_rivalries["fight_number"] = df_top_rivalries.groupby("pair_key").cumcount() + 1

# Step 7: Organize columns for visualization

timeline_cols = [
    "pair_key", "fight_number", "date", "event_name",
    "r_name", "b_name", "winner", "method"
]

timeline_df = df_top_rivalries[timeline_cols]

timeline_df

### Performance evolution across trilogies

In [None]:
df_performance_evo_across_trilogies = raw_ufc_details.copy()

#### Create a column with a unique identifier for the pair of fighters

In [None]:
# Create column with ordered names to identify the fight pair
df_performance_evo_across_trilogies["fighter_pair"] = df_performance_evo_across_trilogies.apply(
    lambda row: tuple(sorted([row["r_id"], row["b_id"]])),
    axis=1
)

#### Filter pairs that have fought 3 or more times (trilogies or more)

In [None]:
# Count how many fights each pair had
pair_counts = df_performance_evo_across_trilogies["fighter_pair"].value_counts()

# Filter only pairs with 3 or more fights
trilogies_pairs = pair_counts[pair_counts >= 3].index.tolist()

# Filter original dataframe to keep only these fights
df_trilogies = df_performance_evo_across_trilogies[
    df_performance_evo_across_trilogies["fighter_pair"].isin(trilogies_pairs)
].copy()

#### Sort by date to analyze chronological evolution

In [None]:
df_trilogies = df_trilogies.sort_values(by=["fighter_pair", "date"])

#### For each fight, identify the winner and create simple metrics for performance

* Who won
* Fight number of the trilogy (1, 2, 3, ...)
* Important statistics (e.g., significant strikes, takedowns, control)

In [None]:
df_trilogies["match_number"] = df_trilogies.groupby("fighter_pair").cumcount() + 1

imp_stats_colums = [
    "fighter_pair",
    "date",
    "match_number",
    "winner_id",
    "r_id",
    "b_id",
    "r_name",
    "b_name",
    "r_total_str_landed",
    "b_total_str_landed",
    "r_td_landed",
    "b_td_landed",
    "r_ctrl",
    "b_ctrl"
]

imp_stats_df = df_trilogies[imp_stats_colums]

imp_stats_df["strikes_diff"] = abs(imp_stats_df["r_total_str_landed"] - imp_stats_df["b_total_str_landed"])
imp_stats_df["td_diff"] = abs(imp_stats_df["r_td_landed"] - imp_stats_df["b_td_landed"])
imp_stats_df["ctrl_diff"] = abs(imp_stats_df["r_ctrl"] - imp_stats_df["b_ctrl"])


In [None]:
imp_stats_df

### Who tends to improve in rematches?

In [None]:
df_improve_in_rematches = raw_ufc_details.copy()

# 1. Create a unique identifier for each pair of fighters in a consistent order
# so that "Fighter A vs Fighter B" and "Fighter B vs Fighter A" are treated the same.

df_improve_in_rematches["fighter_pair"] = df_improve_in_rematches.apply(
    lambda row: tuple(sorted([row["r_id"], row["b_id"]])),
    axis=1
)

# 2. Filter pairs with more than one fight (rematches)
pair_counts = df_improve_in_rematches['fighter_pair'].value_counts()
rematch_pairs = pair_counts[pair_counts > 1].index.tolist()
rematches_df = df_improve_in_rematches[df_improve_in_rematches['fighter_pair'].isin(rematch_pairs)].copy()

# 3. Sort fights by date within each pair
rematches_df.sort_values(by=['fighter_pair', 'date'], inplace=True)

# 4. Assign a fight number (1st fight, 2nd fight, etc) within each pair
rematches_df['fight_number'] = rematches_df.groupby('fighter_pair').cumcount() + 1

# 5. Select important stats for comparison (example: total strikes landed)
stats_cols = ['r_total_str_landed', 'b_total_str_landed', 'r_td_landed', 'b_td_landed', 'r_ctrl', 'b_ctrl']

# 6. For each rematch pair, compare stats of fight 1 vs fight 2
# Example: calculate difference in strikes landed by red fighter between fight 2 and fight 1
# You'll want to pivot or reshape the data for easy comparison

# Pivot to have fights as columns
pivot_cols = ['date', 'winner', 'r_name', 'b_name'] + stats_cols
pivot_df = rematches_df[pivot_cols + ['fighter_pair', 'fight_number']]

# Create separate dataframes for fight 1 and fight 2
fight1 = pivot_df[pivot_df['fight_number'] == 1].set_index('fighter_pair')
fight2 = pivot_df[pivot_df['fight_number'] == 2].set_index('fighter_pair')

# Example: calculate improvement in red corner strikes landed from fight 1 to fight 2
improvement_df = fight2[['r_total_str_landed']].copy()
improvement_df['r_strikes_improvement'] = fight2['r_total_str_landed'] - fight1['r_total_str_landed']

improvement_df

## Generate HTML with reports

In [None]:
def generate_ufc_report_html(output_file="ufc_report.html"):
    env = Environment(loader=FileSystemLoader("./templates"))
    template = env.get_template("ufc_report.html")

    html = template.render(
        title="UFC Analysis Report",
        ufc_logo_url= "./images/UFC_Logo.png",
        kaggle_link = "https://www.kaggle.com/datasets/neelagiriaditya/ufc-datasets-1994-2025/data",
        github_link = "https://github.com/joaogouveia89/ufc-data-study-case",
        countries = countries_report,
        cities = cities_report,
        dist_fights_per_weight_chart = dist_fights_per_weight_chart_data,
        ko_submission_decision_methods_chart = ko_submission_decision_methods_json,
        ko_submission_decision_methods_props = ko_submission_decision_methods_props,
        duration_by_weight_class = duration_by_weight_class,
        top_strike_fights = top_strike_fights,
        top_10_fighters_with_the_most_wins = top_10_fighters_with_the_most_wins_dict,
        top_10_fighters_with_the_most_ko_rate = ko_rate_df_dict,
        average_career_length_years = average_career_length_years,
        influences_fight_outcome = influences_fight_outcome,
        made_weight_stats = made_weight_stats,
        processed_date=human_readable_date(datetime.today()),
    )

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(html)

    print(f"Saved on {output_file}")

generate_ufc_report_html()