In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('../2_data_preparation/cleaned_data.csv')
clustered = pd.read_csv('data_clustered.csv')

In [3]:
clustered = clustered[['name', 'cluster']]
clustered

Unnamed: 0,name,cluster
0,Lerone Murphy,4
1,Edson Barboza,4
2,Khaos Williams,4
3,Carlston Harris,2
4,Themba Gorimbo,2
...,...,...
2391,Nick Sanzo,3
2392,Jim Mullen,3
2393,Yoshiki Takahashi,1
2394,Wallid Ismail,1


In [4]:
data['winner'].value_counts()

winner
f1    7289
0      111
Name: count, dtype: int64

In [5]:
# filter data where clear decision
data = data[data['winner'] == 'f1']
data['winner'].value_counts()

winner
f1    7289
Name: count, dtype: int64

In [6]:
data['method'].value_counts()

method
Decision - Unanimous    2645
KO/TKO                  2443
Submission              1435
Decision - Split         766
Name: count, dtype: int64

In [8]:
data = data[data['method'].isin(['Decision - Unanimous', 'Decision - Split'])]

In [9]:
data['method'].value_counts()

method
Decision - Unanimous    2645
Decision - Split         766
Name: count, dtype: int64

In [10]:

# Merge the cluster column from clustered DataFrame based on fighter names (f1 and f2)
data = pd.merge(data, clustered[['name', 'cluster']], left_on='f1', right_on='name', how='left')
data.rename(columns={'cluster': 'winner_cluster'}, inplace=True)

data = pd.merge(data, clustered[['name', 'cluster']], left_on='f2', right_on='name', how='left')
data.rename(columns={'cluster': 'loser_cluster'}, inplace=True)

In [11]:
odds_df = data[['winner_cluster', 'loser_cluster']]

In [12]:
odds_df.head(2)

Unnamed: 0,winner_cluster,loser_cluster
0,4,4
1,2,2


In [13]:
# filter data where clear decision
data = data[data['winner'] == 'f1']
data['winner'].value_counts()

winner
f1    3411
Name: count, dtype: int64

In [14]:
odds_df['winner_cluster'].value_counts()

winner_cluster
2    1669
4    1491
1     237
0       7
3       7
Name: count, dtype: int64

In [16]:
import matplotlib.cm as cm

# Group by winner and loser clusters to count the number of wins and losses
win_counts = odds_df.groupby(['winner_cluster', 'loser_cluster']).size().unstack(fill_value=0)
loss_counts = odds_df.groupby(['loser_cluster', 'winner_cluster']).size().unstack(fill_value=0)

# Calculate win percentage by cluster matchup
win_percentage = (win_counts / (win_counts + loss_counts)) * 100
# Round win percentages to nearest integer
win_percentage_rounded = win_percentage.round()

# Display win percentages by cluster matchup
win_percentage_rounded.style.background_gradient(cmap='Blues', axis=None)

loser_cluster,0,1,2,3,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,,50.0,36.0,,20.0
1,50.0,50.0,23.0,75.0,25.0
2,64.0,77.0,50.0,100.0,52.0
3,,25.0,0.0,50.0,
4,80.0,75.0,48.0,,50.0


In [17]:
(win_counts + loss_counts)

loser_cluster,0,1,2,3,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2,14,0,5
1,2,92,456,4,327
2,14,456,1626,9,941
3,0,4,9,12,0
4,5,327,941,0,1576


In [19]:
data.groupby(['weightclass', 'winner_cluster']).size().unstack(fill_value=0)

winner_cluster,0,1,2,3,4
weightclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bantamweight,0,18,184,0,241
featherweight,0,35,183,0,192
flyweight,1,14,112,0,183
heavyweight,0,26,96,3,84
light heavyweight,0,14,153,0,79
lightweight,1,39,338,3,231
middleweight,3,32,209,1,144
strawweight,0,12,49,0,135
welterweight,2,47,345,0,202


In [20]:
data['weightclass'].value_counts()

weightclass
lightweight          612
welterweight         596
bantamweight         443
featherweight        410
middleweight         389
flyweight            310
light heavyweight    246
heavyweight          209
strawweight          196
Name: count, dtype: int64

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to calculate win percentages for a given weight class
def calculate_win_percentage(df):
    win_counts = df.groupby(['winner_cluster', 'loser_cluster']).size().unstack(fill_value=0)
    loss_counts = df.groupby(['loser_cluster', 'winner_cluster']).size().unstack(fill_value=0)
    win_percentage = (win_counts / (win_counts + loss_counts)) * 100
    return win_percentage.round().fillna(0)  # Round and fill NaN with 0

# List of unique weight classes
weight_classes = data['weightclass'].unique()

# Create a dictionary to store win percentages for each weight class
win_percentage_by_weight_class = {}

# Calculate win percentages for each weight class
for weight_class in weight_classes:
    df_filtered = data[data['weightclass'] == weight_class]
    win_percentage_by_weight_class[weight_class] = calculate_win_percentage(df_filtered)

# # Plot each heatmap individually
# for weight_class, win_percentage in win_percentage_by_weight_class.items():
#     plt.figure(figsize=(6, 4))
#     sns.heatmap(win_percentage, annot=True, cmap="Blues", fmt="g", cbar=True)
#     plt.title(f'Win Percentages for Weight Class: {weight_class}')
#     plt.xlabel('Loser Cluster')
#     plt.ylabel('Winner Cluster')
#     plt.show()


In [22]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to calculate win percentages for a given weight class
def calculate_win_percentage(df):
    win_counts = df.groupby(['winner_cluster', 'loser_cluster']).size().unstack(fill_value=0)
    loss_counts = df.groupby(['loser_cluster', 'winner_cluster']).size().unstack(fill_value=0)
    win_percentage = (win_counts / (win_counts + loss_counts)) * 100
    return win_percentage.round().fillna(0)  # Round and fill NaN with 0

# List of unique weight classes in the desired order
weight_classes_ordered = ['strawweight', 'flyweight', 'bantamweight', 'featherweight', 
                          'lightweight', 'welterweight', 'middleweight', 
                          'light heavyweight', 'heavyweight']

# Create a dictionary to store win percentages DataFrames for each weight class
win_percentage_df_by_weight_class = {}

# Calculate win percentages for each weight class in the specified order
for weight_class in weight_classes_ordered:
    df_filtered = data[data['weightclass'] == weight_class]
    win_percentage_df_by_weight_class[weight_class] = calculate_win_percentage(df_filtered)

# Display the win percentages as styled DataFrames with background gradient
for weight_class, win_percentage_df in win_percentage_df_by_weight_class.items():
    print(f'Win Percentages for Weight Class: {weight_class}')
    display(win_percentage_df.style.background_gradient(cmap='Blues', axis=None))
    print('\n' + '='*50 + '\n')


Win Percentages for Weight Class: strawweight


loser_cluster,0,1,2,4
0,0.0,0.0,0.0,0.0
1,0.0,0.0,33.0,31.0
2,0.0,67.0,50.0,63.0
4,0.0,69.0,37.0,50.0




Win Percentages for Weight Class: flyweight


Unnamed: 0_level_0,0,1,2,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,0.0,0.0,0.0
1,0.0,50.0,19.0,17.0
2,0.0,81.0,50.0,47.0
4,0.0,83.0,53.0,50.0




Win Percentages for Weight Class: bantamweight


loser_cluster,1,2,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,50.0,15.0,10.0
2,85.0,50.0,52.0
4,90.0,48.0,50.0




Win Percentages for Weight Class: featherweight


loser_cluster,0,1,2,4
0,0.0,0.0,0.0,0.0
1,0.0,50.0,23.0,28.0
2,0.0,77.0,50.0,45.0
4,0.0,72.0,55.0,50.0




Win Percentages for Weight Class: lightweight


loser_cluster,0,1,2,3,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,100.0,0.0,0.0,0.0
1,0.0,50.0,24.0,50.0,18.0
2,100.0,76.0,50.0,100.0,52.0
3,0.0,50.0,0.0,50.0,0.0
4,100.0,82.0,48.0,0.0,50.0




Win Percentages for Weight Class: welterweight


loser_cluster,0,1,2,3,4
0,0.0,0.0,40.0,0.0,0.0
1,100.0,50.0,32.0,0.0,32.0
2,60.0,68.0,50.0,0.0,54.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,68.0,46.0,0.0,50.0




Win Percentages for Weight Class: middleweight


loser_cluster,0,1,2,3,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0,0.0,50.0,0.0,0.0
1,0.0,50.0,17.0,0.0,29.0
2,50.0,83.0,50.0,0.0,50.0
3,0.0,0.0,0.0,50.0,0.0
4,0.0,71.0,50.0,0.0,50.0




Win Percentages for Weight Class: light heavyweight


loser_cluster,1,2,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,50.0,14.0,32.0
2,86.0,50.0,53.0
4,68.0,47.0,50.0




Win Percentages for Weight Class: heavyweight


loser_cluster,1,2,3,4
winner_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,50.0,32.0,100.0,40.0
2,68.0,50.0,100.0,57.0
3,0.0,0.0,50.0,0.0
4,60.0,43.0,0.0,50.0




