In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [6]:
df_raw = pd.read_excel('ufc_fight_details.xlsx')


In [7]:
df_raw.shape

(18777, 73)

In [8]:
df = df_raw.copy()


In [9]:
# Read the events file
events_df = pd.read_excel('ufc_events.xlsx')

events_df = events_df.drop_duplicates(subset='event_name')

# Merge the date information
df = df.merge(
    events_df[['event_name', 'event_date']], 
    on='event_name', 
    how='left'
)

df['event_date'] = pd.to_datetime(df['event_date'])

del events_df

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18777 entries, 0 to 18776
Data columns (total 74 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   event_name                18777 non-null  object        
 1   fight_type                18777 non-null  object        
 2   method                    18777 non-null  object        
 3   time_format               18777 non-null  object        
 4   referee                   18713 non-null  object        
 5   finish_details            7247 non-null   object        
 6   red_fighter_reach         18777 non-null  object        
 7   red_fighter_height        18777 non-null  object        
 8   red_fighter_weight        18777 non-null  object        
 9   red_fighter_stance        18735 non-null  object        
 10  red_fighter_dob           18777 non-null  object        
 11  blue_fighter_reach        18777 non-null  object        
 12  blue_fighter_heigh

In [11]:
df.head()

Unnamed: 0,event_name,fight_type,method,time_format,referee,finish_details,red_fighter_reach,red_fighter_height,red_fighter_weight,red_fighter_stance,...,red_sig_str_attempted,red_sig_str_landed,red_sig_str_pct,red_sub_att,red_td_attempted,red_td_landed,red_td_pct,red_total_str_attempted,red_total_str_landed,event_date
0,UFC 311: Makhachev vs. Moicano,UFC Lightweight Title Bout,Submission,5 Rnd (5-5-5-5-5),Herb Dean,D'Arce Choke On Ground,70,"5' 10""",155,Southpaw,...,19,6,31,1,2,1,50,31,18,2025-01-18
1,UFC 311: Makhachev vs. Moicano,UFC Bantamweight Title Bout,Decision - Unanimous,5 Rnd (5-5-5-5-5),Jason Herzog,,68,"5' 6""",135,Orthodox,...,47,20,42,0,1,0,0,48,21,2025-01-18
2,UFC 311: Makhachev vs. Moicano,UFC Bantamweight Title Bout,Decision - Unanimous,5 Rnd (5-5-5-5-5),Jason Herzog,,68,"5' 6""",135,Orthodox,...,49,20,40,0,3,0,0,58,27,2025-01-18
3,UFC 311: Makhachev vs. Moicano,UFC Bantamweight Title Bout,Decision - Unanimous,5 Rnd (5-5-5-5-5),Jason Herzog,,68,"5' 6""",135,Orthodox,...,54,21,38,0,5,1,20,58,25,2025-01-18
4,UFC 311: Makhachev vs. Moicano,UFC Bantamweight Title Bout,Decision - Unanimous,5 Rnd (5-5-5-5-5),Jason Herzog,,68,"5' 6""",135,Orthodox,...,70,36,51,0,9,4,44,82,48,2025-01-18


In [12]:
def missing(data):
# Identify missing data
    missing_data = data.isnull().sum()
    missing_percent = (data.isnull().sum() / len(data)) * 100

    # Create a DataFrame for missing data
    missing_df = pd.DataFrame({
        'Total': missing_data,
        'Percent': missing_percent
    })
    # Display columns with missing values
    return missing_df[missing_df['Total'] > 0].sort_values(by="Total", ascending=False)

missing(df)

Unnamed: 0,Total,Percent
finish_details,11530,61.40491
blue_fighter_stance,97,0.516589
referee,64,0.340843
red_fighter_stance,42,0.223678


In [13]:
df['finish_details'] = df['finish_details'].fillna(df['method'])

In [14]:
df = df.drop(columns=['referee', 'red_fighter_link', 'blue_fighter_link', 'fight_url'])


In [15]:
for i in ['blue_fighter_stance', 'red_fighter_stance']:
    df[i] = df[i].fillna(df[i].mode()[0])

missing(df)


Unnamed: 0,Total,Percent


In [16]:
# Function to convert percentage strings to float
def convert_pct_to_float(x):
    if x == '---' or pd.isna(x):
        return 0.0
    return float(x.strip('%')) / 100

# Find columns with 'pct' in their name
pct_columns = [col for col in df.columns if 'pct' in col.lower()]

# Convert each percentage column to float
for col in pct_columns:
    df[col] = df[col].apply(convert_pct_to_float)

# Verify the changes
print("Sample of converted columns:")
print(df[pct_columns].head())

Sample of converted columns:
   blue_sig_str_pct  blue_td_pct  red_sig_str_pct  red_td_pct
0              0.33         0.00             0.31        0.50
1              0.36         0.00             0.42        0.00
2              0.53         0.33             0.40        0.00
3              0.45         0.20             0.38        0.20
4              0.44         0.00             0.51        0.44


In [17]:
ctrl_columns = [col for col in df.columns if 'ctrl' in col.lower()]

# Function to convert time format to seconds
def convert_time_to_seconds(time_str):
    if pd.isna(time_str):
        return 0
    try:
        # Split minutes and seconds
        minutes, seconds = map(float, time_str.split(':'))
        return int(minutes * 60 + seconds)
    except:
        return 0

# Convert each time column to seconds
for col in ctrl_columns:
    df[col] = df[col].apply(convert_time_to_seconds)

print(df[ctrl_columns].head())

   blue_ctrl  red_ctrl
0          0        87
1          2         2
2         48        15
3         43         7
4          4        65


In [18]:
df['final_time'] = df['final_time'].apply(convert_time_to_seconds)

In [19]:
dob_columns = [col for col in df.columns if 'dob' in col.lower()]

for col in dob_columns:
    df[col] = pd.to_datetime(df[col], format='%b %d, %Y', errors='coerce')

print(df[dob_columns].head())

  red_fighter_dob blue_fighter_dob
0      1991-10-27       1989-05-21
1      1991-01-10       1996-01-03
2      1991-01-10       1996-01-03
3      1991-01-10       1996-01-03
4      1991-01-10       1996-01-03


In [20]:
# Find reach columns
reach_columns = [col for col in df.columns if 'reach' in col.lower()]

# Create a dictionary to store fighter reaches
fighter_reaches = {}

# First pass: collect all valid reaches for each fighter
for col in ['red_fighter_reach', 'blue_fighter_reach']:
    fighter_col = 'red_fighter_name' if col.startswith('red') else 'blue_fighter_name'
    
    # Get only valid reach values (not NaN or '--')
    valid_reaches = df[~df[col].isin(['--', np.nan])][[fighter_col, col]]
    
    # Add to dictionary
    for _, row in valid_reaches.iterrows():
        fighter = row[fighter_col]
        reach = row[col]
        # Handle both string and integer values
        if isinstance(reach, str):
            reach = int(reach.strip('"'))
        if fighter not in fighter_reaches:
            fighter_reaches[fighter] = reach

print(f"Number of fighters with known reach: {len(fighter_reaches)}")

# Second pass: fill in missing reaches
for col in ['red_fighter_reach', 'blue_fighter_reach']:
    fighter_col = 'red_fighter_name' if col.startswith('red') else 'blue_fighter_name'
    
    # Find rows with missing reach
    missing_mask = df[col].isin(['--', np.nan])
    
    # Fill in missing reaches where possible
    for idx in df[missing_mask].index:
        fighter = df.loc[idx, fighter_col]
        if fighter in fighter_reaches:
            df.loc[idx, col] = str(fighter_reaches[fighter]) + '"'

# Convert all reaches to integers
def convert_reach_to_int(x):
    if pd.isna(x):
        return None
    if isinstance(x, str):
        if x == '--':
            return None
        return int(x.strip('"'))
    return x

# Convert all reaches to integers where possible
for col in reach_columns:
    df[col] = df[col].apply(convert_reach_to_int)

# Calculate average reach from valid values
valid_reaches = []
for col in ['red_fighter_reach', 'blue_fighter_reach']:
    valid_reaches.extend(df[col].dropna().tolist())
    
average_reach = int(np.mean(valid_reaches))
print(f"Average reach: {average_reach} inches")

# Fill remaining missing values with the average
for col in reach_columns:
    df[col] = df[col].fillna(average_reach)

# Verify the changes
print("\nSample of final reach data:")
print(df[reach_columns].head())

# Verify no missing values remain
print("\nRemaining missing values:")
print(df[reach_columns].isna().sum())

Number of fighters with known reach: 1901
Average reach: 71 inches

Sample of final reach data:
   red_fighter_reach  blue_fighter_reach
0               70.0                72.0
1               68.0                69.0
2               68.0                69.0
3               68.0                69.0
4               68.0                69.0

Remaining missing values:
red_fighter_reach     0
blue_fighter_reach    0
dtype: int64


In [21]:
# Find weight columns
weight_columns = [col for col in df.columns if 'weight' in col.lower()]

# Create a dictionary to store fighter weights
fighter_weights = {}

# First pass: collect all valid weights for each fighter
for col in ['red_fighter_weight', 'blue_fighter_weight']:
    fighter_col = 'red_fighter_name' if col.startswith('red') else 'blue_fighter_name'
    
    # Get only valid weight values (not NaN or '--')
    valid_weights = df[~df[col].isin(['--', np.nan])][[fighter_col, col]]
    
    # Add to dictionary
    for _, row in valid_weights.iterrows():
        fighter = row[fighter_col]
        weight = row[col]
        # Handle both string and integer values
        if isinstance(weight, str):
            weight = int(weight.strip(' lbs'))
        if fighter not in fighter_weights:
            fighter_weights[fighter] = weight

print(f"Number of fighters with known weight: {len(fighter_weights)}")

# Second pass: fill in missing weights
for col in ['red_fighter_weight', 'blue_fighter_weight']:
    fighter_col = 'red_fighter_name' if col.startswith('red') else 'blue_fighter_name'
    
    # Find rows with missing weight
    missing_mask = df[col].isin(['--', np.nan])
    
    # Fill in missing weights where possible
    for idx in df[missing_mask].index:
        fighter = df.loc[idx, fighter_col]
        if fighter in fighter_weights:
            df.loc[idx, col] = str(fighter_weights[fighter]) + ' lbs'

# Convert all weights to integers
def convert_weight_to_int(x):
    if pd.isna(x):
        return None
    if isinstance(x, str):
        if x == '--':
            return None
        return int(x.strip(' lbs'))
    return x

# Convert all weights to integers where possible
for col in weight_columns:
    df[col] = df[col].apply(convert_weight_to_int)

# Calculate average weight from valid values
valid_weights = []
for col in ['red_fighter_weight', 'blue_fighter_weight']:
    valid_weights.extend(df[col].dropna().tolist())
    
average_weight = int(np.mean(valid_weights))
print(f"Average weight: {average_weight} lbs")

# Fill remaining missing values with the average
for col in weight_columns:
    df[col] = df[col].fillna(average_weight)

# Verify the changes
print("\nSample of final weight data:")
print(df[weight_columns].head())

# Verify no missing values remain
print("\nRemaining missing values:")
print(df[weight_columns].isna().sum())

Number of fighters with known weight: 2529
Average weight: 165 lbs

Sample of final weight data:
   red_fighter_weight  blue_fighter_weight
0               155.0                155.0
1               135.0                135.0
2               135.0                135.0
3               135.0                135.0
4               135.0                135.0

Remaining missing values:
red_fighter_weight     0
blue_fighter_weight    0
dtype: int64


In [22]:
# Find height columns
height_columns = [col for col in df.columns if 'height' in col.lower()]

# Function to convert height to centimeters
def convert_height_to_cm(height_str):
    if pd.isna(height_str) or height_str == '--':
        return None
    try:
        # Split the feet and inches parts
        feet_str, inches_str = height_str.split("'")
        feet = int(feet_str)
        inches = int(inches_str.strip('"'))
        
        # Convert to centimeters
        # 1 foot = 30.48 cm
        # 1 inch = 2.54 cm
        total_cm = (feet * 30.48) + (inches * 2.54)
        return round(total_cm)
    except:
        return None

# Create a dictionary to store fighter heights
fighter_heights = {}

# First pass: collect all valid heights for each fighter
for col in ['red_fighter_height', 'blue_fighter_height']:
    fighter_col = 'red_fighter_name' if col.startswith('red') else 'blue_fighter_name'
    
    # Get only valid height values
    valid_heights = df[~df[col].isin(['--', np.nan])][[fighter_col, col]]
    
    # Add to dictionary
    for _, row in valid_heights.iterrows():
        fighter = row[fighter_col]
        height = convert_height_to_cm(row[col])
        if height and fighter not in fighter_heights:
            fighter_heights[fighter] = height

print(f"Number of fighters with known height: {len(fighter_heights)}")

# Second pass: fill in missing heights
for col in ['red_fighter_height', 'blue_fighter_height']:
    fighter_col = 'red_fighter_name' if col.startswith('red') else 'blue_fighter_name'
    
    # Convert all heights to cm
    df[col] = df[col].apply(convert_height_to_cm)
    
    # Fill missing heights from fighter dictionary
    for idx in df[df[col].isna()].index:
        fighter = df.loc[idx, fighter_col]
        if fighter in fighter_heights:
            df.loc[idx, col] = fighter_heights[fighter]

# Calculate average height from valid values
valid_heights = []
for col in height_columns:
    valid_heights.extend(df[col].dropna().tolist())
    
average_height = round(np.mean(valid_heights))
print(f"Average height: {average_height} cm")

# Fill remaining missing values with the average
for col in height_columns:
    df[col] = df[col].fillna(average_height)

# Verify the changes
print("\nSample of final height data (in cm):")
print(df[height_columns].head())

# Verify no missing values remain
print("\nRemaining missing values:")
print(df[height_columns].isna().sum())

Number of fighters with known height: 2526
Average height: 178 cm

Sample of final height data (in cm):
   red_fighter_height  blue_fighter_height
0               178.0                180.0
1               168.0                173.0
2               168.0                173.0
3               168.0                173.0
4               168.0                173.0

Remaining missing values:
red_fighter_height     0
blue_fighter_height    0
dtype: int64


In [23]:
# Count unique fight types
fight_type_counts = df['fight_type'].value_counts()

print("Unique fight types and their counts:")
print(fight_type_counts)

print(f"\nTotal number of unique fight types: {len(fight_type_counts)}")

Unique fight types and their counts:
fight_type
Lightweight Bout               3020
Welterweight Bout              2931
Middleweight Bout              2220
Featherweight Bout             1818
Bantamweight Bout              1636
                               ... 
UFC 6 Tournament Title Bout       1
UFC 5 Tournament Title Bout       1
UFC 4 Tournament Title Bout       1
UFC 3 Tournament Title Bout       1
UFC 2 Tournament Title Bout       1
Name: count, Length: 113, dtype: int64

Total number of unique fight types: 113


In [24]:
# Count unique fight types
finish_type_counts = df['finish_details'].value_counts()

print("Unique fight types and their counts:")
print(finish_type_counts)

print(f"\nTotal number of unique fight types: {len(finish_type_counts)}")

Unique fight types and their counts:
finish_details
Decision - Unanimous                                   8816
Decision - Split                                       2409
Rear Naked Choke                                       1033
Punch to Head At Distance                              1031
Punches to Head At Distance                             557
                                                       ... 
to    \n      Fatigue                                     1
Holding Fence by Radach                                   1
Other - Lock From Bottom Guard\n      Shoulder Lock       1
Armbar After Drop to Ground                               1
Ezekiel Choke From Side Control                           1
Name: count, Length: 469, dtype: int64

Total number of unique fight types: 469


In [25]:
# Count unique fight types
method_counts = df['method'].value_counts()

print("Unique fight types and their counts:")
print(method_counts)

print(f"\nTotal number of unique fight types: {len(method_counts)}")

Unique fight types and their counts:
method
Decision - Unanimous       8901
KO/TKO                     4136
Submission                 2625
Decision - Split           2424
Decision - Majority         296
TKO - Doctor's Stoppage     186
Overturned                  124
Could Not Continue           44
DQ                           37
Other                         4
Name: count, dtype: int64

Total number of unique fight types: 10


In [26]:
df = df.drop(columns=['fight_type', # weight class, we will create a new column for this based on the weights
                      'event_name',
                      'finish_details',
                      ])


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18777 entries, 0 to 18776
Data columns (total 67 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   method                    18777 non-null  object        
 1   time_format               18777 non-null  object        
 2   red_fighter_reach         18777 non-null  float64       
 3   red_fighter_height        18777 non-null  float64       
 4   red_fighter_weight        18777 non-null  float64       
 5   red_fighter_stance        18777 non-null  object        
 6   red_fighter_dob           18701 non-null  datetime64[ns]
 7   blue_fighter_reach        18777 non-null  float64       
 8   blue_fighter_height       18777 non-null  float64       
 9   blue_fighter_weight       18777 non-null  float64       
 10  blue_fighter_stance       18777 non-null  object        
 11  blue_fighter_dob          18560 non-null  datetime64[ns]
 12  round             

In [28]:
missing(df)

Unnamed: 0,Total,Percent
blue_fighter_dob,217,1.155669
red_fighter_dob,76,0.40475


In [29]:
for i in dob_columns:
    df[i] = df[i].fillna(df[i].mean())

missing(df)

Unnamed: 0,Total,Percent


In [30]:
# Count unique fight types
blue_wins = df['blue_fighter_status'].value_counts()

print("Unique fight types and their counts:")
print(blue_wins)

print(f"\nTotal number of unique fight types: {len(blue_wins)}")

Unique fight types and their counts:
blue_fighter_status
L     11821
W      6604
D       184
NC      168
Name: count, dtype: int64

Total number of unique fight types: 4


In [31]:
red_wins = df['red_fighter_status'].value_counts()

print("Unique fight types and their counts:")
print(red_wins)

print(f"\nTotal number of unique fight types: {len(red_wins)}")


Unique fight types and their counts:
red_fighter_status
W     11821
L      6604
D       184
NC      168
Name: count, dtype: int64

Total number of unique fight types: 4


-   W: wins
-   L: losses
-   D: draws
-   NC: no contest



In [32]:
# Identify columns ending with '_status'
status_columns = [col for col in df.columns if col.endswith('_status')]

# Create a mask to filter out rows where any status column has 'D' or 'NC'
mask = df[status_columns].isin(['D', 'NC']).any(axis=1)

# Drop the rows that match the mask
df = df[~mask]

# Verify the changes
print(f"Total number of rows after dropping statuses 'D' or 'NC': {len(df)}")

Total number of rows after dropping statuses 'D' or 'NC': 18425


In [33]:
# Create a new column for red fighter win status
df.insert(0, 'red_fighter_win', df['red_fighter_status'].apply(lambda x: 1 if x == 'W' else 0))

# Drop columns ending with '_status'
df = df.drop(columns=status_columns)

In [None]:
def get_weight_class(weight_lbs):
    if weight_lbs <= 125:
        return 'Flyweight'
    elif weight_lbs <= 135:
        return 'Bantamweight'
    elif weight_lbs <= 145:
        return 'Featherweight'
    elif weight_lbs <= 155:
        return 'Lightweight'
    elif weight_lbs <= 170:
        return 'Welterweight'
    elif weight_lbs <= 185:
        return 'Middleweight'
    elif weight_lbs <= 205:
        return 'Light Heavyweight'
    else:
        return 'Heavyweight'

# Add weight class columns
df['red_fighter_weight_class'] = df['red_fighter_weight'].apply(get_weight_class)
df['blue_fighter_weight_class'] = df['blue_fighter_weight'].apply(get_weight_class)

# Verify that both fighters are in the same weight class
df['weight_class_match'] = df['red_fighter_weight_class'] == df['blue_fighter_weight_class']

# Print any mismatches
mismatches = df[~df['weight_class_match']]
if len(mismatches) > 0:
    print(f"\nFound {len(mismatches)} fights with mismatched weight classes:")
    print(mismatches[['red_fighter_name', 'red_fighter_weight', 'red_fighter_weight_class',
                      'blue_fighter_name', 'blue_fighter_weight', 'blue_fighter_weight_class']])

# Create a single weight_class column since both fighters should be in the same class
df['weight_class'] = df['red_fighter_weight_class']

# Drop the individual weight class columns and the match verification column
df = df.drop(columns=['red_fighter_weight_class', 'blue_fighter_weight_class', 'weight_class_match'])

# Convert weight_class to one-hot encoded columns
df = pd.get_dummies(df, columns=['weight_class'], prefix='weight_class')

In [34]:
# Define the order of columns
red_winner_columns = ['red_fighter_win']  # Start with the red winner column
fighter_name_columns = ['red_fighter_name', 'blue_fighter_name']  # Fighter names
non_fighter_columns = [col for col in df.columns if not (col.startswith('red') or col.startswith('blue')) and col != 'red_fighter_win']
red_columns = [col for col in df.columns if col.startswith('red') and col != 'red_fighter_win' and col != 'red_fighter_name']
blue_columns = [col for col in df.columns if col.startswith('blue') and col != 'blue_fighter_name']

# Combine all columns in the desired order
ordered_columns = red_winner_columns + fighter_name_columns + non_fighter_columns + red_columns + blue_columns

# Reorder the DataFrame
df = df[ordered_columns]

In [35]:
# Identify numeric columns
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

# Filter out columns that end with 'pct'
columns_to_convert = [col for col in numeric_columns if not col.endswith('pct')]

# Convert the selected columns to integers
df[columns_to_convert] = df[columns_to_convert].astype(int)

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18425 entries, 0 to 18776
Data columns (total 66 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   red_fighter_win           18425 non-null  int64         
 1   red_fighter_name          18425 non-null  object        
 2   blue_fighter_name         18425 non-null  object        
 3   method                    18425 non-null  object        
 4   time_format               18425 non-null  object        
 5   round                     18425 non-null  int64         
 6   final_time                18425 non-null  int64         
 7   event_date                18425 non-null  datetime64[ns]
 8   red_fighter_reach         18425 non-null  int64         
 9   red_fighter_height        18425 non-null  int64         
 10  red_fighter_weight        18425 non-null  int64         
 11  red_fighter_stance        18425 non-null  object        
 12  red_fighter_dob        

# Preprocess

In [37]:
# Change the dtype of red_fighter_win to boolean
df['red_fighter_win'] = df['red_fighter_win'].astype(bool)

In [38]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']

binary_cols = [col for col in cat_cols if df[col].nunique() == 2]
# Exclude fighters' name columns from multi_cat_cols
multi_cat_cols = [col for col in cat_cols if df[col].nunique() > 2 and col not in ['red_fighter_name',
                                                                                   'blue_fighter_name']]

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to binary categorical columns
for col in binary_cols:
    df[col] = label_encoder.fit_transform(df[col])

df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)

In [39]:
# var_excludes = ['red_fighter_name',
#                 'blue_fighter_name',
#                 'event_date',
#                 'red_fighter_win',
#                 'blue_fighter_dob',
#                 'red_fighter_dob']
# # Exclude fighters' name columns and datetime columns before computing variance
# df_variance = df.drop(columns=var_excludes)

# # Compute variance for each column in the dataset
# variances = df_variance.var()

# low_variance_threshold = 0.01

# low_variance_vars = variances[variances < low_variance_threshold].index

# # Drop low-variance variables from the dataset
# df = df.drop(columns=low_variance_vars)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18425 entries, 0 to 18776
Data columns (total 92 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   red_fighter_win                   18425 non-null  bool          
 1   red_fighter_name                  18425 non-null  object        
 2   blue_fighter_name                 18425 non-null  object        
 3   round                             18425 non-null  int64         
 4   final_time                        18425 non-null  int64         
 5   event_date                        18425 non-null  datetime64[ns]
 6   red_fighter_reach                 18425 non-null  int64         
 7   red_fighter_height                18425 non-null  int64         
 8   red_fighter_weight                18425 non-null  int64         
 9   red_fighter_dob                   18425 non-null  datetime64[ns]
 10  red_body_attempted                18425 non-null  i

# Further todo:
-   Create a new column for weight class based on the weight
-   Create a new column for fighter age based on the date of birth and at the time of the fight
-   Create trailing wins and losses
-   Create the variables available in the upcoming fights' table
-   Days since last fight


In [41]:
df['fight_id'] = df.groupby(['event_date', 'red_fighter_name', 'blue_fighter_name']).ngroup()


In [42]:
def show_fighter_last_stats(fighter_name):
    # Find the last fight where the fighter appeared
    last_fight = df[
        (df['red_fighter_name'] == fighter_name) | 
        (df['blue_fighter_name'] == fighter_name)
    ].iloc[0]  # Get first row since df is sorted newest to oldest
    
    # Determine if fighter was red or blue in their last fight
    was_red = last_fight['red_fighter_name'] == fighter_name
    prefix = 'red' if was_red else 'blue'
    
    # Get relevant trailing statistics
    trailing_cols = [col for col in df.columns if f'{prefix}_fighter_trailing_' in col]
    
    # Create a clean dictionary of stats
    stats = {
        'Fight Date': last_fight['event_date'],
        'Fighter': fighter_name,
        'Opponent': last_fight['blue_fighter_name'] if was_red else last_fight['red_fighter_name'],
        'Result': 'Win' if (was_red and last_fight['red_fighter_win']) or 
                          (not was_red and not last_fight['red_fighter_win']) else 'Loss'
    }
    
    # Add trailing statistics
    for col in trailing_cols:
        clean_name = col.replace(f'{prefix}_fighter_trailing_', '')
        stats[clean_name] = last_fight[col]
    
    # Convert to DataFrame and transpose for better display
    stats_df = pd.DataFrame([stats])
    
    # Reorder columns to show key info first
    first_cols = ['Fight Date', 'Fighter', 'Opponent', 'Result']
    other_cols = [col for col in stats_df.columns if col not in first_cols]
    stats_df = stats_df[first_cols + other_cols]
    
    return stats_df


In [43]:
# First, let's create a fight identifier and sort the dataframe in chronological order
df = df.sort_values(['event_date', 'fight_id'], ascending=[True, True])
df = df.reset_index(drop=True)

def calculate_fighter_stats(row, fighter_name):
    # Get all previous fights for this fighter (excluding current fight)
    current_fight_id = row['fight_id']
    current_date = row['event_date']
    
    mask = ((df['event_date'] < current_date) | 
            ((df['event_date'] == current_date) & (df['fight_id'] < current_fight_id))) & \
           ((df['red_fighter_name'] == fighter_name) | 
            (df['blue_fighter_name'] == fighter_name))
    
    # Get unique fights by taking the last round of each fight
    previous_fights = df[mask].groupby('fight_id').last().reset_index()
    
    if len(previous_fights) == 0:
        return {
            'avg_fight_time': 0,
            'defense': 0,
            'striking_accuracy': 0,
            'strikes_landed_per_min': 0,
            'strikes_absorbed_per_min': 0,
            'takedown_accuracy': 0,
            'takedown_defense': 0,
            'takedowns_per_15': 0,
            'submission_per_15': 0,
            'wins': 0,
            'losses': 0,
            'last_fight_days': None
        }
    
    stats = {}
    total_fights = len(previous_fights)
    
    # Calculate days since last fight
    last_fight_date = previous_fights['event_date'].max()
    stats['last_fight_days'] = (current_date - last_fight_date).days
    
    # Initialize counters
    total_time = 0
    total_strikes_landed = 0
    total_strikes_attempted = 0
    total_strikes_received = 0
    total_takedowns_landed = 0
    total_takedowns_attempted = 0
    total_takedowns_defended = 0
    total_takedowns_attempted_against = 0
    total_submissions = 0
    wins = 0
    
    # Process each previous fight
    for _, fight in previous_fights.iterrows():
        # Determine if fighter was red or blue in this fight
        is_red = fight['red_fighter_name'] == fighter_name
        prefix = 'red' if is_red else 'blue'
        opp_prefix = 'blue' if is_red else 'red'
        
        # Get all rounds for this fight
        fight_rounds = df[df['fight_id'] == fight['fight_id']]
        
        # Accumulate fight statistics across all rounds
        total_time += fight['final_time']
        
        # Sum statistics across all rounds
        for _, round_data in fight_rounds.iterrows():
            total_strikes_landed += round_data[f'{prefix}_sig_str_landed']
            total_strikes_attempted += round_data[f'{prefix}_sig_str_attempted']
            total_strikes_received += round_data[f'{opp_prefix}_sig_str_landed']
            total_takedowns_landed += round_data[f'{prefix}_td_landed']
            total_takedowns_attempted += round_data[f'{prefix}_td_attempted']
            total_takedowns_defended += (round_data[f'{opp_prefix}_td_attempted'] - 
                                       round_data[f'{opp_prefix}_td_landed'])
            total_takedowns_attempted_against += round_data[f'{opp_prefix}_td_attempted']
            total_submissions += round_data[f'{prefix}_sub_att']
        
        # Count wins (only once per fight)
        if (is_red and fight['red_fighter_win']) or (not is_red and not fight['red_fighter_win']):
            wins += 1
    
    # Calculate averages
    total_minutes = total_time / 60
    
    stats['avg_fight_time'] = total_time / total_fights  # Now this is per fight, not per round
    stats['striking_accuracy'] = total_strikes_landed / total_strikes_attempted if total_strikes_attempted > 0 else 0
    stats['defense'] = 1 - (total_strikes_received / total_strikes_attempted) if total_strikes_attempted > 0 else 0
    stats['strikes_landed_per_min'] = total_strikes_landed / total_minutes if total_minutes > 0 else 0
    stats['strikes_absorbed_per_min'] = total_strikes_received / total_minutes if total_minutes > 0 else 0
    stats['takedown_accuracy'] = total_takedowns_landed / total_takedowns_attempted if total_takedowns_attempted > 0 else 0
    stats['takedown_defense'] = total_takedowns_defended / total_takedowns_attempted_against if total_takedowns_attempted_against > 0 else 0
    stats['takedowns_per_15'] = (total_takedowns_landed / total_minutes) * 15 if total_minutes > 0 else 0
    stats['submission_per_15'] = (total_submissions / total_minutes) * 15 if total_minutes > 0 else 0
    stats['wins'] = wins
    stats['losses'] = total_fights - wins
    
    return stats

print("Calculating trailing statistics...")

# Calculate trailing stats for each fighter in each fight
for prefix in ['red', 'blue']:
    # Calculate stats for each fight
    trailing_stats = df.apply(
        lambda row: calculate_fighter_stats(row, row[f'{prefix}_fighter_name']), 
        axis=1
    )
    
    # Convert the series of dictionaries to a DataFrame
    stats_df = pd.DataFrame(trailing_stats.tolist())
    
    # Add the columns to the main DataFrame with appropriate prefixes
    for col in stats_df.columns:
        df[f'{prefix}_fighter_trailing_{col}'] = stats_df[col]

# Sort back to original order (newest to oldest)
df = df.sort_values(['event_date', 'fight_id'], ascending=[False, False])
df = df.reset_index(drop=True)

# Test the changes with a sample fighter
fighter_name = "Sean Strickland"
show_fighter_last_stats(fighter_name)


Calculating trailing statistics...


Unnamed: 0,Fight Date,Fighter,Opponent,Result,avg_fight_time,defense,striking_accuracy,strikes_landed_per_min,strikes_absorbed_per_min,takedown_accuracy,takedown_defense,takedowns_per_15,submission_per_15,wins,losses,last_fight_days
0,2024-06-01,Sean Strickland,Paulo Costa,Win,891.809524,0.692376,0.419427,6.045493,4.434003,0.642857,0.770492,0.865015,0.192226,15,6,133.0


In [44]:
df.to_excel('ufc_preprocessed.xlsx', index=False)

# Upcoming Preprocess

In [45]:
upcoming = pd.read_excel('ufc_upcoming_fight_details.xlsx')

In [46]:
def convert_time_to_seconds(time_str):
    if pd.isna(time_str):
        return 0
    try:
        # Split minutes and seconds
        minutes, seconds = map(float, time_str.split(':'))
        return int(minutes * 60 + seconds)
    except:
        return 0

def convert_height_to_cm(height_str):
    if pd.isna(height_str) or height_str == '--':
        return None
    try:
        # Split the feet and inches parts
        feet_str, inches_str = height_str.split("'")
        feet = int(feet_str)
        inches = int(inches_str.strip('"'))
        
        # Convert to centimeters
        total_cm = (feet * 30.48) + (inches * 2.54)
        return round(total_cm)
    except:
        return None

def convert_reach_to_cm(reach_str):
    if pd.isna(reach_str) or reach_str == '--':
        return None
    try:
        # Convert inches to cm
        inches = int(reach_str.strip('"'))
        return round(inches * 2.54)
    except:
        return None

def convert_weight_to_kg(weight_str):
    if pd.isna(weight_str) or weight_str == '--':
        return None
    try:
        # Convert lbs to kg
        lbs = int(weight_str.strip(' lbs.'))
        return round(lbs * 0.453592)
    except:
        return None

def convert_percentage(pct_str):
    if pd.isna(pct_str):
        return 0.0
    try:
        return float(pct_str.strip('%')) / 100
    except:
        return 0.0

def extract_record(record_str):
    if pd.isna(record_str):
        return 0, 0
    try:
        parts = record_str.split('-')
        return int(parts[0]), int(parts[1])
    except:
        return 0, 0

# Create a copy of the upcoming DataFrame
upcoming_processed = upcoming.copy()

# Process each fighter's data
for prefix in ['fighter1', 'fighter2']:
    # Convert fight time to seconds
    upcoming_processed[f'{prefix}_avg_fight_time'] = upcoming_processed[f'{prefix}_average_fight_time'].apply(convert_time_to_seconds)
    
    # Convert height to cm
    upcoming_processed[f'{prefix}_height'] = upcoming_processed[f'{prefix}_height'].apply(convert_height_to_cm)
    
    # Convert reach to cm
    upcoming_processed[f'{prefix}_reach'] = upcoming_processed[f'{prefix}_reach'].apply(convert_reach_to_cm)
    
    # Convert weight to kg
    upcoming_processed[f'{prefix}_weight'] = upcoming_processed[f'{prefix}_weight'].apply(convert_weight_to_kg)
    upcoming_processed[f'{prefix}_dob'] = pd.to_datetime(upcoming_processed[f'{prefix}_dob'], format='%b %d, %Y')

    
    # Convert percentages
    for stat in ['striking_accuracy', 'takedown_accuracy', 'defense', 'takedown_defense']:
        col = f'{prefix}_{stat}'
        if col in upcoming_processed.columns:
            upcoming_processed[col] = upcoming_processed[col].apply(convert_percentage)
    
    # Extract wins and losses
    wins_losses = upcoming_processed[f'{prefix}_wins/losses/draws'].apply(extract_record)
    upcoming_processed[f'{prefix}_wins'] = [x[0] for x in wins_losses]
    upcoming_processed[f'{prefix}_losses'] = [x[1] for x in wins_losses]

# Rename columns to match our calculated statistics format
column_mapping = {
    'strikes_absorbed_per_min_(sapm)': 'strikes_absorbed_per_min',
    'strikes_landed_per_min_(slpm)': 'strikes_landed_per_min',
    'submission_average/15_min': 'submission_per_15',
    'takedowns_average/15_min': 'takedowns_per_15'
}

for old_suffix, new_suffix in column_mapping.items():
    for prefix in ['fighter1', 'fighter2']:
        old_col = f'{prefix}_{old_suffix}'
        new_col = f'{prefix}_{new_suffix}'
        if old_col in upcoming_processed.columns:
            upcoming_processed = upcoming_processed.rename(columns={old_col: new_col})

# Drop unnecessary columns
cols_to_drop = [
    'event_name', 'event_date', 'event_location', 'event_link',
    'fight_link', 'fighter1_link', 'fighter2_link',
    'fighter1_nickname', 'fighter2_nickname',
    'fighter1_wins/losses/draws', 'fighter2_wins/losses/draws', 'weight_class',
    'fighter1_average_fight_time', 'fighter2_average_fight_time'  # because we created avg_fight_time
]
upcoming_processed = upcoming_processed.drop(columns=cols_to_drop)

# Show the first few rows of processed data
print("\nProcessed upcoming fights data:")
upcoming_processed.head()


Processed upcoming fights data:


Unnamed: 0,fighter1,fighter2,fighter1_defense,fighter1_dob,fighter1_height,fighter1_reach,fighter1_stance,fighter1_strikes_absorbed_per_min,fighter1_strikes_landed_per_min,fighter1_striking_accuracy,...,fighter2_takedown_accuracy,fighter2_takedown_defense,fighter2_takedowns_per_15,fighter2_weight,fighter1_avg_fight_time,fighter1_wins,fighter1_losses,fighter2_avg_fight_time,fighter2_wins,fighter2_losses
0,Israel Adesanya,Nassourdine Imavov,0.56,1989-07-22,193,203,Switch,3.21,4.0,0.48,...,0.36,0.78,0.88,84,1098,24,4,916,15,4
1,Shara Magomedov,Michael Page,0.42,1994-05-16,188,185,Orthodox,4.49,6.82,0.65,...,0.25,0.63,0.44,77,785,15,0,516,22,3
2,Sergei Pavlovich,Jairzinho Rozenstruik,0.51,1992-05-13,190,213,Southpaw,5.03,5.86,0.44,...,0.0,0.75,0.0,110,219,18,3,544,15,5
3,Said Nurmagomedov,Vinicius Oliveira,0.57,1992-04-05,173,178,Orthodox,2.13,3.55,0.47,...,0.6,0.68,1.38,61,494,18,3,654,21,3
4,Fares Ziam,Mike Davis,0.66,1997-03-21,185,190,Orthodox,1.66,2.92,0.5,...,0.53,0.64,2.77,70,789,16,4,757,11,2


In [47]:
# First convert stances to one-hot encoded columns
for prefix in ['fighter1', 'fighter2']:
    # Get unique stances
    stances = ['Orthodox', 'Sideways', 'Southpaw', 'Switch']
    
    # Create one-hot encoded columns for each stance
    for stance in stances:
        col_name = f'{prefix}_stance_{stance}'
        upcoming_processed[col_name] = (upcoming_processed[f'{prefix}_stance'] == stance).astype(bool)
    
    # Drop the original stance column
    upcoming_processed = upcoming_processed.drop(columns=[f'{prefix}_stance'])

# Get all columns from both DataFrames
upcoming_cols = set(upcoming_processed.columns)
df_cols = set(df.columns)

# Create mapping dictionary
column_mapping = {}

# Map fighter1 to red_fighter and fighter2 to blue_fighter
for upcoming_col in upcoming_cols:
    if upcoming_col.startswith('fighter1'):
        df_col = upcoming_col.replace('fighter1', 'red_fighter')
        if df_col in df_cols:
            column_mapping[upcoming_col] = df_col
        elif df_col.replace('red_fighter_', 'red_fighter_trailing_') in df_cols:
            column_mapping[upcoming_col] = df_col.replace('red_fighter_', 'red_fighter_trailing_')
        else:
            column_mapping[upcoming_col] = "NO MATCH FOUND"
    elif upcoming_col.startswith('fighter2'):
        df_col = upcoming_col.replace('fighter2', 'blue_fighter')
        if df_col in df_cols:
            column_mapping[upcoming_col] = df_col
        elif df_col.replace('blue_fighter_', 'blue_fighter_trailing_') in df_cols:
            column_mapping[upcoming_col] = df_col.replace('blue_fighter_', 'blue_fighter_trailing_')
        else:
            column_mapping[upcoming_col] = "NO MATCH FOUND"
    else:
        column_mapping[upcoming_col] = "NO MATCH FOUND"

# Create a DataFrame to display the mapping
mapping_df = pd.DataFrame.from_dict(column_mapping, orient='index', columns=['Corresponding Column in df'])
mapping_df.index.name = 'Upcoming Column'

# Display the mapping
print("Column Mapping between upcoming_processed and df:")
print(mapping_df)

# Find columns without matches
no_matches = mapping_df[mapping_df['Corresponding Column in df'] == "NO MATCH FOUND"]
if len(no_matches) > 0:
    print("\nColumns without matches:")
    print(no_matches)


Column Mapping between upcoming_processed and df:
                                                       Corresponding Column in df
Upcoming Column                                                                  
fighter2_takedowns_per_15                  blue_fighter_trailing_takedowns_per_15
fighter2_avg_fight_time                      blue_fighter_trailing_avg_fight_time
fighter1_takedowns_per_15                   red_fighter_trailing_takedowns_per_15
fighter1_takedown_defense                   red_fighter_trailing_takedown_defense
fighter1_strikes_landed_per_min       red_fighter_trailing_strikes_landed_per_min
fighter2_reach                                                 blue_fighter_reach
fighter1_height                                                red_fighter_height
fighter2_striking_accuracy                blue_fighter_trailing_striking_accuracy
fighter2_losses                                      blue_fighter_trailing_losses
fighter1_dob                                    

In [48]:
upcoming_processed.to_excel('upcoming_preprocessed.xlsx', index=False)