In [1]:
#Hello

In [2]:
import os
import pandas as pd
from torch.utils.data import DataLoader
from datetime import datetime

Load Datasets

In [3]:
#Load the fight data
fight_stats_df = pd.read_csv('datasets/ufc_fight_stat_data.csv')

#Load fight data
fight_data_df = pd.read_csv('datasets/ufc_fight_data.csv')

#Load fighter data 
fighter_data_df = pd.read_csv('datasets/ufc_fighter_data.csv')

#Load event data
event_data_df = pd.read_csv('datasets/ufc_event_data.csv')

print(fighter_data_df.columns)

Index(['fighter_id', 'fighter_f_name', 'fighter_l_name', 'fighter_nickname',
       'fighter_height_cm', 'fighter_weight_lbs', 'fighter_reach_cm',
       'fighter_stance', 'fighter_dob', 'fighter_w', 'fighter_l', 'fighter_d',
       'fighter_nc_dq', 'fighter_url'],
      dtype='object')


Clean and Process Data

In [4]:
#Convert data columns to datetime format
fighter_data_df['fighter_dob'] = pd.to_datetime(fighter_data_df['fighter_dob'])

#Handle Missing Value, if any
fight_stats_df = fight_stats_df.dropna(subset=['fighter_id', 'ctrl_time'])
fight_data_df = fight_data_df.dropna(subset=['fight_id', 'f_1', 'f_2', 'winner', 'num_rounds'])
fighter_data_df = fighter_data_df.dropna(subset=['fighter_id', 'fighter_reach_cm', 'fighter_height_cm', 'fighter_stance', 'fighter_dob'])

fighter_data_df['fighter_nc_dq'] = fighter_data_df['fighter_nc_dq'].fillna(0)
fighter_data_df['fighter_stance'] = fighter_data_df['fighter_stance'].fillna(0)

event_data_df = event_data_df.dropna(subset=['event_id'])

#Encode categorical variables (if needed)
fighter_data_df['fighter_stance'] = fighter_data_df['fighter_stance'].map({'Orthodox': 0, 'Southpaw': 1, 'Switch': 2})



Merge Datasets

In [5]:
    
merged_df = pd.merge(fight_stats_df, fight_data_df, how='inner', on='fight_id')
merged_df = pd.merge(merged_df, fighter_data_df, how='inner', on='fighter_id')
merged_df = pd.merge(merged_df, event_data_df, how='inner', on='event_id')


Cleaning

In [6]:
# Convert ctrl_time from minute format to seconds
def time_to_seconds(time_str):
    if time_str == '--':
        return None  # or any other suitable value, like np.nan
    else:
        minutes, seconds = map(int, time_str.split(':'))
        return minutes * 60 + seconds
    
merged_df['ctrl_time'] = merged_df['ctrl_time'].apply(time_to_seconds)
merged_df['finish_time'] = merged_df['finish_time'].apply(time_to_seconds)

#Convert each column to unique ID
merged_df['referee'] = pd.factorize(merged_df['referee'])[0]
merged_df['event_city'] = pd.factorize(merged_df['event_city'])[0]
merged_df['event_state'] = pd.factorize(merged_df['event_state'])[0]
merged_df['event_country'] = pd.factorize(merged_df['event_country'])[0]
merged_df['result_details'] = pd.factorize(merged_df['result_details'])[0]
merged_df['title_fight'] = pd.factorize(merged_df['title_fight'])[0]
merged_df['weight_class'] = pd.factorize(merged_df['weight_class'])[0]
merged_df['gender'] = pd.factorize(merged_df['gender'])[0]
merged_df['result'] = pd.factorize(merged_df['result'])[0]

# NOTE: add column that contains value age at the event date

# Convert num_rounds to int
merged_df['num_rounds'] = pd.to_numeric(merged_df['num_rounds'], errors='coerce')


# Convert dob to age in years at the date of 3/25/2024
hard_coded_datetime = datetime(2024, 3, 25)
merged_df['fighter_dob'] = (hard_coded_datetime.date() - pd.to_datetime(merged_df['fighter_dob']).dt.date).apply(lambda x: x.days / 365.25)


# List of columns to drop (temp drop 'event_date' *add later*)
cols_to_drop = ['fight_url_x', 'fighter_url', 'event_url', 'fight_url_x', 
                'fight_url_y', 'event_url', 'fighter_url', 'event_name', 
                'fighter_nickname', 'fighter_l_name', 'fighter_f_name', 
                'event_date', 'event_state', 'fight_id', 'finish_round',
                'result_details'
                ]

# Drop the specified columns
merged_df = merged_df.drop(columns=cols_to_drop)



Create Cumulative Columns

In [7]:
#create cumulative column for ctrl_time

# merged_df.sort_values(by=["fighter_id", 'fight_stat_id'], ascending=[True, True], inplace=True)
merged_df["cumulative_ctrl_time"] = merged_df.groupby("fighter_id")["ctrl_time"].transform(lambda x: x.expanding().mean())
# Cumulative for reversals
merged_df["cumulative_reversals"] = merged_df.groupby("fighter_id")["reversals"].transform(lambda x: x.expanding().mean())

# for submission attempts
merged_df["cumulative_submission_att"] = merged_df.groupby("fighter_id")["submission_att"].transform(lambda x: x.expanding().mean())

# for takedown succession
merged_df["cumulative_takedown_succ"] = merged_df.groupby("fighter_id")["takedown_succ"].transform(lambda x: x.expanding().mean())

# for takedown attmepts 
merged_df["cumulative_takedown_att"] = merged_df.groupby("fighter_id")["takedown_att"].transform(lambda x: x.expanding().mean())

# for significant strike succession
merged_df["cumulative_sig_strikes_att"] = merged_df.groupby("fighter_id")["sig_strikes_att"].transform(lambda x: x.expanding().mean())

# for total strikes succession
merged_df["cumulative_total_strikes_succ"] = merged_df.groupby("fighter_id")["total_strikes_succ"].transform(lambda x: x.expanding().mean())

# for total strikes ATTEMPTS
merged_df["cumulative_total_strikes_att"] = merged_df.groupby("fighter_id")["total_strikes_att"].transform(lambda x: x.expanding().mean())

# for knockdowns
merged_df["cumulative_knockdowns"] = merged_df.groupby("fighter_id")["knockdowns"].transform(lambda x: x.expanding().mean())

# AVG finish Time
merged_df['avg_finish_time'] = merged_df.groupby('fighter_id')['finish_time'].transform(lambda x: x.expanding().mean())


# merged_df = pd.read_csv('final_set.csv')

print(merged_df.columns)
# print("num of columns" , merged_df.shape[1])
# print(merged_df['finish_time'])

merged_df.to_csv('final_set.csv', index=False)


Index(['fight_stat_id', 'fighter_id', 'knockdowns', 'total_strikes_att',
       'total_strikes_succ', 'sig_strikes_att', 'sig_strikes_succ',
       'takedown_att', 'takedown_succ', 'submission_att', 'reversals',
       'ctrl_time', 'event_id', 'referee', 'f_1', 'f_2', 'winner',
       'num_rounds', 'title_fight', 'weight_class', 'gender', 'result',
       'finish_time', 'fighter_height_cm', 'fighter_weight_lbs',
       'fighter_reach_cm', 'fighter_stance', 'fighter_dob', 'fighter_w',
       'fighter_l', 'fighter_d', 'fighter_nc_dq', 'event_city',
       'event_country', 'cumulative_ctrl_time', 'cumulative_reversals',
       'cumulative_submission_att', 'cumulative_takedown_succ',
       'cumulative_takedown_att', 'cumulative_sig_strikes_att',
       'cumulative_total_strikes_succ', 'cumulative_total_strikes_att',
       'cumulative_knockdowns', 'avg_finish_time'],
      dtype='object')


Remove original numeric stats

In [8]:
# Need to remove original numeric stats because they are independent and would be hard to implement when creating features for model
temp_df = pd.read_csv('final_set.csv')

drop_cols = ['knockdowns', 'total_strikes_att', 'total_strikes_succ',
            'sig_strikes_att', 'sig_strikes_succ', 'takedown_att',
            'takedown_succ', 'submission_att', 'reversals', 'ctrl_time',
            'event_id', 'result', 'finish_time'
            ]
# temp_df = temp_df[temp_df['fight_stat_id'] > 1200]
temp_df = merged_df.drop(columns=drop_cols)
temp_df = temp_df.iloc[:10000]
temp_df.sort_values(by=["fighter_id", 'fight_stat_id'], ascending=[True, True], inplace=True)
temp_df.to_csv('final_set.csv', index=False)

Random Forest Algo

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

merged_df = pd.read_csv('final_set.csv')
data = merged_df
df = pd.DataFrame(data)

# Creating a binary target variable indicating whether the fighter won or not.
df['is_winner'] = df['fighter_id'] == df['winner']

# Selecting features for the model
features = [
    "f_1","f_2","num_rounds", "title_fight", "weight_class", "gender", 
    "fighter_height_cm", "fighter_weight_lbs", "fighter_reach_cm", 
    "fighter_stance", "fighter_w", "fighter_l", "fighter_d", "fighter_dob",
    'cumulative_ctrl_time', 'cumulative_reversals', 'cumulative_submission_att', 
    'cumulative_takedown_succ', 'cumulative_takedown_att', 'cumulative_sig_strikes_att',
    'cumulative_total_strikes_succ', 'cumulative_total_strikes_att', 'cumulative_knockdowns',
    'avg_finish_time'
]
X = df[features]
y = df['is_winner']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalizing the feature set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Training a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Predicting and evaluating the model
y_pred = clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Accuracy:", accuracy)



Accuracy: 0.6505


In [10]:
# from joblib import dump, load

# # Save the model
# dump(clf, 'forest_model.joblib')

# # Save the scaler
# dump(scaler, 'scaler.joblib')

In [16]:
import numpy as np
from joblib import load

warnings.filterwarnings("ignore", category=UserWarning)


# Load the model and scaler
clf_loaded = load('forest_model.joblib')
scaler_loaded = load('scaler.joblib')

# Data for two fighters in NumPy arrays
data_f1 = np.array([[512,3381,5,0,5.0,0.0,185.42,185.0,193.04,0.0,28.0,5.0,0.0,33.073237508555785,91.6,0.0,0.2,0.9,1.4,204.55,93.95,214.6,0.3,267.8]]) #sean
data_f2 = np.array([[3381,512,5,0,5.0,0.0,185.42,185.0,182.88,0.0,14.0,2.0,0.0,32.92813141683778,51.375,0.0,0.0,0.375,0.5,115.0,75.75,121.625,0.5,190.375]]) # costa

# Scale the data
data_f1_scaled = scaler_loaded.transform(data_f1)
data_f2_scaled = scaler_loaded.transform(data_f2)

# Make probability predictions
probability_f1 = clf_loaded.predict_proba(data_f1_scaled)
probability_f2 = clf_loaded.predict_proba(data_f2_scaled)

# Extract the win probability for each fighter (assuming the win class is indexed at 1)
pred_f1 = probability_f1[0][1]
pred_f2 = probability_f2[0][1]

# Compare predictions and determine the winner
if pred_f1 > pred_f2:
    print("Winner is f_1 with probability:", pred_f1)
else:
    print("Winner is f_2 with probability:", pred_f2)


Winner is f_2 with probability: 0.47


In [12]:
import numpy as np
from joblib import load

# Load the model and scaler
clf_loaded = load('forest_model.joblib')
scaler_loaded = load('scaler.joblib')

def prepare_and_predict(data):
    """Scale the data and make a probability prediction for winning."""
    data_scaled = scaler_loaded.transform(data)
    probability = clf_loaded.predict_proba(data_scaled)
    return probability[0][1]  # Assuming the win class is indexed at 1

fighter_data = [
    np.array([[1256, 2550, 5, 1, 10, 0, 193.04, 205.0, 200.66, 0.0, 8, 2, 0, 36.71731690622861, 20.166666666666668, 0.0, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 95.16666666666667, 86.0, 124.83333333333333, 0.3333333333333333, 192.66666666666666]]), #Alex (Fighter 1)
    np.array([[2550, 1256, 5, 1, 10, 0, 193.04, 205.0, 200.66, 1.0, 12, 1, 0, 32.851471594798085, 52.285714285714285, 0.14285714285714285, 0.0, 0.0, 0.0, 136.0, 80.28571428571429, 145.28571428571428, 0.2857142857142857, 187.14285714285714]]), # Jamahal (Fighter 2)
    np.array([[16,69,5,1,2,1,162.56,115.0,160.02,2.0,24,3,0,34.614647501711154,199.6,0.2,0.3,1.9,4.5,142.1,106.4,179.5,0.3,205.4]]), # Zhang (Fighter 3)
    np.array([[69,16,5,1,2,1,165.1,115.0,160.02,0.0,17,3,0,34.77344284736482,78.6,0.0,0.0,0.8,1.2,174.0,99.6,199.4,0.1,271.8]]), # Yan (Fighter 4)
    np.array([[2902,2532,5,1,0,0,180.34,155.0,177.8,0.0,25,4,0,35.35934291581109,21.416666666666668,0.0,0.0,0.08333333333333333,0.3333333333333333,119.08333333333333,73.83333333333333,121.75,0.5,185.83333333333334]]), # Justin (Fighter 5)
    np.array([[2532,2902,5,1,0,0,180.34,145.0,175.26,0.0,25,7,0,32.3066392881588,59.76,0.0,0.4,0.32,0.6,251.64,131.28,262.52,0.32,259.12]]), # Max (Fighter 6)
    np.array([[1401,350,3,0,0,0,177.8,155.0,187.96,0.0,34,9,0,34.43668720054757,135.79166666666666,0.16666666666666666,1.2916666666666667,1.1666666666666667,2.8333333333333335,47.666666666666664,38.416666666666664,60.375,0.2916666666666667,157.0]]), # Charles (Fighter 7)
    np.array([[350,1401,3,0,0,0,170.18,155.0,182.88,0.0,20,3,0,27.45242984257358,317.3333333333333,0.0,0.0,3.111111111111111,8.555555555555555,108.33333333333333,89.44444444444444,152.66666666666666,0.2222222222222222,250.11111111111111]]), # Arman (Fighter 8)
    np.array([[1480,3646,3,0,5,0,185.42,185.0,193.04,1.0,6,0,0,28.194387405886378,76.0,0.0,1.0,0.5,2.5,7.0,5.0,8.0,0.5,106.0]]), # Bo Nickal (Fighter 9)
    np.array([[3646,1480,3,0,5,0,182.88,185.0,182.88,0.0,10,5,0,29.859000684462696,77.71428571428571,0.14285714285714285,0.5714285714285714,1.1428571428571428,2.142857142857143,20.714285714285715,23.142857142857142,35.57142857142857,0.2857142857142857,227.14285714285714]]) # Cody Brundage (Fighter 10)
      
]

probabilities = [prepare_and_predict(data) for data in fighter_data]

pairs = [(0, 1), (2, 3), (4,5), (6,7), (8,9)]  # Define pairs of fighters for comparison
for i, (f1, f2) in enumerate(pairs):
    if probabilities[f1] > probabilities[f2]:
        print(f"Winner between Fighter {f1+1} and Fighter {f2+1} is Fighter {f1+1} with probability: {probabilities[f1]:.4f}")
    else:
        print(f"Winner between Fighter {f1+1} and Fighter {f2+1} is Fighter {f2+1} with probability: {probabilities[f2]:.4f}")



Winner between Fighter 1 and Fighter 2 is Fighter 1 with probability: 0.4000
Winner between Fighter 3 and Fighter 4 is Fighter 3 with probability: 0.4800
Winner between Fighter 5 and Fighter 6 is Fighter 6 with probability: 0.4200
Winner between Fighter 7 and Fighter 8 is Fighter 8 with probability: 0.5100
Winner between Fighter 9 and Fighter 10 is Fighter 9 with probability: 0.5500


In [20]:
import pandas as pd


def get_fighter_stats(fighter_id, file_path):
    data = pd.read_csv(file_path)

    fighter_data = data[data['fighter_id'] == fighter_id].iloc[0]

    relevant_columns = [
        'weight_class', 'gender', 'fighter_height_cm', 'fighter_weight_lbs',
        'fighter_reach_cm', 'fighter_stance', 'fighter_w', 'fighter_l', 
        'fighter_d', 'fighter_dob', 'avg_ctrl_time', 'avg_reversals',
        'avg_submission_att', 'avg_takedown_succ',
        'avg_takedown_att', 'avg_sig_strikes_att',
        'avg_total_strikes_succ', 'avg_total_strikes_att',
        'avg_knockdowns', 'avg_finish_time'
    ]

    fighter_stats = fighter_data[relevant_columns]
    fighter_stats_list = fighter_stats.astype(str).tolist()

    stats_string = ','.join(fighter_stats_list)
    print(stats_string)

fighter_id = 1403 
file_path = 'final_set(2).csv'
stats = get_fighter_stats(fighter_id, file_path)

10.0,0.0,182.88,185.0,187.96,1.0,19.0,6.0,0.0,29.086926762491444,38.81818181818182,0.0909090909090909,0.0,0.1818181818181818,0.5454545454545454,60.81818181818182,36.18181818181818,67.63636363636364,0.6363636363636364,187.3636363636364
