In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datetime import datetime


def load_fights_data(filepath):
    # Load and clean fights data
    print(f"Loading fights data from {filepath}.")
    fights = pd.read_csv(
        filepath,
        usecols=['EVENT', 'BOUT', 'OUTCOME', 'METHOD', 'TIME FORMAT', 'WEIGHTCLASS', 'ROUND', 'TIME']
    )

    print("Cleaning fights data.")
    fights['EVENT'] = fights['EVENT'].str.strip()
    fights['BOUT'] = fights['BOUT'].str.replace(r'\s+', ' ', regex=True).str.strip()
    fights[['FIGHTER1', 'FIGHTER2']] = fights['BOUT'].str.split(' vs. ', expand=True)

    return fights


def load_events_data(filepath):
    # Load and clean events data
    print(f"Loading events data from {filepath}.")
    events = pd.read_csv(
        filepath,
        usecols=['EVENT', 'DATE']
    )
    events['DATE'] = pd.to_datetime(events['DATE'])
    return events


def merge_fights_events(fights, events):
    # Merge fights data with events data
    print("Merging fights data with events data.")
    merged_data = pd.merge(fights, events, on='EVENT', how='left')
    merged_data = merged_data.sort_values(by="DATE", ascending=True).reset_index(drop=True)
    return merged_data


def determine_winner(row):
    # Determine the winner of a fight based on the outcome value
    outcome_map = {
        "W/L": row["FIGHTER1"],
        "L/W": row["FIGHTER2"]
    }
    return outcome_map.get(row["OUTCOME"], None)


def add_winner_column(data):
    # Add a WINNER column
    print("Adding WINNER column.")
    data["WINNER"] = data.apply(determine_winner, axis=1)
    return data


def map_finishing_methods(data):
    # Map fight finishing methods to multiplier values
    print("Mapping fight finishing methods to numerical values.")
    # These values were determined via general knowledge of mma and what is considered a "good" way of winning
    fight_finishing_methods = {
        "Submission ": 8,
        'SUB': 8,
        "KO/TKO ": 9,
        "KO/TKO": 9,
        "TKO - Doctor's Stoppage ": 5,
        "Other ": 3,
        "Decision - Unanimous ": 9,
        'U-DEC': 9,
        "Decision - Split ": 4,
        'S-DEC': 4,
        "Overturned ": 1,
        "Decision - Majority ": 5,
        "DQ ": 3,
        "Could Not Continue ": 2,
    }
    data['METHOD'] = data['METHOD'].map(fight_finishing_methods)
    return data


def calculate_elo_ratings(data, k_base, initial_elo=1500):
    # Calculate Elo ratings
    print(f"Calculating Elo ratings with k_base={k_base}.")
    ratings = {}
    history = []

    for _, row in data.iterrows():
        fighter1 = row['FIGHTER1']
        fighter2 = row['FIGHTER2']
        winner = row['WINNER']
        method = row['METHOD']
        if pd.isnull(method):
            # If METHOD is NaN, skip this fight as it affects k_factor
            print(f"Skipping fight between {fighter1} and {fighter2} due to missing METHOD.")
            continue
        k_factor = method * k_base

        # Assign initial elo score for first instance
        rating1 = ratings.get(fighter1, initial_elo)
        rating2 = ratings.get(fighter2, initial_elo)

        # Calculate expected scores
        exp_score1 = 1 / (1 + 10 ** ((rating2 - rating1) / 400))
        exp_score2 = 1 - exp_score1

        # Calculate actual scores
        actual1 = 1 if winner == fighter1 else 0
        actual2 = 1 - actual1

        # Update ratings
        new_rating1 = rating1 + k_factor * (actual1 - exp_score1)
        new_rating2 = rating2 + k_factor * (actual2 - exp_score2)

        ratings[fighter1] = new_rating1
        ratings[fighter2] = new_rating2

        # Record fight history
        history.append({
            'fighter1': fighter1,
            'fighter2': fighter2,
            'winner': winner,
            'rating1_before': rating1,
            'rating2_before': rating2,
            'rating1_after': new_rating1,
            'rating2_after': new_rating2,
            'event': row['EVENT'],
            'date': row['DATE'],
            'time_format': row['TIME FORMAT'],
            'round': row['ROUND'],
            'timestamp': row['TIME']
        })

    return ratings, history


def win_loss_numeric_f1(row):
    # Convert fight outcome to numeric for fighter1
    return 1 if row["fighter1"] == row['winner'] else 0


def win_loss_numeric_f2(row):
    # Convert fight outcome to numeric for fighter2
    return 1 if row["fighter2"] == row['winner'] else 0


def calculate_total_time(row):
    # Calculate total fight time in minutes and seconds.
    try:
        minutes, seconds = map(int, row['timestamp'].split(':'))
        full_rounds_time = (row['round'] - 1) * 5 * 60  # Each round is 5 minutes
        last_round_time = minutes * 60 + seconds
        total_time_seconds = full_rounds_time + last_round_time
        total_time_minutes = total_time_seconds / 60
        return total_time_minutes, total_time_seconds
    except Exception as e:
        print(f"Error calculating total time for fight between {row['fighter1']} and {row['fighter2']} on {row['date']}: {e}")
        return None, None


def process_history(history):
    # Convert fight history to DataFrame and calculate additional metrics
    print("Processing fight history into DataFrame.")
    history_df = pd.DataFrame(history)

    # Sort by date
    history_df = history_df.sort_values(by='date', ascending=False).reset_index(drop=True)

    # Generate outcome columns
    history_df['fighter1_outcome'] = history_df.apply(win_loss_numeric_f1, axis=1)
    history_df['fighter2_outcome'] = history_df.apply(win_loss_numeric_f2, axis=1)

    # Calculate total fight time
    print("Calculating total fight time.")
    history_df[['total_time_minutes', 'total_time_seconds']] = history_df.apply(
        lambda row: pd.Series(calculate_total_time(row)), axis=1
    )

    # Drop rows with missing values
    history_df = history_df.dropna().reset_index(drop=True)

    return history_df


def save_history(history_df, filepath):
    # Save the fight history DataFrame to a CSV file
    print(f"Saving fight history to {filepath}.")
    history_upper = history_df.copy()
    history_upper.columns = [col.upper() for col in history_upper.columns]
    history_upper.to_csv(filepath, index=False)
    print("Fight history saved successfully.")


def train_and_evaluate_model(history_df):
    # Train a Random Forest Classifier and evaluate its performance to find best k value
    print("Training and evaluating the Random Forest model.")

    # Generate outcome columns
    history_df['fighter1_outcome'] = history_df['fighter1_outcome'].astype(int)
    history_df['fighter2_outcome'] = history_df['fighter2_outcome'].astype(int)

    # Features and target
    X = history_df[['rating1_before', 'rating2_before']]
    y = history_df['fighter1_outcome']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Initialize and train the classifier
    rf_classifier = RandomForestClassifier(
        n_estimators=100, random_state=42, class_weight='balanced'
    )
    rf_classifier.fit(X_train, y_train)
    print("Random Forest model trained.")

    # Predictions
    y_pred = rf_classifier.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy:.2f}")

    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix:\n{conf_matrix}")

    class_report = classification_report(y_test, y_pred)
    print(f"Classification Report:\n{class_report}")


def find_best_k(data, k_values, initial_elo=1500):
    # Iterate through different k values to find the one with the highest model accuracy
    best_k = None
    best_accuracy = 0
    best_history_df = None
    best_ratings = None

    for k in k_values:
        print(f"\nEvaluating k={k}")
        ratings, history = calculate_elo_ratings(data, k, initial_elo)
        history_df = process_history(history)

        # Check if history_df is empty to avoid training on empty data
        if history_df.empty:
            print(f"No valid fights to process for k={k}. Skipping.")
            continue

        # Train and evaluate the model
        X = history_df[['rating1_before', 'rating2_before']]
        y = history_df['fighter1_outcome']

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Initialize and train the classifier
        rf_classifier = RandomForestClassifier(
            n_estimators=100, random_state=42, class_weight='balanced'
        )
        rf_classifier.fit(X_train, y_train)

        # Predictions
        y_pred = rf_classifier.predict(X_test)

        # Evaluation
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model Accuracy for k={k}: {accuracy:.2f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k
            best_history_df = history_df
            best_ratings = ratings

    print(f"\nBest k value: {best_k} with accuracy: {best_accuracy:.2f}")
    return best_k, best_accuracy, best_history_df, best_ratings


def display_top_fighters(history_df, ratings, top_n=40, save_csv=False, csv_path=''):
    # Create DataFrame from top ratings
    final_ratings_df = pd.DataFrame.from_dict(ratings, orient='index', 
                                            columns=['final_rating']).reset_index()
    final_ratings_df.columns = ['fighter', 'final_rating']
    
    # Sort by final rating
    final_ratings_df = final_ratings_df.sort_values(
        by='final_rating', ascending=False).head(top_n)
    
    # Print top fighters
    print(f"\nTop {top_n} fighters by final Elo rating:")
    print(final_ratings_df.to_string())

    # Save to CSV
    if save_csv and csv_path:
        final_ratings_df.to_csv(csv_path, index=False)
        print(f"Final ratings saved to {csv_path}.")



# Generate scores workflow

# File paths
fights_csv = r'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/refs/heads/main/ufc_fight_results.csv'
events_csv = r'https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/refs/heads/main/ufc_event_details.csv'
output_csv = r'/workspaces/codespaces-jupyter/data/ufc_fight_results_with_elo.csv'
rankings_csv = r'/workspaces/codespaces-jupyter/data/ufc_all_time_elo_rankings.csv'

# Load and preprocess data
fights = load_fights_data(fights_csv)
events = load_events_data(events_csv)
data = merge_fights_events(fights, events)
data = add_winner_column(data)
data = map_finishing_methods(data)

# List of k values to evaluate
k_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 50, 70, 100]

# Get best k value
best_k, best_accuracy, best_history_df, best_ratings = find_best_k(data, k_values)

if best_k is not None and best_history_df is not None:
    # Create the directory
    import os
    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    
    # Save the processed history with the best k value
    save_history(best_history_df, output_csv)
    print(f"\nResults saved to: {output_csv}")

    # Print history
    print("\nSample of processed fight history with the best k value:")
    print(best_history_df.head().to_string())

    # Display top fighters based on elo and save to CSV
    display_top_fighters(best_history_df, best_ratings, top_n=50, save_csv=True, csv_path=rankings_csv)
    print(f"\nTop fighters rankings saved to: {rankings_csv}")
else:
    print("\nNo suitable k value found to save fight history.")

Loading fights data from https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/refs/heads/main/ufc_fight_results.csv.
Cleaning fights data.
Loading events data from https://raw.githubusercontent.com/Greco1899/scrape_ufc_stats/refs/heads/main/ufc_event_details.csv.
Merging fights data with events data.
Adding WINNER column.
Mapping fight finishing methods to numerical values.

Evaluating k=1
Calculating Elo ratings with k_base=1.
Processing fight history into DataFrame.
Calculating total fight time.
Model Accuracy for k=1: 0.59

Evaluating k=2
Calculating Elo ratings with k_base=2.
Processing fight history into DataFrame.
Calculating total fight time.
Model Accuracy for k=2: 0.59

Evaluating k=3
Calculating Elo ratings with k_base=3.
Processing fight history into DataFrame.
Calculating total fight time.
Model Accuracy for k=3: 0.59

Evaluating k=4
Calculating Elo ratings with k_base=4.
Processing fight history into DataFrame.
Calculating total fight time.
Model Accuracy for k=4: 