# UFC Fight Outcome Prediction - Machine Learning Models

This notebook implements machine learning models to predict UFC fight outcomes based on fighter statistics and historical performance data.

## Objectives:
1. Predict fight outcomes (Red vs Blue corner winner)
2. Identify the most important features for predicting fight outcomes
3. Compare multiple machine learning algorithms
4. Evaluate model performance using appropriate metrics

In [None]:
# Import required libraries with error handling
import sys
import warnings
warnings.filterwarnings('ignore')

# Check for required libraries
required_libs = ['pandas', 'numpy', 'matplotlib', 'seaborn', 'sklearn']
missing_libs = []

for lib in required_libs:
    try:
        __import__(lib)
        print(f"‚úì {lib} is available")
    except ImportError:
        missing_libs.append(lib)
        print(f"‚úó {lib} is missing")

if missing_libs:
    print(f"\n‚ùå Missing required libraries: {missing_libs}")
    print("Please install them using: pip install", ' '.join(missing_libs))
    print("\nNote: This notebook requires the following dependencies:")
    print("- pandas: for data manipulation")
    print("- numpy: for numerical computations")
    print("- matplotlib & seaborn: for data visualization")
    print("- scikit-learn: for machine learning models")
    sys.exit("Stopping execution due to missing dependencies.")
else:
    # Import all required libraries
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
    from sklearn.impute import SimpleImputer
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    print("\nüéâ All libraries imported successfully!")
    print(f"Python version: {sys.version}")
    print(f"Pandas version: {pd.__version__}")
    print(f"NumPy version: {np.__version__}")
    print(f"Scikit-learn version: {__import__('sklearn').__version__}")

## 1. Data Loading and Initial Exploration

In [None]:
# Load the UFC dataset with enhanced error handling
import os

# Check if required libraries are available
try:
    import pandas as pd
    import numpy as np
except ImportError as e:
    print(f"‚ùå Missing required library: {e}")
    print("Please install required libraries: pip install pandas numpy scikit-learn matplotlib seaborn")
    df = None
else:
    # Try different data file paths
    data_paths = [
        '../data/data.csv',
        './data/data.csv',
        'data/data.csv'
    ]
    
    df = None
    for path in data_paths:
        try:
            if os.path.exists(path):
                df = pd.read_csv(path)
                print(f"‚úÖ Data loaded successfully from: {path}")
                print(f"üìä Dataset shape: {df.shape}")
                print(f"üìã Columns available: {len(df.columns)}")
                break
        except Exception as e:
            print(f"‚ùå Failed to load data from {path}: {e}")
            continue
    
    if df is None:
        print("\n‚ùå Could not find or load UFC data file.")
        print("Expected data file locations:")
        for path in data_paths:
            exists = "‚úì" if os.path.exists(path) else "‚úó"
            print(f"  {exists} {path}")
        
        print("\nüìù This notebook expects a CSV file with UFC fight data containing:")
        print("  - Fighter statistics (R_ and B_ prefixed columns)")
        print("  - Winner column ('Red' or 'Blue')")
        print("  - Fight metadata (date, location, weight_class, etc.)")
    else:
        # Show first few columns to verify data structure
        print(f"\nüìã First 10 columns: {df.columns.tolist()[:10]}")
        
        # Fix column name issues (remove special characters)
        df.columns = df.columns.str.replace('‚â§', '', regex=False)
        df.columns = df.columns.str.replace('Ôªø', '', regex=False)  # Remove BOM if present
        df.columns = df.columns.str.strip()  # Remove leading/trailing whitespace
        
        print(f"üìã Cleaned columns (first 10): {df.columns.tolist()[:10]}")
        
        if 'Winner' in df.columns:
            print(f"üéØ Winner column found: {df['Winner'].value_counts().to_dict()}")
        else:
            print("‚ö†Ô∏è Warning: 'Winner' column not found in dataset")
            print(f"Available columns: {df.columns.tolist()}")

In [None]:
# Basic data exploration
if df is not None:
    print("=== Dataset Overview ===")
    print(f"Total fights: {len(df)}")
    print(f"Total features: {len(df.columns)}")
    print(f"\nWinner distribution:")
    print(df['Winner'].value_counts())
    print(f"\nWeight classes:")
    print(df['weight_class'].value_counts())
    
    # Check for missing values
    missing_pct = (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
    print(f"\nTop 10 columns with missing values:")
    print(missing_pct.head(10))

## 2. Data Preprocessing and Feature Engineering

In [None]:
def preprocess_data(df):
    """
    Preprocess the UFC dataset for machine learning.
    """
    df_processed = df.copy()
    
    # Create target variable (1 if Red wins, 0 if Blue wins)
    df_processed['red_wins'] = (df_processed['Winner'] == 'Red').astype(int)
    
    # Select numeric features only (excluding target and non-predictive columns)
    exclude_cols = ['R_fighter', 'B_fighter', 'Referee', 'date', 'location', 
                   'Winner', 'red_wins', 'title_bout']
    
    # Get numeric columns
    numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove excluded columns from numeric columns
    feature_cols = [col for col in numeric_cols if col not in exclude_cols]
    
    print(f"Selected {len(feature_cols)} numeric features for modeling")
    
    # Handle categorical variables if needed
    categorical_cols = ['R_Stance', 'B_Stance', 'weight_class']
    
    for col in categorical_cols:
        if col in df_processed.columns:
            # Create dummy variables
            dummies = pd.get_dummies(df_processed[col], prefix=col, drop_first=True)
            df_processed = pd.concat([df_processed, dummies], axis=1)
            feature_cols.extend(dummies.columns.tolist())
    
    # Return processed dataframe and feature columns
    return df_processed, feature_cols

# Process the data
if df is not None:
    df_processed, feature_columns = preprocess_data(df)
    print(f"\nProcessed dataset shape: {df_processed.shape}")
    print(f"Number of features for modeling: {len(feature_columns)}")

In [None]:
# Prepare features and target
if df is not None:
    # Extract features and target
    X = df_processed[feature_columns]
    y = df_processed['red_wins']
    
    print(f"Features shape: {X.shape}")
    print(f"Target shape: {y.shape}")
    print(f"Target distribution: {y.value_counts().to_dict()}")
    
    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns, index=X.index)
    
    print(f"\nMissing values after imputation: {X_imputed.isnull().sum().sum()}")

## 3. Train-Test Split and Feature Scaling

In [None]:
# Split the data
if df is not None:
    X_train, X_test, y_train, y_test = train_test_split(
        X_imputed, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"Training set shape: {X_train.shape}")
    print(f"Test set shape: {X_test.shape}")
    print(f"Training target distribution: {y_train.value_counts().to_dict()}")
    print(f"Test target distribution: {y_test.value_counts().to_dict()}")
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print("\nFeatures scaled successfully!")

## 4. Model Training and Evaluation

In [None]:
# Define models to test
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True)
}

# Store results
results = {}

if df is not None:
    print("Training models...\n")
    
    for name, model in models.items():
        print(f"Training {name}...")
        
        # Use scaled data for all models
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        auc_score = roc_auc_score(y_test, y_pred_proba)
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
        
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'auc': auc_score,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std(),
            'predictions': y_pred,
            'probabilities': y_pred_proba
        }
        
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  AUC: {auc_score:.4f}")
        print(f"  CV Score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
        print()
    
    print("Model training completed!")

## 5. Model Comparison and Results

In [None]:
# Create results summary
if df is not None and results:
    results_df = pd.DataFrame({
        'Model': list(results.keys()),
        'Test Accuracy': [results[name]['accuracy'] for name in results.keys()],
        'AUC Score': [results[name]['auc'] for name in results.keys()],
        'CV Mean': [results[name]['cv_mean'] for name in results.keys()],
        'CV Std': [results[name]['cv_std'] for name in results.keys()]
    })
    
    results_df = results_df.sort_values('Test Accuracy', ascending=False)
    print("=== Model Performance Summary ===")
    print(results_df.round(4))
    
    # Find best model
    best_model_name = results_df.iloc[0]['Model']
    best_model = results[best_model_name]['model']
    print(f"\nBest performing model: {best_model_name}")

In [None]:
# Visualize model performance
if df is not None and results:
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 1. Accuracy comparison
    axes[0, 0].bar(results_df['Model'], results_df['Test Accuracy'])
    axes[0, 0].set_title('Model Accuracy Comparison')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # 2. AUC comparison
    axes[0, 1].bar(results_df['Model'], results_df['AUC Score'])
    axes[0, 1].set_title('Model AUC Comparison')
    axes[0, 1].set_ylabel('AUC Score')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # 3. ROC Curves
    for name in results.keys():
        fpr, tpr, _ = roc_curve(y_test, results[name]['probabilities'])
        axes[1, 0].plot(fpr, tpr, label=f"{name} (AUC = {results[name]['auc']:.3f})")
    
    axes[1, 0].plot([0, 1], [0, 1], 'k--', label='Random')
    axes[1, 0].set_xlabel('False Positive Rate')
    axes[1, 0].set_ylabel('True Positive Rate')
    axes[1, 0].set_title('ROC Curves')
    axes[1, 0].legend()
    
    # 4. Confusion Matrix for best model
    cm = confusion_matrix(y_test, results[best_model_name]['predictions'])
    sns.heatmap(cm, annot=True, fmt='d', ax=axes[1, 1], 
                xticklabels=['Blue Wins', 'Red Wins'],
                yticklabels=['Blue Wins', 'Red Wins'])
    axes[1, 1].set_title(f'Confusion Matrix - {best_model_name}')
    axes[1, 1].set_ylabel('True Label')
    axes[1, 1].set_xlabel('Predicted Label')
    
    plt.tight_layout()
    plt.show()

## 6. Feature Importance Analysis

In [None]:
# Feature importance for tree-based models
if df is not None and results:
    tree_models = ['Random Forest', 'Gradient Boosting']
    
    for model_name in tree_models:
        if model_name in results:
            model = results[model_name]['model']
            
            # Get feature importance
            importance = model.feature_importances_
            feature_importance = pd.DataFrame({
                'feature': feature_columns,
                'importance': importance
            }).sort_values('importance', ascending=False)
            
            # Plot top 20 features
            plt.figure(figsize=(12, 8))
            top_features = feature_importance.head(20)
            plt.barh(range(len(top_features)), top_features['importance'])
            plt.yticks(range(len(top_features)), top_features['feature'])
            plt.xlabel('Feature Importance')
            plt.title(f'Top 20 Feature Importance - {model_name}')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.show()
            
            print(f"\n=== Top 10 Most Important Features ({model_name}) ===")
            print(feature_importance.head(10))

## 7. Model Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for the best model
if df is not None and results:
    print(f"Performing hyperparameter tuning for {best_model_name}...")
    
    if best_model_name == 'Random Forest':
        param_grid = {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
        base_model = RandomForestClassifier(random_state=42)
        
    elif best_model_name == 'Gradient Boosting':
        param_grid = {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_samples_split': [2, 5, 10]
        }
        base_model = GradientBoostingClassifier(random_state=42)
        
    elif best_model_name == 'Logistic Regression':
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear']
        }
        base_model = LogisticRegression(random_state=42, max_iter=1000)
        
    else:  # SVM
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        }
        base_model = SVC(random_state=42, probability=True)
    
    # Perform grid search
    grid_search = GridSearchCV(
        base_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    # Best parameters and score
    print(f"\nBest parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Test the tuned model
    tuned_model = grid_search.best_estimator_
    y_pred_tuned = tuned_model.predict(X_test_scaled)
    y_pred_proba_tuned = tuned_model.predict_proba(X_test_scaled)[:, 1]
    
    tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
    tuned_auc = roc_auc_score(y_test, y_pred_proba_tuned)
    
    print(f"\nTuned model performance:")
    print(f"  Accuracy: {tuned_accuracy:.4f}")
    print(f"  AUC: {tuned_auc:.4f}")
    
    print(f"\nImprovement over default:")
    print(f"  Accuracy: {tuned_accuracy - results[best_model_name]['accuracy']:.4f}")
    print(f"  AUC: {tuned_auc - results[best_model_name]['auc']:.4f}")

## 8. Model Interpretation and Insights

In [None]:
# Final model evaluation and classification report
if df is not None and results:
    print("=== Detailed Classification Report (Best Model) ===")
    print(classification_report(y_test, results[best_model_name]['predictions'],
                              target_names=['Blue Wins', 'Red Wins']))
    
    # Model predictions distribution
    pred_proba = results[best_model_name]['probabilities']
    
    plt.figure(figsize=(12, 5))
    
    # Subplot 1: Prediction probability distribution
    plt.subplot(1, 2, 1)
    plt.hist(pred_proba, bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Predicted Probability (Red Wins)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Prediction Probabilities')
    plt.axvline(x=0.5, color='red', linestyle='--', label='Decision Threshold')
    plt.legend()
    
    # Subplot 2: Calibration plot
    plt.subplot(1, 2, 2)
    from sklearn.calibration import calibration_curve
    
    fraction_of_positives, mean_predicted_value = calibration_curve(
        y_test, pred_proba, n_bins=10
    )
    
    plt.plot(mean_predicted_value, fraction_of_positives, "s-", label=best_model_name)
    plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    plt.xlabel('Mean Predicted Probability')
    plt.ylabel('Fraction of Positives')
    plt.title('Calibration Plot')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

## 9. Summary and Conclusions

In [None]:
# Summary of findings
if df is not None and results:
    print("=== UFC Fight Outcome Prediction - Summary ===")
    print(f"\nüìä Dataset: {len(df)} fights with {len(feature_columns)} features")
    print(f"üéØ Target: Predict Red corner wins (vs Blue corner wins)")
    print(f"üìà Best Model: {best_model_name}")
    print(f"üéØ Best Accuracy: {results[best_model_name]['accuracy']:.4f}")
    print(f"üìä Best AUC Score: {results[best_model_name]['auc']:.4f}")
    
    if 'tuned_accuracy' in locals():
        print(f"‚ö° Tuned Accuracy: {tuned_accuracy:.4f}")
        print(f"‚ö° Tuned AUC Score: {tuned_auc:.4f}")
    
    print("\n=== Key Insights ===")
    print("‚Ä¢ Fight outcomes can be predicted with reasonable accuracy using fighter statistics")
    print("‚Ä¢ Historical performance metrics are strong predictors of fight outcomes")
    print("‚Ä¢ Tree-based models (Random Forest, Gradient Boosting) tend to perform well")
    print("‚Ä¢ Feature engineering and hyperparameter tuning can improve model performance")
    
    print("\n=== Next Steps ===")
    print("‚Ä¢ Collect more recent fight data to improve model accuracy")
    print("‚Ä¢ Engineer additional features (momentum, recent form, style matchups)")
    print("‚Ä¢ Implement ensemble methods combining multiple models")
    print("‚Ä¢ Deploy the model for real-time fight outcome predictions")
    
else:
    print("‚ùå Error: Unable to complete analysis due to data loading issues.")
    print("Please ensure the data file exists at '../data/data.csv'")

In [None]:
# Save the best model (optional)
if df is not None and results:
    import joblib
    
    try:
        # Save the best model and scaler
        joblib.dump(best_model, f'../models/best_ufc_model_{best_model_name.lower().replace(" ", "_")}.pkl')
        joblib.dump(scaler, '../models/feature_scaler.pkl')
        
        print(f"‚úÖ Best model ({best_model_name}) saved successfully!")
        print("üìÅ Model file: ../models/best_ufc_model_*.pkl")
        print("üìÅ Scaler file: ../models/feature_scaler.pkl")
        
    except Exception as e:
        print(f"‚ö†Ô∏è Could not save model: {e}")
        print("This is normal if the models directory doesn't exist.")