In [8]:
import pandas as pd

# Import
-   scaling
-   preprocess

In [9]:
train_df= pd.read_excel('data/train.xlsx')
validation_df = pd.read_excel("data/validation.xlsx")

# Magic
-   scaling
-   remove unneccessary columns if any, e.g. PCA, stb

In [10]:
train = train_df.copy()
validation = validation_df.copy()

In [11]:
train.shape

(13285, 87)

In [12]:
validation.shape

(13, 86)

In [13]:
target = train["red_fighter_win"]
train = train.drop(columns = ["red_fighter_win"])

In [14]:
def add_fighter_ages(df):
    """
    Calculate and add fighter ages based on event_date and fighter date of birth.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing UFC fight data with columns:
        - event_date: date of the fight
        - red_fighter_dob: date of birth for red corner fighter
        - blue_fighter_dob: date of birth for blue corner fighter
    
    Returns:
    --------
    pandas.DataFrame
        The input DataFrame with two new columns added:
        - red_fighter_age: age of red corner fighter at time of fight
        - blue_fighter_age: age of blue corner fighter at time of fight
    """
    # Create a copy of the dataframe to avoid modifying the original
    df_copy = df.copy()
    
    # Calculate age in years for red fighter
    df_copy['red_fighter_age'] = (df_copy['event_date'] - df_copy['red_fighter_dob']).dt.days / 365.25
    
    # Calculate age in years for blue fighter
    df_copy['blue_fighter_age'] = (df_copy['event_date'] - df_copy['blue_fighter_dob']).dt.days / 365.25
    
    # Round ages to 1 decimal place for better readability
    df_copy['red_fighter_age'] = df_copy['red_fighter_age'].round(1)
    df_copy['blue_fighter_age'] = df_copy['blue_fighter_age'].round(1)
    
    return df_copy

# Example usage:
train = add_fighter_ages(train)
validation = add_fighter_ages(validation)

train = train.drop(columns = ["red_fighter_dob", "blue_fighter_dob", "event_date"])
validation = validation.drop(columns = ["red_fighter_dob", "blue_fighter_dob", "event_date"])

In [15]:
def compare_column_distributions(train_df, val_df, columns=None, verbose=True):
    """
    Compare the distributions of columns between training and validation datasets
    to check if they're in similar ranges and likely using the same units.
    
    Parameters:
    -----------
    train_df : pandas.DataFrame
        Training dataset
    val_df : pandas.DataFrame
        Validation dataset
    columns : list or None
        List of columns to compare. If None, compares all numeric columns
    verbose : bool
        Whether to print detailed information
        
    Returns:
    --------
    dict
        Dictionary with column names as keys and comparison results as values
        Each result contains min, max, mean, median, and std for both datasets
    """
    import numpy as np
    
    # If no columns specified, use all numeric columns
    if columns is None:
        numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        columns = [col for col in train_df.columns if train_df[col].dtype.name in numeric_dtypes]
    
    results = {}
    potential_issues = []
    
    for col in columns:
        # Skip if column doesn't exist in both dataframes
        if col not in train_df.columns or col not in val_df.columns:
            if verbose:
                print(f"Column '{col}' not found in both datasets. Skipping.")
            continue
            
        # Skip non-numeric columns
        if not np.issubdtype(train_df[col].dtype, np.number) or not np.issubdtype(val_df[col].dtype, np.number):
            if verbose:
                print(f"Column '{col}' is not numeric in one or both datasets. Skipping.")
            continue
        
        # Calculate statistics
        train_stats = {
            'min': train_df[col].min(),
            'max': train_df[col].max(),
            'mean': train_df[col].mean(),
            'median': train_df[col].median(),
            'std': train_df[col].std()
        }
        
        val_stats = {
            'min': val_df[col].min(),
            'max': val_df[col].max(),
            'mean': val_df[col].mean(),
            'median': val_df[col].median(),
            'std': val_df[col].std()
        }
        
        # Store results
        results[col] = {
            'train': train_stats,
            'validation': val_stats
        }
        
        # Check for potential issues
        # 1. Check if ranges are completely different
        if (val_stats['min'] > train_stats['max']) or (val_stats['max'] < train_stats['min']):
            potential_issues.append(f"WARNING: Column '{col}' has non-overlapping ranges between train and validation")
        
        # 2. Check if means are very different (more than 3 std deviations apart)
        mean_diff = abs(train_stats['mean'] - val_stats['mean'])
        mean_threshold = 3 * max(train_stats['std'], val_stats['std'])
        if mean_diff > mean_threshold:
            potential_issues.append(f"WARNING: Column '{col}' has significantly different means between train and validation")
        
        # 3. Check if one dataset has much higher variance
        std_ratio = max(train_stats['std'], val_stats['std']) / (min(train_stats['std'], val_stats['std']) + 1e-10)
        if std_ratio > 5:  # Arbitrary threshold
            potential_issues.append(f"WARNING: Column '{col}' has much higher variance in one dataset (ratio: {std_ratio:.2f})")
    
    # Print results if verbose
    if verbose:
        print("\n=== Column Distribution Comparison ===\n")
        for col, stats in results.items():
            print(f"Column: {col}")
            print(f"  Train:      min={stats['train']['min']:.2f}, max={stats['train']['max']:.2f}, mean={stats['train']['mean']:.2f}, median={stats['train']['median']:.2f}, std={stats['train']['std']:.2f}")
            print(f"  Validation: min={stats['validation']['min']:.2f}, max={stats['validation']['max']:.2f}, mean={stats['validation']['mean']:.2f}, median={stats['validation']['median']:.2f}, std={stats['validation']['std']:.2f}")
            print()
        
        if potential_issues:
            print("\n=== Potential Issues Detected ===\n")
            for issue in potential_issues:
                print(issue)
    
    return results

# Example usage:
comparison_results = compare_column_distributions(train, validation)

# To check specific columns like height and weight:
# height_weight_comparison = compare_column_distributions(
#     train, validation, 
#     columns=['red_fighter_height', 'blue_fighter_height', 'red_fighter_weight', 'blue_fighter_weight']
# )


=== Column Distribution Comparison ===

Column: red_body_attempted_weighted_trailing
  Train:      min=0.00, max=35.82, mean=4.24, median=3.41, std=3.87
  Validation: min=1.00, max=14.86, mean=6.38, median=6.00, std=4.58

Column: red_body_landed_weighted_trailing
  Train:      min=0.00, max=30.16, mean=2.96, median=2.32, std=2.84
  Validation: min=0.00, max=9.24, mean=4.25, median=4.73, std=2.93

Column: red_clinch_attempted_weighted_trailing
  Train:      min=0.00, max=39.00, mean=2.82, median=1.73, std=3.51
  Validation: min=0.00, max=6.63, mean=2.03, median=1.10, std=2.19

Column: red_clinch_landed_weighted_trailing
  Train:      min=0.00, max=30.00, mean=1.96, median=1.09, std=2.56
  Validation: min=0.00, max=6.36, mean=1.64, median=0.84, std=1.96

Column: red_ctrl_weighted_trailing
  Train:      min=0.00, max=295.00, mean=53.18, median=30.35, std=59.89
  Validation: min=0.00, max=173.25, mean=84.57, median=84.58, std=55.66

Column: red_distance_attempted_weighted_trailing
  Train

After manually checking the results, I do not find these hits as differences in measure.

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13285 entries, 0 to 13284
Data columns (total 85 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   weight_class                                13285 non-null  object 
 1   red_body_attempted_weighted_trailing        13285 non-null  float64
 2   red_body_landed_weighted_trailing           13285 non-null  float64
 3   red_clinch_attempted_weighted_trailing      13285 non-null  float64
 4   red_clinch_landed_weighted_trailing         13285 non-null  float64
 5   red_ctrl_weighted_trailing                  13285 non-null  float64
 6   red_distance_attempted_weighted_trailing    13285 non-null  float64
 7   red_distance_landed_weighted_trailing       13285 non-null  float64
 8   red_fighter_avg_fight_time                  13285 non-null  float64
 9   red_fighter_days_since_last                 13285 non-null  int64  
 10  red_fighte

In [17]:
cat_cols = [col for col in train.select_dtypes(include=['object']).columns 
            if col not in ['red_fighter_name', 'blue_fighter_name']]

In [18]:
cat_cols

['weight_class', 'red_fighter_stance', 'blue_fighter_stance']

In [19]:
for col in ['red_fighter_stance', 'blue_fighter_stance']:
    print(train[col].value_counts())
    print(validation[col].value_counts())

red_fighter_stance
Orthodox       9937
Southpaw       2649
Switch          675
Open Stance      22
Sideways          2
Name: count, dtype: int64
red_fighter_stance
Orthodox    9
Switch      3
Southpaw    1
Name: count, dtype: int64
blue_fighter_stance
Orthodox       10021
Southpaw        2510
Switch           737
Open Stance       12
Sideways           5
Name: count, dtype: int64
blue_fighter_stance
Orthodox    11
Southpaw     2
Name: count, dtype: int64


I remove the women's prefix from dfs. I guess, the GCN will only learn on edges, and I suppose there wont be edges between man and woman fighters. 

In [20]:
validation['weight_class'] = validation['weight_class'].str.replace("Women's", '').str.strip()

In [21]:
print(train["weight_class"].value_counts())
print(validation["weight_class"].value_counts())

weight_class
Welterweight         2395
Lightweight          2076
Middleweight         1774
Flyweight            1699
Bantamweight         1638
Featherweight        1410
Heavyweight          1180
Light Heavyweight    1113
Name: count, dtype: int64
weight_class
Lightweight          3
Welterweight         2
Strawweight          2
Light Heavyweight    1
Featherweight        1
Flyweight            1
Heavyweight          1
Middleweight         1
Bantamweight         1
Name: count, dtype: int64


In [22]:
validation['weight_class'] = validation['weight_class'].replace('Strawweight', 'Flyweight')

In [23]:
for cat_col in cat_cols:
    # For multi-category variables, we use One-Hot Encoding. This creates a new binary column for each category, with 1 indicating the presence of that category and 0 otherwise.
    # One-Hot Encoding using pandas get_dummies
    train = pd.get_dummies(train, columns=[cat_col], drop_first=True)
    # Apply One-Hot Encoding to test data
    validation = pd.get_dummies(validation, columns=[cat_col], drop_first=True)

    # Align the train and test datasets to have the same columns
    train, validation = train.align(validation, join='left', axis=1, fill_value=0)

In [24]:
def convert_int_to_float(df):
    """
    Convert all integer columns in a DataFrame to float.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing columns to convert
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with all integer columns converted to float
    """
    # Create a copy to avoid modifying the original
    df_copy = df.copy()
    
    # Get all integer columns
    int_cols = df_copy.select_dtypes(include=['int', 'int16', 'int32', 'int64', "bool"]).columns
    
    # Convert each integer column to float
    for col in int_cols:
        df_copy[col] = df_copy[col].astype(float)
    
    return df_copy

# Example usage:
train = convert_int_to_float(train)
validation = convert_int_to_float(validation)

In [25]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13285 entries, 0 to 13284
Data columns (total 97 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   red_body_attempted_weighted_trailing        13285 non-null  float64
 1   red_body_landed_weighted_trailing           13285 non-null  float64
 2   red_clinch_attempted_weighted_trailing      13285 non-null  float64
 3   red_clinch_landed_weighted_trailing         13285 non-null  float64
 4   red_ctrl_weighted_trailing                  13285 non-null  float64
 5   red_distance_attempted_weighted_trailing    13285 non-null  float64
 6   red_distance_landed_weighted_trailing       13285 non-null  float64
 7   red_fighter_avg_fight_time                  13285 non-null  float64
 8   red_fighter_days_since_last                 13285 non-null  float64
 9   red_fighter_defense                         13285 non-null  float64
 10  red_fighte

In [26]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 97 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   red_body_attempted_weighted_trailing        13 non-null     float64
 1   red_body_landed_weighted_trailing           13 non-null     float64
 2   red_clinch_attempted_weighted_trailing      13 non-null     float64
 3   red_clinch_landed_weighted_trailing         13 non-null     float64
 4   red_ctrl_weighted_trailing                  13 non-null     float64
 5   red_distance_attempted_weighted_trailing    13 non-null     float64
 6   red_distance_landed_weighted_trailing       13 non-null     float64
 7   red_fighter_avg_fight_time                  13 non-null     float64
 8   red_fighter_days_since_last                 13 non-null     float64
 9   red_fighter_defense                         13 non-null     float64
 10  red_fighter_heig

In [27]:
def scale_features_for_gcn(train_df, validation_df):
    """
    Scale numerical features in a way that's beneficial for GCN training.
    
    For GCNs, we want to:
    1. Normalize features to similar ranges to prevent some features from dominating
    2. Handle outliers that could cause gradient issues
    3. Preserve the relative relationships between features
    
    Parameters:
    -----------
    train_df : pandas.DataFrame
        Training dataset
    validation_df : pandas.DataFrame
        Validation dataset
        
    Returns:
    --------
    train_scaled : pandas.DataFrame
        Scaled training dataset
    validation_scaled : pandas.DataFrame
        Scaled validation dataset
    """
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import RobustScaler
    
    # Create copies to avoid modifying the originals
    train_scaled = train_df.copy()
    validation_scaled = validation_df.copy()
    
    # Get all float columns
    float_cols = train_df.select_dtypes(include=['float64']).columns
    
    # Exclude one-hot encoded columns (those with stance or weight_class in the name)
    exclude_patterns = ['stance', 'weight_class']
    scale_cols = [col for col in float_cols if not any(pattern in col for pattern in exclude_patterns)]
    
    # Also exclude the target column if it exists
    if 'target' in scale_cols:
        scale_cols.remove('target')
    
    # Use RobustScaler which is less influenced by outliers
    # This is important for fighting statistics which often have extreme values
    scaler = RobustScaler()
    
    # Fit scaler on training data and transform both datasets
    train_scaled[scale_cols] = scaler.fit_transform(train_df[scale_cols])
    validation_scaled[scale_cols] = scaler.transform(validation_df[scale_cols])
    
    # Print some statistics to verify the scaling
    print(f"Scaled {len(scale_cols)} numerical features")
    print("\nScaling summary (training data):")
    print(f"Mean range: [{train_scaled[scale_cols].mean().min():.2f}, {train_scaled[scale_cols].mean().max():.2f}]")
    print(f"Std range: [{train_scaled[scale_cols].std().min():.2f}, {train_scaled[scale_cols].std().max():.2f}]")
    
    return train_scaled, validation_scaled

# Apply scaling
train_scaled, validation_scaled = scale_features_for_gcn(train, validation)



Scaled 76 numerical features

Scaling summary (training data):
Mean range: [-0.40, 410.87]
Std range: [0.47, 1317.30]


# Saving

In [28]:
train_scaled["target"] = target

# Save the scaled datasets
train_scaled.to_excel("data/train_scaled.xlsx", index=False)
validation_scaled.to_excel("data/validation_scaled.xlsx", index=False)