##### Goal: Write function using XGBoost to run model from start to finish

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, KFold #, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    mean_squared_error 
)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    PolynomialFeatures,
    StandardScaler
)

import xgboost as xgb

In [2]:
df = pd.read_csv('../data/model_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9792 entries, 0 to 9791
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   fighter          9792 non-null   object 
 1   opp_fighter      9792 non-null   object 
 2   title_bout       9792 non-null   int64  
 3   weight_class     9792 non-null   object 
 4   gender           9792 non-null   object 
 5   stance           9790 non-null   object 
 6   opp_stance       9790 non-null   object 
 7   diff_reach_cms   9792 non-null   float64
 8   diff_height_cms  9792 non-null   float64
 9   diff_weight_lbs  9792 non-null   int64  
 10  diff_age         9792 non-null   int64  
 11  winner           9792 non-null   int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 918.1+ KB


In [3]:
features_: list[str,...] = [
#     'fighter',
#     'opp_fighter',
    'title_bout',
    'weight_class',
    'gender',
    'stance',
    'diff_reach_cms',
    'diff_height_cms',
    'diff_weight_lbs',
    'diff_age',
]

target_: str = 'winner'

In [4]:
def parse_confusion_matrix(cm: np.ndarray):
    """
    Confusion matrix has form:
    
    True Positive     False Negative
    False Positive    True Negative
    
    """

    flatten = lambda nested_list: tuple([value for nested in nested_list for value in nested])    
    true_positive, false_negative, false_positive, true_negative = flatten(cm.tolist())

    return {
        'Correctly predicted winner': true_positive,
        'Incorrectly predicted winner': false_positive,
        
        'Incorrectly predicted loser': false_negative,
        'Correctly predicted loser': true_negative
    }

In [5]:
def run_model(df: pd.DataFrame, target: str, *args, **kwargs) -> None:

    # By default, consider all columns of dataframe as features except for target variable
    features: list[str,...] = kwargs.get('features', [column for column in df.columns if column != target])
    
    # Print model inputs and basic info to make sure running correctly
    target_msg: str = f'Classification model to predict UFC {target}.'
    features_msg: str = f'Input variables: {features}'
    
    print(*[target_msg, features_msg], sep='\n')
        
    num_features: list[str,...] = [feature for feature in features if 'diff' in feature]
    bin_features = ['title_bout']
    str_features: list[str,...] = [ feature for feature in features if feature not in num_features+bin_features ]
    
   # Optimize binary integer types
    for column in bin_features + [target]:
        df[column] = df[column].astype('uint8')

    # Check all features accounted for
    assert(sum([len(num_features), len(bin_features), len(str_features)]) == len(features))
    
    
    X: pd.DataFrame = df[features]
    y: pd.Series = df[target]

    # Column transformers
    ct = ColumnTransformer(
        transformers=[
            ('oh-encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False), str_features), # should not really be any unknowns, from info it seems as though one fight had issues with stance so just 'ignore' those 2 (2/9790 = 0.02% entries)
            ('scale', StandardScaler(), num_features),
            ('polynomial', PolynomialFeatures(include_bias=False), num_features)
        ],
        remainder='passthrough'
    )
    
    
    X_transformed = ct.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, train_size=0.8, random_state=42)

    # Initialize classifification model
    clf = xgb.XGBClassifier()

    # Fit model to transformed data
    clf.fit(X_train, y_train)
    
    
    # Output model results
    print(f'\nModel score for training data for {target}: {clf.score(X_train, y_train):.3f}')
    print(f'Model score for testing data for {target}: {clf.score(X_test, y_test):.3f}\n') # --> returns identical result to sklearn.metrics.accuracy_score(y_test, clf.predict(X_test))

    cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(f'Mean cross-validation score: {cv_scores.mean():.3f}')

    # Does almost the same as above but takes average of K-separate cross_validations --> ideally want pretty similar result
    kfold = KFold(n_splits=10, shuffle=True)
    kf_cv_scores = cross_val_score(clf, X_train, y_train, cv=kfold)
    print(f'K-fold CV average score: {kf_cv_scores.mean():.3f}')

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    print('\nConfusion matrix results:')
    print(parse_confusion_matrix(cm))
    
    return
    
    

In [6]:
run_model(df, target_, features=features_)

Classification model to predict UFC winner.
Input variables: ['title_bout', 'weight_class', 'gender', 'stance', 'diff_reach_cms', 'diff_height_cms', 'diff_weight_lbs', 'diff_age']

Model score for training data for winner: 0.778
Model score for testing data for winner: 0.533

Mean cross-validation score: 0.524
K-fold CV average score: 0.524

Confusion matrix results:
{'Correctly predicted winner': 537, 'Incorrectly predicted winner': 461, 'Incorrectly predicted loser': 454, 'Correctly predicted loser': 507}
