### Goal: Create a binary classification model to predict the outcome of a UFC fight

#### Kernel Layout

- *(optional)* Define which features to use for the model, just make sure dataset contains those columns.
    - May need to go back to 1b_cleaning.ipynb to update dataset to include them.
    - `load_data` returns info of last saved `../data/model_dataset.csv`
- `load_model` : Function to create and return basic classification model --> Will be used in various later functions
    - Hypertuning parameters
    - Feature importance 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold #, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    mean_squared_error 
)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    PolynomialFeatures,
    StandardScaler
)

import xgboost as xgb

In [2]:
class BinaryClassificationModel:

    def __init__(self, *args, df: pd.DataFrame, target: str, **kwargs):

        self.target: str = target
        self.features: list[str,...] = kwargs.get(
            'features',
            [column for column in df.columns if column != target]
        )

        # Ensures target treated separately from everything else
        if target in self.features:
            self.features.remove(target)

        # Include target in complete dataframe, done this way in case dont want to use all columns / if features explicitly passed
        self.df: pd.DataFrame = df[self.features + [target]]
        
        # Categorical features
        self.cat_features: list[str,...] = list(self.df.select_dtypes(exclude='number').columns)
        # Numerical features --> TODO: optimize integer types, probably can use int16
        self.num_features: list[str,...] = list(self.df.select_dtypes(include='number').columns)
        # Binary features --> Will include target
        self.bin_features: list[str,...] = [ feature for feature in self.num_features if self.df[feature].value_counts().shape[0] == 2 ]

        # Optimize binary features, including target
        for column in self.bin_features:
            self.df[column] = self.df[column].astype('uint8')


        self.X: pd.DataFrame = self.df[self.features]
        self.y: pd.Series = self.df[target]
        
    def data_overview(self):
        return self.df.info()



In [4]:
DF: pd.DataFrame = pd.read_csv('../data/model_dataset.csv')

bcm = BinaryClassificationModel(df=DF, target='winner')

bcm.data_overview()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9788 entries, 0 to 9787
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   fighter          9788 non-null   object 
 1   opp_fighter      9788 non-null   object 
 2   title_bout       9788 non-null   uint8  
 3   weight_class     9788 non-null   object 
 4   stance           9788 non-null   object 
 5   opp_stance       9788 non-null   object 
 6   male             9788 non-null   uint8  
 7   diff_reach_cms   9788 non-null   float64
 8   diff_height_cms  9788 non-null   float64
 9   diff_weight_lbs  9788 non-null   int64  
 10  diff_age         9788 non-null   int64  
 11  diff_wins        9788 non-null   int64  
 12  diff_losses      9788 non-null   int64  
 13  winner           9788 non-null   uint8  
dtypes: float64(2), int64(4), object(5), uint8(3)
memory usage: 870.0+ KB


In [None]:
def load_data(**kwargs) -> pd.DataFrame:
    df: pd.DataFrame = pd.read_csv('../data/model_dataset.csv')

    if kwargs.get('show_info', False):
        print(df.info())
    
    return df

In [None]:
load_data(show_info=True)

In [None]:
def load_model(df: pd.DataFrame, target: str, *args, **kwargs):

    # df: pd.DataFrame = load_data()
    
    # By default, consider all columns of dataframe as features except for target variable
    features: list[str,...] = kwargs.get('features', [column for column in df.columns if column != target])
    
    # Print model inputs and basic info to make sure running correctly
    target_msg: str = f'Classification model to predict UFC {target}.'
    features_msg: str = f'Input variables: {features}'
    
    print(*[target_msg, features_msg], sep='\n')
        
    num_features: list[str,...] = [feature for feature in features if 'diff' in feature]
    bin_features = ['title_bout']
    str_features: list[str,...] = [ feature for feature in features if feature not in num_features+bin_features ]
    
   # Optimize binary integer types
    for column in bin_features + [target]:
        df[column] = df[column].astype('uint8')

    print(df.info())

    # Check all features accounted for
    assert(sum([len(num_features), len(bin_features), len(str_features)]) == len(features))
    
    
    X: pd.DataFrame = df[features]
    y: pd.Series = df[target]

    # Column transformers
    ct = ColumnTransformer(
        transformers=[
            ('oh-encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False), str_features),
            # should not really be any unknowns, from info it seems as though one fight had issues with stance so just 'ignore' those 2 (2/9790 = 0.02% entries)
            ('scale', StandardScaler(), num_features),
            ('polynomial', PolynomialFeatures(include_bias=False), num_features)
        ],
        remainder='passthrough'
    )
    
    
    X_transformed = ct.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, train_size=0.8, random_state=42)

    # Initialize classifification model
    clf = xgb.XGBClassifier()

    # Fit model to transformed data
    clf.fit(X_train, y_train)

    # Output model results
    print(f'\nModel score for training data for {target}: {clf.score(X_train, y_train):.3f}')
    print(f'Model score for testing data for {target}: {clf.score(X_test, y_test):.3f}\n') # --> returns identical result to sklearn.metrics.accuracy_score(y_test, clf.predict(X_test))

    cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(f'Mean cross-validation score: {cv_scores.mean():.3f}')

    # Does almost the same as above but takes average of K-separate cross_validations --> ideally want pretty similar result
    kfold = KFold(n_splits=10, shuffle=True)
    kf_cv_scores = cross_val_score(clf, X_train, y_train, cv=kfold)
    print(f'K-fold CV average score: {kf_cv_scores.mean():.3f}')

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    print('\nConfusion matrix results:')
    print(parse_confusion_matrix(cm))
    
    # return

    return clf
    

In [None]:
def parse_confusion_matrix(cm: np.ndarray):
    """
    Confusion matrix has form:
    
    True Positive     False Negative
    False Positive    True Negative
    
    """

    flatten = lambda nested_list: tuple([value for nested in nested_list for value in nested])    
    true_positive, false_negative, false_positive, true_negative = flatten(cm.tolist())

    return {
        'Correctly predicted winner': true_positive,
        'Incorrectly predicted winner': false_positive,
        
        'Incorrectly predicted loser': false_negative,
        'Correctly predicted loser': true_negative
    }
# def output_model_results(clf, target: str) -> None:

    

In [None]:
def feature_importance(clf):
    ax = xgb.plot_importance(clf)
    plt.figure(figsize=(25,15))
    return ax

In [None]:
def run_model(df: pd.DataFrame, target: str, *args, **kwargs) -> None:
    
    clf = load_model(df, target, *args, **kwargs)

    # output_model_results(clf, target)

    return

In [None]:
DF: pd.DataFrame = load_data()

FEATURES: list[str,...] = [
#     'fighter',
#     'opp_fighter',
    'title_bout',
    'weight_class',
    'gender',
    'stance',
    'diff_reach_cms',
    'diff_height_cms',
    'diff_weight_lbs',
    'diff_age',

    'diff_wins',
    'diff_losses'
]

TARGET: str = 'winner'

In [None]:
feature_importance(load_model(
    DF,
    TARGET,
    features=FEATURES
))

In [None]:
run_model(DF, TARGET, features=FEATURES)