### Goal: Create a binary classification model to predict the outcome of a UFC fight

#### Kernel Layout

- *(optional)* Define which features to use for the model, just make sure dataset contains those columns.
    - May need to go back to 1b_cleaning.ipynb to update dataset to include them.
    - `load_data` returns info of last saved `../data/model_dataset.csv`
- `load_model` : Function to create and return basic classification model --> Will be used in various later functions
    - Hypertuning parameters
    - Feature importance 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold #, GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    mean_squared_error 
)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    MinMaxScaler,
    Normalizer,
    OneHotEncoder,
    PolynomialFeatures,
    StandardScaler
)

import xgboost as xgb

pd.set_option('display.memory_usage', False)

In [2]:
class BinaryClassificationModel:

    def __init__(self, df: pd.DataFrame, target: str, **kwargs):

        self.target: str = target
        self.features: list[str,...] = kwargs.get(
            'features',
            [column for column in df.columns if column != target]
        )

        # Ensures target treated separately from everything else
        if target in self.features:
            self.features.remove(target)

        # Include target in complete dataframe, done this way in case dont want to use all columns / if features explicitly passed
        self.df: pd.DataFrame = df[self.features + [target]]
        
        # Categorical features
        self.cat_features: list[str,...] = list(self.df.select_dtypes(exclude='number').columns)
        # Numerical features --> TODO: optimize integer types, probably can use int16
        self.num_features: list[str,...] = [ feature for feature in self.df.select_dtypes(include='number').columns if feature != target]
        # Binary features --> Will include target
        self.bin_features: list[str,...] = [ feature for feature in self.num_features if self.df[feature].value_counts().shape[0] == 2 and feature != target]

        # Optimize binary features and target
        for column in self.bin_features:
            self.df[column] = self.df[column].astype('uint8')


        self.X: pd.DataFrame = self.df[self.features]
        self.y: pd.Series = self.df[target]
        
    def overview(self):

        # Print model inputs and basic info
        target_msg: str = f'Classification model to predict UFC {self.target}.'
        features_msg: str = f'Input variables and data types:\n'
    
        print(*[target_msg, features_msg], sep='\n')

        
        return self.X.info()


    def initialize(self) -> None:

        # Column transformers
        ct = ColumnTransformer(
            transformers=[
                ('oh-encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False), self.cat_features),
                ('scale', StandardScaler(), self.num_features),
                # ('polynomial', PolynomialFeatures(include_bias=False), self.num_features) # Better training data score but worse overall, be careful
            ],
            remainder='passthrough'
        )
    
        X_transformed = ct.fit_transform(self.X)
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X_transformed, self.y, test_size=0.2, train_size=0.8, random_state=42)
    
        # Initialize classifification model
        self.clf = xgb.XGBClassifier()
    
        # Fit model to transformed training data
        self.clf.fit(self.X_train, self.y_train)

        return None

    def parse_confusion_matrix(
        self,
        cm: np.ndarray
    ) -> dict[[str], int]:
        """
        Confusion matrix has form:
        
        True Positive     False Negative
        False Positive    True Negative
        
        """
    
        flatten = lambda nested_list: tuple([value for nested in nested_list for value in nested])    
        true_positive, false_negative, false_positive, true_negative = flatten(cm.tolist())
    
        return {
            'Correctly predicted winner': true_positive,
            'Incorrectly predicted winner': false_positive,
            
            'Incorrectly predicted loser': false_negative,
            'Correctly predicted loser': true_negative
        }

    def evaluate(self) -> None:
        # Output model results
        print(f'\nModel score for training data for {self.target}: {self.clf.score(self.X_train, self.y_train):.3f}')
        print(f'Model score for testing data for {self.target}: {self.clf.score(self.X_test, self.y_test):.3f}\n') # --> returns identical result to sklearn.metrics.accuracy_score(y_test, clf.predict(X_test))
    
        cv_scores = cross_val_score(self.clf, self.X_train, self.y_train, cv=5)
        print(f'Mean cross-validation score: {cv_scores.mean():.3f}')
    
        # Does almost the same as above but takes average of K-separate cross_validations --> ideally want pretty similar result
        kfold = KFold(n_splits=10, shuffle=True)
        kf_cv_scores = cross_val_score(self.clf, self.X_train, self.y_train, cv=kfold)
        print(f'K-fold CV average score: {kf_cv_scores.mean():.3f}')
    
        y_pred = self.clf.predict(self.X_test)
        cm = confusion_matrix(self.y_test, y_pred)
    
        print('\nConfusion matrix results:')
        print(self.parse_confusion_matrix(cm))

        return None



    def run(self) -> None:

        self.initialize()
        self.evaluate()

        return None
        
        


In [3]:
DF: pd.DataFrame = pd.read_csv('../data/model_dataset.csv')

bcm = BinaryClassificationModel(df=DF, target='winner')

# bcm.overview()
clf = bcm.run()


Model score for training data for winner: 0.754
Model score for testing data for winner: 0.599

Mean cross-validation score: 0.574
K-fold CV average score: 0.591

Confusion matrix results:
{'Correctly predicted winner': 583, 'Incorrectly predicted winner': 374, 'Incorrectly predicted loser': 411, 'Correctly predicted loser': 590}


#### Need to do feature importance and hypertuning