| Model iteration | Error metrics | Result |
| --- | --- | ---
| Logistic regression, selected features| 10-fold CV, accuracy | 0.655|

In [6]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pprint import pprint
from collections import defaultdict

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from ufc import constants, load_data

In [8]:
import pandas as pd
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
# Load data
df = load_data.read_prepped_data()

response = "outcome"

### Functions

In [72]:
def test_logistic_regression(df, features, response, k):
    # encode resposne
    le = LabelEncoder()
    df['outcome'] = le.fit_transform(df['outcome'])

    # split X, y
    X = df[features]
    y = df[response]

    # k-fold CV
    cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=constants.SEED)
    cross_val_metrics = []

    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        logistic_model = LogisticRegression(solver='liblinear')

        logistic_model.fit(X_train, y_train)
        y_pred = logistic_model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        class_report = classification_report(y_test, y_pred)

        cross_val_metrics.append({
            'Accuracy': accuracy,
            'Confusion Matrix': conf_matrix,
            'Classification Report': class_report
        })
        
    return cross_val_metrics

In [68]:
def summarise_mean_cv_error(cross_val_metrics):
    """Summarise error metrics across folds"""
    # Initialize lists to store individual metrics for each fold
    accuracies = []
    confusion_matrices = []

    # Initialize dictionaries to store classification report metrics for each fold
    precision = defaultdict(list)
    recall = defaultdict(list)
    f1_score = defaultdict(list)
    support = defaultdict(list)

    for fold in cross_val_metrics:
        accuracies.append(fold['Accuracy'])
        confusion_matrices.append(fold['Confusion Matrix'])

        # Parse and accumulate classification report metrics
        cr = fold['Classification Report'].split('\n')
        weighted_avg_index = next((i for i, s in enumerate(cr) if s.startswith('weighted avg')), None)
        weighted_avg_cr = cr[weighted_avg_index].split()

        label = weighted_avg_cr[0]
        precision[label].append(float(weighted_avg_cr[-4]))
        recall[label].append(float(weighted_avg_cr[-3]))
        f1_score[label].append(float(weighted_avg_cr[-2]))
        support[label].append(int(weighted_avg_cr[-1]))

    # Calculate the mean accuracy
    mean_accuracy = np.mean(accuracies)

    # Calculate the mean confusion matrix
    mean_confusion_matrix = np.mean(confusion_matrices, axis=0)

    # Calculate the mean classification report metrics
    mean_precision = {label: np.mean(precision[label]) for label in precision}['weighted']
    mean_recall = {label: np.mean(recall[label]) for label in recall}["weighted"]
    mean_f1_score = {label: np.mean(f1_score[label]) for label in f1_score}["weighted"]

    mean_classification_report = {
        "Precision": mean_precision,
        "Recall": mean_recall,
        "F1 Score": mean_f1_score,
    }

    return {
        'Mean Accuracy': mean_accuracy,
        'Mean Confusion Matrix': mean_confusion_matrix,
        'Mean Classification Report': mean_classification_report
    }

### Test

In [70]:
features = [
    'delta_age',
    'delta_sig_strikes_landed_pm', 'delta_sig_strikes_accuracy',
    'delta_sig_strikes_absorbed_pm', 'delta_sig_strikes_defended',
    'delta_takedown_avg_per15m', 'delta_takedown_accuracy',
    'delta_takedown_defence', 'delta_submission_avg_attempted_per15m',
]

In [75]:
cross_val_metrics = test_logistic_regression(df, features, response, k=10)

In [76]:
summarise_mean_cv_error(cross_val_metrics)

{'Mean Accuracy': 0.6548902821316614,
 'Mean Confusion Matrix': array([[102.4,  56. ],
        [ 54.2, 106.7]]),
 'Mean Classification Report': {'Precision': 0.6540000000000001,
  'Recall': 0.6540000000000001,
  'F1 Score': 0.6540000000000001}}