In [1]:
######## Find Optimal Decision Boundary Threshold for Models

# Loads model's weights
# Using validation dataset, find optimal threshold
# Test final accuracy on test dataset

Import All Required Modules

In [1]:
import sys

# Manually add the project root to sys.path
sys.path.append('/Users/joaquinuriarte/Documents/GitHub/sports-betting/')

# === STEP 0: Imports
print("# === STEP 0: Imports: Starting ...")
from modules.model_manager.trainer.trainer import Trainer
from modules.model_manager.predictor.predictor import Predictor
from modules.model_manager.factories.model_factory import ModelFactory
from modules.model_manager.helpers.configuration_loader import ConfigurationLoader as MMConfigLoader
from modules.model_manager.model_manager import ModelManager
print("# === STEP 0: Imports: Complete")

# === STEP 0: Imports: Starting ...


2025-01-18 11:05:00.994478: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# === STEP 0: Imports: Complete


In [2]:
# === STEP 1: Dependency Instantiations And Global Variable Declarations
print("# === STEP 1: Dependency Instantiations And Global Variable Declarations: Starting ...")
## === MODEL MANAGER
yaml_path0 = '/Users/joaquinuriarte/Documents/GitHub/sports-betting/configs/model_v01/model_v01_000.yaml'
yaml_paths = [yaml_path0]

checkpoint = '/Users/joaquinuriarte/Documents/GitHub/sports-betting/models'
trainer = Trainer(checkpoint)
predictor = Predictor()
model_factory = ModelFactory()
mm_configuration_loader = MMConfigLoader()
print("# === STEP 1: Dependency Instantiations And Global Variable Declarations: Complete")

# === STEP 1: Dependency Instantiations And Global Variable Declarations: Starting ...
# === STEP 1: Dependency Instantiations And Global Variable Declarations: Complete


Load Val, and Test Datasets into Memory

In [3]:
import pickle
import os

# File path to load train, test, and val datasets
train_test_val_dataset_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v0"

val_dataset_path = train_test_val_dataset_path + "/val.pkl"
test_dataset_path = train_test_val_dataset_path + "/test.pkl"

with open(val_dataset_path, "rb") as f:
    validation_dataset = pickle.load(f)
    print("Val dataset leaded to memory")
with open(test_dataset_path, "rb") as f:
    test_dataset = pickle.load(f)
    print("Test dataset leaded to memory")

Val dataset leaded to memory
Test dataset leaded to memory


Load Models

In [4]:
## For all YAMLS, get their model's signatures
import yaml

yaml_path_list = [yaml_path0]
signatures = []

for yaml_file in yaml_path_list:
    with open(yaml_file, "r") as file:
        config_data = yaml.safe_load(file)

    # Save model signature 
    signatures.append(config_data.get("model", {}).get("model_signature"))

## Using model's signatures, get model's weights path
weights_paths =[]

for signature in signatures:
    base_path = '/Users/joaquinuriarte/Documents/GitHub/sports-betting/z-model_binaries/model_v0/models/'
    base_file_name = '/model_weights_'

    directory = base_path + signature
    file_name = base_file_name + signature + ".pth"

    final_file_path = directory + file_name

    weights_paths.append(final_file_path) 

In [5]:
# === STEP 4: MODEL MANAGER
model_manager = ModelManager(trainer, predictor, model_factory, mm_configuration_loader)
models = model_manager.load_models([yaml_path0], weights_paths)
print("Models successfully loaded ")

Models successfully loaded 


Optimize Thresholds

In [6]:
import numpy as np
import pandas as pd

def calculate_accuracy(predictions_df: pd.DataFrame) -> float:
    """
    Calculates accuracy from a DataFrame containing predictions and target labels.

    Args:
        predictions_df (pd.DataFrame): A DataFrame with 'predictions' and 'target_label' columns.

    Returns:
        float: The accuracy as a percentage.
    """
    # Compare predictions to target labels
    correct_predictions = predictions_df['predictions'] == predictions_df['target_label']
    accuracy = correct_predictions.sum() / len(predictions_df)
    return accuracy

def find_optimal_threshold(model, validation_dataset, starting_threshold, ending_threshold, step):
    # Define the range of thresholds to evaluate
    thresholds = np.arange(starting_threshold, ending_threshold, step)

    best_score = -1
    optimal_threshold = 0.5

    # Iterate through thresholds
    for threshold in thresholds:
        
        # set threshold 
        model.set_prediction_threshold(threshold)
        
        # predict using val dataset
        prediction_df = model.predict(validation_dataset.examples, return_target_labels=True)

        # assess accuracy
        score = calculate_accuracy(prediction_df)

        # Update the best score and threshold
        if score > best_score:
            best_score = score
            optimal_threshold = threshold
    
    return optimal_threshold, best_score

In [7]:
print("Starting threshold optimization loop ...")
final_results = []
for model in models:
    # Optimize threshold
    optimal_threshold, validation_accuracy = find_optimal_threshold(model, validation_dataset, 0.05, 1, .01)

    # Set base threshold
    model.set_prediction_threshold(.5)
    # Predict using val dataset
    base_prediction_df = model.predict(validation_dataset.examples, return_target_labels=True)
    # Assess accuracy
    base_accuracy = calculate_accuracy(base_prediction_df)

    # Set final optimal threshold
    model.set_prediction_threshold(optimal_threshold)
    # Predict using val dataset
    final_prediction_df = model.predict(validation_dataset.examples, return_target_labels=True)
    # Assess accuracy
    final_accuracy = calculate_accuracy(final_prediction_df)

    # Get model signature
    model_config = model.get_training_config()
    model_name = model_config.model_signature

    instance = (model_name, optimal_threshold, base_accuracy, final_accuracy)
    final_results.append(instance)
print("Threshold optimization loop complete ...")

Starting threshold optimization loop ...
Threshold optimization loop complete ...


In [8]:
final_results[0]

('302c4118ee59d656717db73a125404ab',
 0.4700000000000001,
 0.5968037725962798,
 0.6041393764736704)