In [10]:
######## Create, Train, and Predict Baseline Models

# Logistic Regressor
# Random Forest Classifier 

Import All Required Modules

In [19]:
import sys

# Manually add the project root to sys.path
sys.path.append('/Users/joaquinuriarte/Documents/GitHub/sports-betting/')

# === STEP 0: Imports
import numpy as np
from typing import Tuple
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from modules.data_structures.model_dataset import ModelDataset
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from model_binaries.utils.binary_utils import save_entity, load_entity, cross_val_train, compute_f1

Load Train into Memory

In [1]:
# File path to load train, test, and val datasets
train_test_val_folder_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v0/scaler"

In [6]:
train_dataset = load_entity(train_test_val_folder_path, "scaled_train.pkl")
train_dataset = load_entity(train_test_val_folder_path, "scaled_test.pkl")

Define Helper Methods

In [25]:
def get_features_and_labels(dataset, label_key: str = "label") -> Tuple[np.ndarray, np.ndarray]:
    """
    Extracts feature vectors (X) and labels (y) from ModelDataset.
    
    Args:
        dataset (ModelDataset): Your custom dataset containing examples.
        label_key (str): The name of the key used for your labels within features.
    
    Returns:
        X (np.ndarray): 2D array of shape (num_examples, num_features).
        y (np.ndarray): 1D array of shape (num_examples,).
    """
    X_list = []
    y_list = []
    for example in dataset.examples:
        # example.features is a dict: { feature_name: [list_of_values] }
        # We assume "label" is a single-value list for y; 
        # e.g. { "label": [1], "PTS": [10.3], "AST": [5.2], ... }
        
        # 1) Extract label
        y_value = example.features[label_key][0]  # e.g. [1] -> 1
        y_list.append(y_value)
        
        # 2) Extract numeric fields for X. You can decide which keys to skip, 
        #    or you can store them in a certain order.
        feature_vec = []
        for k, v in example.features.items():
            if k == label_key:
                continue
            # v is a list of numeric values or a single numeric value
            # Flatten or pick the first if each feature is stored as [value]
            # Make sure they are floats or ints
            feature_vec.extend(v)  
        
        X_list.append(feature_vec)
    
    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.float32)
    return X, y

def tune_logistic_regression(X, y):
    """
    Perform Randomized Search for Logistic Regression hyperparameters.
    Returns the best estimator and a summary of the results.
    """
    # Define the model
    log_reg = LogisticRegression(max_iter=10000)  # 'liblinear' often works well for small/medium datasets
    
    # Hyperparameter distributions to sample from
    param_dist = {
        "C": [0.001, 0.01, 0.1, 1, 10, 100],
        "solver": ["liblinear", "saga"],
        "penalty": ["l1", "l2", "elasticnet"],
        "l1_ratio": [0.0, 0.5, 1.0],  # only used if penalty='elasticnet'
        "class_weight": [None, "balanced"]
    }
    
    # Create a stratified K-fold for balanced CV splits
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=log_reg,
        param_distributions=param_dist,
        n_iter=5,                # number of parameter settings to try
        scoring={"f1": "f1", "precision": "precision", "recall": "recall", "roc_auc": "roc_auc"},            # or use multiple metrics (see below)
        refit="f1",
        cv=cv,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    random_search.fit(X, y)
    
    best_estimator = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    
    return best_estimator, best_params, best_score

def tune_logistic_regression_with_poly(X, y):
    pipe = Pipeline([
        ("poly", PolynomialFeatures(include_bias=False)),
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(max_iter=1000))
    ])
    
    param_dist = {
        "poly__degree": [1, 2],   # search polynomial degrees
        "logreg__solver": ["liblinear", "saga"],
        "logreg__penalty": ["l1", "l2", "elasticnet"],
        "logreg__C": [0.001, 0.01, 0.1, 1, 10, 100],
        "logreg__l1_ratio": [0.0, 0.5, 1.0],  # only relevant if penalty='elasticnet'
        "logreg__class_weight": [None, "balanced"]
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # For multiple metrics
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=15,
        scoring={"f1": "f1", "precision": "precision", "recall": "recall", "roc_auc": "roc_auc"},
        refit="f1",
        cv=cv,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    search.fit(X, y)
    
    best_estimator = search.best_estimator_        # Pipeline with best-found hyperparams
    best_params = search.best_params_
    best_score = search.best_score_               # best CV 'f1' score
    return best_estimator, best_params, best_score

def tune_random_forest(X, y):
    """
    Perform Randomized Search for Random Forest hyperparameters.
    Returns the best estimator and a summary of the results.
    """
    # Define the model
    rf = RandomForestClassifier(random_state=42)
    
    # Hyperparameter distributions to sample from
    param_dist = {
        'n_estimators': [50, 100, 200, 300, 500],
        'max_depth': [None, 5, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None],
        'bootstrap': [True, False],
        'criterion': ['gini', 'entropy'],
        'class_weight': [None, 'balanced', 'balanced_subsample']
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=20,             
        scoring = {
            "f1": "f1",
            "recall": "recall",
            "precision": "precision",
            "roc_auc": "roc_auc"
        },           # or use multiple metrics
        refit="f1",
        cv=cv,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    
    random_search.fit(X, y)
    
    best_estimator = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    
    return best_estimator, best_params, best_score

def evaluate_model(model, X_test, y_test):
    """
    Computes standard classification metrics for the model on given test data.
    """
    y_pred = model.predict(X_test)
    
    # For AUC, you need predicted probabilities for the positive class
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
    
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1:", f1)
    if auc is not None:
        print("AUC:", auc)

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "auc": auc
    }

def find_optimal_threshold(y_true, y_prob, num_thresholds=101):
    """
    Finds the threshold between 0 and 1 that yields the maximum F1 score
    when converting predicted probabilities to binary predictions.

    Args:
        y_true (array-like): Ground-truth labels (0 or 1).
        y_prob (array-like): Predicted probabilities for the positive class.
        num_thresholds (int): Number of thresholds to check from 0.0 to 1.0 (inclusive).

    Returns:
        best_threshold (float): The threshold that yields the highest F1 score.
        best_f1 (float): The best F1 score obtained at that threshold.
    """
    best_threshold = 0.5
    best_f1 = 0.0
    
    # Generate equally spaced thresholds from 0 to 1
    thresholds = np.linspace(0, 1, num_thresholds)
    
    for t in thresholds:
        y_pred_custom = (y_prob >= t).astype(int)
        score = f1_score(y_true, y_pred_custom)
        if score > best_f1:
            best_f1 = score
            best_threshold = t
    
    return best_threshold, best_f1

Train & Evaluate Logistic Regressor

In [28]:
# Get your X, y from the dataset
X_train, y_train = get_features_and_labels(train_dataset, label_key="Team_A_Wins")
X_test, y_test = get_features_and_labels(train_dataset, label_key="Team_A_Wins")

# Tune Logistic Regression
best_lr, lr_params, lr_score = tune_logistic_regression(X_train, y_train)
print("Best Logistic Regression Params:", lr_params)
print("Best CV F1:", lr_score)

# Evaluate on test
print("Logistic Regression performance on test set:")
evaluate_model(best_lr, X_test, y_test)

Fitting 5 folds for each of 5 candidates, totalling 25 fits




Best Logistic Regression Params: {'solver': 'saga', 'penalty': 'l2', 'l1_ratio': 1.0, 'class_weight': None, 'C': 0.001}
Best CV F1: 0.7328354899465991
Logistic Regression performance on test set:
Accuracy: 0.5783274440518257
Precision: 0.5783274440518257
Recall: 1.0
F1: 0.7328358208955223
AUC: 0.6296957905236529




{'accuracy': 0.5783274440518257,
 'precision': 0.5783274440518257,
 'recall': 1.0,
 'f1': 0.7328358208955223,
 'auc': 0.6296957905236529}

Train & Evaluate a Polynomial (2) Logistic Regressor

In [None]:
# Get your X, y from the dataset
X_train, y_train = get_features_and_labels(train_dataset, label_key="Team_A_Wins")
X_test, y_test = get_features_and_labels(train_dataset, label_key="Team_A_Wins")

# Tune Logistic Regression
best_lr, lr_params, lr_score = tune_logistic_regression_with_poly(X_train, y_train)
print("Best Logistic Regression Params:", lr_params)
print("Best CV F1:", lr_score)

# Evaluate on test
print("Logistic Regression performance on test set:")
evaluate_model(best_lr, X_test, y_test)

In [29]:
y_prob = best_lr.predict_proba(X_test)[:, 1]
    
# Call our threshold tuning function
optimal_thresh, optimal_f1 = find_optimal_threshold(y_test, y_prob, num_thresholds=101)

print(f"Optimal threshold: {optimal_thresh:.3f}, Best F1: {optimal_f1:.4f}")

Optimal threshold: 0.560, Best F1: 0.7333


Train & Evaluate Random Forest

In [26]:
# Get your X, y from the dataset
X_train, y_train = get_features_and_labels(train_dataset, label_key="Team_A_Wins")
X_test, y_test = get_features_and_labels(train_dataset, label_key="Team_A_Wins")

# Tune Random Forest
best_rf, rf_params, rf_score = tune_random_forest(X_train, y_train)
print("Best Random Forest Params:", rf_params)
print("Best CV F1:", rf_score)

# Evaluate on test
print("Random Forest performance on test set:")
evaluate_model(best_rf, X_test, y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Random Forest Params: {'n_estimators': 50, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 5, 'criterion': 'entropy', 'class_weight': None, 'bootstrap': True}
Best CV F1: 0.7275076000757503
Random Forest performance on test set:
Accuracy: 0.6183745583038869
Precision: 0.6027060270602707
Recall: 0.9979633401221996
F1: 0.7515337423312884
AUC: 0.8601841211325902


{'accuracy': 0.6183745583038869,
 'precision': 0.6027060270602707,
 'recall': 0.9979633401221996,
 'f1': 0.7515337423312884,
 'auc': 0.8601841211325902}

In [27]:
y_prob = best_lr.predict_proba(X_test)[:, 1]

# Call our threshold tuning function
optimal_thresh, optimal_f1 = find_optimal_threshold(y_test, y_prob, num_thresholds=101)

print(f"Optimal threshold: {optimal_thresh:.3f}, Best F1: {optimal_f1:.4f}")

Optimal threshold: 0.560, Best F1: 0.7333
