In [3]:
import sys

# Manually add the project root to sys.path
sys.path.append('/Users/joaquinuriarte/Documents/GitHub/sports-betting/')
from model_binaries.utils.binary_utils import load_entity

In [4]:
train_test_val_folder_path = "/Users/joaquinuriarte/Documents/GitHub/sports-betting/processed_datasets/model_v10"
## Use to load train, test, and val datasets if computed already ## 
train_dataset = load_entity(train_test_val_folder_path, "train.pkl")
val_dataset = load_entity(train_test_val_folder_path, "val.pkl")
test_dataset = load_entity(train_test_val_folder_path, "test.pkl")

In [5]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score

# Assuming your ModelDataset and Example classes look roughly like this:
# @dataclass
# class Example:
#     features: Dict[str, List[Union[float, int, str]]]
#
# @dataclass
# class ModelDataset:
#     examples: List[Example]

def convert_modeldataset_to_xy(model_dataset, label_key='Team_A_Wins'):
    """
    Convert a ModelDataset to feature matrix X and target vector y.
    Assumes each Example.features is a dict where each key maps to a list
    containing a single value.
    
    Args:
        model_dataset (ModelDataset): The dataset to convert.
        label_key (str): The key corresponding to the label.
    
    Returns:
        X (np.ndarray): The features array.
        y (np.ndarray): The target vector.
    """
    # Collect feature names (all keys except the label key)
    # Here we assume all examples have the same set of keys.
    all_keys = list(model_dataset.examples[0].features.keys())
    feature_keys = [k for k in all_keys if k != label_key]
    
    X_list = []
    y_list = []
    
    for ex in model_dataset.examples:
        # Each value is stored as a list (of length 1), so extract that value.
        row = [ex.features[k][0] for k in feature_keys]
        X_list.append(row)
        y_list.append(ex.features[label_key][0])
        
    X = np.array(X_list)
    y = np.array(y_list)
    return X, y

# --- Example usage ---

# Suppose you have your training and validation ModelDataset objects:
# train_dataset = ...   # instance of ModelDataset with training examples
# val_dataset = ...     # instance of ModelDataset with validation examples

# Convert ModelDataset objects into numpy arrays.
X_train, y_train = convert_modeldataset_to_xy(train_dataset, label_key='Team_A_Wins')
X_val, y_val = convert_modeldataset_to_xy(val_dataset, label_key='Team_A_Wins')
X_test, y_test = convert_modeldataset_to_xy(test_dataset, label_key='Team_A_Wins')

In [6]:
# Create a pipeline with a standard scaler and logistic regression.
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# Define a grid of hyperparameters for grid search.
param_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    # We include both l1 and l2 penalties.
    # Note: l1 penalty requires a solver like 'liblinear' or 'saga'.
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear', 'saga'],
}

# Set up the grid search.
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation on the training set
    scoring='roc_auc',  # Using AUC as the scoring metric
    n_jobs=-1,
    verbose=1,
)

# Fit grid search on training data.
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation AUC score:", grid_search.best_score_)

# Evaluate on the validation set.
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
y_val_proba = best_model.predict_proba(X_val)[:, 1]

print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))

val_auc = roc_auc_score(y_val, y_val_proba)
print("Validation AUC:", val_auc)

y_test_pred = best_model.predict(X_test)
test_auc = roc_auc_score(y_test, y_test_pred)
print("Test AUC:", test_auc)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters found: {'clf__C': 0.001, 'clf__penalty': 'l2', 'clf__solver': 'saga'}
Best cross-validation AUC score: 0.5828124975764617

Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.57      0.18      0.27      8197
         1.0       0.62      0.91      0.74     12175

    accuracy                           0.62     20372
   macro avg       0.60      0.54      0.51     20372
weighted avg       0.60      0.62      0.55     20372

Validation AUC: 0.6113829595091509


Polynomial Features

In [9]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score

# Assume convert_modeldataset_to_xy() is defined as before
# X_train, y_train = convert_modeldataset_to_xy(train_dataset, label_key='Team_A_Wins')
# X_val, y_val = convert_modeldataset_to_xy(val_dataset, label_key='Team_A_Wins')

# Build a pipeline that adds polynomial features, scales data, and runs logistic regression.
pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),  # start with degree 2; we can try degree 3 as well
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

# Define a grid of hyperparameters
param_grid = {
    'poly__degree': [2, 3],        # try degree 2 and degree 3 polynomial features
    'clf__C': [0.001, 0.01, 0.1, 1, 10],  # regularization strength; lower C means stronger regularization
    'clf__penalty': ['l2'],        # using L2 regularization (which works well with many solvers)
    'clf__solver': ['saga'],       # saga supports both l1 and l2; here we use l2
}

# Set up grid search with 5-fold cross-validation, optimizing AUC
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation AUC score:", grid_search.best_score_)

# Evaluate the best model on the validation set.
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
y_val_proba = best_model.predict_proba(X_val)[:, 1]

print("\nValidation Classification Report:")
print(classification_report(y_val, y_val_pred))

val_auc = roc_auc_score(y_val, y_val_proba)
print("Validation AUC:", val_auc)

y_test_pred = best_model.predict(X_test)
test_auc = roc_auc_score(y_test, y_test_pred)
print("Test AUC:", test_auc)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
