In [11]:
import psycopg2
import configparser
import os
import pandas as pd
from sqlalchemy import create_engine, text
import subprocess
import sys
import papermill as pm
import json
import math
from psycopg2.extras import execute_batch
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix
from pandas.plotting import scatter_matrix
from scipy.stats import loguniform, randint
def get_db_config():
    config = configparser.ConfigParser()
    config.read('../api_data/db.ini')
    
    return {
        'database': config['postgresql']['database'],
        'user': config['postgresql']['user'],
        'password': config['postgresql']['password'],
        'host': config['postgresql']['host'],
        'port': config['postgresql']['port']
    }


In [None]:
class nba_win_probability_model:
    def __init__(self, cursor):
        self.model = LogisticRegression(random_state=42)
        self.fitted = False
        self.lambda_parameter = 1.0
        self.time_remaining = None
        self.spread_lines = None
        self.score_margins = None
        self.outcomes = None
        self.game_ids = None
        self.cursor = cursor
        self.X_train,self.X_test,self.X_val = None, None, None
        self.y_train,self.y_test,self.y_val = None, None, None
    def fetching_outcomes(self):
        self.outcomes = {}
        query_final_margins = """
            select game_id, scoremargin
            from (
                select game_id, scoremargin,
                       row_number() over (partition by game_id order by eventnum desc) as rn
                from play_by_play_q4
                where scoremargin is not null
            ) ranked
            where rn = 1
        """
        self.cursor.execute(query_final_margins)
        final_scores_db = self.cursor.fetchall()
        for game_id, score_margin in final_scores_db:
            outcome = 0
            if 'TIE' not in score_margin and int(score_margin) > 0:
                outcome = 1
            self.outcomes[game_id] = outcome
    
    def features(self):
        query = """
            select game_id, eventnum, time_left, scoremargin, moneyline from play_by_play_q4
            where scoremargin is not null and period = 4 and moneyline is not null
            order by game_id, eventnum
        """
        self.cursor.execute(query)
        queried_features = self.cursor.fetchall()
        game_id_list, time_list, score_difference_list, spread_list = [], [], [], []
        for feature in queried_features:
            game_id, eventnum, time_left, margin, spread = feature
            if 'TIE' in margin:
                margin = 0.0
            normalized_time = time_left / 2880.0
            normalized_time = max(0.0, min(1.0, normalized_time))
            game_id_list.append(game_id)
            score_difference_list.append(float(margin))
            time_list.append(normalized_time)
            spread_list.append(spread)

        self.time_remaining = np.array(time_list)
        self.spread_lines = np.array(spread_list)
        self.score_margins = np.array(score_difference_list)
        self.game_ids = np.array(game_id_list)

    def feature_matrix(self, score_margins, time_remaining, lambda_parameter, spread_lines):
        feature_one = score_margins * (time_remaining ** lambda_parameter)
        feature_two = np.exp(-time_remaining)
        feature_three = spread_lines
        return np.column_stack([feature_one,feature_two,feature_three])

    def preprocessing(self):
        self.fetching_outcomes()
        self.features()
        outcome_list = []
        for game_id in self.game_ids:
            if game_id in self.outcomes:
                outcome_list.append(self.outcomes[game_id])
        self.training_outcomes = np.array(outcome_list)
        return self.training_outcomes
    
    def model_fit(self):
        self.X_train = self.feature_matrix(self.score_margins_train, self.time_remaining_train, self.lambda_parameter, self.spread_lines_train)
        self.X_test = self.feature_matrix(self.score_margins_test, self.time_remaining_test, self.lambda_parameter, self.spread_lines_test)
        
        self.model.fit(self.X_train, self.y_train)
        self.fitted = True

        print(f"\nModel Coefficients (λ = {self.lambda_parameter:.3f}):")
        print(f"β₀ (intercept): {self.model.intercept_[0]:.4f}")
        print(f"β₁ (S × t̂^λ): {self.model.coef_[0][0]:.4f}")
        print(f"β₂ (e^(-t̂)): {self.model.coef_[0][1]:.4f}")
        print(f"β₃ (L): {self.model.coef_[0][2]:.4f}")

        return self
    
    def create_splits(self, train_size=0.8, val_size=0.1, test_size=0.1, random_state=42):
        outcomes = self.preprocessing()
        
        indices = np.arange(len(outcomes))
        
        idx_temp, self.test_indices, y_temp, self.y_test = train_test_split(
            indices, 
            outcomes, 
            test_size=test_size, 
            random_state=random_state,
            stratify=outcomes
        )
        
        val_ratio = val_size / (train_size + val_size)
        
        self.train_indices, self.val_indices, self.y_train, self.y_val = train_test_split(
            idx_temp, 
            y_temp, 
            test_size=val_ratio, 
            random_state=random_state,
            stratify=y_temp
        )
        
        self.score_margins_train = self.score_margins[self.train_indices]
        self.time_remaining_train = self.time_remaining[self.train_indices]
        self.spread_lines_train = self.spread_lines[self.train_indices]
        
        self.score_margins_val = self.score_margins[self.val_indices]
        self.time_remaining_val = self.time_remaining[self.val_indices]
        self.spread_lines_val = self.spread_lines[self.val_indices]
        
        self.score_margins_test = self.score_margins[self.test_indices]
        self.time_remaining_test = self.time_remaining[self.test_indices]
        self.spread_lines_test = self.spread_lines[self.test_indices]
        
        print(f"\nDataset Split (80/10/10):")
        print(f"Training samples: {len(self.train_indices)} ({len(self.train_indices)/len(outcomes)*100:.1f}%)")
        print(f"Validation samples: {len(self.val_indices)} ({len(self.val_indices)/len(outcomes)*100:.1f}%)")
        print(f"Test samples: {len(self.test_indices)} ({len(self.test_indices)/len(outcomes)*100:.1f}%)")

        return self

    def win_probability(self, score_differential, time_remaining, spread):
        score_differential = np.atleast_1d(score_differential)
        time_remaining = np.atleast_1d(time_remaining)
        spread = np.atleast_1d(spread)
        feature_one = score_differential * (time_remaining ** self.lambda_parameter)
        feature_two = np.exp(-time_remaining)
        feature_three = spread
        X = np.column_stack([feature_one,feature_two,feature_three])
        probability = self.model.predict_proba(X)[:,1]
        return probability
    
    def eval(self):
        test_probability = self.model.predict_proba(self.X_test)[:,1]
        test_prediction = (test_probability > .5).astype(int)
        accuracy = accuracy_score(self.y_test, test_prediction)
        logloss = log_loss(self.y_test, test_probability)

        print(f"\nModel Evaluation on Test Set")
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test Log Loss: {logloss:.4f}")
        print(f"Test Samples: {len(self.y_test)}")
        
        print(f"\nClassification Report:")
        print(classification_report(self.y_test, test_prediction, target_names=['Loss', 'Win']))
        
        print(f"\nConfusion Matrix:")
        cm = confusion_matrix(self.y_test, test_prediction)
        print(f"Predicted")
        print(f"Loss Win")
        print(f"Actual Loss {cm[0,0]:4d} {cm[0,1]:4d}")
        print(f"Win {cm[1,0]:4d} {cm[1,1]:4d}")

    def optimal_lambda(self, lambda_range=np.arange(0.01, 10, .01)):
        best_lambda = None 
        lowest_loss = float('inf')
        
        print(f"Testing {len(lambda_range)} lambda values...")
        
        for lambda_val in lambda_range:
            X_train_lambda = self.feature_matrix(self.score_margins_train, self.time_remaining_train, lambda_val, self.spread_lines_train)
            X_val_lambda = self.feature_matrix(self.score_margins_val, self.time_remaining_val, lambda_val, self.spread_lines_val)
            
            temp_model = LogisticRegression(random_state=42)
            temp_model.fit(X_train_lambda, self.y_train)
            
            val_probs = temp_model.predict_proba(X_val_lambda)[:, 1]
            val_loss = log_loss(self.y_val, val_probs)
            
            if val_loss < lowest_loss:
                lowest_loss = val_loss
                best_lambda = lambda_val
        
        self.lambda_parameter = best_lambda
        print(f"\nOptimal lambda found: {best_lambda:.3f} (validation loss: {lowest_loss:.4f})")
        return best_lambda
        
    def tuning(self):
        
        param_distrib = [
            {'penalty': ['l1'], 'solver': ['liblinear', 'saga'], 'C': loguniform(1e-4, 1e4), 'class_weight': [None, 'balanced']},
            {'penalty': ['l2'], 'solver': ['newton-cg', 'newton-cholesky', 'lbfgs', 'liblinear', 'sag', 'saga'], 'C': loguniform(1e-4, 1e4), 'class_weight': [None, 'balanced']},
            {'penalty': ['elasticnet'], 'solver': ['saga'], 'C': loguniform(1e-4, 1e4), 'l1_ratio': np.linspace(0, 1, 10), 'class_weight': [None, 'balanced']},
            {'penalty': [None], 'solver': ['newton-cg', 'newton-cholesky', 'lbfgs', 'sag', 'saga'], 'C': loguniform(1e-4, 1e4)},
        ]
        rnd_search = RandomizedSearchCV(
            estimator=LogisticRegression(random_state=42, max_iter=1000),
            param_distributions=param_distrib,
            n_iter=150,
            cv=5,
            random_state=42,
            n_jobs=1,
            scoring='neg_log_loss'
        )
        rnd_search.fit(self.X_train,self.y_train)
        best_params = rnd_search.best_params_
        print(f"Parameters: {best_params}")
        final_model = rnd_search.best_estimator_
        print(f"final model: {final_model}")
        self.model = LogisticRegression(random_state=42, **rnd_search.best_params_)
        self.model.fit(self.X_train, self.y_train)
        return best_params

def money_line_to_db(conn, cursor):
    moneyline_df = pd.read_csv("../api_data/rotowire_nba_games_archive.csv")
    for _,row in moneyline_df.iterrows():
        updates = []
        moneyline_df_home_abbrev = row['home_team_abbrev']
        moneyline_df_away_abbrev = row['visit_team_abbrev']
        moneyline_df_date = row['game_date'].split(" ")[0]
        moneyline = row['line']
        query = """
            select game_id, eventnum from play_by_play_q4
            where home_team_abbrev = %s and away_team_abbrev = %s and game_date = %s and moneyline is null
            group by game_id, eventnum
        """
        cursor.execute(query, (moneyline_df_home_abbrev,moneyline_df_away_abbrev,moneyline_df_date))
        db_data = cursor.fetchall()
        for game_id, eventnum in db_data:
            updates.append((moneyline, game_id, eventnum))
        update_query = """
                update play_by_play_q4
                set moneyline = %s 
                where game_id = %s and eventnum = %s
        """
        execute_batch(cursor, update_query, updates, page_size=1000)
        conn.commit()
        print(f"Updated {len(updates)} rows successfully")
    cursor.close()
    conn.close()    
    
def main():
    # connect to database 
    db_config = get_db_config()
    conn = psycopg2.connect(
        database=db_config['database'],
        user=db_config['user'],
        password=db_config['password'],
        host=db_config['host'],
        port=db_config['port']
    )
    cursor = conn.cursor()
    # Create SQLAlchemy engine for pandas to_sql functionality
    CONNECTION_STR = (
        f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}"
        f"@{db_config['host']}:{db_config['port']}/{db_config['database']}"
    )
    ENGINE = create_engine(CONNECTION_STR)
    print("Database connected successfully")
    nba_model = nba_win_probability_model(cursor)
    nba_model.create_splits()
    print("Finding optimal lambda")
    optimal_lambda = nba_model.optimal_lambda()
    print("Training model")
    nba_model.model_fit()
    print("Model evaluation")
    nba_model.eval()
    print('Hyperparameter tuning')
    nba_model.tuning()
    print("Eval after hyperparameter tuning")
    nba_model.eval()


main()




Database connected successfully

Dataset Split (80/10/10):
Training samples: 957640 (80.0%)
Validation samples: 119706 (10.0%)
Test samples: 119706 (10.0%)
Finding optimal lambda
Testing 999 lambda values...

Optimal lambda found: 0.010 (validation loss: 0.2911)
Training model

Model Coefficients (λ = 0.010):
β₀ (intercept): 0.4385
β₁ (S × t̂^λ): 0.2846
β₂ (e^(-t̂)): -0.4481
β₃ (L): -0.0613
Model evaluation

Model Evaluation on Test Set
Test Accuracy: 0.8730
Test Log Loss: 0.2889
Test Samples: 119706

Classification Report:
              precision    recall  f1-score   support

        Loss       0.86      0.85      0.85     52499
         Win       0.88      0.89      0.89     67207

    accuracy                           0.87    119706
   macro avg       0.87      0.87      0.87    119706
weighted avg       0.87      0.87      0.87    119706


Confusion Matrix:
Predicted
Loss Win
Actual Loss 44529 7970
Win 7235 59972
Hyperparameter tuning


InvalidParameterError: The 'n_iter' parameter of RandomizedSearchCV must be an int in the range [1, inf). Got <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001CDD3A42AD0> instead.