In [1]:
import psycopg2
import configparser
import os
import pandas as pd
from sqlalchemy import create_engine, text
import subprocess
import sys
import papermill as pm
import json
import math
from psycopg2.extras import execute_batch
from sklearn.cluster import KMeans
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, classification_report, confusion_matrix

def get_db_config():
    config = configparser.ConfigParser()
    config.read('../api_data/db.ini')
    
    return {
        'database': config['postgresql']['database'],
        'user': config['postgresql']['user'],
        'password': config['postgresql']['password'],
        'host': config['postgresql']['host'],
        'port': config['postgresql']['port']
    }


In [2]:
class nba_win_probability_model:
    def __init__(self, cursor):
        self.model = LogisticRegression(random_state=42)
        self.fitted = False
        self.lambda_parameter = 1.0
        self.time_remaining = None
        self.spread_lines = None
        self.score_margins = None
        self.outcomes = None
        self.game_ids = None
        self.cursor = cursor
    def fetching_outcomes(self):
        self.outcomes = {}
        query_final_margins = """
            select game_id, scoremargin
            from (
                select game_id, scoremargin,
                       row_number() over (partition by game_id order by eventnum desc) as rn
                from play_by_play_q4
                where scoremargin is not null
            ) ranked
            where rn = 1
        """
        self.cursor.execute(query_final_margins)
        final_scores_db = self.cursor.fetchall()
        for game_id, score_margin in final_scores_db:
            outcome = 0
            if 'TIE' not in score_margin and int(score_margin) > 0:
                outcome = 1
            self.outcomes[game_id] = outcome
        return self.outcomes 
    
    def features(self):
        query = """
            select game_id, eventnum, time_left, scoremargin, moneyline from play_by_play_q4
            where scoremargin is not null and period = 4 and moneyline is not null
            order by game_id, eventnum
        """
        self.cursor.execute(query)
        queried_features = self.cursor.fetchall()
        game_id_list = []
        time_list = []
        score_difference_list = []
        spread_list = []
        for feature in queried_features:
            game_id, eventnum, time_left, margin, spread = feature
            if 'TIE' in margin:
                margin = 0.0
            normalized_time = time_left / 2880.0
            normalized_time = max(0.0, min(1.0, normalized_time))
            game_id_list.append(game_id)
            score_difference_list.append(float(margin))
            time_list.append(normalized_time)
            spread_list.append(spread)

        self.time_remaining = np.array(time_list)
        self.spread_lines = np.array(spread_list)
        self.score_margins = np.array(score_difference_list)
        self.game_ids = game_id_list
        return self.time_remaining, self.spread_lines, self.score_margins

    def preprocessing(self):
        self.fetching_outcomes()
        self.features()
        outcome_list = []
        for game_id in self.game_ids:
            if game_id in self.outcomes:
                outcome_list.append(self.outcomes[game_id])
        self.training_outcomes = np.array(outcome_list)
        return self.training_outcomes
    
    def feature_matrix(self):
        feature_one = self.score_margins * (self.time_remaining ** self.lambda_parameter)
        feature_two = np.exp(-self.time_remaining)
        feature_three = self.spread_lines
        matrix = np.column_stack([feature_one,feature_two,feature_three])
        return matrix
    
    def model_fit(self, test_size=.2, random_state=42):
        training = self.preprocessing()
        feature_matrix = self.feature_matrix()
        X_train, X_test, y_train, y_test = train_test_split(
            feature_matrix, 
            training, 
            test_size=test_size, 
            random_state=random_state,
            stratify=training
        )
        self.X_test = X_test
        self.y_test = y_test
        self.model.fit(X_train,y_train)
        self.fitted = True

        print(f"\nDataset Split:")
        print(f"Training samples: {len(X_train)}")
        print(f"Test samples: {len(X_test)}")
        print(f"Training win rate: {y_train.mean():.3f}")
        print(f"Test win rate: {y_test.mean():.3f}")
        
        print(f"\nModel Coefficients:")
        print(f"β₀ (intercept): {self.model.intercept_[0]:.4f}")
        print(f"β₁ (S × t̂^λ): {self.model.coef_[0][0]:.4f}")
        print(f"β₂ (e^(-t̂)): {self.model.coef_[0][1]:.4f}")
        print(f"β₃ (L): {self.model.coef_[0][2]:.4f}")

        return self
    
    def win_probability(self, score_differential, time_remaining, spread):
        score_differential = np.atleast_1d(score_differential)
        time_remaining = np.atleast_1d(time_remaining)
        spread = np.atleast_1d(spread)
        feature_one = score_differential * (time_remaining ** self.lambda_parameter)
        feature_two = np.exp(-time_remaining)
        feature_three = spread
        X = np.column_stack([feature_one,feature_two,feature_three])
        probability = self.model.predict_proba(X)[:,1]
        return probability
    
    def eval(self):
        test_probability = self.model.predict_proba(self.X_test)[:,1]
        test_prediction = (test_probability > .5).astype(int)
        accuracy = accuracy_score(self.y_test, test_prediction)
        logloss = log_loss(self.y_test, test_probability)

        print(f"\nModel Evaluation on Test Set")
        print(f"Test Accuracy: {accuracy:.4f}")
        print(f"Test Log Loss: {logloss:.4f}")
        print(f"Test Samples: {len(self.y_test)}")
        
        print(f"\nClassification Report:")
        print(classification_report(self.y_test, test_prediction, target_names=['Loss', 'Win']))
        
        print(f"\nConfusion Matrix:")
        cm = confusion_matrix(self.y_test, test_prediction)
        print(f"Predicted")
        print(f"Loss Win")
        print(f"Actual Loss {cm[0,0]:4d} {cm[0,1]:4d}")
        print(f"Win {cm[1,0]:4d} {cm[1,1]:4d}")

    def optimal_lambda(self, lambda_range = np.arange(.1,3.1,.1), validation_split = .1):
        outcomes = self.preprocessing()
        train_features = []
        train_outcomes = []
        best_lambda = None 
        highest_score = float('inf')
        for lambda_val in lambda_range:
            prev_lambda = self.lambda_parameter
            self.lambda_parameter = lambda_val
            matrix = self.feature_matrix()
            X_train, X_val, y_train, y_val = train_test_split(
                matrix, 
                outcomes,
                test_size=validation_split,
                random_state=42,
                stratify=outcomes
            )
            temp_model = LogisticRegression(random_state=42)
            temp_model.fit(X_train,y_train)

            val_probs = temp_model.predict_proba(X_val)[:, 1]
            val_loss = log_loss(y_val, val_probs)

            if val_loss < highest_score:
                highest_score = val_loss
                best_lambda = lambda_val

            self.lambda_parameter = prev_lambda
        self.lambda_parameter = best_lambda
        print(f"Optimal lambda found: {best_lambda:.2f} (validation loss: {highest_score:.4f})")
        return best_lambda
        
    
def money_line_to_db(conn, cursor):
    moneyline_df = pd.read_csv("../api_data/rotowire_nba_games_archive.csv")
    for _,row in moneyline_df.iterrows():
        updates = []
        moneyline_df_home_abbrev = row['home_team_abbrev']
        moneyline_df_away_abbrev = row['visit_team_abbrev']
        moneyline_df_date = row['game_date'].split(" ")[0]
        moneyline = row['line']
        query = """
            select game_id, eventnum from play_by_play_q4
            where home_team_abbrev = %s and away_team_abbrev = %s and game_date = %s and moneyline is null
            group by game_id, eventnum
        """
        cursor.execute(query, (moneyline_df_home_abbrev,moneyline_df_away_abbrev,moneyline_df_date))
        db_data = cursor.fetchall()
        for game_id, eventnum in db_data:
            updates.append((moneyline, game_id, eventnum))
        update_query = """
                update play_by_play_q4
                set moneyline = %s 
                where game_id = %s and eventnum = %s
        """
        execute_batch(cursor, update_query, updates, page_size=1000)
        conn.commit()
        print(f"Updated {len(updates)} rows successfully")
    cursor.close()
    conn.close()    
    
def main():
    # connect to database 
    db_config = get_db_config()
    conn = psycopg2.connect(
        database=db_config['database'],
        user=db_config['user'],
        password=db_config['password'],
        host=db_config['host'],
        port=db_config['port']
    )
    cursor = conn.cursor()
    # Create SQLAlchemy engine for pandas to_sql functionality
    CONNECTION_STR = (
        f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}"
        f"@{db_config['host']}:{db_config['port']}/{db_config['database']}"
    )
    ENGINE = create_engine(CONNECTION_STR)
    print("Database connected successfully")
    nba_model = nba_win_probability_model(cursor)
    print("Finding optimal lambda")
    optimal_lambda = nba_model.optimal_lambda()
    print("Training model")
    nba_model.model_fit()
    print("Model evaluation")
    nba_model.eval()


main()




Database connected successfully
Finding optimal lambda
Optimal lambda found: 0.10 (validation loss: 0.2964)
Training model

Dataset Split:
Training samples: 957641
Test samples: 239411
Training win rate: 0.561
Test win rate: 0.561

Model Coefficients:
β₀ (intercept): 0.4294
β₁ (S × t̂^λ): 0.3433
β₂ (e^(-t̂)): -0.4380
β₃ (L): -0.0620
Model evaluation

Model Evaluation on Test Set
Test Accuracy: 0.8717
Test Log Loss: 0.2974
Test Samples: 239411

Classification Report:
              precision    recall  f1-score   support

        Loss       0.86      0.85      0.85    104998
         Win       0.88      0.89      0.89    134413

    accuracy                           0.87    239411
   macro avg       0.87      0.87      0.87    239411
weighted avg       0.87      0.87      0.87    239411


Confusion Matrix:
Predicted
Loss Win
Actual Loss 88858 16140
Win 14567 119846
