<a href="https://colab.research.google.com/github/krishna-kenny/nbaWinNeuralNetModel/blob/main/nba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
pip install nba_api



In [7]:
pip install tensorflow



In [15]:
import time
import pandas as pd
from nba_api.stats.endpoints import TeamInfoCommon, TeamGameLogs, PlayerGameLogs
from nba_api.stats.static import teams

# Maximum number of retries for each API call
MAX_RETRIES = 3


def fetch_with_retries(func, *args, **kwargs):
    """Attempts a function call up to MAX_RETRIES with exponential backoff."""
    for attempt in range(MAX_RETRIES):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            wait_time = 2**attempt  # Exponential backoff
            print(f"Error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    print(f"Failed after {MAX_RETRIES} attempts.")
    return None


def get_team_info(seasons):
    print("Fetching team information...")
    nba_teams = teams.get_teams()
    team_data = []

    for season in seasons:
        for team in nba_teams:
            team_info = fetch_with_retries(
                TeamInfoCommon,
                team_id=team["id"],
                season_type_nullable="Regular Season",
                timeout=60,
            )
            if team_info:
                df_team = team_info.get_data_frames()[0]
                team_data.append(df_team)
                time.sleep(0.6)  # Delay to avoid API rate limits
            else:
                print(
                    f"Skipping team {team['full_name']} for season {season} after failed attempts."
                )

    if team_data:
        df_teams = pd.concat(team_data, ignore_index=True)
        df_teams.to_csv("data/raw/nba_team_data.csv", index=False)
    else:
        print("No team data fetched.")


def get_game_logs(seasons):
    print("Fetching game logs...")
    game_log_data = []

    for season in seasons:
        game_logs = fetch_with_retries(
            TeamGameLogs,
            season_nullable=season,
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if game_logs:
            df_game_logs = game_logs.get_data_frames()[0]
            game_log_data.append(df_game_logs)
            time.sleep(0.6)
        else:
            print(f"Skipping game logs for season {season} after failed attempts.")

    if game_log_data:
        df_all_game_logs = pd.concat(game_log_data, ignore_index=True)
        df_all_game_logs.to_csv("data/raw/nba_game_logs.csv", index=False)
    else:
        print("No game log data fetched.")


def get_player_game_logs(seasons):
    print("Fetching player game logs...")
    player_game_log_data = []

    for season in seasons:
        player_game_logs = fetch_with_retries(
            PlayerGameLogs,
            season_nullable=season,
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if player_game_logs:
            df_player_game_logs = player_game_logs.get_data_frames()[0]
            player_game_log_data.append(df_player_game_logs)
            time.sleep(0.6)
        else:
            print(
                f"Skipping player game logs for season {season} after failed attempts."
            )

    if player_game_log_data:
        df_all_player_game_logs = pd.concat(player_game_log_data, ignore_index=True)
        df_all_player_game_logs.to_csv("data/raw/nba_player_game_logs.csv", index=False)
    else:
        print("No player game log data fetched.")


# Define the list of seasons
seasons = ["2023-24", "2024-25"]

# Run functions to save data to CSV files
get_team_info(seasons[-1])
print("Team information data stored.")

get_game_logs(seasons)
print("Game logs data stored.")

get_player_game_logs(seasons[-2:])  # Fetching for the most recent two seasons only
print("Player game logs data stored.")


Fetching team information...
Error: HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=60). Retrying in 1 seconds...
Team information data stored.
Fetching game logs...
Game logs data stored.
Fetching player game logs...
Player game logs data stored.


In [16]:
import pandas as pd
import numpy as np


# Function to preprocess team data
def preprocess_team_data(input_file, output_file):
    """Load, clean, and save team data by removing unnecessary columns."""
    team_data = pd.read_csv(input_file).dropna()
    columns_to_drop = [
        "TEAM_NAME",
        "TEAM_CITY",
        "SEASON_YEAR",
        "TEAM_CODE",
        "TEAM_DIVISION",
        "MIN_YEAR",
        "MAX_YEAR",
    ]
    team_data = team_data.drop(columns=columns_to_drop)
    team_data = team_data.drop_duplicates()
    team_data.to_csv(output_file, index=False)
    print(f"Team data preprocessed and saved. rows: {team_data.shape}")


# Function to preprocess game logs
def preprocess_game_logs(input_file, output_file):
    """Load, clean, and save game log data with specific transformations."""
    game_logs = pd.read_csv(input_file).dropna()

    # Convert 'GAME_DATE' to datetime and sort by date
    game_logs["GAME_DATE"] = pd.to_datetime(game_logs["GAME_DATE"])
    game_logs_sorted = game_logs.sort_values(by="GAME_DATE", ascending=True)

    # Extract 'TEAM1' and 'TEAM2' from 'MATCHUP' column
    game_logs_sorted["TEAM1"] = game_logs_sorted["MATCHUP"].str.split().str[0]
    game_logs_sorted["TEAM2"] = game_logs_sorted["MATCHUP"].str.split().str[2]

    # Drop columns that are not required
    columns_to_drop = [
        "MATCHUP",
        "AVAILABLE_FLAG",
        "TEAM_NAME",
        "TEAM_ABBREVIATION",
        "GAME_ID",
    ]
    game_logs_cleaned = game_logs_sorted.drop(columns=columns_to_drop)

    # Convert 'WL' to binary format: 'W' becomes 1, 'L' becomes 0
    game_logs_cleaned["WL"] = game_logs_cleaned["WL"].apply(
        lambda result: 1 if result == "W" else 0
    )

    # Convert 'SEASON_YEAR' to integer format, using only the starting year
    game_logs_cleaned["SEASON_YEAR"] = game_logs_cleaned["SEASON_YEAR"].apply(
        lambda year: int(year[:4])
    )

    game_logs_cleaned.to_csv(output_file, index=False)
    print(f"Game logs preprocessed and saved. rows: {game_logs_cleaned.shape}")


# Function to preprocess player game logs
def preprocess_player_game_logs(input_file, output_file):
    """Load, clean, and save player game log data with specific transformations."""
    player_game_logs = pd.read_csv(input_file).dropna()

    # Drop unnecessary columns
    columns_to_drop = [
        "PLAYER_NAME",
        "NICKNAME",
        "TEAM_NAME",
        "TEAM_ABBREVIATION",
        "MATCHUP",
        "GAME_ID",
        "MIN_SEC",
    ]
    player_game_logs = player_game_logs.drop(columns=columns_to_drop)
    player_game_logs["WL"] = player_game_logs["WL"].apply(
        lambda x: 1 if x == "W" else 0
    )
    player_game_logs["SEASON_YEAR"] = player_game_logs["SEASON_YEAR"].apply(
        lambda x: x[:4]
    )

    player_game_logs.to_csv(output_file, index=False)
    print(f"Player game logs preprocessed and saved. rows: {player_game_logs.shape}")


# Function to compute weighted averages
def compute_weighted_avg(player_id, df):
    """Compute weighted averages for a given player."""
    player_rows = df[df["PLAYER_ID"] == player_id].copy()

    # Retain TEAM_ID
    team_id = player_rows["TEAM_ID"].iloc[0]

    # Convert GAME_DATE to a timestamp for weighting
    player_rows["GAME_TIMESTAMP"] = pd.to_datetime(player_rows["GAME_DATE"]).apply(
        lambda x: x.timestamp()
    )
    max_timestamp = player_rows["GAME_TIMESTAMP"].max()

    # Calculate weights based on recency
    player_rows["WEIGHTS"] = np.exp(
        (player_rows["GAME_TIMESTAMP"] - max_timestamp) / 1e7
    )
    player_rows["WEIGHTS"] /= player_rows["WEIGHTS"].sum()

    # Compute weighted average for all columns after WL
    weighted_avg = (
        player_rows.iloc[:, df.columns.get_loc("WL") + 1 :]  # Select columns after WL
        .mul(player_rows["WEIGHTS"], axis=0)  # Multiply each column by weights
        .sum()  # Sum the weighted values for each column
    )
    weighted_avg["TEAM_ID"] = team_id  # Include TEAM_ID in the output
    return weighted_avg


# Function to create feature data
def create_feature_data(input_csv, output_csv):
    """Generate feature data where each player is represented by a single row."""
    # Load the dataset
    df = pd.read_csv(input_csv)
    df.fillna(0, inplace=True)

    # Ensure GAME_DATE is parsed correctly
    df["GAME_DATE"] = pd.to_datetime(df["GAME_DATE"])

    # Get unique players
    unique_players = df["PLAYER_ID"].unique()

    # Create a new dataframe to store the features
    feature_data = []

    for player_id in unique_players:
        weighted_avg = compute_weighted_avg(player_id, df)
        weighted_avg["PLAYER_ID"] = player_id  # Retain the player ID
        feature_data.append(weighted_avg)

    # Convert the list to a DataFrame
    feature_df = pd.DataFrame(feature_data)

    # Save to CSV
    feature_df.to_csv(output_csv, index=False)
    print(f"Feature data saved to {output_csv}. rows: {feature_df.shape}")


# Function to compute team-level aggregated features
def create_team_features(player_features_file, team_features_output):
    """
    Generate aggregated features for each team by averaging player statistics.
    """
    import pandas as pd

    # Load the player features data
    player_data = pd.read_csv(player_features_file)

    # Compute team features
    team_features = player_data.groupby("TEAM_ID").mean().reset_index()

    # Load mapping of TEAM_ID to TEAM_ABBREVIATION and additional features
    preprocessed_nba_team_data = pd.read_csv(
        "data/processed/preprocessed_nba_team_data.csv"
    )
    id_to_abbr_map = preprocessed_nba_team_data.set_index("TEAM_ID")[
        "TEAM_ABBREVIATION"
    ].to_dict()

    # Apply a function to map TEAM_ID to ABBR
    team_features["TEAM_ID"] = team_features["TEAM_ID"].map(id_to_abbr_map)

    # Rename the column
    team_features.rename(columns={"TEAM_ID": "TEAM_ABBREVIATION"}, inplace=True)

    # Merge additional team features
    additional_features = preprocessed_nba_team_data.drop(
        columns=["TEAM_ID", "TEAM_CONFERENCE", "TEAM_SLUG"]
    )
    team_features = team_features.merge(
        additional_features,
        left_on="TEAM_ABBREVIATION",
        right_on="TEAM_ABBREVIATION",
        how="left",
    )

    # Save the resulting team features to a CSV file
    team_features.to_csv(team_features_output, index=False)
    print(
        f"Team feature data saved to {team_features_output}. rows: {team_features.shape}"
    )


# File paths for input and output data
team_data_file = "data/raw/nba_team_data.csv"
team_data_output = "data/processed/preprocessed_nba_team_data.csv"

game_data_file = "data/raw/nba_game_logs.csv"
game_data_output = "data/processed/preprocessed_nba_game_logs.csv"

player_game_logs_file = "data/raw/nba_player_game_logs.csv"
player_game_logs_output = "data/processed/preprocessed_nba_player_game_logs.csv"

# Run the preprocessing functions
preprocess_team_data(team_data_file, team_data_output)
preprocess_game_logs(game_data_file, game_data_output)
preprocess_player_game_logs(player_game_logs_file, player_game_logs_output)
create_feature_data(player_game_logs_output, "data/nba_player_features.csv")

# File paths for player features and team output
player_features_file = "data/nba_player_features.csv"
team_features_output = "data/nba_team_features.csv"

# Run the team feature creation function
create_team_features(player_features_file, team_features_output)


Team data preprocessed and saved. rows: (30, 9)
Game logs preprocessed and saved. rows: (3238, 54)
Player game logs preprocessed and saved. rows: (34803, 62)
Feature data saved to data/nba_player_features.csv. rows: (660, 61)
Team feature data saved to data/nba_team_features.csv. rows: (30, 66)


In [17]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib  # To save and load the scaler


def prepare_dataset(game_logs_file, features_file):
    """
    Prepare dataset for training using team-specific features.

    Args:
        game_logs_file: CSV file containing game logs with TEAM1, TEAM2, and WL columns.
        features_file: CSV file to save aggregated team features.

    Returns:
        X: Feature matrix for training.
        y: Target vector (win/loss).
    """
    # Load game logs
    game_logs = pd.read_csv(game_logs_file)

    # Aggregate features by team
    team_features = (
        game_logs.groupby("TEAM1")
        .mean(numeric_only=True)
        .reset_index()
        .rename(columns={"TEAM1": "TEAM"})
    )

    # Save the aggregated features to features_file
    team_features.to_csv(features_file, index=False)

    # Merge team features for TEAM1 and TEAM2
    game_logs = game_logs.merge(
        team_features,
        how="left",
        left_on="TEAM1",
        right_on="TEAM",
        suffixes=("", "_TEAM1"),
    ).merge(
        team_features,
        how="left",
        left_on="TEAM2",
        right_on="TEAM",
        suffixes=("", "_TEAM2"),
    )

    # Drop unnecessary columns
    columns_to_drop = [
        "TEAM",
        "TEAM_TEAM2",
        "TEAM_CONFERENCE",
        "TEAM_SLUG",
        "TEAM_CONFERENCE_TEAM2",
        "TEAM_SLUG_TEAM2",
        "PLAYER_ID",
        "PLAYER_ID_TEAM2",
        "AVAILABLE_FLAG",
        "AVAILABLE_FLAG_TEAM2",
        "GAME_TIMESTAMP",
        "GAME_TIMESTAMP_TEAM2",
    ]
    game_logs.drop(
        columns=[col for col in columns_to_drop if col in game_logs.columns],
        inplace=True,
    )

    # Feature engineering: Create new features for differences and ratios
    numeric_columns = game_logs.filter(regex="_TEAM1$").columns
    diff_features = {}
    ratio_features = {}
    for col in numeric_columns:
        base_col = col.replace("_TEAM1", "")
        diff_features[f"{base_col}_DIFF"] = (
            game_logs[f"{base_col}_TEAM1"] - game_logs[f"{base_col}_TEAM2"]
        )
        ratio_features[f"{base_col}_RATIO"] = game_logs[f"{base_col}_TEAM1"] / (
            game_logs[f"{base_col}_TEAM2"] + 1e-5
        )

    # Add all new features at once to optimize performance
    new_features = pd.concat(
        [pd.DataFrame(diff_features), pd.DataFrame(ratio_features)], axis=1
    )
    game_logs = pd.concat([game_logs, new_features], axis=1)

    # Handle missing values
    game_logs.fillna(0, inplace=True)

    # Extract features and target
    feature_columns = game_logs.select_dtypes(include=np.number).columns.difference(
        ["WL"]
    )
    X = game_logs[feature_columns].to_numpy()
    y = game_logs["WL"].astype(int).to_numpy()

    return X, y


def train_model(X, y):
    """
    Train a neural network model on the given features and labels.

    Args:
        X: Feature matrix for training.
        y: Target vector (win/loss).

    Returns:
        model: Trained neural network model.
        scaler: Fitted scaler for feature normalization.
    """
    # Normalize the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Handle class imbalance
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

    # Define the neural network
    model = Sequential()
    model.add(Dense(256, activation="relu", input_shape=(X_resampled.shape[1],)))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(1, activation="sigmoid"))

    # Compile the model
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

    # Train the model
    model.fit(X_resampled, y_resampled, epochs=16, batch_size=32, validation_split=0.2)

    return model, scaler


def load_trained_model(
    model_path="saved_model/model.h5", scaler_path="saved_model/scaler.pkl"
):
    """
    Load the pre-trained neural network model and the scaler.

    Args:
        model_path: Path to the saved model file.
        scaler_path: Path to the saved scaler file.

    Returns:
        model: The pre-trained model.
        scaler: The scaler used for feature normalization.
    """
    # Load the trained model
    model = load_model(model_path)

    # Load the scaler
    scaler = joblib.load(scaler_path)

    return model, scaler


if __name__ == "__main__":
    game_logs_file = "data/processed/preprocessed_nba_game_logs.csv"
    features_file = "data/features.csv"
    model_save_path = "saved_model/model.h5"
    scaler_save_path = "saved_model/scaler.pkl"

    # Prepare the dataset
    X, y = prepare_dataset(game_logs_file, features_file)

    # Check if data is valid for training
    if X.size == 0 or y.size == 0:
        print("No data available to train the model.")
    else:
        # Split the dataset into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Train the neural network
        model, scaler = train_model(X_train, y_train)

        # Save the trained model
        model.save(model_save_path)
        print(f"Neural network model saved to {model_save_path}")

        # Save the scaler
        joblib.dump(scaler, scaler_save_path)
        print(f"Scaler saved to {scaler_save_path}")

        # Evaluate the neural network
        X_test_scaled = scaler.transform(X_test)
        test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
        print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

        # Train and evaluate a Random Forest for comparison
        rf = RandomForestClassifier(random_state=42)
        rf.fit(X_train, y_train)
        y_pred_rf = rf.predict(X_test)
        rf_accuracy = accuracy_score(y_test, y_pred_rf)
        print(f"Random Forest Accuracy: {rf_accuracy}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/16
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7058 - loss: 0.5275 - val_accuracy: 0.9675 - val_loss: 0.0899
Epoch 2/16
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9738 - loss: 0.0705 - val_accuracy: 0.9790 - val_loss: 0.0450
Epoch 3/16
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9828 - loss: 0.0399 - val_accuracy: 0.9809 - val_loss: 0.0537
Epoch 4/16
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9900 - loss: 0.0260 - val_accuracy: 0.9790 - val_loss: 0.0535
Epoch 5/16
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9905 - loss: 0.0274 - val_accuracy: 0.9943 - val_loss: 0.0160
Epoch 6/16
[1m66/66[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9903 - loss: 0.0297 - val_accuracy: 0.9924 - val_loss: 0.0347
Epoch 7/16
[1m66/66[0m [32m━━━━━━━━━━



Neural network model saved to saved_model/model.h5
Scaler saved to saved_model/scaler.pkl
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9951 - loss: 0.0114     
Test Loss: 0.01557663083076477, Test Accuracy: 0.9938271641731262
Random Forest Accuracy: 1.0


In [18]:
import numpy as np
import pandas as pd
from nba_api.stats.static import teams
from nbaModel import (
    load_trained_model,
)  # Ensure this is correctly implemented in your nbaModel.py


# Get Team ID based on abbreviation
def get_team_id_by_abbreviation(team_abbreviation):
    """Retrieve the team ID by abbreviation."""
    nba_teams = teams.get_teams()
    for team in nba_teams:
        if team["abbreviation"].lower() == team_abbreviation.lower():
            return team["id"]
    raise ValueError(
        f"Team '{team_abbreviation}' not found! Please enter a valid abbreviation."
    )


# Fetch team-specific features
def fetch_team_features(team_abbreviation, features_file):
    """
    Retrieve the team-specific features for the given team abbreviation.
    Args:
        team_abbreviation: Abbreviation of the NBA team (e.g., 'LAL').
        features_file: CSV file containing the aggregated team features.
    Returns:
        numpy array of the team's features.
    """
    team_features = pd.read_csv(features_file)
    team_row = team_features[team_features["TEAM"] == team_abbreviation.upper()]
    if team_row.empty:
        raise ValueError(
            f"Features for team '{team_abbreviation}' not found in {features_file}."
        )
    return team_row.drop(columns=["TEAM"]).to_numpy().flatten()


# Predict win probability
def predict_matchup_win_probability(
    team1_abbreviation, team2_abbreviation, features_file
):
    """
    Predict the win probability of Team 1 beating Team 2.
    Args:
        team1_abbreviation: Abbreviation of Team 1 (e.g., 'LAL').
        team2_abbreviation: Abbreviation of Team 2 (e.g., 'BOS').
        features_file: Path to the CSV file containing aggregated team features.
    """
    # Load the trained model and scaler
    model, scaler = load_trained_model()

    # Fetch features for both teams
    team1_features = fetch_team_features(team1_abbreviation, features_file)
    team2_features = fetch_team_features(team2_abbreviation, features_file)

    # Calculate difference and ratio features
    diff_features = team1_features - team2_features
    ratio_features = team1_features / (team2_features + 1e-5)

    # Combine features for the model
    matchup_features = np.concatenate([diff_features, ratio_features]).reshape(1, -1)

    # Debug: Check input shape
    print(f"Matchup features shape: {matchup_features.shape}")
    print(f"Scaler expects: {scaler.n_features_in_}")

    # Ensure consistent feature count
    if matchup_features.shape[1] != scaler.n_features_in_:
        raise ValueError(
            f"Feature count mismatch. Got {matchup_features.shape[1]} features, "
            f"but scaler expects {scaler.n_features_in_}. Check feature engineering consistency."
        )

    # Scale the features
    scaled_features = scaler.transform(matchup_features)

    # Predict win probability for Team 1
    win_probability = model.predict(scaled_features)[0][0]
    print(
        f"Predicted probability of {team1_abbreviation} beating {team2_abbreviation}: {win_probability * 100:.2f}%"
    )
    return win_probability


def display_team_data():
    """Display team names, abbreviations, and IDs in a 2D array."""
    nba_teams = teams.get_teams()
    team_data = np.array(
        [[team["full_name"], team["abbreviation"], team["id"]] for team in nba_teams]
    )
    print("\nAvailable Teams:")
    print(pd.DataFrame(team_data, columns=["Team Name", "Abbreviation", "Team ID"]))


def main():
    """Main function to handle user input and prediction."""
    features_file = "data/features.csv"  # Path to the features file

    # Display team data before taking user input
    display_team_data()

    team1_abbreviation = input(
        "Enter Team 1 abbreviation (e.g., 'LAL' for Los Angeles Lakers): "
    ).strip()
    team2_abbreviation = input(
        "Enter Team 2 abbreviation (e.g., 'BOS' for Boston Celtics): "
    ).strip()

    try:
        predict_matchup_win_probability(
            team1_abbreviation, team2_abbreviation, features_file
        )
    except ValueError as e:
        print(e)


if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'nbaModel'