<a href="https://colab.research.google.com/github/krishna-kenny/nbaWinNeuralNetModel/blob/main/nba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
!pip install nba_api



In [41]:
import time
import pandas as pd
from nba_api.stats.endpoints import TeamInfoCommon, TeamGameLogs, PlayerGameLogs, LeagueGameFinder, LeagueLeaders, PlayerCareerStats
from nba_api.stats.static import teams

# Maximum number of retries for each API call
MAX_RETRIES = 3
# Define the list of seasons
# Generate all seasons from 2013 onwards
start_year = 2024
end_year = 2024  # Adjust to your desired year
seasons = [f"{year}-{(year + 1)%100}" for year in range(start_year, end_year + 1)]

# Printing seasons to verify
print(seasons)


['2024-25']


In [42]:
def fetch_with_retries(func, *args, **kwargs):
    """Attempts a function call up to MAX_RETRIES with exponential backoff."""
    for attempt in range(MAX_RETRIES):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            wait_time = 2**attempt  # Exponential backoff
            print(f"Error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    print(f"Failed after {MAX_RETRIES} attempts.")
    return None

In [43]:
def get_team_info(seasons):
    """Fetches relevant team information for the specified seasons."""
    print("Fetching team information...")
    nba_teams = teams.get_teams()
    team_data = []

    for team in nba_teams:
        team_info = fetch_with_retries(
            TeamInfoCommon,
            team_id=team["id"],
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if team_info:
            df_team = team_info.get_data_frames()[0]
            df_team = df_team[["TEAM_ID", "TEAM_ABBREVIATION"]]  # Only keep relevant features
            team_data.append(df_team)
            time.sleep(0.6)  # Delay to avoid API rate limits

    if team_data:
        df_teams = pd.concat(team_data, ignore_index=True)
        df_teams.to_csv("nba_team_data.csv", index=False)
    else:
        print("No team data fetched.")

# Run functions to save data to CSV files
get_team_info(seasons)
print("Team information data stored.")

Fetching team information...
Team information data stored.


In [44]:
def get_league_game_data():
    """Fetches league-wide game data with relevant features for a neural network."""
    print("Fetching league game data for NN...")
    game_data = fetch_with_retries(LeagueGameFinder, timeout=60)
    if game_data:
        df_game_data = game_data.get_data_frames()[0]
        # Relevant columns for neural network input
        relevant_columns = [
            "SEASON_ID", "TEAM_ID", "TEAM_ABBREVIATION", "TEAM_NAME", "GAME_ID",
            "GAME_DATE", "MATCHUP", "WL", "MIN", "PTS", "FGM", "FGA", "FG_PCT",
            "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", "OREB", "DREB",
            "REB", "AST", "STL", "BLK", "TOV", "PF", "PLUS_MINUS"
        ]
        df_nn_data = df_game_data[relevant_columns]
        df_nn_data.to_csv("nba_league_game.csv", index=False)
    else:
        print("No league game data fetched.")

get_league_game_data()
print("League game data stored.")

Fetching league game data for NN...
League game data stored.


In [53]:
import pandas as pd

def process_league_game_data(input_file):
    """
    Processes the league game data from nba_league_game.csv to create TEAM1, TEAM2, and SEASON_YEAR columns.
    """
    try:
        # Load the input CSV file
        df_game_logs = pd.read_csv(input_file, parse_dates=["GAME_DATE"])

        # Process MATCHUP column to create TEAM1 and TEAM2 columns
        matchups_split = df_game_logs['MATCHUP'].str.split(' @ | vs. ', expand=True)
        df_game_logs['TEAM1'] = matchups_split[0]
        df_game_logs['TEAM2'] = matchups_split[1]

        # Extract SEASON_YEAR from SEASON_ID
        df_game_logs['SEASON_YEAR'] = df_game_logs['SEASON_ID'].apply(lambda x: str(x)[1:])  # Extract year from ID

        # Save the processed DataFrame to a CSV file
        df_game_logs.to_csv("nba_game_logs.csv", index=False)
        print("Processed game logs saved to 'nba_game_logs.csv'.")

    except Exception as e:
        print(f"An error occurred while processing league game data: {e}")

# Use the nba_league_game.csv file
process_league_game_data("nba_league_game.csv")


Processed game logs saved to 'nba_game_logs.csv'.


In [46]:
def get_league_leaders():
    """Fetches league leaders data with relevant columns for analysis."""
    print("Fetching league leaders data...")
    leaders_data = fetch_with_retries(LeagueLeaders, timeout=60)
    if leaders_data:
        df_leaders = leaders_data.get_data_frames()[0]
        # Select only relevant columns
        relevant_columns = ['PLAYER_ID', 'RANK', 'PLAYER', 'TEAM', 'GP', 'MIN', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'EFF', 'AST_TOV', 'STL_TOV']
        df_relevant_leaders = df_leaders[relevant_columns]
        df_relevant_leaders.to_csv("nba_league_leaders_relevant.csv", index=False)
    else:
        print("No league leaders data fetched.")

get_league_leaders()
print("League leaders data stored.")

Fetching league leaders data...
League leaders data stored.


In [47]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv("nba_league_leaders_relevant.csv")

# Calculate weights based on rank
data['WEIGHT'] = np.exp(-0.1 * (data['RANK'] - 1))

# Features to aggregate (excluding categorical and non-numeric ones)
features_to_aggregate = [
    "GP", "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT",
    "FTM", "FTA", "FT_PCT", "OREB", "DREB", "REB", "AST", "STL",
    "BLK", "TOV", "PF", "PTS", "EFF", "AST_TOV", "STL_TOV"
]

# Multiply features by weight
for feature in features_to_aggregate:
    data[f"{feature}_WEIGHTED"] = data[feature] * data['WEIGHT']

# Group by team and aggregate
aggregated = data.groupby("TEAM").apply(
    lambda group: pd.Series({
        feature: group[f"{feature}_WEIGHTED"].sum() / group['WEIGHT'].sum()
        for feature in features_to_aggregate
    })
).reset_index()

# Rename columns for clarity
aggregated.columns = ["TEAM"] + [f"WEIGHTED_{col}" for col in features_to_aggregate]

aggregated.to_csv("nba_team_aggregated_data.csv", index=False)
# Save or display the result
print(aggregated.head())


  TEAM  WEIGHTED_GP  WEIGHTED_MIN  WEIGHTED_FGM  WEIGHTED_FGA  \
0  ATL    36.838847   1318.397229    256.729700    634.263741   
1  BKN    32.813097   1064.688368    213.090651    427.692427   
2  BOS    36.659713   1329.114258    335.276766    731.035964   
3  CHA    26.025705    888.898472    267.063531    631.054941   
4  CHI    37.115083   1237.350781    313.964466    600.830471   

   WEIGHTED_FG_PCT  WEIGHTED_FG3M  WEIGHTED_FG3A  WEIGHTED_FG3_PCT  \
0         0.405343     107.021101     310.363742          0.344705   
1         0.498236     106.247643     248.520050          0.427198   
2         0.458451     130.708655     366.056318          0.356018   
3         0.423443     113.705064     337.698199          0.336489   
4         0.522721     107.127711     241.695484          0.442370   

   WEIGHTED_FTM  ...  WEIGHTED_REB  WEIGHTED_AST  WEIGHTED_STL  WEIGHTED_BLK  \
0    226.476688  ...    132.754987    430.393776     47.987375      7.792965   
1    112.947460  ...    136.

  aggregated = data.groupby("TEAM").apply(


In [57]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import tensorflow.keras.backend as K

def custom_accuracy(y_true, y_pred):
    """
    Custom accuracy metric to evaluate the model.
    """
    condition_1 = K.cast(K.less(y_pred, 0.5), dtype="float32") * K.cast(K.equal(y_true, 0), dtype="float32")
    condition_2 = K.cast(K.greater_equal(y_pred, 0.5), dtype="float32") * K.cast(K.equal(y_true, 1), dtype="float32")
    return K.mean(condition_1 + condition_2)

def build_neural_network(input_shape):
    """
    Build a neural network model with added regularization and improved architecture.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(1, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss="binary_crossentropy",
                  metrics=[custom_accuracy])

    print(model.summary())
    return model

def train_model(X, y):
    """
    Train a neural network model.
    """
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Build the model
    model = build_neural_network(X_train.shape[1])

    # Early stopping and learning rate scheduler
    early_stopping = EarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True)

    # Train the model
    model.fit(X_train, y_train,
              epochs=150,
              batch_size=5,
              validation_data=(X_val, y_val),
              callbacks=[early_stopping],)

    return model, scaler

def prepare_dataset(game_logs_file, features_file):
    """
    Prepare dataset for training using transformed features (difference and ratio).
    """
    try:
        # Load game logs and team features
        game_logs = pd.read_csv(game_logs_file, parse_dates=["GAME_DATE"])
        team_features = pd.read_csv(features_file)

        # Normalize key columns
        game_logs["TEAM1"] = game_logs["TEAM1"].str.strip().str.upper()
        game_logs["TEAM2"] = game_logs["TEAM2"].str.strip().str.upper()

        # Merge features for TEAM1 and TEAM2
        game_logs = game_logs.merge(
            team_features.add_suffix("_TEAM1"),
            left_on=["TEAM1"],
            right_on=["TEAM_TEAM1"],
            how="left"
        ).merge(
            team_features.add_suffix("_TEAM2"),
            left_on=["TEAM2"],
            right_on=["TEAM_TEAM2"],
            how="left"
        )

        # Drop unnecessary columns
        columns_to_drop = ["TEAM_TEAM1", "TEAM_TEAM2", "GAME_DATE"]
        game_logs.drop(columns=[col for col in columns_to_drop if col in game_logs.columns], inplace=True)

        # Handle missing values
        game_logs.fillna(0, inplace=True)

        # Extract features and target
        feature_columns_team1 = [col for col in game_logs.columns if col.endswith("_TEAM1")]
        feature_columns_team2 = [col.replace("_TEAM1", "_TEAM2") for col in feature_columns_team1]

        # Ensure column alignment
        feature_columns_team2 = [col for col in feature_columns_team2 if col in game_logs.columns]

        # Standardize team features before transformations
        scaler = StandardScaler()
        team_features_team1 = scaler.fit_transform(game_logs[feature_columns_team1])
        team_features_team2 = scaler.fit_transform(game_logs[feature_columns_team2])

        # Compute transformed features
        X = (team_features_team1 - team_features_team2)  # Simpler transformation
        y = (game_logs["WL"] == "W").astype(int).to_numpy()

        return X, y, feature_columns_team1

    except Exception as e:
        print("An error occurred in prepare_dataset:", e)
        raise


def main():
    game_logs_file = "nba_game_logs.csv"
    features_file = "nba_team_aggregated_data.csv"

    # Prepare the dataset
    X, y, feature_columns = prepare_dataset(game_logs_file, features_file)

    # Train the model
    model, scaler = train_model(X, y)

    # Evaluate the model
    X_scaled = scaler.transform(X)
    loss, accuracy = model.evaluate(X_scaled, y)
    print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

if __name__ == "__main__":
    main()


None
Epoch 1/150
[1m4800/4800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1ms/step - custom_accuracy: 0.5451 - loss: 0.6906 - val_custom_accuracy: 0.5536 - val_loss: 0.6766
Epoch 2/150
[1m4800/4800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - custom_accuracy: 0.5555 - loss: 0.6746 - val_custom_accuracy: 0.5508 - val_loss: 0.6753
Epoch 3/150
[1m4800/4800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - custom_accuracy: 0.5509 - loss: 0.6741 - val_custom_accuracy: 0.5505 - val_loss: 0.6739
Epoch 4/150
[1m4800/4800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1ms/step - custom_accuracy: 0.5553 - loss: 0.6723 - val_custom_accuracy: 0.5517 - val_loss: 0.6764
Epoch 5/150
[1m4800/4800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2ms/step - custom_accuracy: 0.5483 - loss: 0.6741 - val_custom_accuracy: 0.5502 - val_loss: 0.6758
Epoch 6/150
[1m4800/4800[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - custom_a

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import joblib
from nba_api.stats.static import teams
from keras.models import load_model
import os


def prepare_features(team1, team2, features_file, scaler, feature_columns):
    """
    Prepare input features for prediction by combining team-specific features.
    """
    try:
        # Load team features
        team_features = pd.read_csv(features_file)

        # Normalize team names
        team_features["TEAM"] = team_features["TEAM"].str.strip().str.upper()
        team1 = team1.strip().upper()
        team2 = team2.strip().upper()

        # Extract features for the two teams
        features_team1 = team_features[team_features["TEAM"] == team1].add_suffix("_TEAM1")
        features_team2 = team_features[team_features["TEAM"] == team2].add_suffix("_TEAM2")

        if features_team1.empty or features_team2.empty:
            raise ValueError(f"Features for {team1} or {team2} not found in the file.")

        # Combine features
        combined_features = pd.concat([features_team1.reset_index(drop=True),
                                        features_team2.reset_index(drop=True)], axis=1)

        # Align with feature_columns and fill missing values with 0
        combined_features = combined_features.reindex(columns=feature_columns, fill_value=0)

        # Scale features
        X = combined_features.to_numpy()
        X_scaled = scaler.transform(X)

        return X_scaled

    except Exception as e:
        print("An error occurred in prepare_features:", e)
        raise


def predict(team1, team2, model_path, scaler_path, features_file, feature_columns):
    """
    Predict the probability of Team 1 beating Team 2.
    """
    try:
        # Validate file paths
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file '{model_path}' not found.")
        if not os.path.exists(scaler_path):
            raise FileNotFoundError(f"Scaler file '{scaler_path}' not found.")

        # Load the model and scaler
        model = load_model(model_path, custom_objects={"custom_accuracy": custom_accuracy})
        scaler = joblib.load(scaler_path)

        # Prepare the input features
        X_scaled = prepare_features(team1, team2, features_file, scaler, feature_columns)

        # Make predictions
        probability = model.predict(X_scaled).flatten()[0]

        print(f"Probability of {team1} beating {team2}: {probability:.2%}")
        return probability

    except Exception as e:
        print("An error occurred in predict:", e)
        raise


def custom_accuracy(y_true, y_pred):
    """
    Custom accuracy metric to evaluate the model based on given conditions.
    """
    import tensorflow.keras.backend as K
    condition_1 = K.cast(y_pred < 0.5, dtype="float32") * K.cast(y_true == 0, dtype="float32")
    condition_2 = K.cast(y_pred >= 0.5, dtype="float32") * K.cast(y_true == 1, dtype="float32")
    return K.mean(condition_1 + condition_2)


def display_team_data(features_file):
    """
    Display available team-season combinations for user reference.
    """
    try:
        team_features = pd.read_csv(features_file)
        print("Available TEAM combinations:")
        for team in team_features["TEAM"].unique():
            print(team)
    except Exception as e:
        print("An error occurred in display_team_data:", e)


if __name__ == "__main__":
    features_file = "nba_team_aggregated_data.csv"
    model_path = "model.keras"
    scaler_path = "scaler.pkl"

    # Load feature columns from the training process
    try:
        feature_columns = pd.read_csv("train_data_X.csv", nrows=0).columns.tolist()
    except FileNotFoundError:
        print("Feature column file 'train_data_X.csv' not found.")
        exit(1)

    display_team_data(features_file)

    # Example usage
    team1 = input("Enter TEAM like team:season (e.g., GSW:2024): ")
    team2 = input("Enter TEAM like team:season (e.g., PHI:2024): ")

    predict(team1, team2, model_path, scaler_path, features_file, feature_columns)
