<a href="https://colab.research.google.com/github/krishna-kenny/nbaWinNeuralNetModel/blob/main/nba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
!pip install nba_api



In [42]:
import time
import pandas as pd
from nba_api.stats.endpoints import TeamInfoCommon, TeamGameLogs, PlayerGameLogs, LeagueGameFinder, LeagueLeaders, PlayerCareerStats
from nba_api.stats.static import teams

# Maximum number of retries for each API call
MAX_RETRIES = 3
# Define the list of seasons
seasons = ["2024-25"]

In [43]:
def fetch_with_retries(func, *args, **kwargs):
    """Attempts a function call up to MAX_RETRIES with exponential backoff."""
    for attempt in range(MAX_RETRIES):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            wait_time = 2**attempt  # Exponential backoff
            print(f"Error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    print(f"Failed after {MAX_RETRIES} attempts.")
    return None

In [44]:
def get_team_info(seasons):
    """Fetches relevant team information for the specified seasons."""
    print("Fetching team information...")
    nba_teams = teams.get_teams()
    team_data = []

    for team in nba_teams:
        team_info = fetch_with_retries(
            TeamInfoCommon,
            team_id=team["id"],
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if team_info:
            df_team = team_info.get_data_frames()[0]
            df_team = df_team[["TEAM_ID", "TEAM_ABBREVIATION"]]  # Only keep relevant features
            team_data.append(df_team)
            time.sleep(0.6)  # Delay to avoid API rate limits

    if team_data:
        df_teams = pd.concat(team_data, ignore_index=True)
        df_teams.to_csv("nba_team_data.csv", index=False)
    else:
        print("No team data fetched.")

# Run functions to save data to CSV files
get_team_info(seasons)
print("Team information data stored.")

Fetching team information...
Team information data stored.


In [45]:
def get_team_game_logs(seasons):
    """Fetches team game logs for the specified seasons and processes the MATCHUP column."""
    print("Fetching team game logs...")
    game_log_data = []

    for season in seasons:
        game_logs = fetch_with_retries(
            TeamGameLogs,
            season_nullable=season,
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if game_logs:
            df_game_logs = game_logs.get_data_frames()[0]
            # Keep only relevant columns
            df_game_logs = df_game_logs[["GAME_ID", "GAME_DATE", "MATCHUP", "WL"]]
            game_log_data.append(df_game_logs)
            time.sleep(0.6)  # Delay to respect rate limits

    if game_log_data:
        # Concatenate all game logs
        df_all_game_logs = pd.concat(game_log_data, ignore_index=True)

        # Process MATCHUP column to create team1 and team2 columns
        matchups_split = df_all_game_logs['MATCHUP'].str.split(' @ | vs. ', expand=True)
        df_all_game_logs['TEAM1'] = matchups_split[0]
        df_all_game_logs['TEAM2'] = matchups_split[1]

        # Drop the original MATCHUP column if no longer needed
        df_all_game_logs.drop(columns=['MATCHUP'], inplace=True)

        # Extract and add SEASON_YEAR
        df_all_game_logs['SEASON_YEAR'] = pd.to_datetime(df_all_game_logs['GAME_DATE']).dt.year.astype(str)

        # Create combined TEAM_SEASON columns
        df_all_game_logs['TEAM_SEASON1'] = df_all_game_logs['TEAM1'] + ':' + df_all_game_logs['SEASON_YEAR']
        df_all_game_logs['TEAM_SEASON2'] = df_all_game_logs['TEAM2'] + ':' + df_all_game_logs['SEASON_YEAR']

        # Drop the original TEAM1, TEAM2, and SEASON_YEAR columns if no longer needed
        df_all_game_logs.drop(columns=['TEAM1', 'TEAM2', 'SEASON_YEAR'], inplace=True)

        # Save the processed DataFrame to a CSV file
        df_all_game_logs.to_csv("nba_game_logs.csv", index=False)
        print("Processed game logs saved to 'nba_game_logs.csv'.")
    else:
        print("No game log data fetched.")

get_team_game_logs(seasons)
print("Team game logs data stored.")


Fetching team game logs...
Processed game logs saved to 'nba_game_logs.csv'.
Team game logs data stored.


In [46]:
def get_player_game_logs(seasons):
    """Fetches player game logs for the specified seasons."""
    print("Fetching player game logs...")
    player_game_log_data = []

    for season in seasons:
        player_game_logs = fetch_with_retries(
            PlayerGameLogs,
            season_nullable=season,
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if player_game_logs:
            df_player_game_logs = player_game_logs.get_data_frames()[0]
            # Keep only relevant columns
            df_player_game_logs = df_player_game_logs[[
                "SEASON_YEAR", "GAME_ID", "TEAM_ID", "PLAYER_ID", "PLAYER_NAME", "PTS", "REB", "AST", "STL", "BLK",
                "MIN", "FG_PCT", "FG3_PCT", "FT_PCT", "TOV", "PF"
            ]]
            # Modify SEASON_YEAR to keep only the first 4 characters
            df_player_game_logs["SEASON_YEAR"] = df_player_game_logs["SEASON_YEAR"].str[:4]
            player_game_log_data.append(df_player_game_logs)
            time.sleep(0.6)

    if player_game_log_data:
        df_all_player_game_logs = pd.concat(player_game_log_data, ignore_index=True)
        df_all_player_game_logs.to_csv("nba_player_game_logs.csv", index=False)
    else:
        print("No player game log data fetched.")

# Call the function with the specified seasons
get_player_game_logs(seasons)
print("Player game logs data stored.")

Fetching player game logs...
Player game logs data stored.


In [47]:
def get_league_game_data():
    """Fetches league-wide game data with relevant features for a neural network."""
    print("Fetching league game data for NN...")
    game_data = fetch_with_retries(LeagueGameFinder, timeout=60)
    if game_data:
        df_game_data = game_data.get_data_frames()[0]
        # Relevant columns for neural network input
        relevant_columns = [
            "SEASON_ID", "TEAM_ID", "TEAM_ABBREVIATION", "TEAM_NAME", "GAME_ID",
            "GAME_DATE", "MATCHUP", "WL", "MIN", "PTS", "FGM", "FGA", "FG_PCT",
            "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", "OREB", "DREB",
            "REB", "AST", "STL", "BLK", "TOV", "PF", "PLUS_MINUS"
        ]
        df_nn_data = df_game_data[relevant_columns]
        df_nn_data.to_csv("nba_league_game.csv", index=False)
    else:
        print("No league game data fetched.")

get_league_game_data()
print("League game data stored.")

Fetching league game data for NN...
League game data stored.


In [48]:
def get_league_leaders():
    """Fetches league leaders data with relevant columns for analysis."""
    print("Fetching league leaders data...")
    leaders_data = fetch_with_retries(LeagueLeaders, timeout=60)
    if leaders_data:
        df_leaders = leaders_data.get_data_frames()[0]
        # Select only relevant columns
        relevant_columns = [
            "PLAYER_ID", "PLAYER", "TEAM_ID", "TEAM", "GP", "MIN", "FGM", "FGA",
            "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", "OREB",
            "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "EFF"
        ]
        df_relevant_leaders = df_leaders[relevant_columns]
        df_relevant_leaders.to_csv("nba_league_leaders_relevant.csv", index=False)
    else:
        print("No league leaders data fetched.")

get_league_leaders()
print("League leaders data stored.")

Fetching league leaders data...
League leaders data stored.


In [49]:
def get_player_career_stats():
    """Fetches career stats for players."""
    print("Fetching player career stats...")
    career_stats_data = []
    nba_teams = teams.get_teams()
    for team in nba_teams:
        players = team.get("players", [])
        for player in players:
            career_stats = fetch_with_retries(PlayerCareerStats, player_id=player["id"], timeout=60)
            if career_stats:
                df_career_stats = career_stats.get_data_frames()[0]
                # Keep only relevant columns
                df_career_stats = df_career_stats[[
                    "PLAYER_ID", "PLAYER_NAME", "GP", "PTS", "REB", "AST", "FG_PCT", "FG3_PCT", "FT_PCT"
                ]]
                career_stats_data.append(df_career_stats)
                time.sleep(0.6)

    if career_stats_data:
        df_all_career_stats = pd.concat(career_stats_data, ignore_index=True)
        df_all_career_stats.to_csv("nba_player_career_stats.csv", index=False)
    else:
        print("No player career stats data fetched.")

get_player_career_stats()
print("Player career stats data stored.")

Fetching player career stats...
No player career stats data fetched.
Player career stats data stored.


In [50]:
import pandas as pd

# Load player game logs from the CSV file
file_path = "nba_player_game_logs.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Exclude non-numerical columns explicitly
no_aggregate_columns = ['PLAYER_ID', 'SEASON_YEAR', 'PLAYER_NAME', 'TEAM_ID']  # Adjust as necessary
numerical_columns = [col for col in df.columns if col not in no_aggregate_columns]

# Group by PLAYER_ID and SEASON_YEAR
grouped = df.groupby(['PLAYER_ID', 'SEASON_YEAR'])

# Aggregate numerical columns using mean and count the number of games
aggregated_data = grouped[numerical_columns].mean().reset_index()

# Add non-numerical columns using the first value in the group (like TEAM_ID)
aggregated_data['TEAM_ID'] = grouped['TEAM_ID'].first().values

# Add games played as a new column
aggregated_data['GAMES_PLAYED'] = grouped.size().values

# Save the aggregated data for further use
output_path = "nba_player_aggregated_data.csv"
aggregated_data.to_csv(output_path, index=False)

print(f"Aggregated data saved to '{output_path}'.")


Aggregated data saved to 'nba_player_aggregated_data.csv'.


In [51]:
import pandas as pd

# Load aggregated player data
player_aggregated_file = "nba_player_aggregated_data.csv"  # Update with your actual file path
team_abbreviation_file = "nba_team_data.csv"  # File containing TEAM_ID to TEAM_ABBREVIATION mapping

# Load player data and team abbreviation mapping
player_df = pd.read_csv(player_aggregated_file)
team_data_df = pd.read_csv(team_abbreviation_file)

# Merge team abbreviations into player data
player_df = player_df.merge(team_data_df, on="TEAM_ID", how="left")

# Combine TEAM_ABBREVIATION and SEASON_YEAR into a new column
player_df['TEAM_SEASON'] = player_df['TEAM_ABBREVIATION'] + ":" + player_df['SEASON_YEAR'].astype(str)

# Drop the original TEAM_ABBREVIATION and SEASON_YEAR columns
player_df.drop(columns=['TEAM_ABBREVIATION', 'SEASON_YEAR', 'GAME_ID'], inplace=True)

# Define non-numerical columns to exclude
no_aggregate_columns = ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_SEASON']
numerical_columns = [col for col in player_df.columns if col not in no_aggregate_columns]

# Multiply each player's stats by their 'MIN' to weight the statistics
for col in numerical_columns:
    player_df[f"{col}_WEIGHTED"] = player_df[col] * player_df['MIN']

# Group by TEAM_SEASON
grouped = player_df.groupby(['TEAM_SEASON'])

# Compute team-level weighted stats as the sum of weighted stats divided by the total 'MIN'
team_aggregated_data = grouped[[f"{col}_WEIGHTED" for col in numerical_columns]].sum()
team_aggregated_data.columns = numerical_columns  # Rename back to original column names

# Compute total minutes played by the team
team_aggregated_data['TOTAL_MIN'] = grouped['MIN'].sum()

# Normalize weighted stats by dividing by TOTAL_MIN
for col in numerical_columns:
    team_aggregated_data[col] = team_aggregated_data[col] / team_aggregated_data['TOTAL_MIN']

# Add additional columns
team_aggregated_data['TEAM_GAMES_PLAYED'] = grouped['GAMES_PLAYED'].sum()  # Total games played by players in the team

# Reset index to flatten the DataFrame
team_aggregated_data.reset_index(inplace=True)

# Save the aggregated data for further use
output_path = "nba_team_aggregated_data.csv"
team_aggregated_data.to_csv(output_path, index=False)

print(f"Team aggregated data saved to '{output_path}'.")
print(team_aggregated_data.head())


Team aggregated data saved to 'nba_team_aggregated_data.csv'.
  TEAM_SEASON        PTS       REB       AST       STL       BLK        MIN  \
0    ATL:2024  12.517602  4.603445  3.263720  1.106791  0.523868  25.093177   
1    BKN:2024  11.159203  3.935153  2.490732  0.698381  0.351855  23.434451   
2    BOS:2024  14.026085  4.899728  2.913664  0.838082  0.591792  26.306693   
3    CHA:2024  11.674453  4.483149  2.554467  0.774399  0.501070  24.043847   
4    CHI:2024  11.855969  4.358064  3.096378  0.742207  0.404245  23.668120   

     FG_PCT   FG3_PCT    FT_PCT       TOV        PF  GAMES_PLAYED   TOTAL_MIN  \
0  0.449480  0.287977  0.482819  1.688143  1.883960     24.575455  334.073158   
1  0.414251  0.246777  0.479832  1.454625  2.010277     24.030250  298.116057   
2  0.424780  0.289157  0.447614  1.286885  1.740272     25.912215  317.803788   
3  0.408512  0.255962  0.456263  1.621471  2.083410     19.883753  390.016917   
4  0.434351  0.330585  0.453331  1.496984  1.688131     26

In [52]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

In [53]:
def extract_season_year(game_date):
    """
    Extracts the year from a game date in the format 'YYYY-MM-DDTHH:MM:SS'.

    Args:
        game_date (str): The game date string.

    Returns:
        str: The year as a string.
    """
    try:
        return pd.to_datetime(game_date).year
    except Exception as e:
        print(f"Error parsing date '{game_date}': {e}")
        return None

In [54]:
import pandas as pd
import numpy as np

def prepare_dataset(game_logs_file, features_file):
    """
    Prepare dataset for training using team-specific features.

    Args:
        game_logs_file: CSV file containing game logs with TEAM1, TEAM2, GAME_DATE, and WL columns.
        features_file: CSV file containing aggregated team features.

    Returns:
        X: Feature matrix for training.
        y: Target vector (win/loss).
        feature_columns: List of feature names used in the dataset.
    """
    try:
        # Load game logs
        game_logs = pd.read_csv(game_logs_file, parse_dates=["GAME_DATE"])  # Ensure GAME_DATE is parsed


        # Load team features
        team_features = pd.read_csv(features_file)

        # Normalize key columns
        game_logs["TEAM_SEASON1"] = game_logs["TEAM_SEASON1"].str.strip().str.upper()
        game_logs["TEAM_SEASON2"] = game_logs["TEAM_SEASON2"].str.strip().str.upper()

        # Merge features for TEAM1
        game_logs = game_logs.merge(
            team_features.add_suffix("_TEAM1"),
            left_on=["TEAM_SEASON1"],
            right_on=["TEAM_SEASON_TEAM1"],
            how="left"
        )

        # Merge features for TEAM2
        game_logs = game_logs.merge(
            team_features.add_suffix("_TEAM2"),
            left_on=["TEAM_SEASON2"],
            right_on=["TEAM_SEASON_TEAM2"],
            how="left"
        )

        # Drop unnecessary columns
        columns_to_drop = ["TEAM_SEASON_TEAM1", "TEAM_SEASON_TEAM2", "GAME_DATE"]
        game_logs.drop(columns=[col for col in columns_to_drop if col in game_logs.columns], inplace=True)

        # Handle missing values
        game_logs.fillna(0, inplace=True)

        # Extract features and target
        feature_columns = game_logs.select_dtypes(include=np.number).columns.difference(["WL"])
        X = game_logs[feature_columns].to_numpy()
        y = (game_logs["WL"] == "W").astype(int).to_numpy()  # Convert "W"/"L" to 1/0

        return X, y, feature_columns

    except Exception as e:
        print("An error occurred in prepare_dataset:", e)
        raise


In [67]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
import joblib
import tensorflow.keras.backend as K
from keras.callbacks import EarlyStopping
from keras.regularizers import l2

def custom_accuracy(y_true, y_pred):
    """
    Custom accuracy metric to evaluate the model based on given conditions.
    """
    condition_1 = K.cast(y_pred < 0.5, dtype="float32") * K.cast(y_true == 0, dtype="float32")
    condition_2 = K.cast(y_pred >= 0.5, dtype="float32") * K.cast(y_true == 1, dtype="float32")
    return K.mean(condition_1 + condition_2)


def prepare_dataset(game_logs_file, features_file):
    """
    Prepare dataset for training using team-specific features.
    """
    try:
        # Load game logs and team features
        game_logs = pd.read_csv(game_logs_file, parse_dates=["GAME_DATE"])
        team_features = pd.read_csv(features_file)

        # Normalize key columns
        game_logs["TEAM_SEASON1"] = game_logs["TEAM_SEASON1"].str.strip().str.upper()
        game_logs["TEAM_SEASON2"] = game_logs["TEAM_SEASON2"].str.strip().str.upper()

        # Merge features for TEAM1 and TEAM2
        game_logs = game_logs.merge(
            team_features.add_suffix("_TEAM1"),
            left_on=["TEAM_SEASON1"],
            right_on=["TEAM_SEASON_TEAM1"],
            how="left"
        ).merge(
            team_features.add_suffix("_TEAM2"),
            left_on=["TEAM_SEASON2"],
            right_on=["TEAM_SEASON_TEAM2"],
            how="left"
        )

        # Drop unnecessary columns
        columns_to_drop = ["TEAM_SEASON_TEAM1", "TEAM_SEASON_TEAM2", "GAME_DATE"]
        game_logs.drop(columns=[col for col in columns_to_drop if col in game_logs.columns], inplace=True)

        # Handle missing values
        game_logs.fillna(0, inplace=True)

        # Extract features and target
        feature_columns = game_logs.select_dtypes(include=np.number).columns.difference(["WL"])
        X = game_logs[feature_columns].to_numpy()
        y = (game_logs["WL"] == "W").astype(int).to_numpy()

        return X, y, feature_columns

    except Exception as e:
        print("An error occurred in prepare_dataset:", e)
        raise

def build_neural_network(input_shape):
    """
    Build a neural network model with added regularization.
    """
    model = Sequential([
        Input(shape=(input_shape,)),
        Dense(32, activation="relu"),  # Added L2 regularization
        Dropout(0.75),  # Increased dropout rate
        Dense(16, activation="relu"),
        Dropout(0.5),
        Dense(8, activation="relu"),
        Dropout(0.25),
        Dense(4, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),  # Reduced learning rate
                  loss="binary_crossentropy",
                  metrics=[custom_accuracy])

    print(model.summary())

    return model


def train_model(X, y, save_data_prefix="train_data"):
    """
    Train a neural network model and save the processed data.
    """
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Handle class imbalance
    X_resampled, y_resampled = SMOTE().fit_resample(X_scaled, y)

    # Save processed training data
    pd.DataFrame(X_resampled).to_csv(f"{save_data_prefix}_X.csv", index=False, header=False)
    pd.DataFrame(y_resampled).to_csv(f"{save_data_prefix}_y.csv", index=False, header=False)

    # Debugging output
    print(f"Processed training data saved to {save_data_prefix}_X.csv and {save_data_prefix}_y.csv.")
    print(f"Resampled Features Shape: {X_resampled.shape}, Resampled Target Shape: {y_resampled.shape}")

    # Build the model
    model = build_neural_network(X_resampled.shape[1])

    # Early stopping to prevent overfitting
    early_stopping = EarlyStopping(monitor="val_loss", patience=16, restore_best_weights=True)

    # Train the model
    model.fit(X_resampled, y_resampled,
              epochs=64,
              batch_size=1,
              validation_split=0.2,
              callbacks=[early_stopping])

    return model, scaler



def main():
    game_logs_file = "nba_game_logs.csv"
    features_file = "nba_team_aggregated_data.csv"

    # Prepare the dataset
    X, y, feature_columns = prepare_dataset(game_logs_file, features_file)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # Validate split
    assert X_train.shape[0] == y_train.shape[0], "Mismatch in training data sizes."
    assert X_test.shape[0] == y_test.shape[0], "Mismatch in test data sizes."

    # Train neural network
    model, scaler = train_model(X_train, y_train)

    # Save the model and scaler
    model.save("model.keras")
    joblib.dump(scaler, "scaler.pkl")

    # Evaluate the model
    X_test_scaled = scaler.transform(X_test)
    test_loss, test_custom_accuracy = model.evaluate(X_test_scaled, y_test)
    print(f"Test Loss: {test_loss}, Test Custom Accuracy: {test_custom_accuracy}")

    # Predictions
    y_pred = model.predict(X_test_scaled)
    pd.DataFrame({"Actual": y_test.flatten(), "Predicted": y_pred.flatten()}).to_csv("predictions.csv", index=False)
    print("Predictions saved to predictions.csv")


if __name__ == "__main__":
    main()


Processed training data saved to train_data_X.csv and train_data_y.csv.
Resampled Features Shape: (804, 29), Resampled Target Shape: (804,)




None
Epoch 1/64
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - custom_accuracy: 0.5173 - loss: 1.0686 - val_custom_accuracy: 0.5404 - val_loss: 0.6892
Epoch 2/64
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - custom_accuracy: 0.5115 - loss: 0.7348 - val_custom_accuracy: 0.5404 - val_loss: 0.6891
Epoch 3/64
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - custom_accuracy: 0.5041 - loss: 0.7283 - val_custom_accuracy: 0.5404 - val_loss: 0.6895
Epoch 4/64
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - custom_accuracy: 0.5414 - loss: 0.7173 - val_custom_accuracy: 0.5404 - val_loss: 0.6899
Epoch 5/64
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - custom_accuracy: 0.5042 - loss: 0.7048 - val_custom_accuracy: 0.5404 - val_loss: 0.6899
Epoch 6/64
[1m643/643[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - custom_accuracy: 0.5826 - l

In [60]:
import pandas as pd
import numpy as np
import joblib
from nba_api.stats.static import teams
from keras.models import load_model


def prepare_features(team_season1, team_season2, features_file, scaler, feature_columns):
    """
    Prepare input features for prediction by combining team-specific features.
    """
    try:
        # Load team features
        team_features = pd.read_csv(features_file)

        # Normalize team names
        team_features["TEAM_SEASON"] = team_features["TEAM_SEASON"].str.strip().str.upper()
        team_season1 = team_season1.strip().upper()
        team_season2 = team_season2.strip().upper()

        # Extract features for the two teams
        features_team1 = team_features[team_features["TEAM_SEASON"] == team_season1].add_suffix("_TEAM1")
        features_team2 = team_features[team_features["TEAM_SEASON"] == team_season2].add_suffix("_TEAM2")

        if features_team1.empty or features_team2.empty:
            raise ValueError(f"Features for {team_season1} or {team_season2} not found in the file.")

        # Combine features
        combined_features = pd.concat([features_team1.reset_index(drop=True),
                                        features_team2.reset_index(drop=True)], axis=1)

        # Align with feature_columns and fill missing values with 0
        combined_features = combined_features.reindex(columns=feature_columns, fill_value=0)

        # Scale features
        X = combined_features.to_numpy()
        X_scaled = scaler.transform(X)

        return X_scaled

    except Exception as e:
        print("An error occurred in prepare_features:", e)
        raise


def predict(team_season1, team_season2, model_path, scaler_path, features_file, feature_columns):
    """
    Predict the probability of Team 1 beating Team 2.
    """
    try:
        # Load the model and scaler
        model = load_model(model_path, custom_objects={"custom_accuracy": custom_accuracy})
        scaler = joblib.load(scaler_path)

        # Prepare the input features
        X_scaled = prepare_features(team_season1, team_season2, features_file, scaler, feature_columns)

        # Make predictions
        probability = model.predict(X_scaled).flatten()[0]

        print(f"Probability of {team_season1} beating {team_season2}: {probability:.2%}")
        return probability

    except Exception as e:
        print("An error occurred in predict:", e)
        raise


def custom_accuracy(y_true, y_pred):
    """
    Custom accuracy metric to evaluate the model based on given conditions.
    """
    import tensorflow.keras.backend as K
    condition_1 = K.cast(y_pred < 0.5, dtype="float32") * K.cast(y_true == 0, dtype="float32")
    condition_2 = K.cast(y_pred >= 0.5, dtype="float32") * K.cast(y_true == 1, dtype="float32")
    return K.mean(condition_1 + condition_2)


def display_team_data():
    """
    Display available team abbreviations and names for user reference.
    """
    nba_teams = teams.get_teams()
    print("Available NBA Teams:")
    for team in nba_teams:
        print(f"{team['abbreviation']} - {team['full_name']}")


if __name__ == "__main__":
    display_team_data()
    # Example usage
    team_season1 = input("Enter TEAM_SEASON like team:season (e.g., GSW:2024): ")
    team_season2 = input("Enter TEAM_SEASON like team:season (e.g., PHI:2024): ")
    model_path = "model.keras"
    scaler_path = "scaler.pkl"
    features_file = "nba_team_aggregated_data.csv"

    # Load feature columns from the training process
    feature_columns = pd.read_csv("train_data_X.csv", nrows=0).columns.tolist()

    predict(team_season1, team_season2, model_path, scaler_path, features_file, feature_columns)


Enter TEAM_SEASON like team:season (e.g., GSW:2024): GSW:2024
Enter TEAM_SEASON like team:season (e.g., PHI:2024): PHI:2024
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Probability of GSW:2024 beating PHI:2024: 0.00%


In [None]:
import numpy as np
import pandas as pd
import joblib
from tensorflow.keras.models import load_model


def load_feature_names(feature_names_file="feature_names.pkl"):
    """
    Load saved feature names for feature alignment during prediction.
    """
    feature_names = joblib.load(feature_names_file)

    # Print feature names
    print("Feature Names Used in the Model:")
    print(feature_names)

    return feature_names


def fetch_team_features(team_season, features_file, feature_names):
    """
    Retrieve the team-specific features for the given team_season.

    Args:
        team_season: Combination of team abbreviation and season (e.g., 'LAL_2023').
        features_file: CSV file containing the aggregated team features.
        feature_names: List of features expected by the model.

    Returns:
        numpy array of the team's features.
    """
    team_features = pd.read_csv(features_file)
    team_row = team_features[team_features["TEAM_SEASON"] == team_season.upper()]

    if team_row.empty:
        raise ValueError(f"Features for '{team_season}' not found in {features_file}.")

    # Ensure only numeric features are returned
    numeric_features = team_row[feature_names].apply(pd.to_numeric, errors='coerce')

    return numeric_features.to_numpy().flatten()


from tensorflow.keras.models import load_model

def predict_matchup_win_probability(team1_season, team2_season, features_file, model_path="model.keras", scaler_path="scaler.pkl"):
    """
    Predict the win probability for Team 1 in a matchup against Team 2.

    Args:
        team1_season: TEAM_SEASON string for Team 1 (e.g., 'LAL_2023').
        team2_season: TEAM_SEASON string for Team 2 (e.g., 'BOS_2023').
        features_file: CSV file containing aggregated team features.
        model_path: Path to the trained neural network model file (saved as .keras).
        scaler_path: Path to the scaler file for feature normalization.
    """
    # Load model, scaler, and feature names
    model = load_model(model_path)  # Updated to reflect .keras model format
    scaler = joblib.load(scaler_path)
    feature_names = load_feature_names()

    # Fetch features for both teams
    team1_features = fetch_team_features(team1_season, features_file, feature_names)
    team2_features = fetch_team_features(team2_season, features_file, feature_names)

    # Create matchup feature differences and ratios
    matchup_features = np.concatenate([
        team1_features - team2_features,
        team1_features / (team2_features + 1e-5)  # Avoid division by zero
    ]).reshape(1, -1)

    # Scale the matchup features
    matchup_features_scaled = scaler.transform(matchup_features)

    # Predict win probability for Team 1
    win_probability = model.predict(matchup_features_scaled)[0][0]

    print(f"\nWin Probability for {team1_season} vs {team2_season}: {win_probability * 100:.2f}%")





def main():
    """Main function to handle user input and prediction."""
    features_file = "features.csv"  # Path to the features file

    # Display team data before taking user input
    display_team_data()

    team1_abbreviation = input("Enter Team 1 abbreviation (e.g., 'LAL' for Los Angeles Lakers): ").strip()
    season1 = input("Enter Team 1 season (e.g., '2023'): ").strip()
    team1_season = f"{team1_abbreviation.upper()}_{season1}"

    team2_abbreviation = input("Enter Team 2 abbreviation (e.g., 'BOS' for Boston Celtics): ").strip()
    season2 = input("Enter Team 2 season (e.g., '2023'): ").strip()
    team2_season = f"{team2_abbreviation.upper()}_{season2}"

    try:
        predict_matchup_win_probability(team1_season, team2_season, features_file)
    except ValueError as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    main()
