<a href="https://colab.research.google.com/github/krishna-kenny/nbaWinNeuralNetModel/blob/main/nba.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install nba_api



In [21]:
import time
import pandas as pd
from nba_api.stats.endpoints import TeamInfoCommon, TeamGameLogs, PlayerGameLogs, LeagueGameFinder, LeagueLeaders, PlayerCareerStats
from nba_api.stats.static import teams

# Maximum number of retries for each API call
MAX_RETRIES = 3
# Define the list of seasons
seasons = ["2021-22", "2022-23", "2023-24", "2024-25"]

In [22]:
def fetch_with_retries(func, *args, **kwargs):
    """Attempts a function call up to MAX_RETRIES with exponential backoff."""
    for attempt in range(MAX_RETRIES):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            wait_time = 2**attempt  # Exponential backoff
            print(f"Error: {e}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
    print(f"Failed after {MAX_RETRIES} attempts.")
    return None

In [23]:
def get_team_info(seasons):
    """Fetches relevant team information for the specified seasons."""
    print("Fetching team information...")
    nba_teams = teams.get_teams()
    team_data = []

    for team in nba_teams:
        team_info = fetch_with_retries(
            TeamInfoCommon,
            team_id=team["id"],
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if team_info:
            df_team = team_info.get_data_frames()[0]
            df_team = df_team[["TEAM_ID", "TEAM_ABBREVIATION"]]  # Only keep relevant features
            team_data.append(df_team)
            time.sleep(0.6)  # Delay to avoid API rate limits

    if team_data:
        df_teams = pd.concat(team_data, ignore_index=True)
        df_teams.to_csv("nba_team_data.csv", index=False)
    else:
        print("No team data fetched.")

# Run functions to save data to CSV files
get_team_info(seasons)
print("Team information data stored.")

Fetching team information...
Team information data stored.


In [24]:
def get_team_game_logs(seasons):
    """Fetches team game logs for the specified seasons and processes the MATCHUP column."""
    print("Fetching team game logs...")
    game_log_data = []

    for season in seasons:
        game_logs = fetch_with_retries(
            TeamGameLogs,
            season_nullable=season,
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if game_logs:
            df_game_logs = game_logs.get_data_frames()[0]
            # Keep only relevant columns
            df_game_logs = df_game_logs[["GAME_ID", "GAME_DATE", "MATCHUP", "WL"]]
            game_log_data.append(df_game_logs)
            time.sleep(0.6)  # Delay to respect rate limits

    if game_log_data:
        # Concatenate all game logs
        df_all_game_logs = pd.concat(game_log_data, ignore_index=True)

        # Process MATCHUP column to create team1 and team2 columns
        matchups_split = df_all_game_logs['MATCHUP'].str.split(' @ | vs. ', expand=True)
        df_all_game_logs['TEAM1'] = matchups_split[0]
        df_all_game_logs['TEAM2'] = matchups_split[1]

        # Drop the original MATCHUP column if no longer needed
        df_all_game_logs.drop(columns=['MATCHUP'], inplace=True)

        # Extract and add SEASON_YEAR
        df_all_game_logs['SEASON_YEAR'] = pd.to_datetime(df_all_game_logs['GAME_DATE']).dt.year.astype(str)

        # Create combined TEAM_SEASON columns
        df_all_game_logs['TEAM_SEASON1'] = df_all_game_logs['TEAM1'] + ':' + df_all_game_logs['SEASON_YEAR']
        df_all_game_logs['TEAM_SEASON2'] = df_all_game_logs['TEAM2'] + ':' + df_all_game_logs['SEASON_YEAR']

        # Drop the original TEAM1, TEAM2, and SEASON_YEAR columns if no longer needed
        df_all_game_logs.drop(columns=['TEAM1', 'TEAM2', 'SEASON_YEAR'], inplace=True)

        # Save the processed DataFrame to a CSV file
        df_all_game_logs.to_csv("nba_game_logs.csv", index=False)
        print("Processed game logs saved to 'nba_game_logs.csv'.")
    else:
        print("No game log data fetched.")

get_team_game_logs(seasons)
print("Team game logs data stored.")


Fetching team game logs...
Processed game logs saved to 'nba_game_logs.csv'.
Team game logs data stored.


In [25]:
def get_player_game_logs(seasons):
    """Fetches player game logs for the specified seasons."""
    print("Fetching player game logs...")
    player_game_log_data = []

    for season in seasons:
        player_game_logs = fetch_with_retries(
            PlayerGameLogs,
            season_nullable=season,
            season_type_nullable="Regular Season",
            timeout=60,
        )
        if player_game_logs:
            df_player_game_logs = player_game_logs.get_data_frames()[0]
            # Keep only relevant columns
            df_player_game_logs = df_player_game_logs[[
                "SEASON_YEAR", "GAME_ID", "TEAM_ID", "PLAYER_ID", "PLAYER_NAME", "PTS", "REB", "AST", "STL", "BLK",
                "MIN", "FG_PCT", "FG3_PCT", "FT_PCT", "TOV", "PF"
            ]]
            # Modify SEASON_YEAR to keep only the first 4 characters
            df_player_game_logs["SEASON_YEAR"] = df_player_game_logs["SEASON_YEAR"].str[:4]
            player_game_log_data.append(df_player_game_logs)
            time.sleep(0.6)

    if player_game_log_data:
        df_all_player_game_logs = pd.concat(player_game_log_data, ignore_index=True)
        df_all_player_game_logs.to_csv("nba_player_game_logs.csv", index=False)
    else:
        print("No player game log data fetched.")

# Call the function with the specified seasons
get_player_game_logs(seasons)
print("Player game logs data stored.")

Fetching player game logs...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_player_game_logs["SEASON_YEAR"] = df_player_game_logs["SEASON_YEAR"].str[:4]


Player game logs data stored.


In [26]:
def get_league_game_data():
    """Fetches league-wide game data with relevant features for a neural network."""
    print("Fetching league game data for NN...")
    game_data = fetch_with_retries(LeagueGameFinder, timeout=60)
    if game_data:
        df_game_data = game_data.get_data_frames()[0]
        # Relevant columns for neural network input
        relevant_columns = [
            "SEASON_ID", "TEAM_ID", "TEAM_ABBREVIATION", "TEAM_NAME", "GAME_ID",
            "GAME_DATE", "MATCHUP", "WL", "MIN", "PTS", "FGM", "FGA", "FG_PCT",
            "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", "OREB", "DREB",
            "REB", "AST", "STL", "BLK", "TOV", "PF", "PLUS_MINUS"
        ]
        df_nn_data = df_game_data[relevant_columns]
        df_nn_data.to_csv("nba_league_game.csv", index=False)
    else:
        print("No league game data fetched.")

get_league_game_data()
print("League game data stored.")

Fetching league game data for NN...
League game data stored.


In [27]:
def get_league_leaders():
    """Fetches league leaders data with relevant columns for analysis."""
    print("Fetching league leaders data...")
    leaders_data = fetch_with_retries(LeagueLeaders, timeout=60)
    if leaders_data:
        df_leaders = leaders_data.get_data_frames()[0]
        # Select only relevant columns
        relevant_columns = [
            "PLAYER_ID", "PLAYER", "TEAM_ID", "TEAM", "GP", "MIN", "FGM", "FGA",
            "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", "OREB",
            "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "EFF"
        ]
        df_relevant_leaders = df_leaders[relevant_columns]
        df_relevant_leaders.to_csv("nba_league_leaders_relevant.csv", index=False)
    else:
        print("No league leaders data fetched.")

get_league_leaders()
print("League leaders data stored.")

Fetching league leaders data...
League leaders data stored.


In [28]:
def get_player_career_stats():
    """Fetches career stats for players."""
    print("Fetching player career stats...")
    career_stats_data = []
    nba_teams = teams.get_teams()
    for team in nba_teams:
        players = team.get("players", [])
        for player in players:
            career_stats = fetch_with_retries(PlayerCareerStats, player_id=player["id"], timeout=60)
            if career_stats:
                df_career_stats = career_stats.get_data_frames()[0]
                # Keep only relevant columns
                df_career_stats = df_career_stats[[
                    "PLAYER_ID", "PLAYER_NAME", "GP", "PTS", "REB", "AST", "FG_PCT", "FG3_PCT", "FT_PCT"
                ]]
                career_stats_data.append(df_career_stats)
                time.sleep(0.6)

    if career_stats_data:
        df_all_career_stats = pd.concat(career_stats_data, ignore_index=True)
        df_all_career_stats.to_csv("nba_player_career_stats.csv", index=False)
    else:
        print("No player career stats data fetched.")

get_player_career_stats()
print("Player career stats data stored.")

Fetching player career stats...
No player career stats data fetched.
Player career stats data stored.


In [29]:
import pandas as pd

# Load player game logs from the CSV file
file_path = "nba_player_game_logs.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Load league leaders data (assumed to have 'PLAYER_ID' column)
league_leaders_path = "nba_league_leaders_relevant.csv"  # Update with your actual file path
league_leaders_df = pd.read_csv(league_leaders_path)

# Exclude non-numerical columns explicitly
no_aggregate_columns = ['PLAYER_ID', 'SEASON_YEAR', 'PLAYER_NAME', 'TEAM_ID']  # Adjust as necessary
numerical_columns = [col for col in df.columns if col not in no_aggregate_columns]

# Group by PLAYER_ID and SEASON_YEAR
grouped = df.groupby(['PLAYER_ID', 'SEASON_YEAR'])

# Aggregate numerical columns using mean and count the number of games
aggregated_data = grouped[numerical_columns].mean().reset_index()

# Add non-numerical columns using the first value in the group (like TEAM_ID)
aggregated_data['TEAM_ID'] = grouped['TEAM_ID'].first().values

# Add games played as a new column
aggregated_data['GAMES_PLAYED'] = grouped.size().values

# Mark league leaders (ignoring SEASON_YEAR)
league_leader_set = set(league_leaders_df['PLAYER_ID'])

# Add a column to indicate whether the player is a league leader
aggregated_data['LEAGUE_LEADER'] = aggregated_data['PLAYER_ID'].apply(
    lambda player_id: 1 if player_id in league_leader_set else 0
)

# Save the aggregated data for further use
output_path = "nba_player_aggregated_data.csv"
aggregated_data.to_csv(output_path, index=False)

print(f"Aggregated data saved to '{output_path}'.")



Aggregated data saved to 'nba_player_aggregated_data.csv'.


In [30]:
import pandas as pd

# Load aggregated player data
player_aggregated_file = "nba_player_aggregated_data.csv"  # Update with your actual file path
team_abbreviation_file = "nba_team_data.csv"  # File containing TEAM_ID to TEAM_ABBREVIATION mapping

# Load player data and team abbreviation mapping
player_df = pd.read_csv(player_aggregated_file)
team_data_df = pd.read_csv(team_abbreviation_file)

# Merge team abbreviations into player data
player_df = player_df.merge(team_data_df, on="TEAM_ID", how="left")

# Combine TEAM_ABBREVIATION and SEASON_YEAR into a new column
player_df['TEAM_SEASON'] = player_df['TEAM_ABBREVIATION'] + ":" + player_df['SEASON_YEAR'].astype(str)

# Drop the original TEAM_ABBREVIATION and SEASON_YEAR columns
player_df.drop(columns=['TEAM_ABBREVIATION', 'SEASON_YEAR'], inplace=True)

# Define non-numerical columns to exclude
no_aggregate_columns = ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_ID', 'TEAM_SEASON']
numerical_columns = [col for col in player_df.columns if col not in no_aggregate_columns]

# Multiply each player's stats by their 'MIN' to weight the statistics
for col in numerical_columns:
    player_df[f"{col}_WEIGHTED"] = player_df[col] * player_df['MIN']

# Group by TEAM_SEASON
grouped = player_df.groupby(['TEAM_SEASON'])

# Compute team-level weighted stats as the sum of weighted stats divided by the total 'MIN'
team_aggregated_data = grouped[[f"{col}_WEIGHTED" for col in numerical_columns]].sum()
team_aggregated_data.columns = numerical_columns  # Rename back to original column names

# Compute total minutes played by the team
team_aggregated_data['TOTAL_MIN'] = grouped['MIN'].sum()

# Normalize weighted stats by dividing by TOTAL_MIN
for col in numerical_columns:
    team_aggregated_data[col] = team_aggregated_data[col] / team_aggregated_data['TOTAL_MIN']

# Add additional columns
team_aggregated_data['TEAM_GAMES_PLAYED'] = grouped['GAMES_PLAYED'].sum()  # Total games played by players in the team

# Reset index to flatten the DataFrame
team_aggregated_data.reset_index(inplace=True)

# Save the aggregated data for further use
output_path = "nba_team_aggregated_data.csv"
team_aggregated_data.to_csv(output_path, index=False)

print(f"Team aggregated data saved to '{output_path}'.")
print(team_aggregated_data.head())


Team aggregated data saved to 'nba_team_aggregated_data.csv'.
  TEAM_SEASON       GAME_ID        PTS       REB       AST       STL  \
0    ATL:2021  2.210059e+07  11.151654  4.278361  2.202601  0.619072   
1    ATL:2022  2.220061e+07  12.611278  4.476498  2.664374  0.741768   
2    ATL:2023  2.230063e+07  12.870024  4.654347  3.075042  0.815135   
3    ATL:2024  2.240027e+07  12.458611  4.606468  3.245496  1.066465   
4    BKN:2021  2.210061e+07  12.753156  4.383505  2.636392  0.741647   

        BLK        MIN    FG_PCT   FG3_PCT    FT_PCT       TOV        PF  \
0  0.388194  23.924721  0.454031  0.270012  0.441140  0.984951  1.973915   
1  0.452792  24.990882  0.459390  0.237846  0.510258  1.316131  1.838686   
2  0.460030  25.954890  0.442187  0.280925  0.491170  1.439518  1.953551   
3  0.521157  25.086680  0.447500  0.286389  0.489835  1.714349  1.896854   
4  0.520330  25.668016  0.448659  0.260247  0.435397  1.412261  2.072285   

   GAMES_PLAYED  LEAGUE_LEADER   TOTAL_MIN  TEAM

In [31]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

In [32]:
def extract_season_year(game_date):
    """
    Extracts the year from a game date in the format 'YYYY-MM-DDTHH:MM:SS'.

    Args:
        game_date (str): The game date string.

    Returns:
        str: The year as a string.
    """
    try:
        return pd.to_datetime(game_date).year
    except Exception as e:
        print(f"Error parsing date '{game_date}': {e}")
        return None

In [33]:
import pandas as pd
import numpy as np

def prepare_dataset(game_logs_file, features_file):
    """
    Prepare dataset for training using team-specific features.

    Args:
        game_logs_file: CSV file containing game logs with TEAM1, TEAM2, GAME_DATE, and WL columns.
        features_file: CSV file containing aggregated team features.

    Returns:
        X: Feature matrix for training.
        y: Target vector (win/loss).
        feature_columns: List of feature names used in the dataset.
    """
    try:
        # Load game logs
        game_logs = pd.read_csv(game_logs_file, parse_dates=["GAME_DATE"])  # Ensure GAME_DATE is parsed

        # Load team features
        team_features = pd.read_csv(features_file)

        # Normalize key columns
        game_logs["TEAM_SEASON1"] = game_logs["TEAM_SEASON1"].str.strip().str.upper()
        game_logs["TEAM_SEASON2"] = game_logs["TEAM_SEASON2"].str.strip().str.upper()

        # Merge features for TEAM1
        game_logs = game_logs.merge(
            team_features.add_suffix("_TEAM1"),
            left_on=["TEAM_SEASON1"],
            right_on=["TEAM_SEASON_TEAM1"],
            how="left"
        )

        # Merge features for TEAM2
        game_logs = game_logs.merge(
            team_features.add_suffix("_TEAM2"),
            left_on=["TEAM_SEASON2"],
            right_on=["TEAM_SEASON_TEAM2"],
            how="left"
        )

        # Drop unnecessary columns
        columns_to_drop = ["TEAM_SEASON_TEAM1", "TEAM_SEASON_TEAM2", "GAME_DATE"]
        game_logs.drop(columns=[col for col in columns_to_drop if col in game_logs.columns], inplace=True)

        # Handle missing values
        game_logs.fillna(0, inplace=True)

        # Extract features and target
        feature_columns = game_logs.select_dtypes(include=np.number).columns.difference(["WL"])
        X = game_logs[feature_columns].to_numpy()
        y = (game_logs["WL"] == "W").astype(int).to_numpy()  # Convert "W"/"L" to 1/0

        return X, y, feature_columns

    except Exception as e:
        print("An error occurred in prepare_dataset:", e)
        raise


In [38]:
import tensorflow.keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib

def custom_accuracy(y_true, y_pred):
    """
    Custom accuracy metric to evaluate the model based on given conditions.
    If the prediction is in [0, 0.5) and true label is 0, it's correct.
    If the prediction is in [0.5, 1] and true label is 1, it's correct.
    """
    condition_1 = K.cast(y_pred < 0.5, dtype="float32") * K.cast(y_true == 0, dtype="float32")
    condition_2 = K.cast(y_pred >= 0.5, dtype="float32") * K.cast(y_true == 1, dtype="float32")
    return K.mean(condition_1 + condition_2)

def build_neural_network(input_shape):
    """
    Build a neural network model.

    Args:
        input_shape: Number of input features.

    Returns:
        model: Compiled neural network model.
    """
    model = Sequential([
        Input(shape=(input_shape,)),  # Explicitly define input shape
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(128, activation="relu"),
        Dropout(0.3),
        Dense(64, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=[custom_accuracy])  # Use custom accuracy
    return model

def save_model_and_scaler(model, scaler, model_path="model.keras", scaler_path="scaler.pkl"):
    """
    Save trained model and scaler.
    """
    # Save the model in the recommended .keras format
    model.save(model_path)
    # Save the scaler using joblib
    joblib.dump(scaler, scaler_path)
    print(f"Model saved to {model_path}, Scaler saved to {scaler_path}")

def save_feature_names(feature_names, feature_names_file="feature_names.pkl"):
    """
    Save feature names for later use during prediction.
    """
    joblib.dump(feature_names, feature_names_file)
    print(f"Feature names saved to {feature_names_file}")

import pandas as pd

def train_model(X, y, save_data_prefix="train_data"):
    """
    Train a neural network model and save the processed data.

    Args:
        X: Feature matrix for training.
        y: Target vector (win/loss).
        save_data_prefix: Prefix for saving processed data.

    Returns:
        model: Trained neural network model.
        scaler: Fitted scaler for feature normalization.
    """
    # Normalize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Handle class imbalance
    X_resampled, y_resampled = SMOTE(random_state=42).fit_resample(X_scaled, y)

    # Save processed training data
    pd.DataFrame(X_resampled).to_csv(f"{save_data_prefix}_X.csv", index=False, header=False)
    pd.DataFrame(y_resampled).to_csv(f"{save_data_prefix}_y.csv", index=False, header=False)
    print(f"Processed training data saved to {save_data_prefix}_X.csv and {save_data_prefix}_y.csv.")

    # Build and train the model
    model = build_neural_network(X_resampled.shape[1])
    model.fit(X_resampled, y_resampled, epochs=16, batch_size=128, validation_split=0.2)

    return model, scaler

import pandas as pd

def save_predictions_and_actuals(y_true, y_pred, output_file="predictions.csv"):
    """
    Save predictions and actual values to a CSV file.

    Args:
        y_true: Array of actual target values.
        y_pred: Array of predicted target values.
        output_file: Name of the CSV file to save the data.
    """
    data = pd.DataFrame({
        "Actual": y_true.flatten(),
        "Predicted": y_pred.flatten()
    })
    data.to_csv(output_file, index=False)
    print(f"Predictions and actuals saved to {output_file}")

def main():
    game_logs_file = "nba_game_logs.csv"
    features_file = "nba_team_aggregated_data.csv"
    model_save_path = "model.keras"
    scaler_save_path = "scaler.pkl"
    predictions_file = "predictions.csv"

    # Prepare the dataset
    X, y, feature_columns = prepare_dataset(game_logs_file, features_file)

    if X.size == 0 or y.size == 0:
        print("No data available to train the model.")
        return

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Save test data
    pd.DataFrame(X_test).to_csv("test_data_X.csv", index=False, header=False)
    pd.DataFrame(y_test).to_csv("test_data_y.csv", index=False, header=False)
    print("Test data saved to test_data_X.csv and test_data_y.csv.")

    # Train neural network
    model, scaler = train_model(X_train, y_train)

    # Save the model and scaler
    save_model_and_scaler(model, scaler, model_save_path, scaler_save_path)

    # Save feature names after preparing the dataset
    save_feature_names(feature_columns.tolist())

    # Evaluate the model
    X_test_scaled = scaler.transform(X_test)
    test_loss, test_custom_accuracy = model.evaluate(X_test_scaled, y_test)
    print(f"Neural Network - Test Loss: {test_loss}, Test Custom Accuracy: {test_custom_accuracy}")

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Save predictions and actual values
    save_predictions_and_actuals(y_test, y_pred, predictions_file)

if __name__ == "__main__":
    main()


Test data saved to test_data_X.csv and test_data_y.csv.




Processed training data saved to train_data_X.csv and train_data_y.csv.
Epoch 1/16
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - custom_accuracy: 0.5004 - loss: 0.6855 - val_custom_accuracy: 0.5046 - val_loss: 0.6645
Epoch 2/16
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - custom_accuracy: 0.5017 - loss: 0.6606 - val_custom_accuracy: 0.5023 - val_loss: 0.6585
Epoch 3/16
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - custom_accuracy: 0.5011 - loss: 0.6535 - val_custom_accuracy: 0.5007 - val_loss: 0.6572
Epoch 4/16
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - custom_accuracy: 0.5011 - loss: 0.6452 - val_custom_accuracy: 0.5022 - val_loss: 0.6533
Epoch 5/16
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - custom_accuracy: 0.5011 - loss: 0.6476 - val_custom_accuracy: 0.4985 - val_loss: 0.6551
Epoch 6/16
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [35]:
import numpy as np
import pandas as pd
from nba_api.stats.static import teams
import joblib
from tensorflow.keras.models import load_model


def load_feature_names(feature_names_file="feature_names.pkl"):
    """
    Load saved feature names for feature alignment during prediction.
    """
    feature_names = joblib.load(feature_names_file)

    # Print feature names
    print("Feature Names Used in the Model:")
    print(feature_names)

    return feature_names


def get_team_id_by_abbreviation(team_abbreviation):
    """Retrieve the team ID by abbreviation."""
    nba_teams = teams.get_teams()
    for team in nba_teams:
        if team["abbreviation"].lower() == team_abbreviation.lower():
            return team["id"]
    raise ValueError(f"Team '{team_abbreviation}' not found! Please enter a valid abbreviation.")


def fetch_team_features(team_abbreviation, features_file, feature_names):
    """
    Retrieve the team-specific features for the given team abbreviation.

    Args:
        team_abbreviation: Abbreviation of the NBA team (e.g., 'LAL').
        features_file: CSV file containing the aggregated team features.
        feature_names: List of features expected by the model.

    Returns:
        numpy array of the team's features.
    """
    team_features = pd.read_csv(features_file)
    team_row = team_features[team_features["TEAM"] == team_abbreviation.upper()]

    if team_row.empty:
        raise ValueError(f"Features for team '{team_abbreviation}' not found in {features_file}.")

    # Ensure only numeric features are returned
    numeric_features = team_row[feature_names].apply(pd.to_numeric, errors='coerce')

    return numeric_features.to_numpy().flatten()


def predict_matchup_win_probability(team1_abbreviation, team2_abbreviation, features_file, model_path="model.h5", scaler_path="scaler.pkl"):
    """
    Predict the win probability for Team 1 in a matchup against Team 2.

    Args:
        team1_abbreviation: Abbreviation of Team 1 (e.g., 'LAL').
        team2_abbreviation: Abbreviation of Team 2 (e.g., 'BOS').
        features_file: CSV file containing aggregated team features.
        model_path: Path to the trained neural network model file.
        scaler_path: Path to the scaler file for feature normalization.
    """
    # Load model, scaler, and feature names
    model = load_model(model_path)
    scaler = joblib.load(scaler_path)
    feature_names = load_feature_names()

    # Fetch features for both teams
    team1_features = fetch_team_features(team1_abbreviation, features_file, feature_names)
    team2_features = fetch_team_features(team2_abbreviation, features_file, feature_names)

    # Create matchup feature differences and ratios
    matchup_features = np.concatenate([
        team1_features - team2_features,
        team1_features / (team2_features + 1e-5)  # Avoid division by zero
    ]).reshape(1, -1)

    # Scale the matchup features
    matchup_features_scaled = scaler.transform(matchup_features)

    # Predict win probability for Team 1
    win_probability = model.predict(matchup_features_scaled)[0][0]

    print(f"\nWin Probability for {team1_abbreviation} vs {team2_abbreviation}: {win_probability * 100:.2f}%")


def display_team_data():
    """
    Display available team abbreviations and names for user reference.
    """
    nba_teams = teams.get_teams()
    print("Available NBA Teams:")
    for team in nba_teams:
        print(f"{team['abbreviation']} - {team['full_name']}")


def main():
    """Main function to handle user input and prediction."""
    features_file = "features.csv"  # Path to the features file

    # Display team data before taking user input
    display_team_data()

    team1_abbreviation = input("Enter Team 1 abbreviation (e.g., 'LAL' for Los Angeles Lakers): ").strip()
    team2_abbreviation = input("Enter Team 2 abbreviation (e.g., 'BOS' for Boston Celtics): ").strip()

    try:
        predict_matchup_win_probability(team1_abbreviation, team2_abbreviation, features_file)
    except ValueError as e:
        print(f"Error: {e}")


if __name__ == "__main__":
    main()


Available NBA Teams:
ATL - Atlanta Hawks
BOS - Boston Celtics
CLE - Cleveland Cavaliers
NOP - New Orleans Pelicans
CHI - Chicago Bulls
DAL - Dallas Mavericks
DEN - Denver Nuggets
GSW - Golden State Warriors
HOU - Houston Rockets
LAC - Los Angeles Clippers
LAL - Los Angeles Lakers
MIA - Miami Heat
MIL - Milwaukee Bucks
MIN - Minnesota Timberwolves
BKN - Brooklyn Nets
NYK - New York Knicks
ORL - Orlando Magic
IND - Indiana Pacers
PHI - Philadelphia 76ers
PHX - Phoenix Suns
POR - Portland Trail Blazers
SAC - Sacramento Kings
SAS - San Antonio Spurs
OKC - Oklahoma City Thunder
TOR - Toronto Raptors
UTA - Utah Jazz
MEM - Memphis Grizzlies
WAS - Washington Wizards
DET - Detroit Pistons
CHA - Charlotte Hornets
Enter Team 1 abbreviation (e.g., 'LAL' for Los Angeles Lakers): POR
Enter Team 2 abbreviation (e.g., 'BOS' for Boston Celtics): SAS


FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)