Inputs into the game:
- Teams
- Who is at home
- Date
- The players playing in the next game
- Performance from the last ten games

In [1]:
import pandas as pd

df = pd.read_pickle('df_nba_games.pkl')

In [2]:
df.head()

Unnamed: 0,mp,fg,fga,fg%,3p,3pa,3p%,ft,fta,orb,...,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,won_10_y,season_10_y,team_opp_next_y,team_y
0,0.0,0.581395,0.480769,0.567961,0.555556,0.470588,0.659574,0.295455,0.415094,0.464286,...,0.454457,0.167784,0.336735,0.554444,0.532632,0.5,0.8,2024.0,NOP,DAL
1,0.0,0.465116,0.461538,0.449029,0.62963,0.627451,0.594108,0.227273,0.226415,0.392857,...,0.320761,0.173196,0.360204,0.473333,0.495789,0.4,0.6,2024.0,SAS,OKC
2,0.0,0.232558,0.307692,0.264563,0.259259,0.333333,0.394435,0.590909,0.584906,0.571429,...,0.38663,0.279253,0.370408,0.454444,0.377895,0.5,0.8,2024.0,PHO,MIN
3,0.0,0.418605,0.288462,0.512136,0.296296,0.294118,0.492635,0.477273,0.471698,0.214286,...,0.357609,0.162629,0.396429,0.414444,0.602105,0.5,0.3,2024.0,OKC,SAS
4,0.0,0.651163,0.673077,0.51699,0.666667,0.803922,0.504092,0.363636,0.396226,0.285714,...,0.390217,0.190206,0.481633,0.414444,0.517895,0.5,0.4,2024.0,DAL,NOP


In [3]:
import joblib
import os

def load_model(model_name, folder="saved_models"):
    """
    Load a trained scikit-learn model saved with `save_model`.

    Returns:
        model : trained scikit-learn model
        predictors : list of feature names used for training
        feature_importance : optional, pandas Series or dict if saved
    """
    filepath = os.path.join(folder, f"{model_name}.pkl")
    
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"Model file not found: {filepath}")
    
    saved = joblib.load(filepath)

    # Extract model and predictors
    if isinstance(saved, dict) and "model" in saved and "predictors" in saved:
        model = saved["model"]
        predictors = saved["predictors"]
        feature_importance = saved.get("feature_importance", None)
        return model, predictors, feature_importance
    else:
        raise ValueError("Saved file does not have the expected format. "
                         "It must contain 'model' and 'predictors'.")

In [4]:
def predict_win_probability(model, predictors, df, team_home, team_away, home=1):
    """
    Predict win probability for an NBA game using exact predictors.
    
    Parameters
    ----------
    model : trained classifier (sklearn or CatBoost)
    predictors : list of str
        Exact feature names used during training
    df : pd.DataFrame
        Preprocessed NBA dataset with rolling stats (_x, _y, _opp)
    team_home : str
        Abbreviation of home team
    team_away : str
        Abbreviation of away team
    home : int
        1 if team_home is at home, 0 otherwise
    
    Returns
    -------
    dict
        {'team_home': probability_win, 'team_away': probability_win}
    """

    # Get last row for each team
    last_home = df[df['team_x'] == team_home].iloc[-1]
    last_away = df[df['team_y'] == team_away].iloc[-1]

    # Build feature dict
    game_input = {}
    for col in predictors:
        if col in df.columns:
            # direct mapping if column exists
            if '_x' in col:
                game_input[col] = last_home.get(col, 0)
            elif '_y' in col:
                game_input[col] = last_away.get(col, 0)
            else:
                game_input[col] = last_home.get(col, last_away.get(col, 0))
        else:
            # fallback: try stripping _x/_y
            col_base = col.replace('_x','').replace('_y','')
            game_input[col] = last_home.get(col_base, last_away.get(col_base, 0))
    
    # Set home indicator if present
    if 'home' in predictors:
        game_input['home'] = home

    # Convert to DataFrame in exact predictor order
    X_new = pd.DataFrame([game_input])[predictors].astype(float)

    # Optional: sanity check
    # print(X_new.T)

    # Predict probabilities
    try:
        prob = model.predict_proba(X_new)[0]
        return {team_home: round(prob[1]*100, 2), team_away: round(prob[0]*100, 2)}
    except AttributeError:
        # fallback for models without predict_proba
        pred = model.predict(X_new)[0]
        return {team_home: 100, team_away: 0} if pred == 1 else {team_home: 0, team_away: 100}


In [5]:
def get_prediction(model_name,team_home,team_away):
    model, predictors, _ = load_model(model_name)
    result = predict_win_probability(model, predictors, df, team_home, team_away, home=1)
    return result

In [6]:
import os

model_dir = "saved_models"  # replace with your directory path
# List all files in the directory
all_files = os.listdir(model_dir)

# Optionally, filter for specific model file types (e.g., .pkl, .cbm)
model_files = [f for f in all_files if f.endswith(('.pkl', '.cbm'))]

print(model_files)

['CatBoost.pkl', 'Decision Tree Model.pkl', 'Gradient Boosting Model.pkl', 'K-Means Model.pkl', 'KNN Model.pkl', 'LightGBM.pkl', 'Linear Regression Model.pkl', 'Logistic Regression Model.pkl', 'MLP Model.pkl', 'Naive Bayes Model.pkl', 'Random Forest Model.pkl', 'SVM Model.pkl']


In [7]:
teams = ["LAL","GSW"]
tally = [0,0]
for model in model_files:
    model = model.replace(".pkl","")
    prediction = get_prediction(model,teams[0],teams[1])
    print(f"{model} - {prediction}")
    if prediction[teams[0]] > prediction[teams[1]]:
        tally[0] += 1
    else:
        tally[1] += 1
print(tally)

CatBoost - {'LAL': 47.81, 'GSW': 52.19}
Decision Tree Model - {'LAL': 0.0, 'GSW': 100.0}
Gradient Boosting Model - {'LAL': 90.0, 'GSW': 10.0}
K-Means Model - {'LAL': 0, 'GSW': 100}
KNN Model - {'LAL': 33.33, 'GSW': 66.67}
LightGBM - {'LAL': 44.78, 'GSW': 55.22}
Linear Regression Model - {'LAL': 0, 'GSW': 100}
Logistic Regression Model - {'LAL': 69.45, 'GSW': 30.55}
MLP Model - {'LAL': 76.37, 'GSW': 23.63}
Naive Bayes Model - {'LAL': 97.8, 'GSW': 2.2}
Random Forest Model - {'LAL': 61.51, 'GSW': 38.49}
SVM Model - {'LAL': 77.69, 'GSW': 22.31}
[6, 6]
