In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import TimeSeriesSplit
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

# Load the main dataset of NBA games, sorted by date and reindexed
data_frame = pd.read_csv("nba_games.csv", index_col=0)
data_frame = data_frame.sort_values("date").reset_index(drop=True)

# Remove unnecessary columns
for col_to_remove in ["mp.1", "mp_opp.1", "index_opp"]:
    if col_to_remove in data_frame.columns:
        del data_frame[col_to_remove]

In [2]:
def append_target_column(team_data):
    """
    Append a 'target' column indicating future win outcome for the team.
    The target is the next game's 'won' value shifted by -1.
    """
    team_data["target"] = team_data["won"].shift(-1)
    return team_data

In [3]:
# Apply the target column creation by team grouping
data_frame = data_frame.groupby("team", group_keys=False).apply(append_target_column)

# Replace missing target values with 2 (indicating no next game)
data_frame.loc[pd.isnull(data_frame["target"]), "target"] = 2
data_frame["target"] = data_frame["target"].astype(int, errors="ignore")

# Identify and remove columns that contain null values
missing_counts = pd.isnull(data_frame).sum()
missing_columns = missing_counts[missing_counts > 0]
valid_columns_mask = ~data_frame.columns.isin(missing_columns.index)
data_frame = data_frame[data_frame.columns[valid_columns_mask]].copy()

# Time-series based feature selection setup
time_splitter = TimeSeriesSplit(n_splits=5)
model = RidgeClassifier(alpha=1)

feature_selector = SequentialFeatureSelector(
    model,
    n_features_to_select=30,
    direction="forward",
    cv=time_splitter,
    n_jobs=1
)

# Define columns to exclude from modeling
exclude_cols = ["season", "date", "won", "target", "team", "team_opp"]
candidate_features = data_frame.columns[~data_frame.columns.isin(exclude_cols)]

  data_frame = data_frame.groupby("team", group_keys=False).apply(append_target_column)


In [4]:
#List of stats used to make predictions
candidate_features

Index(['fg', 'fga', 'fg%', '3p', '3pa', '3p%', 'ft', 'fta', 'ft%', 'orb',
       ...
       'trb%_max_opp', 'ast%_max_opp', 'stl%_max_opp', 'blk%_max_opp',
       'tov%_max_opp', 'usg%_max_opp', 'ortg_max_opp', 'drtg_max_opp',
       'total_opp', 'home_opp'],
      dtype='object', length=137)

In [5]:
# Normalize the candidate features with MinMax scaling
scaler = MinMaxScaler()
data_frame[candidate_features] = scaler.fit_transform(data_frame[candidate_features])

# Fit the feature selector to identify the best 30 predictors
feature_selector.fit(data_frame[candidate_features], data_frame["target"])
feature_subset = candidate_features[feature_selector.get_support()]
feature_subset = list(feature_subset)

In [6]:
def rolling_backtest(dataset, trained_model, predictors_list, start_index=2, step_value=1):
    """
    Perform a rolling backtest by training on all seasons before the current one,
    and then predicting the current season's outcomes.
    Returns a DataFrame with actual and predicted values for evaluation.
    """
    all_preds = []
    all_seasons = sorted(dataset["season"].unique())
    for idx in range(start_index, len(all_seasons), step_value):
        current_season = all_seasons[idx]
        training_data = dataset[dataset["season"] < current_season]
        testing_data = dataset[dataset["season"] == current_season]

        trained_model.fit(training_data[predictors_list], training_data["target"])
        predictions = trained_model.predict(testing_data[predictors_list])
        predictions_series = pd.Series(predictions, index=testing_data.index)

        combined_df = pd.concat([testing_data["target"], predictions_series], axis=1)
        combined_df.columns = ["actual", "prediction"]
        all_preds.append(combined_df)

    return pd.concat(all_preds)

In [7]:
# Execute backtest with the currently chosen model and selected predictors
predicted_results = rolling_backtest(data_frame, model, feature_subset)
predicted_results = predicted_results[predicted_results["actual"] != 2]

In [8]:
predicted_results

Unnamed: 0,actual,prediction
4354,0,1
4355,0,0
4356,0,1
4357,0,1
4358,0,1
...,...,...
28739,0,0
28740,0,0
28741,1,1
28742,1,0


In [9]:
# Calculate accuracy excluding cases where target = 2
print("Initial Accuracy:", accuracy_score(predicted_results["actual"], predicted_results["prediction"]))

Initial Accuracy: 0.5516236298698632


In [10]:
# Check the home field advantage statistics
print("Home Win Rate:", data_frame.groupby("home").apply(lambda group: group[group["won"] == 1].shape[0] / group.shape[0]))

Home Win Rate: home
0.0    0.421763
1.0    0.578237
dtype: float64


  print("Home Win Rate:", data_frame.groupby("home").apply(lambda group: group[group["won"] == 1].shape[0] / group.shape[0]))


In [11]:
# Create a rolling dataset including selected_columns + 'won', 'team', 'season'
df_rolling_input = data_frame[list(candidate_features) + ["won", "team", "season"]]

In [12]:
def compute_team_rolling_average(team_data):
    """
    Compute 10-game rolling averages for numeric columns on a per-team basis.
    """
    numeric_subset = team_data.select_dtypes(include='number')
    rolled = numeric_subset.rolling(10).mean()
    return rolled

In [13]:
df_rolling_out = df_rolling_input.groupby(["team", "season"], group_keys=False).apply(compute_team_rolling_average)

# Rename rolling columns to have a consistent suffix
rolling_columns = [f"{col_name}_10" for col_name in df_rolling_out.columns]
df_rolling_out.columns = rolling_columns

# Concatenate rolling averages with the main DataFrame and drop missing rows
data_frame = pd.concat([data_frame, df_rolling_out], axis=1).dropna()

  df_rolling_out = df_rolling_input.groupby(["team", "season"], group_keys=False).apply(compute_team_rolling_average)


In [14]:
def shift_column_within_team(subset, column_name):
    """
    Shift the specified column by -1 within a team group.
    This effectively retrieves the next game's value for that column.
    """
    return subset[column_name].shift(-1)

In [15]:
def add_shifted_column(data, column_name):
    """
    Add a column to the dataset with the next game's value for the specified column.
    Uses groupby to ensure shifting is done on a per-team basis.
    """
    return data.groupby("team", group_keys=False).apply(lambda grp: shift_column_within_team(grp, column_name))

data_frame["home_next"] = add_shifted_column(data_frame, "home")
data_frame["team_opp_next"] = add_shifted_column(data_frame, "team_opp")
data_frame["date_next"] = add_shifted_column(data_frame, "date")

data_frame = data_frame.copy()

  return data.groupby("team", group_keys=False).apply(lambda grp: shift_column_within_team(grp, column_name))
  return data.groupby("team", group_keys=False).apply(lambda grp: shift_column_within_team(grp, column_name))
  return data.groupby("team", group_keys=False).apply(lambda grp: shift_column_within_team(grp, column_name))


In [16]:
# Merge the rolling columns and next-game info back into a full DataFrame for modeling
merged_data = data_frame.merge(
    data_frame[rolling_columns + ["team_opp_next", "date_next", "team"]],
    left_on=["team", "date_next"],
    right_on=["team_opp_next", "date_next"]
)

# Displaying a subset of columns to verify merging
merged_data[["team_x", "team_opp_next_x", "team_y", "team_opp_next_y", "date_next"]]

Unnamed: 0,team_x,team_opp_next_x,team_y,team_opp_next_y,date_next
0,ORL,OKC,OKC,ORL,2009-11-18
1,OKC,ORL,ORL,OKC,2009-11-18
2,PHO,NOH,NOH,PHO,2009-11-19
3,NOH,PHO,PHO,NOH,2009-11-19
4,BOS,ORL,ORL,BOS,2009-11-20
...,...,...,...,...,...
24739,MIA,BOS,BOS,MIA,2023-05-25
24740,MIA,BOS,BOS,MIA,2023-05-27
24741,BOS,MIA,MIA,BOS,2023-05-27
24742,MIA,BOS,BOS,MIA,2023-05-29


In [17]:
#Full DataFrame after merging
merged_data

Unnamed: 0,fg,fga,fg%,3p,3pa,3p%,ft,fta,ft%,orb,...,blk%_max_opp_10_y,tov%_max_opp_10_y,usg%_max_opp_10_y,ortg_max_opp_10_y,drtg_max_opp_10_y,total_opp_10_y,home_opp_10_y,season_10_y,team_opp_next_y,team_y
0,0.304348,0.257143,0.400000,0.379310,0.432836,0.386952,0.372549,0.476190,0.585764,0.342105,...,0.0594,0.398428,0.138846,0.321801,0.449438,0.298333,0.5,2010.0,ORL,OKC
1,0.391304,0.257143,0.520930,0.172414,0.164179,0.401575,0.392157,0.333333,0.947491,0.236842,...,0.0822,0.461950,0.148462,0.322749,0.486517,0.331667,0.4,2010.0,OKC,ORL
2,0.478261,0.285714,0.609302,0.206897,0.268657,0.321710,0.431373,0.492063,0.672112,0.131579,...,0.0738,0.406394,0.150513,0.405213,0.475281,0.407500,0.6,2010.0,PHO,NOH
3,0.391304,0.314286,0.465116,0.275862,0.373134,0.321710,0.529412,0.507937,0.822637,0.500000,...,0.0678,0.369602,0.126538,0.421327,0.541573,0.423333,0.7,2010.0,NOH,PHO
4,0.500000,0.300000,0.623256,0.275862,0.238806,0.473566,0.313725,0.412698,0.568261,0.263158,...,0.0744,0.469392,0.150128,0.312796,0.469663,0.321667,0.4,2010.0,BOS,ORL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24739,0.326087,0.271429,0.416279,0.275862,0.432836,0.281215,0.431373,0.428571,0.791132,0.315789,...,0.1113,0.401677,0.299872,0.538389,0.595506,0.412500,0.5,2023.0,MIA,BOS
24740,0.456522,0.271429,0.595349,0.310345,0.298507,0.439820,0.137255,0.142857,0.766628,0.263158,...,0.1253,0.395807,0.305641,0.523697,0.598876,0.420833,0.5,2023.0,MIA,BOS
24741,0.456522,0.285714,0.579070,0.551724,0.537313,0.461192,0.254902,0.285714,0.693116,0.315789,...,0.1156,0.243711,0.260000,0.474882,0.574157,0.409167,0.5,2023.0,BOS,MIA
24742,0.304348,0.485714,0.227907,0.482759,0.402985,0.525309,0.431373,0.444444,0.758460,0.447368,...,0.1196,0.374214,0.298974,0.472986,0.591011,0.421667,0.5,2023.0,MIA,BOS


In [18]:
# Update removed_columns to include any object columns from merged_data
new_removed = list(merged_data.columns[merged_data.dtypes == "object"]) + exclude_cols
final_features = merged_data.columns[~merged_data.columns.isin(new_removed)]

In [19]:
# Re-run feature selection with the merged data
feature_selector.fit(merged_data[final_features], merged_data["target"])
final_predictors = list(final_features[feature_selector.get_support()])

In [20]:
# Perform a final backtest on the merged dataset using the updated predictors
final_predictions = rolling_backtest(merged_data, model, final_predictors)
print("Final Accuracy:", accuracy_score(final_predictions["actual"], final_predictions["prediction"]))

Final Accuracy: 0.6420600858369099
