# Load data

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

df = pd.read_csv(
    "/content/drive/My Drive/Colab Notebooks/Basketball Projected Wins/nba_win_prediction_features_2023_24.csv",
    parse_dates=['GAME_DATE']
)

df = df.sort_values('GAME_DATE').reset_index(drop=True) #sort
df.info()



Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2310 entries, 0 to 2309
Data columns (total 37 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   GAME_ID            2310 non-null   int64         
 1   GAME_DATE          2310 non-null   datetime64[ns]
 2   TEAM_ID            2310 non-null   int64         
 3   TEAM_ABBREVIATION  2310 non-null   object        
 4   MATCHUP            2310 non-null   object        
 5   WL                 2310 non-null   object        
 6   PTS                2310 non-null   int64         
 7   REB                2310 non-null   int64         
 8   OREB               2310 non-null   int64         
 9   DREB               2310 non-null   int64         
 10  AST                2310 non-null   int64         
 11  TOV                2310 non-null   int64         
 12  FGA                2310 non-null   int64         
 13  FGM                2310 non-null   in

# Create Features/Target

In [2]:
feature_cols = [
    # Season-to-date strength
    'PTS_SEASON_AVG',
    'PTS_ALLOWED_AVG',
    'REB_AVG',
    'AST_AVG',
    'TOV_AVG',

    # Rolling form
    'PTS_L5',
    'PTS_ALLOWED_L5',
    'REB_L5',
    'AST_L5',
    'TOV_L5',
    'WIN_L5',

    # Context
    'HOME'
]


In [3]:
X = df[feature_cols]
y = df['WIN']


# Train/Test Split

NEVER random split sports time-series data

In [4]:
split_date = df['GAME_DATE'].quantile(0.8) #80,20

X_train = X[df['GAME_DATE'] <= split_date]
X_test  = X[df['GAME_DATE'] > split_date]

y_train = y[df['GAME_DATE'] <= split_date]
y_test  = y[df['GAME_DATE'] > split_date]


# Feature Sacling

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


# Train a baseline logistic regression

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    solver='lbfgs'
)

model.fit(X_train_scaled, y_train)


# Evaluate

In [12]:
from sklearn.metrics import accuracy_score #Model Performance

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
accuracy


0.6084070796460177

In [8]:
from sklearn.metrics import confusion_matrix #Confusion Matrix

confusion_matrix(y_test, y_pred)


array([[134,  92],
       [ 85, 141]])

In [9]:
import pandas as pd #Feature Importance

coef_df = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': model.coef_[0]
}).sort_values(by='coefficient', ascending=False)

coef_df


Unnamed: 0,feature,coefficient
0,PTS_SEASON_AVG,0.442472
11,HOME,0.219257
5,PTS_L5,0.154718
9,TOV_L5,0.149982
8,AST_L5,0.079798
7,REB_L5,-0.00145
6,PTS_ALLOWED_L5,-0.050852
2,REB_AVG,-0.061769
4,TOV_AVG,-0.111843
3,AST_AVG,-0.112755


#Create Baseline Benchmark

In [10]:
y_test.mean()


np.float64(0.5)

# Baseline Conclusion

Using a time-aware logistic regression model, I achieved an accuracy of 60.8%. This outperforms a naive baseline shows that rolling and season-to-date team metrics have meaningful predictive signal for NBA game outcomes.

The confusion matrix shows a balanced performance across wins and losses. The model is not overly biased toward predicting one versus the other. This suggests the model is learning genuine patterns rather than exploiting class imbalance or home-court bias alone.

Feature coefficients align well with basketball intuition. Season-to-date scoring margin proxies—particularly average points scored and points allowed—were the strongest predictors of winning, while home-court advantage and recent offensive performance also contributed positively. Defensive strength, as captured by points allowed, had a strong negative relationship with win probability, reinforcing its importance in outcome prediction.

Overall, the baseline confirms that a simple model can capture real structure in NBA game outcomes. Using these results, there is a strong foundation for further improvements. I will incorporate opponent-relative features, experimenting with non-linear models, or extending predictions to future games.

# Build Random Forest
How does tree-based modeling affect the outcome?

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=20,
    random_state=42
)

rf.fit(X_train, y_train)

rf_preds = rf.predict(X_test)

rf_accuracy = accuracy_score(y_test, rf_preds)
rf_accuracy


0.6592920353982301

Feature Importance

In [14]:
rf_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

rf_importance


Unnamed: 0,feature,importance
1,PTS_ALLOWED_AVG,0.183224
0,PTS_SEASON_AVG,0.126106
4,TOV_AVG,0.100359
2,REB_AVG,0.090729
5,PTS_L5,0.089964
3,AST_AVG,0.082518
6,PTS_ALLOWED_L5,0.068258
8,AST_L5,0.065398
10,WIN_L5,0.056946
7,REB_L5,0.05337


# Gradient boosted trees

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

gb_preds = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_preds)
gb_accuracy


0.6305309734513275

# Compare Baseline Models

In [17]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting'],
    'Accuracy': [accuracy, rf_accuracy, gb_accuracy]
})

results


Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.608407
1,Random Forest,0.659292
2,Gradient Boosting,0.630531


Logistic Regression, Random Forest, and Gradient Boosting give similar accuracies. While parameters are a main reason, how much farther can we go to improve these results? I am going to test a neural network to see how far we can go with our scores

# Neural Networks

Prep Data

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df[feature_cols]
y = df['WIN']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Build MLP (Multi Layer Perception)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Train

In [21]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled,
    y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


Epoch 1/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.5489 - loss: 0.7144 - val_accuracy: 0.6135 - val_loss: 0.6730
Epoch 2/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.5569 - loss: 0.6843 - val_accuracy: 0.5892 - val_loss: 0.6756
Epoch 3/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6072 - loss: 0.6588 - val_accuracy: 0.5919 - val_loss: 0.6770
Epoch 4/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6258 - loss: 0.6614 - val_accuracy: 0.5459 - val_loss: 0.6815
Epoch 5/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6259 - loss: 0.6421 - val_accuracy: 0.5649 - val_loss: 0.6796
Epoch 6/50
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6474 - loss: 0.6328 - val_accuracy: 0.5622 - val_loss: 0.6812


Evaluate

In [22]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
accuracy


[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6238 - loss: 0.6472 


0.5844155550003052

#Neural Network Evaluation

After trying neural networks, we see that tree-based models outperform neural networks on structured, tabular NBA rolling-stat data. After some research, this aligns with industry findings that boosted decision trees remain the dominant approach for sportsbook-style prediction tasks. Trying ensembling or even more advnaced techniques is my next step.

Resplit data for this task

In [None]:
split_date = df['GAME_DATE'].quantile(0.8) #80,20

X_train = X[df['GAME_DATE'] <= split_date]
X_test  = X[df['GAME_DATE'] > split_date]

y_train = y[df['GAME_DATE'] <= split_date]
y_test  = y[df['GAME_DATE'] > split_date]


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)



# Save Best Model
After evaluating results, we are going to focus on teh random forest model and applying it on current season wins predictions

In [24]:
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=20,
    random_state=42
)

rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)


In [25]:
import joblib

joblib.dump(
    rf_model,
    "/content/drive/My Drive/Colab Notebooks/Basketball Projected Wins/rf_win_model.pkl"
)

joblib.dump(
    feature_cols,
    "/content/drive/My Drive/Colab Notebooks/Basketball Projected Wins/model_features.pkl"
)


['/content/drive/My Drive/Colab Notebooks/Basketball Projected Wins/model_features.pkl']