In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

np.random.seed(42)
num_games = 1500
nba_data = pd.DataFrame({
    "game_date": pd.date_range(start="2023-01-01", periods=num_games, freq="D"),
    "home_team_rating": np.random.normal(110, 5, num_games),
    "away_team_rating": np.random.normal(108, 5, num_games),
    "home_win_streak": np.random.randint(0, 5, num_games),
    "away_win_streak": np.random.randint(0, 5, num_games),
    "home_advantage": np.random.randint(0, 2, num_games),
    "home_team_win": np.random.randint(0, 2, num_games)
})

In [4]:
nba_data["rating_diff"] = nba_data["home_team_rating"] - nba_data["away_team_rating"]
nba_data["streak_diff"] = nba_data["home_win_streak"] - nba_data["away_win_streak"]

features = ["rating_diff", "streak_diff", "home_advantage"]
target = "home_team_win"

X = nba_data[features]
y = nba_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
log_preds = log_clf.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, log_preds))

rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train, y_train)
rf_preds = rf_clf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_preds))

Logistic Regression Accuracy: 0.5466666666666666
Random Forest Accuracy: 0.4766666666666667


In [5]:
seq_len = 10
sequences, labels = [], []

for i in range(seq_len, len(X)):
    sequences.append(X.iloc[i-seq_len:i].values)
    labels.append(y.iloc[i])

X_seq = np.array(sequences)
y_seq = np.array(labels)

split = int(0.8 * len(X_seq))
X_seq_train, X_seq_test = X_seq[:split], X_seq[split:]
y_seq_train, y_seq_test = y_seq[:split], y_seq[split:]

model = Sequential([
    LSTM(32, input_shape=(seq_len, X_seq.shape[2]), return_sequences=False),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_seq_train, y_seq_train, epochs=5, batch_size=32, verbose=1)

  super().__init__(**kwargs)


Epoch 1/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5060 - loss: 0.6977
Epoch 2/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5349 - loss: 0.6896
Epoch 3/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5214 - loss: 0.6928
Epoch 4/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5257 - loss: 0.6950
Epoch 5/5
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5477 - loss: 0.6815


<keras.src.callbacks.history.History at 0x135dda6d0>

In [6]:
lstm_preds = (model.predict(X_seq_test) > 0.5).astype(int).flatten()
print("LSTM Accuracy:", accuracy_score(y_seq_test, lstm_preds))

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
LSTM Accuracy: 0.5268456375838926
