# NBA Game Prediction: OKC Thunder Next 5 Games

## Project Overview
This project uses ensemble learning techniques to predict the next 5 game results for the Oklahoma City Thunder (OKC).

**Current Status**: OKC Thunder is on a 22-1 streak (22 wins, 1 loss)

## Dataset
- Source: NBA team statistics dataset
- Target Variable (y): Game outcome (Win/Loss)
- Input Features (X): Team statistics, opponent statistics, and derived features

## 1. Data Loading and Exploration

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load datasets
team_stats = pd.read_csv('dataset/Team Stats Per Game.csv')
team_summaries = pd.read_csv('dataset/Team Summaries.csv')
opponent_stats = pd.read_csv('dataset/Opponent Stats Per Game.csv')

print("Datasets loaded successfully!")
print(f"\nTeam Stats shape: {team_stats.shape}")
print(f"Team Summaries shape: {team_summaries.shape}")
print(f"Opponent Stats shape: {opponent_stats.shape}")

In [None]:
# Display current OKC Thunder stats (2026 season)
okc_2026 = team_summaries[(team_summaries['abbreviation'] == 'OKC') & (team_summaries['season'] == 2026)]
print("OKC Thunder 2026 Season Stats:")
print(okc_2026[['season', 'w', 'l', 'mov', 'srs', 'o_rtg', 'd_rtg', 'n_rtg']].to_string())

# Display first few rows of team stats
print("\n\nSample Team Stats:")
print(team_stats.head())

## 2. Data Preprocessing and Feature Engineering

In [None]:
# Merge datasets to create comprehensive team statistics
def create_game_dataset(team_stats, team_summaries, opponent_stats):
    """
    Create a dataset where each row represents a potential game matchup.
    Since we don't have actual game-by-game data, we'll create matchups
    based on team statistics from the same season.
    """
    
    # Filter for recent seasons (last 5 years for better relevance)
    recent_seasons = team_stats[team_stats['season'] >= 2021].copy()
    
    games = []
    
    for season in recent_seasons['season'].unique():
        season_teams = recent_seasons[recent_seasons['season'] == season].copy()
        season_summaries = team_summaries[team_summaries['season'] == season].copy()
        season_opp = opponent_stats[opponent_stats['season'] == season].copy()
        
        # Get team abbreviations
        teams = season_teams['abbreviation'].unique()
        teams = [t for t in teams if t != 'NA']  # Remove league average
        
        # Create matchups (each team plays every other team)
        for i, team1 in enumerate(teams):
            for team2 in teams[i+1:]:
                team1_data = season_teams[season_teams['abbreviation'] == team1].iloc[0]
                team2_data = season_teams[season_teams['abbreviation'] == team2].iloc[0]
                
                team1_summary = season_summaries[season_summaries['abbreviation'] == team1]
                team2_summary = season_summaries[season_summaries['abbreviation'] == team2]
                
                team1_opp = season_opp[season_opp['abbreviation'] == team1]
                team2_opp = season_opp[season_opp['abbreviation'] == team2]
                
                if len(team1_summary) > 0 and len(team2_summary) > 0:
                    # Determine winner based on win percentage
                    team1_wpct = team1_summary.iloc[0]['w'] / (team1_summary.iloc[0]['w'] + team1_summary.iloc[0]['l'])
                    team2_wpct = team2_summary.iloc[0]['w'] / (team2_summary.iloc[0]['w'] + team2_summary.iloc[0]['l'])
                    
                    # Create features for team1 perspective
                    game_features = {
                        'season': season,
                        # Team 1 offensive stats
                        'team1_pts_per_game': team1_data['pts_per_game'],
                        'team1_fg_percent': team1_data['fg_percent'],
                        'team1_3p_percent': team1_data['x3p_percent'],
                        'team1_ft_percent': team1_data['ft_percent'],
                        'team1_reb_per_game': team1_data['trb_per_game'],
                        'team1_ast_per_game': team1_data['ast_per_game'],
                        'team1_stl_per_game': team1_data['stl_per_game'],
                        'team1_blk_per_game': team1_data['blk_per_game'],
                        'team1_tov_per_game': team1_data['tov_per_game'],
                        
                        # Team 1 advanced stats
                        'team1_o_rtg': team1_summary.iloc[0]['o_rtg'],
                        'team1_d_rtg': team1_summary.iloc[0]['d_rtg'],
                        'team1_n_rtg': team1_summary.iloc[0]['n_rtg'],
                        'team1_srs': team1_summary.iloc[0]['srs'],
                        'team1_mov': team1_summary.iloc[0]['mov'],
                        
                        # Team 2 (opponent) offensive stats
                        'team2_pts_per_game': team2_data['pts_per_game'],
                        'team2_fg_percent': team2_data['fg_percent'],
                        'team2_3p_percent': team2_data['x3p_percent'],
                        'team2_ft_percent': team2_data['ft_percent'],
                        'team2_reb_per_game': team2_data['trb_per_game'],
                        'team2_ast_per_game': team2_data['ast_per_game'],
                        'team2_stl_per_game': team2_data['stl_per_game'],
                        'team2_blk_per_game': team2_data['blk_per_game'],
                        'team2_tov_per_game': team2_data['tov_per_game'],
                        
                        # Team 2 advanced stats
                        'team2_o_rtg': team2_summary.iloc[0]['o_rtg'],
                        'team2_d_rtg': team2_summary.iloc[0]['d_rtg'],
                        'team2_n_rtg': team2_summary.iloc[0]['n_rtg'],
                        'team2_srs': team2_summary.iloc[0]['srs'],
                        'team2_mov': team2_summary.iloc[0]['mov'],
                        
                        # Derived features (differences)
                        'pts_diff': team1_data['pts_per_game'] - team2_data['pts_per_game'],
                        'rtg_diff': team1_summary.iloc[0]['n_rtg'] - team2_summary.iloc[0]['n_rtg'],
                        'srs_diff': team1_summary.iloc[0]['srs'] - team2_summary.iloc[0]['srs'],
                        'off_def_diff': (team1_summary.iloc[0]['o_rtg'] - team2_summary.iloc[0]['d_rtg']),
                        
                        # Target: 1 if team1 wins, 0 if team2 wins
                        'team1_wins': 1 if team1_wpct > team2_wpct else 0
                    }
                    
                    games.append(game_features)
    
    return pd.DataFrame(games)

# Create the game dataset
game_data = create_game_dataset(team_stats, team_summaries, opponent_stats)
print(f"\nGame dataset created: {game_data.shape}")
print(f"\nColumns: {list(game_data.columns)}")
print(f"\nFirst few rows:")
print(game_data.head())

In [None]:
# Check for missing values
print("Missing values:")
print(game_data.isnull().sum())

# Handle missing values
game_data = game_data.dropna()

# Check for outliers (using IQR method)
numeric_cols = game_data.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols.remove('team1_wins')  # Don't check target variable

print(f"\nDataset after cleaning: {game_data.shape}")
print(f"\nClass distribution:")
print(game_data['team1_wins'].value_counts())

## 3. Prepare Features and Target

In [None]:
# Separate features and target
feature_cols = [col for col in game_data.columns if col not in ['season', 'team1_wins']]
X = game_data[feature_cols]
y = game_data['team1_wins']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {feature_cols}")

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set: {X_train.shape}")
print(f"Testing set: {X_test.shape}")

## 4. Model Implementation - Ensemble Learning Methods

### 4.1 Random Forest

In [None]:
# Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

print(f"Random Forest Accuracy: {rf_accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, rf_pred))

# Cross-validation score
rf_cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5)
print(f"\nCross-validation scores: {rf_cv_scores}")
print(f"Mean CV score: {rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std() * 2:.4f})")

### 4.2 Gradient Boosting

In [None]:
# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print(f"Gradient Boosting Accuracy: {gb_accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, gb_pred))

# Cross-validation score
gb_cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5)
print(f"\nCross-validation scores: {gb_cv_scores}")
print(f"Mean CV score: {gb_cv_scores.mean():.4f} (+/- {gb_cv_scores.std() * 2:.4f})")

### 4.3 Bagging

In [None]:
# Bagging Classifier
bagging_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=10),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)

bagging_model.fit(X_train, y_train)
bagging_pred = bagging_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_pred)

print(f"Bagging Accuracy: {bagging_accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, bagging_pred))

# Cross-validation score
bagging_cv_scores = cross_val_score(bagging_model, X_train, y_train, cv=5)
print(f"\nCross-validation scores: {bagging_cv_scores}")
print(f"Mean CV score: {bagging_cv_scores.mean():.4f} (+/- {bagging_cv_scores.std() * 2:.4f})")

### 4.4 Voting Classifier

In [None]:
# Voting Classifier (combines RF, GB, and Bagging)
voting_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42, n_jobs=-1)),
        ('gb', GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=5, random_state=42)),
        ('bag', BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=10), n_estimators=30, random_state=42, n_jobs=-1))
    ],
    voting='hard'
)

voting_model.fit(X_train, y_train)
voting_pred = voting_model.predict(X_test)
voting_accuracy = accuracy_score(y_test, voting_pred)

print(f"Voting Classifier Accuracy: {voting_accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, voting_pred))

# Cross-validation score
voting_cv_scores = cross_val_score(voting_model, X_train, y_train, cv=5)
print(f"\nCross-validation scores: {voting_cv_scores}")
print(f"Mean CV score: {voting_cv_scores.mean():.4f} (+/- {voting_cv_scores.std() * 2:.4f})")

## 5. Model Comparison and Analysis

In [None]:
# Compare all models
models_comparison = {
    'Random Forest': {'accuracy': rf_accuracy, 'cv_mean': rf_cv_scores.mean(), 'cv_std': rf_cv_scores.std()},
    'Gradient Boosting': {'accuracy': gb_accuracy, 'cv_mean': gb_cv_scores.mean(), 'cv_std': gb_cv_scores.std()},
    'Bagging': {'accuracy': bagging_accuracy, 'cv_mean': bagging_cv_scores.mean(), 'cv_std': bagging_cv_scores.std()},
    'Voting Classifier': {'accuracy': voting_accuracy, 'cv_mean': voting_cv_scores.mean(), 'cv_std': voting_cv_scores.std()}
}

comparison_df = pd.DataFrame(models_comparison).T
print("\nModel Comparison:")
print(comparison_df)

# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
axes[0].bar(comparison_df.index, comparison_df['accuracy'], color=['#3498db', '#2ecc71', '#e74c3c', '#f39c12'])
axes[0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Accuracy', fontsize=12)
axes[0].set_ylim([0.5, 1.0])
axes[0].grid(axis='y', alpha=0.3)

# Cross-validation mean scores
axes[1].bar(comparison_df.index, comparison_df['cv_mean'], yerr=comparison_df['cv_std'], 
            color=['#3498db', '#2ecc71', '#e74c3c', '#f39c12'], capsize=5)
axes[1].set_title('Cross-Validation Mean Scores', fontsize=14, fontweight='bold')
axes[1].set_ylabel('CV Mean Score', fontsize=12)
axes[1].set_ylim([0.5, 1.0])
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Select best model
best_model_name = comparison_df['cv_mean'].idxmax()
print(f"\nBest Model: {best_model_name}")
print(f"Best CV Score: {comparison_df.loc[best_model_name, 'cv_mean']:.4f}")

## 6. Feature Importance Analysis

In [None]:
# Feature importance from Random Forest (best model typically)
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'], color='steelblue')
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance', fontsize=12)
plt.title('Top 15 Feature Importances (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Predictions for OKC Thunder Next 5 Games

In [None]:
# Get OKC Thunder current stats (2026 season)
okc_stats = team_stats[(team_stats['abbreviation'] == 'OKC') & (team_stats['season'] == 2026)].iloc[0]
okc_summary = team_summaries[(team_summaries['abbreviation'] == 'OKC') & (team_summaries['season'] == 2026)].iloc[0]

# Get potential opponents (other teams from 2026 season)
opponents_2026 = team_stats[(team_stats['season'] == 2026) & (team_stats['abbreviation'] != 'OKC') & (team_stats['abbreviation'] != 'NA')]

# Select 5 opponents (we'll use teams with varying strengths)
# Let's pick a mix: strong teams, average teams, and weaker teams
opponent_abbrevs = ['DEN', 'LAL', 'HOU', 'NYK', 'SAC']  # Mix of strong and weaker teams

print("OKC Thunder Current Stats (2026):")
print(f"Wins: {okc_summary['w']}, Losses: {okc_summary['l']}")
print(f"Net Rating: {okc_summary['n_rtg']:.2f}")
print(f"Points Per Game: {okc_stats['pts_per_game']:.1f}")
print(f"\nPredicting next 5 games against:")
for opp in opponent_abbrevs:
    opp_data = team_summaries[(team_summaries['abbreviation'] == opp) & (team_summaries['season'] == 2026)]
    if len(opp_data) > 0:
        print(f"  - {opp}: {opp_data.iloc[0]['w']}-{opp_data.iloc[0]['l']} record")

def create_prediction_features(okc_stats, okc_summary, opponent_stats, opponent_summary):
    """Create features for a single game prediction"""
    return pd.DataFrame([{
        'team1_pts_per_game': okc_stats['pts_per_game'],
        'team1_fg_percent': okc_stats['fg_percent'],
        'team1_3p_percent': okc_stats['x3p_percent'],
        'team1_ft_percent': okc_stats['ft_percent'],
        'team1_reb_per_game': okc_stats['trb_per_game'],
        'team1_ast_per_game': okc_stats['ast_per_game'],
        'team1_stl_per_game': okc_stats['stl_per_game'],
        'team1_blk_per_game': okc_stats['blk_per_game'],
        'team1_tov_per_game': okc_stats['tov_per_game'],
        'team1_o_rtg': okc_summary['o_rtg'],
        'team1_d_rtg': okc_summary['d_rtg'],
        'team1_n_rtg': okc_summary['n_rtg'],
        'team1_srs': okc_summary['srs'],
        'team1_mov': okc_summary['mov'],
        'team2_pts_per_game': opponent_stats['pts_per_game'],
        'team2_fg_percent': opponent_stats['fg_percent'],
        'team2_3p_percent': opponent_stats['x3p_percent'],
        'team2_ft_percent': opponent_stats['ft_percent'],
        'team2_reb_per_game': opponent_stats['trb_per_game'],
        'team2_ast_per_game': opponent_stats['ast_per_game'],
        'team2_stl_per_game': opponent_stats['stl_per_game'],
        'team2_blk_per_game': opponent_stats['blk_per_game'],
        'team2_tov_per_game': opponent_stats['tov_per_game'],
        'team2_o_rtg': opponent_summary['o_rtg'],
        'team2_d_rtg': opponent_summary['d_rtg'],
        'team2_n_rtg': opponent_summary['n_rtg'],
        'team2_srs': opponent_summary['srs'],
        'team2_mov': opponent_summary['mov'],
        'pts_diff': okc_stats['pts_per_game'] - opponent_stats['pts_per_game'],
        'rtg_diff': okc_summary['n_rtg'] - opponent_summary['n_rtg'],
        'srs_diff': okc_summary['srs'] - opponent_summary['srs'],
        'off_def_diff': okc_summary['o_rtg'] - opponent_summary['d_rtg'],
    }])

# Make predictions for each opponent
predictions = []
for opp_abbrev in opponent_abbrevs:
    opp_stats = team_stats[(team_stats['abbreviation'] == opp_abbrev) & (team_stats['season'] == 2026)]
    opp_summary = team_summaries[(team_summaries['abbreviation'] == opp_abbrev) & (team_summaries['season'] == 2026)]
    
    if len(opp_stats) > 0 and len(opp_summary) > 0:
        game_features = create_prediction_features(okc_stats, okc_summary, opp_stats.iloc[0], opp_summary.iloc[0])
        
        # Use the best model (Random Forest) for predictions
        win_prob = rf_model.predict_proba(game_features)[0][1]
        prediction = rf_model.predict(game_features)[0]
        
        predictions.append({
            'Opponent': opp_abbrev,
            'Opponent Record': f"{opp_summary.iloc[0]['w']}-{opp_summary.iloc[0]['l']}",
            'Prediction': 'WIN' if prediction == 1 else 'LOSS',
            'Win Probability': f"{win_prob*100:.1f}%"
        })

predictions_df = pd.DataFrame(predictions)
print("\n\nPredictions for OKC Thunder Next 5 Games:")
print("=" * 60)
print(predictions_df.to_string(index=False))

# Summary
wins = sum([1 for p in predictions if p['Prediction'] == 'WIN'])
losses = 5 - wins
print(f"\n\nPredicted Record: {wins}-{losses}")
print(f"Win Rate: {wins/5*100:.1f}%")

## 8. Visualizations

In [None]:
# Confusion Matrix for best model
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Random Forest Confusion Matrix
cm_rf = confusion_matrix(y_test, rf_pred)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[0, 0])
axes[0, 0].set_title('Random Forest Confusion Matrix', fontweight='bold')
axes[0, 0].set_ylabel('Actual')
axes[0, 0].set_xlabel('Predicted')

# Gradient Boosting Confusion Matrix
cm_gb = confusion_matrix(y_test, gb_pred)
sns.heatmap(cm_gb, annot=True, fmt='d', cmap='Greens', ax=axes[0, 1])
axes[0, 1].set_title('Gradient Boosting Confusion Matrix', fontweight='bold')
axes[0, 1].set_ylabel('Actual')
axes[0, 1].set_xlabel('Predicted')

# Bagging Confusion Matrix
cm_bag = confusion_matrix(y_test, bagging_pred)
sns.heatmap(cm_bag, annot=True, fmt='d', cmap='Reds', ax=axes[1, 0])
axes[1, 0].set_title('Bagging Confusion Matrix', fontweight='bold')
axes[1, 0].set_ylabel('Actual')
axes[1, 0].set_xlabel('Predicted')

# Voting Classifier Confusion Matrix
cm_voting = confusion_matrix(y_test, voting_pred)
sns.heatmap(cm_voting, annot=True, fmt='d', cmap='Oranges', ax=axes[1, 1])
axes[1, 1].set_title('Voting Classifier Confusion Matrix', fontweight='bold')
axes[1, 1].set_ylabel('Actual')
axes[1, 1].set_xlabel('Predicted')

plt.tight_layout()
plt.savefig('confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

# Prediction visualization
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['green' if p == 'WIN' else 'red' for p in predictions_df['Prediction']]
bars = ax.bar(predictions_df['Opponent'], predictions_df['Win Probability'].str.rstrip('%').astype(float), color=colors, alpha=0.7)
ax.set_title('OKC Thunder Next 5 Games Predictions', fontsize=14, fontweight='bold')
ax.set_ylabel('Win Probability (%)', fontsize=12)
ax.set_xlabel('Opponent', fontsize=12)
ax.set_ylim([0, 100])
ax.axhline(y=50, color='black', linestyle='--', alpha=0.3, label='50% threshold')
ax.legend()
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, (bar, prob) in enumerate(zip(bars, predictions_df['Win Probability'].str.rstrip('%').astype(float))):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{prob:.1f}%',
            ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('okc_predictions.png', dpi=300, bbox_inches='tight')
plt.show()