In [22]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, GridSearchCV
import joblib


In [23]:
# Load Data
stats = pd.read_csv("player_mvp_stats.csv")


In [24]:
# Handle Missing Values
stats.fillna(stats.mean(numeric_only=True), inplace=True)


In [25]:
# Define Predictors
predictors = ["Age", "G_y", "GS", "MP", "FG", "FGA", 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%', 'PS/G', 'PA/G', 'SRS']


In [37]:
# Split Data
years = list(range(1995, 2025))
ridge_mse = []
rf_mse = []
ridge_r2 = []
rf_r2 = []
ridge_mae = []
rf_mae = []

for year in years:
    # Split Data
    train = stats[stats["Year"] < year].copy()
    test = stats[stats["Year"] == year].copy()


In [38]:
# Feature Scaling
scaler = StandardScaler()
train[predictors] = scaler.fit_transform(train[predictors])
test[predictors] = scaler.transform(test[predictors])


In [39]:
# Ridge Regression Model
ridge = Ridge()

# Hyperparameter Tuning for Ridge
ridge_params = {'alpha': [0.01, 0.1, 1, 10]}
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5)
ridge_grid.fit(train[predictors], train["Share"])

# Best Model
ridge_best = ridge_grid.best_estimator_

# Predictions
predictions = ridge_best.predict(test[predictors])
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

# Combine and Evaluate
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
print("MSE:", mean_squared_error(combination["Share"], combination["predictions"]))
print("R²:", r2_score(combination["Share"], combination["predictions"]))
print("MAE:", mean_absolute_error(combination["Share"], combination["predictions"]))


MSE: 0.0017017920307472578
R²: 0.06720237406718432
MAE: 0.010929158258906045


In [40]:
# RandomForest Regressor
rf = RandomForestRegressor(random_state=1)

# Hyperparameter Tuning for RandomForest
rf_params = {'n_estimators': [50, 100, 200], 'min_samples_split': [2, 5, 10]}
rf_grid = GridSearchCV(rf, rf_params, cv=5)
rf_grid.fit(train[predictors], train["Share"])

# Best Model
rf_best = rf_grid.best_estimator_

# Predictions
rf_predictions = rf_best.predict(test[predictors])
rf_predictions = pd.DataFrame(rf_predictions, columns=["predictions"], index=test.index)

# Combine and Evaluate
rf_combination = pd.concat([test[["Player", "Share"]], rf_predictions], axis=1)
print("RandomForest MSE:", mean_squared_error(rf_combination["Share"], rf_combination["predictions"]))
print("RandomForest R²:", r2_score(rf_combination["Share"], rf_combination["predictions"]))
print("RandomForest MAE:", mean_absolute_error(rf_combination["Share"], rf_combination["predictions"]))


RandomForest MSE: 0.0010050626780250599
RandomForest R²: 0.4490983251556372
RandomForest MAE: 0.003306522054257553


In [41]:
# Additional Evaluation Metrics
def calculate_metrics(combination, share_col='Share'):
    print("Top 5 Actual MVPs:")
    print(combination.sort_values(share_col, ascending=False).head(5))
    print("Top 5 Predicted MVPs:")
    print(combination.sort_values("predictions", ascending=False).head(5))

calculate_metrics(combination)
calculate_metrics(rf_combination)


Top 5 Actual MVPs:
                       Player     Share  predictions
711              Nikola Jokić  0.935000     0.265513
1349  Shai Gilgeous-Alexander  0.646000     0.239367
260               Luka Dončić  0.572000     0.241445
150     Giannis Antetokounmpo  0.194000     0.272402
9601              Nic Claxton  0.177661     0.195556
Top 5 Predicted MVPs:
                      Player     Share  predictions
150    Giannis Antetokounmpo  0.194000     0.272402
711             Nikola Jokić  0.935000     0.265513
4182        Domantas Sabonis  0.003000     0.257844
12777            Joel Embiid  0.177661     0.243985
260              Luka Dončić  0.572000     0.241445
Top 5 Actual MVPs:
                       Player     Share  predictions
711              Nikola Jokić  0.935000     0.450596
1349  Shai Gilgeous-Alexander  0.646000     0.257481
260               Luka Dončić  0.572000     0.244483
150     Giannis Antetokounmpo  0.194000     0.295841
9601              Nic Claxton  0.177661     0

In [34]:
# Save Model
joblib.dump(ridge_best, 'ridge_best_model.pkl')
joblib.dump(rf_best, 'rf_best_model.pkl')


['rf_best_model.pkl']