In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [3]:
# Ensure both files have the columns 'Player', 'Year', and 'Shares' in the mvp_stats
# Merge the data on 'Player' and 'Year'
data = pd.read_csv('path_to_sorted_file.csv')

In [4]:
# Sort data by Year and Team
data = data.sort_values(by=['Year', 'Team'])

In [26]:
# Fill missing values if necessary
data.fillna(0, inplace=True)

In [38]:
# Define predictors and target variable
predictors = [col for col in data.columns if col not in ['Player', 'Year', 'Share']]
target = 'Share'

In [37]:
# Select predictors and target variable
predictors = [col for col in data.columns if col not in ['Player', 'Year', 'Share']]
target = 'Share'

In [29]:

# Convert non-numeric columns to numeric using one-hot encoding
data = pd.get_dummies(data, columns=[col for col in predictors if data[col].dtype == 'object'])

In [30]:
# Update the list of predictors after one-hot encoding
predictors = [col for col in data.columns if col not in ['Player', 'Year', 'Share']]

In [31]:
# Standardize the features
scaler = StandardScaler()
data[predictors] = scaler.fit_transform(data[predictors])

In [32]:
# Split the data into training and testing sets
train, test = train_test_split(data, test_size=0.2, random_state=1)

In [39]:
# Initialize models
rf = RandomForestRegressor(random_state=1)
gbr = GradientBoostingRegressor(random_state=1)

In [40]:

# Define hyperparameters for tuning
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gbr_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [41]:
# Perform grid search with cross-validation
rf_grid = GridSearchCV(rf, rf_params, cv=5, n_jobs=-1, verbose=1)
gbr_grid = GridSearchCV(gbr, gbr_params, cv=5, n_jobs=-1, verbose=1)

In [42]:
# Fit the models
rf_grid.fit(train[predictors], train[target])
gbr_grid.fit(train[predictors], train[target])

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [43]:
# Get the best models
rf_best = rf_grid.best_estimator_
gbr_best = gbr_grid.best_estimator_

In [44]:

# Define base models for stacking
base_models = [
    ('rf', rf_best),
    ('gbr', gbr_best)
]


In [45]:
# Define the meta-model
meta_model = GradientBoostingRegressor(n_estimators=100, random_state=1)

In [46]:
# Initialize the stacking regressor
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)

In [47]:
# Fit the stacking model
stacking_regressor.fit(train[predictors], train[target])

In [48]:
# Predictions
stacking_predictions = stacking_regressor.predict(test[predictors])
stacking_predictions = pd.DataFrame(stacking_predictions, columns=["predictions"], index=test.index)

In [50]:
# Combine and evaluate
stacking_combination = pd.concat([test[["Player", "Share"]], stacking_predictions], axis=1)
print("Stacking MSE:", mean_squared_error(stacking_combination["Share"], stacking_combination["predictions"]))
print("Stacking R²:", r2_score(stacking_combination["Share"], stacking_combination["predictions"]))
print("Stacking MAE:", mean_absolute_error(stacking_combination["Share"], stacking_combination["predictions"]))

Stacking MSE: 0.0021394583355989834
Stacking R²: 0.9975620857971954
Stacking MAE: 0.0038285338287963747


In [53]:
# Calculate R² Score
r2_stacking = r2_score(stacking_combination["Share"], stacking_combination["predictions"])

# Convert R² Score to Percentage
accuracy_percentage_stacking = r2_stacking * 100

print(f"Stacking Model Accuracy: {accuracy_percentage_stacking:.2f}%")

Stacking Model Accuracy: 99.76%


In [55]:
# Predictions for the entire dataset
data['Predictions'] = stacking_regressor.predict(data[predictors])

# Display Predictions vs Actual for each year
comparison = data[['Year', 'Player', 'Share', 'Predictions']]

# Filter for the range of years you are interested in (1995-2024)
comparison = comparison[(comparison['Year'] >= 1995) & (comparison['Year'] <= 2024)]

# Display the comparison DataFrame
print(comparison)


       Year               Player     Share  Predictions
0      1995          Andrew Lang -0.095031    -0.094916
1      1995           Craig Ehlo -0.095031    -0.094916
2      1995         Doug Edwards -0.095031    -0.094916
3      1995        Ennis Whatley -0.095031    -0.094916
4      1995          Fred Vinson -0.095031    -0.094916
...     ...                  ...       ...          ...
14239  2024    Marvin Bagley III -0.095031    -0.094916
14240  2024  Patrick Baldwin Jr. -0.095031    -0.094916
14241  2024       Richaun Holmes -0.095031    -0.094916
14242  2024     Tristan Vukcevic -0.095031    -0.094916
14243  2024           Tyus Jones -0.095031    -0.094916

[14244 rows x 4 columns]


In [59]:
def calculate_top_5_mvp(comparison):
    years = comparison['Year'].unique()
    top_5_actual = []
    top_5_predicted = []
    
    for year in years:
        yearly_data = comparison[comparison['Year'] == year]
        top_5_actual.append(yearly_data.sort_values('Share', ascending=False).head(5))
        top_5_predicted.append(yearly_data.sort_values('Predictions', ascending=False).head(5))
    
    top_5_actual_df = pd.concat(top_5_actual)
    top_5_predicted_df = pd.concat(top_5_predicted)
    
    return top_5_actual_df, top_5_predicted_df

# Calculate the top 5 actual and predicted MVPs for each year
top_5_actual, top_5_predicted = calculate_top_5_mvp(comparison)

# Display the results
print("Top 5 Actual MVPs:")
print(top_5_actual)

print("Top 5 Predicted MVPs:")
print(top_5_predicted)


Top 5 Actual MVPs:
       Year                   Player      Share  Predictions
350    1995           David Robinson  14.796134    15.725012
286    1995         Shaquille O'Neal   9.901835     9.648797
387    1995              Karl Malone   8.704294     8.720148
271    1995            Patrick Ewing   3.705861     3.593256
145    1995          Hakeem Olajuwon   2.334763     2.196643
...     ...                      ...        ...          ...
13815  2024             Nikola Jokić  16.132521    17.181794
14070  2024  Shai Gilgeous-Alexander  11.116732    11.557831
13796  2024              Luka Dončić   9.832413     9.642465
13986  2024    Giannis Antetokounmpo   3.271969     2.985376
14042  2024            Jalen Brunson   2.386830     2.196643

[150 rows x 4 columns]
Top 5 Predicted MVPs:
       Year                   Player      Share  Predictions
350    1995           David Robinson  14.796134    15.725012
286    1995         Shaquille O'Neal   9.901835     9.648797
387    1995         

In [61]:
# Assuming 'comparison' DataFrame contains the necessary columns: 'Year', 'Player', 'Shares', 'Predictions'

# Ensure the DataFrame is sorted by year and player
comparison = comparison.sort_values(by=['Year', 'Player'])

# Save the DataFrame to a CSV file
comparison.to_csv('predictions_vs_actual.csv', index=False)

# Display a message indicating that the file has been saved
print("Predictions vs Actual results have been saved to 'predictions_vs_actual.csv'.")


Predictions vs Actual results have been saved to 'predictions_vs_actual.csv'.


In [62]:
# Train the model on the training data
stacking_regressor.fit(train[predictors], train[target])

# Evaluate on the training data
train_predictions = stacking_regressor.predict(train[predictors])
train_mse = mean_squared_error(train[target], train_predictions)
train_r2 = r2_score(train[target], train_predictions)
train_mae = mean_absolute_error(train[target], train_predictions)

print("Training MSE:", train_mse)
print("Training R²:", train_r2)
print("Training MAE:", train_mae)

# Evaluate on the testing data
test_predictions = stacking_regressor.predict(test[predictors])
test_mse = mean_squared_error(test[target], test_predictions)
test_r2 = r2_score(test[target], test_predictions)
test_mae = mean_absolute_error(test[target], test_predictions)

print("Testing MSE:", test_mse)
print("Testing R²:", test_r2)
print("Testing MAE:", test_mae)

# Check for overfitting by comparing training and testing performance
if train_r2 > test_r2 + 0.1:  # Adjust the threshold as needed
    print("The model may be overfitting.")
else:
    print("The model does not appear to be overfitting.")


Training MSE: 0.0018584305642238919
Training R²: 0.9981966324084313
Training MAE: 0.003043291123075095
Testing MSE: 0.0021394583355989834
Testing R²: 0.9975620857971954
Testing MAE: 0.0038285338287963747
The model does not appear to be overfitting.


In [63]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(stacking_regressor, train[predictors], train[target], cv=5, n_jobs=-1)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure()
plt.plot(train_sizes, train_scores_mean, label="Training score")
plt.plot(train_sizes, test_scores_mean, label="Cross-validation score")
plt.title("Learning Curve")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.legend(loc="best")
plt.show()
