# NBA MVP Prediction
### Regression problem

In [3]:
import numpy as np
import pandas as pd
import time
import re
import os

In [9]:
df = pd.read_csv('mvp_votings.csv', index_col=0)

## Training and Validation

Cross-validation by using all seasons for training, except one season, used for validation.

### Pipeline

In [5]:
from sklearn.preprocessing import PolynomialFeatures, normalize, StandardScaler, MinMaxScaler
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
import logging

In [6]:
def pipeline(data_frame, estimators, params, filename, poly_fit=None, scaler=None):
    seasons = data_frame.season.unique()
    features = ['ts_pct', 'bpm', 'usg_pct', 'pts_per_g', 'trb_per_g', 'per', 'ws_per_48', 'win_pct']
    target = ['award_share']
    
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.basicConfig(filename=filename, filemode='w', level=logging.INFO)
    logger = logging.getLogger()
    
    minimal_error, best_estimator = None, None
    
    for estimator in estimators:
        print(f"Starting with estimator: {estimator.__name__}")
        logging.info(f"Starting with estimator: {estimator.__name__}")
            
        for index, curr_params in enumerate(params[estimator.__name__]):
            regressor = estimator(**curr_params)
            errors = [] # MSE for each split
            accuracies = [] # accuracies for each split
            best_accuracies = []
                
            for season in seasons:
                train_data = data_frame.loc[data_frame.season != season]
                validation_data = data_frame.loc[data_frame.season == season]
                validation_data = validation_data.sort_values(by='award_share', ascending=False)
                    
                # Get train data
                train_x = train_data[features].to_numpy()
                train_y = train_data[target].to_numpy()
                train_y = train_y.reshape(train_y.shape[0], )
                    
                # Validate over one season only
                val_x = validation_data[features].to_numpy()
                val_y = validation_data[target].to_numpy()
                val_y = val_y.reshape(val_y.shape[0], )
                
                if poly_fit:
                    train_x = poly_fit.fit_transform(train_x)
                    val_x = poly_fit.fit_transform(val_x)
                        
                if scaler:
                    train_x = scaler.fit_transform(train_x)
                    val_x = scaler.fit_transform(val_x)
                        
                shuffle_x, shuffle_y = shuffle(train_x, train_y)
                    
                regressor.fit(shuffle_x, shuffle_y)
                predicted_y = regressor.predict(val_x)
                    
                sorted_indices = np.argsort(predicted_y)[::-1]
                correct_indices = np.arange(len(val_y))
                    
                accuracy = np.sum(sorted_indices[:5] == correct_indices[:5] / len(correct_indices[:5]))
                accuracies.append(accuracy)
                    
                best_accuracies.append(np.sum(sorted_indices[:1] == correct_indices[:5]) / len(correct_indices[:1]))
                    
                curr_error = mean_squared_error(val_y, predicted_y)
                errors.append(curr_error)
                
            mean_error = np.average(errors)
            mean_accuracy = np.average(accuracies)
            mean_acc_at_1 = np.average(best_accuracies)
            logging.info(f"Parameters: {curr_params}, " \
                         f"MSE over all splits is: {mean_error:.4f}, " \
                         f"Mean accuracy at 5: {mean_accuracy:.4f}, " \
                         f"Mean accuracy at 1: {mean_acc_at_1:.4f} "
                        )
            print(f"Parameters: {curr_params}, " \
                  f"MSE over all splits is: {mean_error:.4f}, " \
                  f"Mean accuracy at 5: {mean_accuracy:.4f}, " \
                  f"Mean accuracy at 1: {mean_acc_at_1:.4f} ")
            
            if minimal_error is None or mean_error < minimal_error:
                minimal_error = mean_error
                best_estimator = estimator(*curr_params)
        
    return best_estimator

#### Defining estimators and their parameters

In [7]:
estimators = [LinearRegression]
params = {
    LinearRegression.__name__: [
        {
            'n_jobs': -1,
        },
        {
            'n_jobs': -1,
            'normalize': True
        },
    ]
}

#### Run pipeline to find best estimator to predict the NBA MVP

In [10]:
best_estimator = pipeline(
    data_frame=df,
    estimators=estimators,
    params=params,
    filename="temp.txt",
    poly_fit=PolynomialFeatures(degree=2, interaction_only=False)
)

Starting with estimator: LinearRegression
Parameters: {'n_jobs': -1}, MSE over all splits is: 0.0361, Mean accuracy at 5: 0.6579, Mean accuracy at 1: 1.0000 
Parameters: {'n_jobs': -1, 'normalize': True}, MSE over all splits is: 0.0397, Mean accuracy at 5: 0.6053, Mean accuracy at 1: 0.9474 


## Predicting the MVP

In [11]:
test_data = pd.read_csv('test_data.csv')
top_players = pd.read_csv('top_players.csv')

In [12]:
features = ['ts_pct', 'bpm', 'mp_per_g', 'pts_per_g', 'trb_per_g', 'ast_per_g',
            'stl_per_g', 'blk_per_g', 'ws', 'win_pct']
target = ['award_share']

In [13]:
def get_predictions_for_regression_model(model, poly_fit, train_data, val_data, scaler=None, should_print=True):
    train_x = train_data[features]
    train_y = train_data[target]
    
    train_x, train_y = shuffle(train_x, train_y)
    
    test_x = test_data[features].to_numpy()
    test_x = np.nan_to_num(test_x)
    
    if poly_fit:
        train_x = poly_fit.fit_transform(train_x)
        test_x = poly_fit.fit_transform(test_x)
        
    if scaler:
        train_x = scaler.fit_transform(train_x)
        test_x = scaler.fit_transform(test_x)
        
    model.fit(train_x, train_y.values.ravel())
    predict_y = model.predict(test_x)
    sorted_indices = np.argsort(predict_y)[::-1]
    predictions = predict_y[sorted_indices]
    
    formatted_preds = []
    if should_print:
        print(f"Predictions")
    for i in range(10):
        if predictions[i] < 0:
            break
        if should_print:
            print(f"{i+1}. {top_players.iloc[sorted_indices[i]].Player}: {predictions[i]}")
        formatted_preds.append((top_players.iloc[sorted_indices[i]].Player, predictions[i]))
    return formatted_preds

In [14]:
get_predictions_for_regression_model(
    model=GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, subsample=1.0),
    poly_fit=PolynomialFeatures(degree=2, interaction_only=False),
    train_data=df,
    val_data=test_data
)

Predictions
1. James Harden: 0.7376242046085356
2. Giannis Antetokounmpo: 0.725515666993111
3. Joel Embiid: 0.29815014201398854
4. Kawhi Leonard: 0.26369955851669674
5. Paul George: 0.17653668514623136
6. Stephen Curry: 0.1740185306574057
7. Nikola Jokic: 0.1725925733183824
8. Rudy Gobert: 0.1546860022795456
9. Russell Westbrook: 0.1306408948130648
10. Damian Lillard: 0.10123934904712832


[('James Harden', 0.7376242046085356),
 ('Giannis Antetokounmpo', 0.725515666993111),
 ('Joel Embiid', 0.29815014201398854),
 ('Kawhi Leonard', 0.26369955851669674),
 ('Paul George', 0.17653668514623136),
 ('Stephen Curry', 0.1740185306574057),
 ('Nikola Jokic', 0.1725925733183824),
 ('Rudy Gobert', 0.1546860022795456),
 ('Russell Westbrook', 0.1306408948130648),
 ('Damian Lillard', 0.10123934904712832)]