In [2]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
from sklearn.linear_model import LinearRegression
import pickle
# from Model_Build.Functions import *

In [41]:
years = [2019, 2018, 2017, 2016, 2015, 2014]
dataset_columns = ['Player', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 
                  'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48',
                  'OBPM', 'DBPM', 'BPM', 'VORP_Prior_Year']
dataset = pd.DataFrame(columns = dataset_columns)

for year in years:
    # Getting VORP for a given year
    tables = pd.read_html(f'https://www.basketball-reference.com/leagues/NBA_{str(year)}_advanced.html')
    table = tables[0]
    table = table[['Player', 'Tm', 'VORP']]
    table.columns = ['Player', 'Team', 'VORP']
    table = table[table.Team != 'Tm']
    table.Team.unique()
    table['VORP'] = table.VORP.apply(pd.to_numeric)
    player_vorp = table.groupby('Player')['VORP'].sum()
    player_vorp = pd.DataFrame(player_vorp)
    player_vorp.reset_index(drop = False, inplace = True)
    player_vorp.columns = ['Player', 'VORP']
    player_vorp['Year'] = year
    
    # Getting predictive advanced stats from the prior year
    tables = pd.read_html(f'https://www.basketball-reference.com/leagues/NBA_{str(year-1)}_advanced.html')
    table = tables[0]
    table = table[['Player', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 
                  'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48',
                  'OBPM', 'DBPM', 'BPM', 'VORP']]
    table.columns = ['Player', 'Team', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 
                  'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48',
                  'OBPM', 'DBPM', 'BPM', 'VORP_Prior_Year']
    table = table[table.Team != 'Tm']
    columns = ['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 
                  'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48',
                  'OBPM', 'DBPM', 'BPM', 'VORP_Prior_Year']
    for column in columns:
        table[column] = table[column].apply(pd.to_numeric)
    player_predictive = table.groupby('Player').agg({'G' : 'sum', 'MP' : 'sum', 'PER' : 'mean', 
                                                    'TS%' : 'mean', '3PAr' : 'mean', 'FTr' : 'mean', 
                                                    'ORB%' : 'mean', 'DRB%' : 'mean', 'TRB%' : 'mean', 
                                                    'AST%' : 'mean', 'STL%' : 'mean', 'BLK%' : 'mean', 
                                                    'TOV%' : 'mean', 'USG%' : 'mean', 'OWS' : 'sum', 
                                                    'DWS' : 'sum', 'WS' : 'sum', 'WS/48' : 'mean', 
                                                    'OBPM' : 'sum', 'DBPM' : 'sum', 'BPM' : 'sum',
                                                    'VORP_Prior_Year' : 'sum'})
    player_predictive = pd.DataFrame(player_predictive)
    player_predictive.reset_index(drop = False, inplace = True)
    player_predictive.columns = ['Player', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 
                  'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48',
                  'OBPM', 'DBPM', 'BPM', 'VORP_Prior_Year']
    
    # Merging VORP with prior year's predictive stats
    merged = pd.merge(player_vorp, player_predictive, on = 'Player', how = 'inner')
    
    # Appending to full dataset
    dataset = dataset.append(merged)
    

In [43]:
    x = dataset[['G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 
                'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48',
                'OBPM', 'DBPM', 'BPM', 'VORP_Prior_Year']]
    y = dataset.VORP
    model = LinearRegression()
    model.fit(x,y)
    r_squared = model.score(x,y)
    print('The y-intercept is: ', model.intercept_)
    print('The slope is: ', model.coef_)
    print('The r-squared is: ', r_squared)

The y-intercept is:  -0.3333749401270538
The slope is:  [-7.43465733e-03  2.47456113e-04  6.27825867e-02 -1.89825549e+00
  4.51246345e-01  3.40079986e-01 -2.95758986e-02  1.10759602e-02
  9.85909313e-03  1.07487963e-04  1.01704747e-01  5.39183238e-02
  9.16884158e-03  7.77248797e-03  8.19926264e-01  7.71525087e-01
 -8.06070692e-01  6.06415441e-01  2.40724719e-01  2.39743028e-01
 -2.68998642e-01  5.77639472e-01]
The r-squared is:  0.5761333188317022


In [44]:
# Saving model to data folder
file_name = 'Model_Build/Data/predict_vorp_regression.pickle'
with open(file_name, 'wb') as f:
    pickle.dump(model, f)