In [95]:
import requests
import bs4
import pandas as pd
import numpy as np

In [5]:
url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html"

In [150]:
res = requests.get(url.format(2015))
soup = bs4.BeautifulSoup(res.text,'lxml')
table = soup.find('table')
final_df = pd.read_html(str(table))[0]
    
for col in ['Age','G','FGA','FG','3P','FT','FTA','TRB','AST','STL','BLK','TOV','PTS']:
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce')
    
final_df['2015_score'] = (final_df['FGA']*(-0.45) + final_df['FG']*(1.0) + final_df['FTA']*(-0.75) + final_df['FT']*(1.0) 
                        + final_df['3P']*(3.0) + final_df['PTS']*(0.5) + final_df['TRB']*(1.5) + final_df['AST']*(1.5)
                        + final_df['STL']*(2.0) + final_df['BLK']*(3.0) + final_df['TOV']*(-2.0) + final_df['G']*(0.2) 
                        + final_df['Age']*(1))
                     
final_df = final_df.drop(columns=['Rk', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
                          '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
                          'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'], axis=1)
                     
final_df.set_index('Player')

for i in range(2016,2021):
    
    res = requests.get(url.format(i))
    soup = bs4.BeautifulSoup(res.text,'lxml')
    table = soup.find('table')
    df = pd.read_html(str(table))[0]
    
    for col in ['Age','G','FGA','FG','3P','FT','FTA','TRB','AST','STL','BLK','TOV','PTS']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df[f'{i}_score'] = (df['FGA']*(-0.45) + df['FG']*(1.0) + df['FTA']*(-0.75) + df['FT']*(1.0) 
                        + df['3P']*(3.0) + df['PTS']*(0.5) + df['TRB']*(1.5) + df['AST']*(1.5)
                        + df['STL']*(2.0) + df['BLK']*(3.0) + df['TOV']*(-2.0) + df['G']*(0.2) 
                        + df['Age']*(1))
                     
    df = df.drop(columns=['Rk', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
                          '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
                          'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'], axis=1)
                     
    df.set_index('Player')
    
    final_df = pd.merge(final_df,df,on='Player',how='outer')
    
#columns before drop:
#Index['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       #'3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       #'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      #dtype='object'
# useful cat: 'Age','G','FGA','FG','3P','FT','FTA','TRB','AST','STL','BLK','TOV','PTS'
# Age factor is not apply to the score format right now, but will be considered in the future

In [151]:
final_df = final_df.groupby('Player').mean()

In [152]:
final_df.columns

Index(['2015_score', '2016_score', '2017_score', '2018_score', '2019_score',
       '2020_score'],
      dtype='object')

Feature Engineering

In [153]:
# remove players that don't have stats during 2018-2020
final_df = final_df.drop(final_df[final_df['2018_score'].isna() & final_df['2019_score'].isna() 
                                  & final_df['2020_score'].isna()].index)

#fill na with average scoring
final_df = final_df.fillna(final_df.mean(axis=0))


In [154]:
final_df.sort_values(by='2020_score',ascending=False).head(15)

Unnamed: 0_level_0,2015_score,2016_score,2017_score,2018_score,2019_score,2020_score
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
James Harden,80.655,82.985,90.22,87.98,95.525,93.615
LeBron James,79.45,82.605,88.16,93.84,87.645,91.245
Damian Lillard,72.455,74.835,77.815,79.82,82.11,86.72
Giannis Antetokounmpo,57.755,66.61,76.91,78.26,83.64,83.785
Hassan Whiteside,62.83,75.54,77.58,68.335,72.12,80.875
Anthony Davis,76.18,71.93,80.365,83.875,83.69,80.46
Nikola Vučević,68.49,66.68,70.86,70.935,83.995,79.71
Kyle Lowry,72.27,79.63,79.24,81.13,78.07,79.515
Chris Paul,83.09,81.155,79.92,80.69,77.895,79.435
Kawhi Leonard,66.315,73.655,75.535,53.915,76.315,78.97


In [155]:
final_df.sort_values(by='2019_score',ascending=False).head(15)

Unnamed: 0_level_0,2015_score,2016_score,2017_score,2018_score,2019_score,2020_score
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
James Harden,80.655,82.985,90.22,87.98,95.525,93.615
Russell Westbrook,77.45,82.705,93.15,86.68,88.81,77.3
Paul George,38.61,75.77,75.05,76.7,87.85,72.84
Stephen Curry,82.49,91.385,84.515,81.47,87.82,64.62
LeBron James,79.45,82.605,88.16,93.84,87.645,91.245
Nikola Vučević,68.49,66.68,70.86,70.935,83.995,79.71
Anthony Davis,76.18,71.93,80.365,83.875,83.69,80.46
Giannis Antetokounmpo,57.755,66.61,76.91,78.26,83.64,83.785
Kevin Durant,68.14,83.785,83.125,84.475,83.41,49.475935
Damian Lillard,72.455,74.835,77.815,79.82,82.11,86.72


Load Predict Score from computer

In [74]:
predict_df = pd.read_csv('predict_table.csv',index_col='Player')

In [75]:
predict_df.head()

Unnamed: 0_level_0,GP,FG,FGA,FT,FTA,3PM,PTS,TREB,AST,STL,BLK,TO,Predict_Score
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
James Harden,68,9.5,21.2,10.2,11.8,4.6,33.8,6.5,7.5,1.8,0.9,4.5,81.61
Anthony Davis,65,9.0,17.9,7.3,8.6,1.3,26.6,9.4,3.2,1.5,2.3,2.5,68.695
Karl-Anthony Towns,69,8.4,16.8,5.0,6.2,3.3,25.1,10.6,3.7,1.1,1.5,3.4,68.09
Stephen Curry,65,8.0,17.5,6.2,6.8,4.1,26.2,5.4,6.4,1.2,0.5,3.8,63.825
Damian Lillard,67,9.0,19.6,6.8,7.6,4.0,28.7,4.2,7.7,1.1,0.3,3.0,67.38


In [156]:
completed_df = pd.merge(final_df,predict_df,on='Player',how='outer')
completed_df = completed_df.drop(columns=['GP', 'FG', 'FGA', 'FT', 'FTA', '3PM', 'PTS', 'TREB',
                                          'AST', 'STL', 'BLK', 'TO'], axis=1)
completed_df.dropna(inplace=True)
completed_df['Predict_Score'] = pd.to_numeric(completed_df['Predict_Score'], errors='coerce')

In [157]:
completed_df.head()

Unnamed: 0_level_0,2015_score,2016_score,2017_score,2018_score,2019_score,2020_score,Predict_Score
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aaron Gordon,38.995,55.945,57.765,64.07,67.32,64.02,48.305
Aaron Holiday,54.042135,54.881162,53.741132,49.89386,42.06,52.95,34.715
Al Horford,70.86,76.39,74.69,75.2,75.73,74.98,43.075
Alec Burks,46.955,45.965,43.17,50.5,46.93875,59.656667,33.475
Alex Caruso,54.042135,54.881162,53.741132,39.005,45.145,49.4,29.17


In [158]:
completed_df.sort_values(by='Predict_Score',ascending=False).head(15)

Unnamed: 0_level_0,2015_score,2016_score,2017_score,2018_score,2019_score,2020_score,Predict_Score
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
James Harden,80.655,82.985,90.22,87.98,95.525,93.615,81.61
Giannis Antetokounmpo,57.755,66.61,76.91,78.26,83.64,83.785,76.14
Anthony Davis,76.18,71.93,80.365,83.875,83.69,80.46,68.695
Karl-Anthony Towns,54.042135,69.005,78.25,79.49,81.755,77.365,68.09
Damian Lillard,72.455,74.835,77.815,79.82,82.11,86.72,67.38
Trae Young,54.042135,54.881162,53.741132,49.89386,63.85,72.065,65.945
Joel Embiid,54.042135,54.881162,59.015,71.24,81.51,72.36,64.44
Stephen Curry,82.49,91.385,84.515,81.47,87.82,64.62,63.825
LeBron James,79.45,82.605,88.16,93.84,87.645,91.245,63.055
Kawhi Leonard,66.315,73.655,75.535,53.915,76.315,78.97,62.11


In [159]:
completed_df.to_csv('final_data.csv')

In [160]:
completed_df.columns

Index(['2015_score', '2016_score', '2017_score', '2018_score', '2019_score',
       '2020_score', 'Predict_Score'],
      dtype='object')

In [161]:
completed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 240 entries, Aaron Gordon to Zion Williamson
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   2015_score     240 non-null    float64
 1   2016_score     240 non-null    float64
 2   2017_score     240 non-null    float64
 3   2018_score     240 non-null    float64
 4   2019_score     240 non-null    float64
 5   2020_score     240 non-null    float64
 6   Predict_Score  240 non-null    float64
dtypes: float64(7)
memory usage: 15.0+ KB


Training Model

In [168]:
from sklearn.model_selection import train_test_split

X = completed_df[['2015_score', '2016_score', '2017_score', '2018_score', '2019_score','2020_score']] 

y = completed_df['Predict_Score'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [171]:
#Linera Regression Model

from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train,y_train) 
lm_predictions = lm.predict(X_test)

from sklearn import metrics

print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lm_predictions)))

RMSE: 7.423105650439219


In [174]:
#Random Forests

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))


RMSE: 7.329156453614735


In [185]:
#Neural Networks

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.layers import Dropout

model = Sequential()

model.add(Dense(8,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2,activation='relu'))

model.add(Dense(1))

model.compile(optimizer='adam',loss='mse')

from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=25)
model.fit(X_train,y_train,epochs=800,validation_data=(X_test, y_test),callbacks=[early_stop])

training_score = model.evaluate(X_train,y_train,verbose=0)
test_score = model.evaluate(X_test,y_test,verbose=0)

test_predictions = model.predict(X_test)

print('training_score: ', training_score)
print('test_score: ', test_score)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, test_predictions)))

Epoch 1/800


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800
Epoch 36/800
Epoch 37/800
Epoch 38/800
Epoch 39/800
Epoch 40/800
Epoch 41/800
Epoch 42/800
Epoch 43/800
Epoch 44/800
Epoch 45/800
Epoch 46/800
Epoch 47/800
Epoch 48/800
Epoch 49/800
Epoch 50/800
Epoch 51/800
Epoch 52/800
Epoch 53/800
Epoch 54/800
Epoch 

Epoch 74/800
Epoch 75/800
Epoch 76/800
Epoch 77/800
Epoch 78/800
Epoch 79/800
Epoch 80/800
Epoch 81/800
Epoch 82/800
Epoch 83/800
Epoch 84/800
Epoch 85/800
Epoch 86/800
Epoch 87/800
Epoch 88/800
Epoch 89/800
Epoch 00089: early stopping
training_score:  628.5396728515625
test_score:  697.5608520507812
RMSE: 26.41137678449719
