<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
# Import modules and packages
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import json
import pandas as pd
import seaborn as sns
import re
from scipy.stats import gaussian_kde
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import unidecode
import unicodedata
%matplotlib inline

In [None]:
sns.set_style("whitegrid")

In [None]:
# Read in compiled NBA player data
df_orig = pd.read_csv('../CompleteNBAPlayerStats.csv')
print("Table of BBall Player Stats:\n\n", df_orig)

In [None]:
# Dataframes after cutting on GP/MPG/other parameters
df = df_orig.copy()
df = df[df.GP > min_num_games]
df = df[df.MPG > min_MPG]

# Add some additional variables
## To convert to "per 36-min" stats
df['2PA_PG'] = df['2PA_PT']*df.MPG/36.
df['3PA_PG'] = df['3PA_PT']*df.MPG/36.
df['FGA_PG'] = df.FGA_PT*df.MPG/36.

df["2PR"] = df["2PA_PH"]/df["FGA_PH"]
df["3PR"] = df["3PA_PH"]/df["FGA_PH"]

df['FG_FREQ_RIM'] = (df.FGA_RA)/df.FGA_PG # restricted area
df['FG_FREQ_MR_AND_PT'] = (df.FGA_MR + df.FGA_NONRA)/df.FGA_PG # combined paint and midrange
df['FG_FREQ_MR'] = (df.FGA_MR)/df.FGA_PG
df['FG_FREQ_CORNERS'] = (df.FGA_LC + df.FGA_RC)/df.FGA_PG
df['FG_FREQ_AB'] = df.FGA_AB/df.FGA_PG
df['FG_FREQ_01DRIB'] = (df['FGA_0DRIB'] + df['FGA_1DRIB'])/df.FGA_PG
df['FG_FREQ_GT1DRIB'] = (df['FGA_2DRIB'] + df['FGA_36DRIB'] + df['FGA_GT7DRIB'])/df.FGA_PG
df['FG_FREQ_CANDS'] = df['FGA_CANDS']/df.FGA_PG

df["FG_FREQ_RIM"].fillna(0, inplace=True)
df["FG_FREQ_MR_AND_PT"].fillna(0, inplace=True)
df["FG_FREQ_MR"].fillna(0, inplace=True)
df["FG_FREQ_CORNERS"].fillna(0, inplace=True)
df["FG_FREQ_AB"].fillna(0, inplace=True)
df["FG_FREQ_01DRIB"].fillna(0, inplace=True)
df["FG_FREQ_GT1DRIB"].fillna(0, inplace=True)
df["FG_FREQ_CANDS"].fillna(0, inplace=True)


In [None]:
# Various dataframes separating rookies from established veterans,
# as well as median data to exclude outliers for veteran players
df_vets = df[df["name"].isin(df["name"].value_counts()[df["name"].value_counts()>1].index)]
df_rooks = df[df["name"].isin(df["name"].value_counts()[df["name"].value_counts()==1].index)]
df_med = df_vets.groupby("name").median().reset_index()

# Dataframes by player position
# Centers
dfc = df[df['pos'].str.contains('C')]
# Forwards
dff = df[df['pos'].str.contains('F')]
# Guards
dfg = df[df['pos'].str.contains('G')]


In [None]:
x_train = df[df.year != 2020][['2PM_PH', '3PM_PH', 'FTM_PH', 'AST_PH', 'ORB_PH', 'DRB_PH']]
y_train = df[df.year != 2020][['OFFRTG']]

reg = LinearRegression()
reg.fit(x_train, y_train)

x_test = df[df.year == 2020][['2PM_PH', '3PM_PH', 'FTM_PH', 'AST_PH', 'ORB_PH', 'DRB_PH']]
y_test_tot = df[df.year == 2020].OFFRTG
y_pred_tot = reg.predict(x_test)

#print(df[df.year == 2020].name.values[i])
#for i,name in enumerate(df[df.year == 2020].name):
#    print(name, ':', y_pred[:,0][i], df[df.year == 2020].iloc[i,:].OFFRTG)
    
print('Made', len(y_pred_tot), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test_tot, y_pred_tot)))


In [None]:
x_train = df[df.year != 2020][['EFGP', '3PR', 'FTM_PH', 'AST_PH', 'TOR', 'REBR']]
y_train = df[df.year != 2020][['OFFRTG']]

reg = LinearRegression()
reg.fit(x_train, y_train)

x_test = df[df.year == 2020][['EFGP', '3PR', 'FTM_PH', 'AST_PH', 'TOR', 'REBR']]
y_test_eff = df[df.year == 2020].OFFRTG
y_pred_eff = reg.predict(x_test)

#print(df[df.year == 2020].name.values[i])
#for i,name in enumerate(df[df.year == 2020].name):
#    print(name, ':', y_pred[:,0][i], df[df.year == 2020].iloc[i,:].OFFRTG)
    
print('Made', len(y_pred_eff), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test_eff, y_pred_eff)))


In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
plt.xlabel("Offensive Rating")
plt.ylabel("Predicted Offensive Rating")
plt.scatter(y_test_tot, y_pred_tot)
plt.scatter(y_test_eff, y_pred_eff)
#plt.scatter(y_test_eff, df[df.year == 2020].PTS_PH)
xmin, xmax = ax.get_xlim()
#ymin, ymax = ax.get_ylim()
ymin = 90
ymax = 125
plt.plot([90,130], [90,130], 'r--')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)


In [None]:
mlp_reg = MLPRegressor()
mlp_reg.fit(x_train, y_train.values.ravel())
y_mlp_pred = mlp_reg.predict(x_test)
for i,pred in enumerate(y_mlp_pred):
    print(str(df_test.name.values[i]) + ' has a predicted +/- of ' + str(pred) + ' compared to real +/- of ' + str(y_test.values[i][0]))

print('Made', len(y_mlp_pred), 'predictions with a RMS error of', np.sqrt(metrics.mean_squared_error(y_test, y_mlp_pred)))
