In [None]:
#Dependencies and Setup
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
import numpy as np
from patsy import dmatrices
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
#Load Main NBA CSV
NBAData = "C:/Users/john1/Documents/GitHub/WakeTechCapstone/NBAFinalExport.csv"

In [None]:
#Read the loaded Data
nba_data_df = pd.read_csv(NBAData, low_memory=False)

In [None]:
#add all features
features = nba_data_df

In [None]:
features

In [None]:
features

In [None]:
#drop unneeded columns
features.drop(columns=['GAME-ID','PlayerCleaned', 'DATE','GMIN', 'GMFG', 'GMFGA', 'GM3P', 'GM3PA', 'GMFT', 'GMFTA', 'GMOR', 'GMDR', 'GMREB', 'GMAST', 'GMPF', 'GMST', 'GMTO', 'GMBL', 'GMPTS', 'GMUSAGE', 'GMREST', 'GMTM1Q', 'GMTM2Q', 'GMTM3Q', 'GMTM4Q', 'GMTMOT1', 'GMTMOT2', 'GMTMOT3', 'GMTMOT4', 'GMTMOT5', 'GMTMF', 'GMTMMIN', 'GMTMFG', 'GMTMFGA', 'GMTM3P', 'GMTM3PA', 'GMTMFT', 'GMTMFTA', 'GMTMOR', 'GMTMDR', 'GMTMREB', 'GMTMAST', 'GMTMPF', 'GMTMST', 'GMTMTO', 'GMTMTOTO', 'GMTMBL', 'GMTMPTS', 'GMTMPOSS', 'GMTMPACE', 'GMTMOEFF', 'GMTMDEFF', 'GMTMREST', 'MAIN REF', 'CREW',  'OPENING ODDS', 'CLOSING_ODDS'],inplace=True)

In [None]:
features.dtypes

In [None]:
#describe features
features.describe()

In [None]:
#view nulls
features.isnull().sum()

Because of the size of the data set with the relative low number of null values I am going to drop any row with a null value before setting up the model

In [None]:
#drop rows with null values
features.dropna(inplace=True)

In [None]:
#get feature description after null values drop
features.describe()

In [None]:
#get null value count after drop
features.isnull().sum()

In [None]:
#one-hot encoding for categorical values
features = pd.get_dummies(features)

In [None]:
features

In [None]:
# Labels are the values we want to predict
labels = np.array(features['DKPTS'])

In [None]:
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('DKPTS', axis = 1)

In [None]:
features

In [None]:
# Saving feature names for later use
feature_list = list(features.columns)

In [None]:
# Convert to numpy array
features = np.array(features)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)

In [None]:
# Train the model on training data
rf.fit(train_features, train_labels);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

In [None]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)

In [None]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

In [None]:
errors

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

In [None]:
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(feature_list, importances)]

In [None]:
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

In [None]:
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
predictions

In [None]:
test_features.shape

In [None]:
test_features = pd.DataFrame(test_features)

In [None]:
test_features

In [None]:
predictions = pd.DataFrame(predictions)

In [None]:
predictions

In [None]:
DKPTS = pd.DataFrame(test_labels)

In [None]:
DKPTS

In [None]:
frames = [test_features, predictions, DKPTS ]

In [None]:
finalTestResults = pd.concat([test_features, predictions, DKPTS], axis=1, join="inner")

In [None]:
finalTestResults

In [None]:
feature_list

In [None]:
finalTestResults.columns =['Seasons',
'PLAYER-ID',
 'DKSAL',
 'Age',
 'G',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP',
 'GS',
 'PLFG',
 'PLFGA',
 'PLFG%',
 'PL3P',
 'PL3PA',
 'PL3P%',
 'PL2P',
 'PL2PA',
 'PL2P%',
 'PLFT',
 'PLFTA',
 'PLFT%',
 'PLORB',
 'PLDRB',
 'PLREB',
 'PLAST',
 'PLSTL',
 'PLBLK',
 'PLTO',
 'PLPF',
 'PLPTS',
 'ORtg',
 'DRtg',
 'OPENING SPREAD',
 'OPENING TOTAL',
 'CLOSING SPREAD',
 'CLOSING TOTAL',
 'W',
 'L',
 'TMPW',
 'TMPL',
 'TMMOV',
 'TMSOS',
 'TMSRS',
 'TMORTG',
 'TMDRTG',
 'TMNRTG',
 'TMPACE',
 'TMFTr',
 'TM3PAr',
 'TMTS%',
 'TMeFG%',
 'TMTOV%',
 'TMORB%',
 'TMFT/FGA',
 'TMeFG%_2',
 'TMTOV%_3',
 'TMDRB%',
 'TMFT/FGA_4',
 'TMATTEND',
 'TMATTGM',
 'TMG',
 'TMPERMIN',
 'TMPERFG',
 'TMPERFGA',
 'TMPERFG%',
 'TMPER3P',
 'TMPER3PA',
 'TMPER3P%',
 'TMPER2P',
 'TMPER2PA',
 'TMPER2P%',
 'TMPERFT',
 'TMPERFTA',
 'TMPERFT%',
 'TMPERORB',
 'TMPERDRB',
 'TMPERREB',
 'TMPERAST',
 'TMPERSTL',
 'TMPERBLK',
 'TMPERTOV',
 'TMPERPF',
 'TMPERPTS',
 'OPPG',
 'OPPMP',
 'OPPFG',
 'OPPFGA',
 'OPPFG%',
 'OPP3P',
 'OPP3PA',
 'OPP3P%',
 'OPP2P',
 'OPP2PA',
 'OPP2P%',
 'OPPFT',
 'OPPFTA',
 'OPPFT%',
 'OPPORB',
 'OPPDRB',
 'OPPREB',
 'OPPAST',
 'OPPSTL',
 'OPPBLK',
 'OPPTOV',
 'OPPPF',
 'OPPPTS',
 'VSADVGMS',
 'VSADVDKPTS',
 'VSADVPTS',
 'VSADVFG',
 'VSADVFGA',
 'VSADVFG%',
 'VSADV3P',
 'VSADV3PA',
 'VSADV3P%',
 'VSADVFT',
 'VSADVFTA',
 'VSADVFT%',
 'VSADVOR',
 'VSADVDR',
 'VSADVREB',
 'VSADVAST',
 'VSADVPF',
 'VSADVST',
 'VSADVTO',
 'VSADVBL',
 'VSADVUSAGE',
 'PLADVGMS',
 'PLADVAVGDKPTS',
 'PLADVMINDKPTS',
 'PLADVMAXDKPTS',
 'PLADVDEVDKPTS',
 'PLADVPTS',
 'PLADVFG',
 'PLADVFGA',
 'PLADVFG%',
 'PLADV3P',
 'PLADV3PA',
 'PLADV3P%',
 'PLADVFT',
 'PLADVFTA',
 'PLADVFT%',
 'PLADVOR',
 'PLADVDR',
 'PLADVREB',
 'PLADVAST',
 'PLADVPF',
 'PLADVST',
 'PLADVTO',
 'PLADVBL',
 'PLADVUSAGE',
 'OWN_TEAM_Atlanta',
 'OWN_TEAM_Boston',
 'OWN_TEAM_Brooklyn',
 'OWN_TEAM_Charlotte',
 'OWN_TEAM_Chicago',
 'OWN_TEAM_Cleveland',
 'OWN_TEAM_Dallas',
 'OWN_TEAM_Denver',
 'OWN_TEAM_Detroit',
 'OWN_TEAM_Golden State',
 'OWN_TEAM_Houston',
 'OWN_TEAM_Indiana',
 'OWN_TEAM_LA Clippers',
 'OWN_TEAM_LA Lakers',
 'OWN_TEAM_Memphis',
 'OWN_TEAM_Miami',
 'OWN_TEAM_Milwaukee',
 'OWN_TEAM_Minnesota',
 'OWN_TEAM_New Orleans',
 'OWN_TEAM_New York',
 'OWN_TEAM_Oklahoma City',
 'OWN_TEAM_Orlando',
 'OWN_TEAM_Philadelphia',
 'OWN_TEAM_Phoenix',
 'OWN_TEAM_Portland',
 'OWN_TEAM_Sacramento',
 'OWN_TEAM_San Antonio',
 'OWN_TEAM_Toronto',
 'OWN_TEAM_Utah',
 'OWN_TEAM_Washington',
 'OPPONENT_TEAM_Atlanta',
 'OPPONENT_TEAM_Boston',
 'OPPONENT_TEAM_Brooklyn',
 'OPPONENT_TEAM_Charlotte',
 'OPPONENT_TEAM_Chicago',
 'OPPONENT_TEAM_Cleveland',
 'OPPONENT_TEAM_Dallas',
 'OPPONENT_TEAM_Denver',
 'OPPONENT_TEAM_Detroit',
 'OPPONENT_TEAM_Golden State',
 'OPPONENT_TEAM_Houston',
 'OPPONENT_TEAM_Indiana',
 'OPPONENT_TEAM_LA Clippers',
 'OPPONENT_TEAM_LA Lakers',
 'OPPONENT_TEAM_Memphis',
 'OPPONENT_TEAM_Miami',
 'OPPONENT_TEAM_Milwaukee',
 'OPPONENT_TEAM_Minnesota',
 'OPPONENT_TEAM_New Orleans',
 'OPPONENT_TEAM_New York',
 'OPPONENT_TEAM_Oklahoma City',
 'OPPONENT_TEAM_Orlando',
 'OPPONENT_TEAM_Philadelphia',
 'OPPONENT_TEAM_Phoenix',
 'OPPONENT_TEAM_Portland',
 'OPPONENT_TEAM_Sacramento',
 'OPPONENT_TEAM_San Antonio',
 'OPPONENT_TEAM_Toronto',
 'OPPONENT_TEAM_Utah',
 'OPPONENT_TEAM_Washington',
 'Venue_H',
 'Venue_R',
 'Starter_N',
 'Starter_Y',
 'DKPOS_C',
 'DKPOS_C/PF',
 'DKPOS_PF',
 'DKPOS_PF/C',
 'DKPOS_PG',
 'DKPOS_PG/SF',
 'DKPOS_PG/SG',
 'DKPOS_SF',
 'DKPOS_SF/PF',
 'DKPOS_SG',
 'DKPOS_SG/SF',
 'CONFERENCE_East',
 'CONFERENCE_West',
 'DIVISION_Atlantic',
 'DIVISION_Central',
 'DIVISION_Northwest',
 'DIVISION_Pacific',
 'DIVISION_Southeast',
 'DIVISION_Southwest',
 'BIGDATABALL_DATASET_NBA 2018-2019 Regular Season',
 'BIGDATABALL_DATASET_NBA 2019 Playoffs',
 'BIGDATABALL_DATASET_NBA 2019-2020 Regular Season',
 'BIGDATABALL_DATASET_NBA 2020 Playoffs',
 'BIGDATABALL_DATASET_NBA 2020-2021 Regular Season',
 'BIGDATABALL_DATASET_NBA 2021 Play-in',
 'BIGDATABALL_DATASET_NBA 2021 Playoffs',
 'BIGDATABALL_DATASET_NBA 2021-2022 Regular Season',
 'TMARENA_AT&T Center',
 'TMARENA_Amalie Arena',
 'TMARENA_American Airlines Center',
 'TMARENA_AmericanAirlines Arena',
 'TMARENA_Amway Center',
 'TMARENA_Ball Arena',
 'TMARENA_Bankers Life Fieldhouse',
 'TMARENA_Barclays Center',
 'TMARENA_Capital One Arena',
 'TMARENA_Chase Center',
 'TMARENA_Chesapeake Energy Arena',
 'TMARENA_FTX Arena',
 'TMARENA_FedEx Forum',
 'TMARENA_Fiserv Forum',
 'TMARENA_Golden 1 Center',
 'TMARENA_Little Caesars Arena',
 'TMARENA_Madison Square Garden (IV)',
 'TMARENA_Moda Center',
 'TMARENA_Oracle Arena',
 'TMARENA_Pepsi Center',
 'TMARENA_Phoenix Suns Arena',
 'TMARENA_Quicken Loans Arena',
 'TMARENA_STAPLES Center',
 'TMARENA_Scotiabank Arena',
 'TMARENA_Smoothie King Center',
 'TMARENA_Spectrum Center',
 'TMARENA_State Farm Arena',
 'TMARENA_TD Garden',
 'TMARENA_Talking Stick Resort Arena',
 'TMARENA_Target Center',
 'TMARENA_Toyota Center',
 'TMARENA_United Center',
 'TMARENA_Vivint Smart Home Arena',
 'TMARENA_Wells Fargo Center',
 'OPPCONFERENCE_East',
 'OPPCONFERENCE_West',
 'OPPDIVISION_Atlantic',
 'OPPDIVISION_Central',
 'OPPDIVISION_Northwest',
 'OPPDIVISION_Pacific',
 'OPPDIVISION_Southeast',
 'OPPDIVISION_Southwest',
 'Pred',
    'Act']

In [None]:
finalTestResults

In [None]:
finalTestResults.to_csv('NBAAnalysisResults.csv')