In [202]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [203]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,STL,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,0,5,3.0,3.0,3.0,3.0,3.0
1,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,1,0,0,0,4,4.5,4.5,4.5,4.5,13.688889
2,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,0,1,-6,11.7,8.1,8.1,8.1,13.688889
3,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.688889
4,AJ Green,MIL,MIL @ BOS,2024-10-28,L,3,3,1,2,50.0,...,0,0,0,0,4,4.2,7.133333,6.475,6.475,13.688889


In [204]:
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
datasetCM = dataset[dataset['PLAYER'] == 'Cody Martin']
datasetCM.head()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1611,Cody Martin,22.1,22.1,22.1,22.1,22.472414
1612,Cody Martin,10.5,16.3,16.3,16.3,22.472414
1613,Cody Martin,20.6,17.733333,17.733333,17.733333,22.472414
1614,Cody Martin,35.8,22.3,22.25,22.25,22.472414
1615,Cody Martin,17.1,24.5,21.22,21.22,22.472414


In [205]:
datasetCM.tail()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1635,Cody Martin,9.4,20.966667,18.18,21.242857,22.472414
1636,Cody Martin,27.0,20.7,20.2,20.685714,22.472414
1637,Cody Martin,21.9,19.433333,22.36,19.971429,22.472414
1638,Cody Martin,26.5,25.133333,22.1,21.342857,22.472414
1639,Cody Martin,28.0,25.466667,22.56,23.757143,22.472414


For reference, features are the raw input data provided to the model to generate predictions, while the label is the target outcome that the model aims to predict. These above features can and probably should be tweaked and adjusted to improve the model's performance.

In [206]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']
labelName = ['FP']
dfFeatures = dataset[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,3.0,3.0,3.0,3.0
1,4.5,4.5,4.5,13.688889
2,8.1,8.1,8.1,13.688889
3,7.233333,7.233333,7.233333,13.688889
4,7.133333,6.475,6.475,13.688889


In [207]:
dfLabels = dataset[labelName]
dfLabels.head()

Unnamed: 0,FP
0,3.0
1,4.5
2,11.7
3,5.5
4,4.2


In [208]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
labels

array([[ 3. ],
       [ 4.5],
       [11.7],
       ...,
       [31.4],
       [59.2],
       [46.6]])

** Note numpy array is only numbers while pandas dataframe can be a mix of numbers and strings.

In [209]:
# Train is your training data while test is your testing data
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=9)
# In general a test_size should be small, always less than 50%



In [210]:
train

array([[19.5       , 19.5       , 19.5       , 20.66111111],
       [16.2       , 20.82      , 16.97142857, 16.31481481],
       [ 9.66666667,  9.66666667,  9.66666667,  8.4125    ],
       ...,
       [ 3.13333333,  7.6       ,  5.7       , 11.4       ],
       [ 1.63333333,  2.16      ,  2.16      ,  5.85882353],
       [14.33333333, 13.5       , 11.65714286, 13.052     ]])

In [211]:
test

array([[51.25      , 51.25      , 51.25      , 47.29642857],
       [-0.33333333, -0.25      , -0.25      ,  0.65555556],
       [ 7.9       ,  8.4       ,  9.7       , 10.23333333],
       ...,
       [12.8       ,  9.32      ,  9.32      ,  8.9       ],
       [17.83333333, 21.76      , 18.32857143, 15.4       ],
       [16.43333333, 15.58      , 19.68571429, 19.025     ]])

In [212]:
# Single decision tree
tree = DecisionTreeRegressor(random_state=9, max_depth=20)
tree.fit(train,trainLabels)
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}')

Decision tree has 10093 nodes with maximum depth 20


In [213]:
train_predictions = tree.predict(train)
predictions = tree.predict(test)
df1 = pd.DataFrame(test, columns = featureNames)

In [214]:
df1.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
count,1878.0,1878.0,1878.0,1878.0
mean,21.354162,21.37205,21.269892,21.389384
std,13.162736,12.795192,12.662075,12.0152
min,-1.0,-1.0,-1.0,0.0
25%,11.275,11.805,11.746429,12.515385
50%,19.9,19.96,19.971429,20.017857
75%,29.683333,29.6,29.385714,28.272375
max,76.433333,70.66,70.742857,63.8125


In [215]:
df1['actual'] = testLabels
df1['predicted'] = predictions
df1['error'] = abs(df1['actual'] - df1['predicted'])

In [216]:
# Error shows whether the algorithm is good or not
df1.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
0,51.25,51.25,51.25,47.296429,62.4,58.6,3.8
1,-0.333333,-0.25,-0.25,0.655556,0.0,0.0,0.0
2,7.9,8.4,9.7,10.233333,6.9,5.907692,0.992308
3,5.0,5.0,5.0,14.071429,5.6,5.0,0.6
4,21.9,24.84,25.171429,17.882143,13.3,27.0,13.7


In [217]:
# Scaling the data
x = features
x = StandardScaler().fit_transform(x)
x

array([[-1.39597554, -1.4321091 , -1.44456323, -1.54469437],
       [-1.28196756, -1.31463136, -1.32565462, -0.64890727],
       [-1.00834842, -1.03268477, -1.04027394, -0.64890727],
       ...,
       [ 0.81071219,  1.20722424,  1.22691703,  1.73209856],
       [ 1.71264196,  1.55965747,  1.58364288,  1.73209856],
       [ 1.85198504,  1.4954363 ,  1.65498805,  1.73209856]])

In [218]:
train, test, trainLabels, testLabels = train_test_split(x, labels, test_size=0.2, random_state=9)

In [219]:
reg = DecisionTreeRegressor(random_state=9)
reg.fit(train,trainLabels)

In [220]:
train_predictions = reg.predict(train)
predictions = reg.predict(test)
df2 = pd.DataFrame(test, columns = featureNames)
df2['actual'] = testLabels
df2['predicted'] = predictions
df2['error'] = abs(df2['actual'] - df2['predicted'])
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0
mean,-0.000962,0.006762,0.003735,-0.003564,21.364271,21.280671,7.824707
std,1.000438,1.0021,1.003753,1.006939,14.701911,15.199706,6.876309
min,-1.699997,-1.745383,-1.761653,-1.796111,-2.0,-2.0,0.0
25%,-0.767032,-0.742515,-0.751213,-0.747253,9.4,8.925,2.5
50%,-0.111486,-0.103827,-0.099197,-0.118505,20.05,19.3,6.0
75%,0.6321,0.651163,0.647096,0.573268,30.8,31.2,11.3
max,4.185348,3.86692,3.925576,3.551724,86.1,98.3,39.3


In [221]:
# Earlier data for reference
# Note that describe() is different from head()
df1.describe() 


Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0
mean,21.354162,21.37205,21.269892,21.389384,21.364271,21.409557,7.501203
std,13.162736,12.795192,12.662075,12.0152,14.701911,15.102374,6.656033
min,-1.0,-1.0,-1.0,0.0,-2.0,-2.0,0.0
25%,11.275,11.805,11.746429,12.515385,9.4,9.2,2.3
50%,19.9,19.96,19.971429,20.017857,20.05,18.666667,5.775
75%,29.683333,29.6,29.385714,28.272375,30.8,31.092857,10.824027
max,76.433333,70.66,70.742857,63.8125,86.1,89.7,47.1


In [222]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()


Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,STL,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,0,5,3.0,3.0,3.0,3.0,3.0
1,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,1,0,0,0,4,4.5,4.5,4.5,4.5,13.688889
2,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,0,1,-6,11.7,8.1,8.1,8.1,13.688889
3,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.688889
4,AJ Green,MIL,MIL @ BOS,2024-10-28,L,3,3,1,2,50.0,...,0,0,0,0,4,4.2,7.133333,6.475,6.475,13.688889


In [223]:
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
datasetCM = dataset[dataset['PLAYER'] == 'Cody Martin']
datasetCM.head()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1611,Cody Martin,22.1,22.1,22.1,22.1,22.472414
1612,Cody Martin,10.5,16.3,16.3,16.3,22.472414
1613,Cody Martin,20.6,17.733333,17.733333,17.733333,22.472414
1614,Cody Martin,35.8,22.3,22.25,22.25,22.472414
1615,Cody Martin,17.1,24.5,21.22,21.22,22.472414


In [224]:
datasetCM.tail()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1635,Cody Martin,9.4,20.966667,18.18,21.242857,22.472414
1636,Cody Martin,27.0,20.7,20.2,20.685714,22.472414
1637,Cody Martin,21.9,19.433333,22.36,19.971429,22.472414
1638,Cody Martin,26.5,25.133333,22.1,21.342857,22.472414
1639,Cody Martin,28.0,25.466667,22.56,23.757143,22.472414


In [225]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']
labelName = ['FP']
dfFeatures = dataset[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,3.0,3.0,3.0,3.0
1,4.5,4.5,4.5,13.688889
2,8.1,8.1,8.1,13.688889
3,7.233333,7.233333,7.233333,13.688889
4,7.133333,6.475,6.475,13.688889


In [226]:
dfLabels = dataset[labelName]
dfLabels.head()

Unnamed: 0,FP
0,3.0
1,4.5
2,11.7
3,5.5
4,4.2


In [227]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=9)

In [228]:
rf = RandomForestRegressor(random_state=9)

In [229]:
rf.fit(train, trainLabels)

  return fit_method(estimator, *args, **kwargs)


In [230]:
rfPredNoStandard = rf.predict(test)

In [231]:
df3 = pd.DataFrame(test, columns = featureNames)
df3['actual'] = testLabels
df3['predicted'] = rfPredNoStandard
df3['error'] = abs(df3['actual'] - df3['predicted'])

In [232]:
train, test, trainLabels, testLabels = train_test_split(x, labels, test_size=0.2, random_state=9)
rf = RandomForestRegressor(random_state=9)
rf.fit(train, trainLabels)
rfPredNoStandard = rf.predict(test)

  return fit_method(estimator, *args, **kwargs)


In [233]:
df4 = pd.DataFrame(test, columns = featureNames)
df4['actual'] = testLabels
df4['predicted'] = rfPredNoStandard
df4['error'] = abs(df4['actual'] - df4['predicted'])

In [234]:
df4.describe()
# df3.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0
mean,-0.000962,0.006762,0.003735,-0.003564,21.364271,21.413126,5.574799
std,1.000438,1.0021,1.003753,1.006939,14.701911,13.420939,4.809124
min,-1.699997,-1.745383,-1.761653,-1.796111,-2.0,-1.016,0.0
25%,-0.767032,-0.742515,-0.751213,-0.747253,9.4,11.4375,1.84475
50%,-0.111486,-0.103827,-0.099197,-0.118505,20.05,19.669,4.3365
75%,0.6321,0.651163,0.647096,0.573268,30.8,29.633,8.06075
max,4.185348,3.86692,3.925576,3.551724,86.1,77.904,35.304


In [235]:
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0,1878.0
mean,-0.000962,0.006762,0.003735,-0.003564,21.364271,21.280671,7.824707
std,1.000438,1.0021,1.003753,1.006939,14.701911,15.199706,6.876309
min,-1.699997,-1.745383,-1.761653,-1.796111,-2.0,-2.0,0.0
25%,-0.767032,-0.742515,-0.751213,-0.747253,9.4,8.925,2.5
50%,-0.111486,-0.103827,-0.099197,-0.118505,20.05,19.3,6.0
75%,0.6321,0.651163,0.647096,0.573268,30.8,31.2,11.3
max,4.185348,3.86692,3.925576,3.551724,86.1,98.3,39.3


In [236]:
dfCheck = pd.DataFrame()
dfCheck['df1'] = df1['error']
dfCheck['df2'] = df2['error']
dfCheck['df3'] = df3['error']
dfCheck['df4'] = df4['error']
dfCheck.describe()

Unnamed: 0,df1,df2,df3,df4
count,1878.0,1878.0,1878.0,1878.0
mean,7.501203,7.824707,5.574605,5.574799
std,6.656033,6.876309,4.80923,4.809124
min,0.0,0.0,0.0,0.0
25%,2.3,2.5,1.854,1.84475
50%,5.775,6.0,4.344,4.3365
75%,10.824027,11.3,8.07725,8.06075
max,47.1,39.3,35.304,35.304


Note: Scaling and preprocessing data matters more on larger datasets

In [238]:
dfHighError = dfCheck[dfCheck['df3']>8.853]
dfHighError.describe()

Unnamed: 0,df1,df2,df3,df4
count,393.0,393.0,393.0,393.0
mean,13.873663,14.224682,13.099618,13.09856
std,8.014334,8.401766,3.885417,3.87639
min,0.0,0.0,8.855,8.843
25%,7.9,7.2,10.168,10.174
50%,12.9,14.3,12.07,12.094
75%,18.7,20.0,14.965,14.835
max,47.1,39.3,35.304,35.304
