In [74]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [75]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,STL,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,0,5,3.0,3.0,3.0,3.0,5.7
1,A.J. Lawson,TOR,TOR @ MEM,2024-12-26,L,5,6,2,3,66.7,...,0,0,0,0,-4,8.4,5.7,5.7,5.7,5.7
2,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,1,0,0,0,4,4.5,4.5,4.5,4.5,13.85
3,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,0,1,-6,11.7,8.1,8.1,8.1,13.85
4,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.85


In [76]:
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
datasetCM = dataset[dataset['PLAYER'] == 'Cody Martin']
datasetCM.head()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1663,Cody Martin,22.1,22.1,22.1,22.1,21.933333
1664,Cody Martin,10.5,16.3,16.3,16.3,21.933333
1665,Cody Martin,20.6,17.733333,17.733333,17.733333,21.933333
1666,Cody Martin,35.8,22.3,22.25,22.25,21.933333
1667,Cody Martin,17.1,24.5,21.22,21.22,21.933333


In [77]:
datasetCM.tail()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1688,Cody Martin,27.0,20.7,20.2,20.685714,21.933333
1689,Cody Martin,21.9,19.433333,22.36,19.971429,21.933333
1690,Cody Martin,26.5,25.133333,22.1,21.342857,21.933333
1691,Cody Martin,28.0,25.466667,22.56,23.757143,21.933333
1692,Cody Martin,6.3,20.266667,21.94,20.685714,21.933333


For reference, features are the raw input data provided to the model to generate predictions, while the label is the target outcome that the model aims to predict. These above features can and probably should be tweaked and adjusted to improve the model's performance.

In [78]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']
labelName = ['FP']
dfFeatures = dataset[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,3.0,3.0,3.0,5.7
1,5.7,5.7,5.7,5.7
2,4.5,4.5,4.5,13.85
3,8.1,8.1,8.1,13.85
4,7.233333,7.233333,7.233333,13.85


In [79]:
dfLabels = dataset[labelName]
dfLabels.head()

Unnamed: 0,FP
0,3.0
1,8.4
2,4.5
3,11.7
4,5.5


In [80]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
labels

array([[ 3. ],
       [ 8.4],
       [ 4.5],
       ...,
       [31.4],
       [59.2],
       [46.6]])

** Note numpy array is only numbers while pandas dataframe can be a mix of numbers and strings.

In [81]:
# Train is your training data while test is your testing data
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=30)
# In general a test_size should be small, always less than 50%



In [82]:
train

array([[32.73333333, 34.82      , 30.51428571, 27.76206897],
       [33.93333333, 40.82      , 35.74285714, 34.62692308],
       [ 7.33333333,  8.06      , 10.72857143, 13.675     ],
       ...,
       [28.26666667, 26.74      , 25.9       , 19.646875  ],
       [31.66666667, 36.76      , 36.37142857, 40.02      ],
       [13.76666667, 16.82      , 20.07142857, 18.38888889]])

In [83]:
test

array([[20.46666667, 20.34      , 19.44285714, 22.66785714],
       [20.4       , 20.4       , 20.4       , 11.4       ],
       [38.4       , 39.94      , 36.42857143, 39.54333333],
       ...,
       [12.06666667,  9.58      , 10.61428571, 16.79666667],
       [12.63333333, 15.92      , 16.41428571, 15.73157895],
       [29.65      , 29.65      , 29.65      , 22.7       ]])

In [84]:
# Single decision tree
tree = DecisionTreeRegressor(random_state=30, max_depth=20)
tree.fit(train,trainLabels)
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}')

Decision tree has 10309 nodes with maximum depth 20


In [85]:
train_predictions = tree.predict(train)
predictions = tree.predict(test)
df1 = pd.DataFrame(test, columns = featureNames)

In [86]:
df1.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
count,1933.0,1933.0,1933.0,1933.0
mean,21.328212,21.243431,21.200515,21.456634
std,12.991187,12.537719,12.355876,11.742534
min,-1.0,-1.0,-1.0,0.0
25%,11.4,11.75,11.825,12.475
50%,19.733333,19.76,19.771429,19.941667
75%,29.733333,29.46,29.457143,28.288
max,72.2,72.2,72.2,63.26


In [87]:
df1['actual'] = testLabels
df1['predicted'] = predictions
df1['error'] = abs(df1['actual'] - df1['predicted'])

In [88]:
# Error shows whether the algorithm is good or not
df1.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0
mean,21.328212,21.243431,21.200515,21.456634,21.46896,21.41968,7.462489
std,12.991187,12.537719,12.355876,11.742534,15.089416,14.757167,6.619419
min,-1.0,-1.0,-1.0,0.0,-2.0,-2.0,0.0
25%,11.4,11.75,11.825,12.475,9.4,10.1,2.3
50%,19.733333,19.76,19.771429,19.941667,19.3,19.8,5.636364
75%,29.733333,29.46,29.457143,28.288,31.3,30.4,10.635211
max,72.2,72.2,72.2,63.26,98.3,82.1,40.1


In [89]:
# Scaling the data
x = features
x = StandardScaler().fit_transform(x)
x

array([[-1.40052859, -1.43718408, -1.4500083 , -1.32580168],
       [-1.19510672, -1.22547284, -1.23566715, -1.32580168],
       [-1.28640533, -1.31956672, -1.33092988, -0.64120523],
       ...,
       [ 0.80839045,  1.20528592,  1.22528675,  1.73178249],
       [ 1.71123222,  1.558138  ,  1.58252199,  1.73178249],
       [ 1.8507162 ,  1.49384051,  1.65396904,  1.73178249]])

In [90]:
train, test, trainLabels, testLabels = train_test_split(x, labels, test_size=0.2, random_state=30)

In [91]:
reg = DecisionTreeRegressor(random_state=30)
reg.fit(train,trainLabels)

In [92]:
train_predictions = reg.predict(train)
predictions = reg.predict(test)
df2 = pd.DataFrame(test, columns = featureNames)
df2['actual'] = testLabels
df2['predicted'] = predictions
df2['error'] = abs(df2['actual'] - df2['predicted'])
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0
mean,-0.006078,-0.006688,-0.005149,-0.002251,21.46896,21.329333,7.938282
std,0.988398,0.983102,0.980879,0.986368,15.089416,14.997632,6.923474
min,-1.704857,-1.75083,-1.767551,-1.804599,-2.0,-2.0,0.0
25%,-0.761438,-0.751083,-0.74943,-0.756705,9.4,9.7,2.5
50%,-0.12742,-0.123006,-0.118598,-0.129508,19.3,19.6,6.1
75%,0.633401,0.637586,0.650308,0.57158,31.3,30.5,11.6
max,3.864358,3.988897,4.043476,3.509213,98.3,72.0,44.8


In [93]:
# Earlier data for reference
# Note that describe() is different from head()
df1.describe() 


Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0
mean,21.328212,21.243431,21.200515,21.456634,21.46896,21.41968,7.462489
std,12.991187,12.537719,12.355876,11.742534,15.089416,14.757167,6.619419
min,-1.0,-1.0,-1.0,0.0,-2.0,-2.0,0.0
25%,11.4,11.75,11.825,12.475,9.4,10.1,2.3
50%,19.733333,19.76,19.771429,19.941667,19.3,19.8,5.636364
75%,29.733333,29.46,29.457143,28.288,31.3,30.4,10.635211
max,72.2,72.2,72.2,63.26,98.3,82.1,40.1


In [94]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()


Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,STL,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,0,5,3.0,3.0,3.0,3.0,5.7
1,A.J. Lawson,TOR,TOR @ MEM,2024-12-26,L,5,6,2,3,66.7,...,0,0,0,0,-4,8.4,5.7,5.7,5.7,5.7
2,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,1,0,0,0,4,4.5,4.5,4.5,4.5,13.85
3,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,0,1,-6,11.7,8.1,8.1,8.1,13.85
4,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.85


In [95]:
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
datasetCM = dataset[dataset['PLAYER'] == 'Cody Martin']
datasetCM.head()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1663,Cody Martin,22.1,22.1,22.1,22.1,21.933333
1664,Cody Martin,10.5,16.3,16.3,16.3,21.933333
1665,Cody Martin,20.6,17.733333,17.733333,17.733333,21.933333
1666,Cody Martin,35.8,22.3,22.25,22.25,21.933333
1667,Cody Martin,17.1,24.5,21.22,21.22,21.933333


In [96]:
datasetCM.tail()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1688,Cody Martin,27.0,20.7,20.2,20.685714,21.933333
1689,Cody Martin,21.9,19.433333,22.36,19.971429,21.933333
1690,Cody Martin,26.5,25.133333,22.1,21.342857,21.933333
1691,Cody Martin,28.0,25.466667,22.56,23.757143,21.933333
1692,Cody Martin,6.3,20.266667,21.94,20.685714,21.933333


In [97]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']
labelName = ['FP']
dfFeatures = dataset[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,3.0,3.0,3.0,5.7
1,5.7,5.7,5.7,5.7
2,4.5,4.5,4.5,13.85
3,8.1,8.1,8.1,13.85
4,7.233333,7.233333,7.233333,13.85


In [98]:
dfLabels = dataset[labelName]
dfLabels.head()

Unnamed: 0,FP
0,3.0
1,8.4
2,4.5
3,11.7
4,5.5


In [99]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=30)

In [100]:
rf = RandomForestRegressor(random_state=30)

In [101]:
rf.fit(train, trainLabels)

  return fit_method(estimator, *args, **kwargs)


In [102]:
rfPredNoStandard = rf.predict(test)

In [103]:
df3 = pd.DataFrame(test, columns = featureNames)
df3['actual'] = testLabels
df3['predicted'] = rfPredNoStandard
df3['error'] = abs(df3['actual'] - df3['predicted'])

In [104]:
train, test, trainLabels, testLabels = train_test_split(x, labels, test_size=0.2, random_state=30)
rf = RandomForestRegressor(random_state=30)
rf.fit(train, trainLabels)
rfPredNoStandard = rf.predict(test)

  return fit_method(estimator, *args, **kwargs)


In [105]:
df4 = pd.DataFrame(test, columns = featureNames)
df4['actual'] = testLabels
df4['predicted'] = rfPredNoStandard
df4['error'] = abs(df4['actual'] - df4['predicted'])

In [106]:
df4.describe()
# df3.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0
mean,-0.006078,-0.006688,-0.005149,-0.002251,21.46896,21.411699,5.791686
std,0.988398,0.983102,0.980879,0.986368,15.089416,13.021771,4.951151
min,-1.704857,-1.75083,-1.767551,-1.804599,-2.0,-0.99,0.0
25%,-0.761438,-0.751083,-0.74943,-0.756705,9.4,11.565,1.97
50%,-0.12742,-0.123006,-0.118598,-0.129508,19.3,19.73,4.444
75%,0.633401,0.637586,0.650308,0.57158,31.3,29.99,8.318
max,3.864358,3.988897,4.043476,3.509213,98.3,70.788,28.961


In [107]:
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0
mean,-0.006078,-0.006688,-0.005149,-0.002251,21.46896,21.329333,7.938282
std,0.988398,0.983102,0.980879,0.986368,15.089416,14.997632,6.923474
min,-1.704857,-1.75083,-1.767551,-1.804599,-2.0,-2.0,0.0
25%,-0.761438,-0.751083,-0.74943,-0.756705,9.4,9.7,2.5
50%,-0.12742,-0.123006,-0.118598,-0.129508,19.3,19.6,6.1
75%,0.633401,0.637586,0.650308,0.57158,31.3,30.5,11.6
max,3.864358,3.988897,4.043476,3.509213,98.3,72.0,44.8


In [108]:
dfCheck = pd.DataFrame()
dfCheck['df1'] = df1['error']
dfCheck['df2'] = df2['error']
dfCheck['df3'] = df3['error']
dfCheck['df4'] = df4['error']
dfCheck.describe()

Unnamed: 0,df1,df2,df3,df4
count,1933.0,1933.0,1933.0,1933.0
mean,7.462489,7.938282,5.79068,5.791686
std,6.619419,6.923474,4.950985,4.951151
min,0.0,0.0,0.0,0.0
25%,2.3,2.5,1.951,1.97
50%,5.636364,6.1,4.437,4.444
75%,10.635211,11.6,8.336,8.318
max,40.1,44.8,29.062,28.961


Note: Scaling and preprocessing data matters more on larger datasets

In [109]:
dfHighError = dfCheck[dfCheck['df3']>8.853]
dfHighError.describe()

Unnamed: 0,df1,df2,df3,df4
count,435.0,435.0,435.0,435.0
mean,13.630424,13.923678,13.32051,13.320575
std,7.672573,8.268889,3.81962,3.819336
min,0.0,0.0,8.865,8.769
25%,8.1,7.8,10.3875,10.358
50%,12.7,13.3,12.401,12.338
75%,18.359211,19.15,15.662,15.635
max,40.1,44.8,29.062,28.961
