In [21]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [22]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,STL,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,0,5,3.0,3.0,3.0,3.0,3.0
1,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,1,0,0,0,4,4.5,4.5,4.5,4.5,13.688889
2,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,0,1,-6,11.7,8.1,8.1,8.1,13.688889
3,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.688889
4,AJ Green,MIL,MIL @ BOS,2024-10-28,L,3,3,1,2,50.0,...,0,0,0,0,4,4.2,7.133333,6.475,6.475,13.688889


In [23]:
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
datasetCM = dataset[dataset['PLAYER'] == 'Cody Martin']
datasetCM.head()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1611,Cody Martin,22.1,22.1,22.1,22.1,22.472414
1612,Cody Martin,10.5,16.3,16.3,16.3,22.472414
1613,Cody Martin,20.6,17.733333,17.733333,17.733333,22.472414
1614,Cody Martin,35.8,22.3,22.25,22.25,22.472414
1615,Cody Martin,17.1,24.5,21.22,21.22,22.472414


In [24]:
datasetCM.tail()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1635,Cody Martin,9.4,20.966667,18.18,21.242857,22.472414
1636,Cody Martin,27.0,20.7,20.2,20.685714,22.472414
1637,Cody Martin,21.9,19.433333,22.36,19.971429,22.472414
1638,Cody Martin,26.5,25.133333,22.1,21.342857,22.472414
1639,Cody Martin,28.0,25.466667,22.56,23.757143,22.472414


For reference, features are the raw input data provided to the model to generate predictions, while the label is the target outcome that the model aims to predict. These above features can and probably should be tweaked and adjusted to improve the model's performance.

In [25]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']
labelName = ['FP']
dfFeatures = datasetCM[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1611,22.1,22.1,22.1,22.472414
1612,16.3,16.3,16.3,22.472414
1613,17.733333,17.733333,17.733333,22.472414
1614,22.3,22.25,22.25,22.472414
1615,24.5,21.22,21.22,22.472414


In [26]:
dfLabels = datasetCM[labelName]
dfLabels.head()

Unnamed: 0,FP
1611,22.1
1612,10.5
1613,20.6
1614,35.8
1615,17.1


In [27]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
labels

array([[22.1],
       [10.5],
       [20.6],
       [35.8],
       [17.1],
       [18.3],
       [16.1],
       [19.3],
       [30. ],
       [24.7],
       [12.6],
       [11.9],
       [29.2],
       [21.2],
       [29.1],
       [31.2],
       [16.2],
       [33.7],
       [30.9],
       [26.9],
       [16.9],
       [11.1],
       [27.8],
       [25.7],
       [ 9.4],
       [27. ],
       [21.9],
       [26.5],
       [28. ]])

** Note numpy array is only numbers while pandas dataframe can be a mix of numbers and strings.

In [28]:
# Train is your training data while test is your testing data
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=9)
# In general a test_size should be small, always less than 50%



In [29]:
train

array([[26.93333333, 28.22      , 27.35714286, 22.47241379],
       [30.5       , 27.78      , 27.02857143, 22.47241379],
       [17.16666667, 21.58      , 20.07142857, 22.47241379],
       [17.73333333, 17.73333333, 17.73333333, 22.47241379],
       [17.9       , 21.68      , 20.54285714, 22.47241379],
       [20.76666667, 19.92      , 21.27142857, 22.47241379],
       [25.5       , 25.38      , 21.62857143, 22.47241379],
       [22.43333333, 20.54      , 19.72857143, 22.47241379],
       [25.13333333, 22.1       , 21.34285714, 22.47241379],
       [16.4       , 19.7       , 18.98571429, 22.47241379],
       [26.5       , 20.8       , 22.67142857, 22.47241379],
       [22.1       , 22.1       , 22.1       , 22.47241379],
       [21.53333333, 21.68      , 24.71428571, 22.47241379],
       [27.03333333, 26.28      , 24.64285714, 22.47241379],
       [21.8       , 20.16      , 22.45714286, 22.47241379],
       [24.9       , 24.92      , 26.41428571, 22.47241379],
       [20.7       , 20.

In [30]:
test

array([[23.73333333, 20.46      , 20.73333333, 22.47241379],
       [24.66666667, 21.68      , 23.04285714, 22.47241379],
       [27.16666667, 24.52      , 22.84285714, 22.47241379],
       [24.5       , 21.22      , 21.22      , 22.47241379],
       [22.3       , 22.25      , 22.25      , 22.47241379],
       [17.9       , 21.32      , 19.67142857, 22.47241379]])

In [31]:
# Single decision tree
tree = DecisionTreeRegressor(random_state=9)
tree.fit(train,trainLabels)
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}')

Decision tree has 45 nodes with maximum depth 10


In [32]:
train_predictions = tree.predict(train)
predictions = tree.predict(test)
df1 = pd.DataFrame(test, columns = featureNames)

In [33]:
df1.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
count,6.0,6.0,6.0,6.0
mean,23.377778,21.908333,21.626746,22.47241
std,3.116741,1.407642,1.316027,3.891803e-15
min,17.9,20.46,19.671429,22.47241
25%,22.658333,21.245,20.855,22.47241
50%,24.116667,21.5,21.735,22.47241
75%,24.625,22.1075,22.694643,22.47241
max,27.166667,24.52,23.042857,22.47241


In [34]:
df1['actual'] = testLabels
df1['predicted'] = predictions
df1['error'] = abs(df1['actual'] - df1['predicted'])

In [35]:
# Error shows whether the algorithm is good or not
df1.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
0,23.733333,20.46,20.733333,22.472414,18.3,26.5,8.2
1,24.666667,21.68,23.042857,22.472414,24.7,26.5,1.8
2,27.166667,24.52,22.842857,22.472414,31.2,33.7,2.5
3,24.5,21.22,21.22,22.472414,17.1,26.5,9.4
4,22.3,22.25,22.25,22.472414,35.8,22.1,13.7


In [36]:
# Scaling the data
x = features
x = StandardScaler().fit_transform(x)
x

array([[-0.03210591,  0.03401108,  0.04499076,  0.        ],
       [-1.58986015, -2.11931294, -2.25381364,  0.        ],
       [-1.2048979 , -1.58716965, -1.6857183 ,  0.        ],
       [ 0.02160975,  0.0897005 ,  0.10444259,  0.        ],
       [ 0.61248205, -0.29270015, -0.30379336,  0.        ],
       [ 0.406572  , -0.57485985, -0.49668154,  0.        ],
       [-1.35709228, -0.15904555, -0.75902457,  0.        ],
       [-1.16013484, -0.25557387, -0.91756281,  0.        ],
       [-0.11267941, -0.68623868,  0.18654275,  0.        ],
       [ 0.6572451 , -0.12191928,  0.41868802,  0.        ],
       [ 0.05742019, -0.54515883, -0.89491449,  0.        ],
       [-1.56300232, -0.85701955, -1.18934264,  0.        ],
       [-1.16013484, -0.12191928, -0.57217594,  0.        ],
       [-0.39021034, -0.77534174, -0.28340987,  0.        ],
       [ 1.14963868, -0.44863051,  0.27147395,  0.        ],
       [ 1.32869089,  0.93246697,  0.3394189 ,  0.        ],
       [ 0.88106036,  1.

In [37]:
train, test, trainLabels, testLabels = train_test_split(x, labels, test_size=0.2, random_state=9)

In [38]:
reg = DecisionTreeRegressor(random_state=9)
reg.fit(train,trainLabels)

In [39]:
train_predictions = reg.predict(train)
predictions = reg.predict(test)
df2 = pd.DataFrame(test, columns = featureNames)
df2['actual'] = testLabels
df2['predicted'] = predictions
df2['error'] = abs(df2['actual'] - df2['predicted'])
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.311077,-0.037148,-0.142581,0.0,24.4,25.233333,6.466667
std,0.837089,0.522605,0.521602,0.0,7.652451,5.826377,4.735258
min,-1.160135,-0.57486,-0.917563,0.0,17.1,16.1,1.8
25%,0.11785,-0.283419,-0.448459,0.0,18.55,23.2,2.675
50%,0.509527,-0.188747,-0.099675,0.0,22.0,26.5,5.7
75%,0.646054,0.036796,0.280675,0.0,29.575,26.5,9.1
max,1.328691,0.932467,0.418688,0.0,35.8,33.7,13.7


In [40]:
# Earlier data for reference
# Note that describe() is different from head()
df1.describe() 


Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,23.377778,21.908333,21.626746,22.47241,24.4,25.233333,6.466667
std,3.116741,1.407642,1.316027,3.891803e-15,7.652451,5.826377,4.735258
min,17.9,20.46,19.671429,22.47241,17.1,16.1,1.8
25%,22.658333,21.245,20.855,22.47241,18.55,23.2,2.675
50%,24.116667,21.5,21.735,22.47241,22.0,26.5,5.7
75%,24.625,22.1075,22.694643,22.47241,29.575,26.5,9.1
max,27.166667,24.52,23.042857,22.47241,35.8,33.7,13.7


In [41]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
datasetCM = dataset[dataset['PLAYER'] == 'Cody Martin']
datasetCM.head()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1611,Cody Martin,22.1,22.1,22.1,22.1,22.472414
1612,Cody Martin,10.5,16.3,16.3,16.3,22.472414
1613,Cody Martin,20.6,17.733333,17.733333,17.733333,22.472414
1614,Cody Martin,35.8,22.3,22.25,22.25,22.472414
1615,Cody Martin,17.1,24.5,21.22,21.22,22.472414


In [42]:
datasetCM.tail()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1635,Cody Martin,9.4,20.966667,18.18,21.242857,22.472414
1636,Cody Martin,27.0,20.7,20.2,20.685714,22.472414
1637,Cody Martin,21.9,19.433333,22.36,19.971429,22.472414
1638,Cody Martin,26.5,25.133333,22.1,21.342857,22.472414
1639,Cody Martin,28.0,25.466667,22.56,23.757143,22.472414


In [43]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']
labelName = ['FP']
dfFeatures = datasetCM[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
1611,22.1,22.1,22.1,22.472414
1612,16.3,16.3,16.3,22.472414
1613,17.733333,17.733333,17.733333,22.472414
1614,22.3,22.25,22.25,22.472414
1615,24.5,21.22,21.22,22.472414


In [44]:
dfLabels = datasetCM[labelName]
dfLabels.head()

Unnamed: 0,FP
1611,22.1
1612,10.5
1613,20.6
1614,35.8
1615,17.1


In [45]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=9)

In [46]:
rf = RandomForestRegressor(random_state=9)

In [47]:
rf.fit(train, trainLabels)

  return fit_method(estimator, *args, **kwargs)


In [48]:
rfPredNoStandard = rf.predict(test)

In [49]:
df3 = pd.DataFrame(test, columns = featureNames)
df3['actual'] = testLabels
df3['predicted'] = rfPredNoStandard
df3['error'] = abs(df3['actual'] - df3['predicted'])

In [50]:
df3.describe()
# df3.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,23.377778,21.908333,21.626746,22.47241,24.4,23.871167,4.553167
std,3.116741,1.407642,1.316027,3.891803e-15,7.652451,3.003767,4.220163
min,17.9,20.46,19.671429,22.47241,17.1,20.004,0.704
25%,22.658333,21.245,20.855,22.47241,18.55,22.5395,1.94175
50%,24.116667,21.5,21.735,22.47241,22.0,22.9875,3.5605
75%,24.625,22.1075,22.694643,22.47241,29.575,25.60525,5.1395
max,27.166667,24.52,23.042857,22.47241,35.8,28.38,12.426


In [51]:
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,actual,predicted,error
count,6.0,6.0,6.0,6.0,6.0,6.0,6.0
mean,0.311077,-0.037148,-0.142581,0.0,24.4,25.233333,6.466667
std,0.837089,0.522605,0.521602,0.0,7.652451,5.826377,4.735258
min,-1.160135,-0.57486,-0.917563,0.0,17.1,16.1,1.8
25%,0.11785,-0.283419,-0.448459,0.0,18.55,23.2,2.675
50%,0.509527,-0.188747,-0.099675,0.0,22.0,26.5,5.7
75%,0.646054,0.036796,0.280675,0.0,29.575,26.5,9.1
max,1.328691,0.932467,0.418688,0.0,35.8,33.7,13.7


In [52]:
dfCheck = pd.DataFrame()
dfCheck['df1'] = df1['error']
dfCheck['df2'] = df2['error']
dfCheck['df3'] = df3['error']
dfCheck.describe()

Unnamed: 0,df1,df2,df3
count,6.0,6.0,6.0
mean,6.466667,6.466667,4.553167
std,4.735258,4.735258,4.220163
min,1.8,1.8,0.704
25%,2.675,2.675,1.94175
50%,5.7,5.7,3.5605
75%,9.1,9.1,5.1395
max,13.7,13.7,12.426
