In [209]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [210]:
dataset = pd.read_excel('../data/boxScores.xlsx')
# Filter out rows where the 'MIN' column is zero
dataset = dataset[dataset['MIN'] != 0]
dataset['FPPM'] = dataset['FP'] / dataset['MIN']
dataset.sort_values(by=['GAME DATE'], ascending = [True], inplace=True)
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,5,3.0,3.0,3.0,3.0,5.7,0.75
1,A.J. Lawson,TOR,TOR @ MEM,2024-12-26,L,5,6,2,3,66.7,...,0,0,0,-4,8.4,5.7,5.7,5.7,5.7,1.68
2,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,0,0,0,4,4.5,4.5,4.5,4.5,13.85,1.125
3,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,1,-6,11.7,8.1,8.1,8.1,13.85,1.063636
4,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.85,0.423077


In [211]:
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM']]
datasetCM = dataset[dataset['PLAYER'] == 'Cody Martin']
datasetCM.head()

Unnamed: 0,PLAYER,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM
1663,Cody Martin,22.1,22.1,22.1,22.1,21.933333,0.920833
1664,Cody Martin,10.5,16.3,16.3,16.3,21.933333,0.456522
1665,Cody Martin,20.6,17.733333,17.733333,17.733333,21.933333,0.762963
1666,Cody Martin,35.8,22.3,22.25,22.25,21.933333,1.432
1667,Cody Martin,17.1,24.5,21.22,21.22,21.933333,0.657692


In [212]:
dataset.describe()

Unnamed: 0,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM
count,9635.0,9635.0,9635.0,9635.0,9635.0,9635.0
mean,21.549642,21.461673,21.381588,21.317007,21.530204,0.902938
std,14.945291,13.128096,12.736635,12.580747,11.891276,0.468288
min,-3.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,9.6,11.458333,11.66,11.7,12.362069,0.625
50%,19.8,20.066667,20.04,19.942857,19.964286,0.888462
75%,31.1,29.866667,29.66,29.477381,28.423333,1.165451
max,98.3,80.5,75.36,72.2,63.26,7.2


For reference, features are the raw input data provided to the model to generate predictions, while the label is the target outcome that the model aims to predict. These above features can and probably should be tweaked and adjusted to improve the model's performance.

In [213]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM']
labelName = ['FP']
dfFeatures = dataset[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM
0,3.0,3.0,3.0,5.7,0.75
1,5.7,5.7,5.7,5.7,1.68
2,4.5,4.5,4.5,13.85,1.125
3,8.1,8.1,8.1,13.85,1.063636
4,7.233333,7.233333,7.233333,13.85,0.423077


In [214]:
dfLabels = dataset[labelName]
dfLabels.head()

Unnamed: 0,FP
0,3.0
1,8.4
2,4.5
3,11.7
4,5.5


In [215]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
labels

array([[ 3. ],
       [ 8.4],
       [ 4.5],
       ...,
       [31.4],
       [59.2],
       [46.6]])

** Note numpy array is only numbers while pandas dataframe can be a mix of numbers and strings.

In [216]:
# Train is your training data while test is your testing data
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=30)
# In general a test_size should be small, always less than 50%


In [217]:
train

array([[34.76666667, 30.2       , 30.2       , 40.11071429,  0.68484848],
       [29.96666667, 25.02      , 26.47142857, 30.18      ,  0.75294118],
       [26.        , 21.16      , 17.58571429, 14.87916667,  0.92592593],
       ...,
       [21.66666667, 23.4       , 24.75714286, 19.646875  ,  0.678125  ],
       [10.5       , 10.5       , 10.5       , 18.63870968,  0.45652174],
       [33.26666667, 31.18      , 28.25714286, 28.252     ,  0.92571429]])

In [218]:
test

array([[14.63333333, 17.84      , 16.81428571, 16.52413793,  0.66052632],
       [ 4.7       ,  4.7       ,  4.7       ,  7.15      ,  0.26666667],
       [11.46666667, 13.6       , 16.24285714, 13.85      ,  0.15384615],
       ...,
       [ 0.        ,  0.        ,  0.        ,  5.62105263,  0.        ],
       [32.03333333, 32.82      , 34.32857143, 32.45882353,  1.34827586],
       [17.06666667, 19.94      , 21.5       , 18.63103448,  0.9875    ]])

In [219]:
# Single decision tree
tree = DecisionTreeRegressor(random_state=30, max_depth=20)
tree.fit(train,trainLabels)
print(f'Decision tree has {tree.tree_.node_count} nodes with maximum depth {tree.tree_.max_depth}')

Decision tree has 13103 nodes with maximum depth 20


In [220]:
train_predictions = tree.predict(train)
predictions = tree.predict(test)
df1 = pd.DataFrame(test, columns = featureNames)

In [221]:
df1['actual'] = testLabels
df1['predicted'] = predictions
df1['error'] = abs(df1['actual'] - df1['predicted'])

In [222]:
# Error shows whether the algorithm is good or not
df1.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM,actual,predicted,error
count,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0
mean,21.400251,21.344522,21.276503,21.473866,0.915745,21.643954,21.633655,5.146333
std,12.933505,12.59313,12.392549,11.637394,0.481962,14.570644,14.78137,4.806147
min,-1.0,-1.0,-1.0,0.0,-0.5,-2.0,-2.0,0.0
25%,11.558333,11.92,11.952381,12.6,0.639231,9.75,10.2,1.4
50%,20.1,20.04,19.985714,20.103226,0.9,20.4,19.7,3.8
75%,29.916667,29.71,29.685714,28.455556,1.166667,31.8,31.35,7.6
max,72.033333,71.86,69.042857,63.26,5.0,89.7,82.1,29.7


In [223]:
# Scaling the data
x = features
x = StandardScaler().fit_transform(x)
x

array([[-1.40634494, -1.44328092, -1.456031  , -1.33131425, -0.32660614],
       [-1.20066848, -1.231283  , -1.24140622, -1.33131425,  1.65945612],
       [-1.29208024, -1.3255043 , -1.33679501, -0.64590227,  0.47422541],
       ...,
       [ 0.80531172,  1.20276716,  1.22280427,  1.72991225,  0.1037397 ],
       [ 1.70927242,  1.55609702,  1.58051225,  1.72991225,  1.48861205],
       [ 1.84892926,  1.49171247,  1.65205384,  1.72991225,  1.50333999]])

In [224]:
train, test, trainLabels, testLabels = train_test_split(x, labels, test_size=0.2, random_state=30)

In [225]:
reg = DecisionTreeRegressor(random_state=30)
reg.fit(train,trainLabels)

In [226]:
train_predictions = reg.predict(train)
predictions = reg.predict(test)
df2 = pd.DataFrame(test, columns = featureNames)
df2['actual'] = testLabels
df2['predicted'] = predictions
df2['error'] = abs(df2['actual'] - df2['predicted'])
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM,actual,predicted,error
count,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0
mean,-0.004679,-0.00291,-0.00322,-0.004738,0.02735,21.643954,21.616243,5.224598
std,0.985229,0.988784,0.985092,0.978701,1.029254,14.570644,14.792381,4.935804
min,-1.711051,-1.757352,-1.773994,-1.810682,-2.996045,-2.0,-2.0,0.0
25%,-0.754401,-0.742903,-0.7444,-0.751027,-0.563159,9.75,10.1,1.4
50%,-0.103727,-0.105339,-0.105825,-0.120008,-0.006274,20.4,19.4,3.9
75%,0.644072,0.653928,0.665234,0.58242,0.563207,31.8,31.4,7.6
max,3.85237,3.963451,3.793759,3.50946,8.749485,89.7,78.7,30.2


In [227]:
# Earlier data for reference
# Note that describe() is different from head()
df1.describe() 


Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM,actual,predicted,error
count,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0
mean,21.400251,21.344522,21.276503,21.473866,0.915745,21.643954,21.633655,5.146333
std,12.933505,12.59313,12.392549,11.637394,0.481962,14.570644,14.78137,4.806147
min,-1.0,-1.0,-1.0,0.0,-0.5,-2.0,-2.0,0.0
25%,11.558333,11.92,11.952381,12.6,0.639231,9.75,10.2,1.4
50%,20.1,20.04,19.985714,20.103226,0.9,20.4,19.7,3.8
75%,29.916667,29.71,29.685714,28.455556,1.166667,31.8,31.35,7.6
max,72.033333,71.86,69.042857,63.26,5.0,89.7,82.1,29.7


In [229]:
dataset = dataset[['PLAYER', 'FP', 'Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM']]

In [230]:
featureNames = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM']
labelName = ['FP']
dfFeatures = dataset[featureNames]
dfFeatures.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM
0,3.0,3.0,3.0,5.7,0.75
1,5.7,5.7,5.7,5.7,1.68
2,4.5,4.5,4.5,13.85,1.125
3,8.1,8.1,8.1,13.85,1.063636
4,7.233333,7.233333,7.233333,13.85,0.423077


In [231]:
dfLabels = dataset[labelName]
dfLabels.head()

Unnamed: 0,FP
0,3.0
1,8.4
2,4.5
3,11.7
4,5.5


In [232]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)
train, test, trainLabels, testLabels = train_test_split(features, labels, test_size=0.2, random_state=30)

In [233]:
rf = RandomForestRegressor(random_state=30)

In [234]:
rf.fit(train, trainLabels)

  return fit_method(estimator, *args, **kwargs)


In [235]:
rfPredNoStandard = rf.predict(test)

In [236]:
df3 = pd.DataFrame(test, columns = featureNames)
df3['actual'] = testLabels
df3['predicted'] = rfPredNoStandard
df3['error'] = abs(df3['actual'] - df3['predicted'])

In [237]:
train, test, trainLabels, testLabels = train_test_split(x, labels, test_size=0.2, random_state=30)
rf = RandomForestRegressor(random_state=30)
rf.fit(train, trainLabels)
rfPredNoStandard = rf.predict(test)

  return fit_method(estimator, *args, **kwargs)


In [238]:
df4 = pd.DataFrame(test, columns = featureNames)
df4['actual'] = testLabels
df4['predicted'] = rfPredNoStandard
df4['error'] = abs(df4['actual'] - df4['predicted'])

In [239]:
df4.describe()
# df3.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM,actual,predicted,error
count,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0
mean,-0.004679,-0.00291,-0.00322,-0.004738,0.02735,21.643954,21.698947,3.753736
std,0.985229,0.988784,0.985092,0.978701,1.029254,14.570644,14.059849,3.515409
min,-1.711051,-1.757352,-1.773994,-1.810682,-2.996045,-2.0,-1.57,0.0
25%,-0.754401,-0.742903,-0.7444,-0.751027,-0.563159,9.75,10.5465,1.178
50%,-0.103727,-0.105339,-0.105825,-0.120008,-0.006274,20.4,20.103,2.753
75%,0.644072,0.653928,0.665234,0.58242,0.563207,31.8,30.5475,5.4015
max,3.85237,3.963451,3.793759,3.50946,8.749485,89.7,79.611,36.399


In [240]:
df2.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM,actual,predicted,error
count,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0,1927.0
mean,-0.004679,-0.00291,-0.00322,-0.004738,0.02735,21.643954,21.616243,5.224598
std,0.985229,0.988784,0.985092,0.978701,1.029254,14.570644,14.792381,4.935804
min,-1.711051,-1.757352,-1.773994,-1.810682,-2.996045,-2.0,-2.0,0.0
25%,-0.754401,-0.742903,-0.7444,-0.751027,-0.563159,9.75,10.1,1.4
50%,-0.103727,-0.105339,-0.105825,-0.120008,-0.006274,20.4,19.4,3.9
75%,0.644072,0.653928,0.665234,0.58242,0.563207,31.8,31.4,7.6
max,3.85237,3.963451,3.793759,3.50946,8.749485,89.7,78.7,30.2


In [241]:
dfCheck = pd.DataFrame()
dfCheck['df1'] = df1['error']
dfCheck['df2'] = df2['error']
dfCheck['df3'] = df3['error']
dfCheck['df4'] = df4['error']
dfCheck.describe()

Unnamed: 0,df1,df2,df3,df4
count,1927.0,1927.0,1927.0,1927.0
mean,5.146333,5.224598,3.753905,3.753736
std,4.806147,4.935804,3.513366,3.515409
min,0.0,0.0,0.0,0.0
25%,1.4,1.4,1.1685,1.178
50%,3.8,3.9,2.759,2.753
75%,7.6,7.6,5.4095,5.4015
max,29.7,30.2,36.399,36.399


Note: Scaling and preprocessing data matters more on larger datasets

In [242]:
dfHighError = dfCheck[dfCheck['df3']>8.853]
dfHighError.describe()

Unnamed: 0,df1,df2,df3,df4
count,162.0,162.0,162.0,162.0
mean,12.182072,12.225926,11.978883,11.977858
std,6.262485,6.142546,3.601163,3.607959
min,0.0,0.0,8.855,8.784
25%,7.6,7.425,9.6215,9.621
50%,11.9,12.1,10.9845,10.9845
75%,16.0,16.35,12.87275,12.9895
max,29.7,28.6,36.399,36.399
