In [225]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [226]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,STL,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,Naji Marshall,DAL,DAL vs. MIN,2024-12-25,L,14,4,1,3,33.3,...,0,0,1,1,-7,5.4,5.4,5.4,5.4,19.916
1,Chris Paul,SAS,SAS @ NYK,2024-12-25,L,36,13,4,8,50.0,...,1,0,1,1,4,32.7,32.7,32.7,32.7,30.266667
2,Harrison Barnes,SAS,SAS @ NYK,2024-12-25,L,17,3,1,3,33.3,...,0,1,1,1,-10,7.7,7.7,7.7,7.7,17.956667
3,Karl-Anthony Towns,NYK,NYK vs. SAS,2024-12-25,W,30,21,9,16,56.3,...,0,0,1,5,-13,33.8,33.8,33.8,33.8,49.057143
4,Cameron Payne,NYK,NYK vs. SAS,2024-12-25,W,5,2,1,3,33.3,...,0,0,2,1,-10,1.5,1.5,1.5,1.5,16.279167


In [227]:
dataset['AveDiff'] = dataset['FP'] - dataset['Season_FP_Avg']

In [228]:
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff
0,Naji Marshall,DAL,DAL vs. MIN,2024-12-25,L,14,4,1,3,33.3,...,0,1,1,-7,5.4,5.4,5.4,5.4,19.916,-14.516
1,Chris Paul,SAS,SAS @ NYK,2024-12-25,L,36,13,4,8,50.0,...,0,1,1,4,32.7,32.7,32.7,32.7,30.266667,2.433333
2,Harrison Barnes,SAS,SAS @ NYK,2024-12-25,L,17,3,1,3,33.3,...,1,1,1,-10,7.7,7.7,7.7,7.7,17.956667,-10.256667
3,Karl-Anthony Towns,NYK,NYK vs. SAS,2024-12-25,W,30,21,9,16,56.3,...,0,1,5,-13,33.8,33.8,33.8,33.8,49.057143,-15.257143
4,Cameron Payne,NYK,NYK vs. SAS,2024-12-25,W,5,2,1,3,33.3,...,0,2,1,-10,1.5,1.5,1.5,1.5,16.279167,-14.779167


In [229]:
clusterdf = pd.read_excel('../data/clusteredPlayers.xlsx')

In [230]:
clusterdf = clusterdf.drop('Unnamed: 0', axis=1)
clusterdf.head()

Unnamed: 0,Player,Cluster
0,Anthony Black,8
1,Ayo Dosunmu,8
2,Bennedict Mathurin,11
3,Chris Paul,6
4,Clint Capela,2


In [231]:
clusterDict = {}

In [232]:
clusterDict.update(pd.Series(clusterdf['Cluster'].values, index=clusterdf['Player']).to_dict())
clusterDict

{'Anthony Black': 8,
 'Ayo Dosunmu': 8,
 'Bennedict Mathurin': 11,
 'Chris Paul': 6,
 'Clint Capela': 2,
 "De'Aaron Fox": 6,
 'Georges Niang': 1,
 'Harrison Barnes': 13,
 'Ivica Zubac': 2,
 'Jake LaRavia': 10,
 'Jalen Brunson': 6,
 'Jarace Walker': 4,
 'Jarrett Allen': 2,
 'Jaylen Wells': 13,
 'Jonathan Mogbo': 10,
 'Julian Champagnie': 13,
 'Kentavious Caldwell-Pope': 1,
 'Kris Dunn': 10,
 'Malik Beasley': 1,
 'Mikal Bridges': 13,
 'Moritz Wagner': 7,
 'Nicolas Batum': 1,
 'OG Anunoby': 13,
 'Ochai Agbaji': 13,
 'Pascal Siakam': 11,
 'Payton Pritchard': 11,
 'Ronald Holland II': 12,
 'Santi Aldama': 7,
 'Scotty Pippen Jr.': 8,
 'Tyrese Haliburton': 6,
 'Zaccharie Risacher': 13,
 'Alperen Sengun': 7,
 'Amen Thompson': 10,
 'Amir Coffey': 13,
 'Anthony Edwards': 6,
 'Brandon Clarke': 10,
 'Cody Martin': 8,
 'Dalton Knecht': 13,
 'Daniel Gafford': 2,
 'Darius Garland': 6,
 'Deni Avdija': 0,
 'Derrick White': 11,
 'Donovan Mitchell': 6,
 'Donte DiVincenzo': 1,
 'Gabe Vincent': 1,
 'Jabari

In [233]:
dataset['CLUSTER'] = dataset['PLAYER'].map(clusterDict)

In [234]:
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff,CLUSTER
0,Naji Marshall,DAL,DAL vs. MIN,2024-12-25,L,14,4,1,3,33.3,...,1,1,-7,5.4,5.4,5.4,5.4,19.916,-14.516,8.0
1,Chris Paul,SAS,SAS @ NYK,2024-12-25,L,36,13,4,8,50.0,...,1,1,4,32.7,32.7,32.7,32.7,30.266667,2.433333,6.0
2,Harrison Barnes,SAS,SAS @ NYK,2024-12-25,L,17,3,1,3,33.3,...,1,1,-10,7.7,7.7,7.7,7.7,17.956667,-10.256667,13.0
3,Karl-Anthony Towns,NYK,NYK vs. SAS,2024-12-25,W,30,21,9,16,56.3,...,1,5,-13,33.8,33.8,33.8,33.8,49.057143,-15.257143,7.0
4,Cameron Payne,NYK,NYK vs. SAS,2024-12-25,W,5,2,1,3,33.3,...,2,1,-10,1.5,1.5,1.5,1.5,16.279167,-14.779167,5.0


In [235]:
dataset['CLUSTER'] = dataset['CLUSTER'].replace(clusterDict)
dataset.head(50)

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff,CLUSTER
0,Naji Marshall,DAL,DAL vs. MIN,2024-12-25,L,14,4,1,3,33.3,...,1,1,-7,5.4,5.4,5.4,5.4,19.916,-14.516,8.0
1,Chris Paul,SAS,SAS @ NYK,2024-12-25,L,36,13,4,8,50.0,...,1,1,4,32.7,32.7,32.7,32.7,30.266667,2.433333,6.0
2,Harrison Barnes,SAS,SAS @ NYK,2024-12-25,L,17,3,1,3,33.3,...,1,1,-10,7.7,7.7,7.7,7.7,17.956667,-10.256667,13.0
3,Karl-Anthony Towns,NYK,NYK vs. SAS,2024-12-25,W,30,21,9,16,56.3,...,1,5,-13,33.8,33.8,33.8,33.8,49.057143,-15.257143,7.0
4,Cameron Payne,NYK,NYK vs. SAS,2024-12-25,W,5,2,1,3,33.3,...,2,1,-10,1.5,1.5,1.5,1.5,16.279167,-14.779167,5.0
5,OG Anunoby,NYK,NYK vs. SAS,2024-12-25,W,36,8,3,10,30.0,...,0,2,13,18.5,18.5,18.5,18.5,32.523333,-14.023333,13.0
6,Josh Hart,NYK,NYK vs. SAS,2024-12-25,W,38,12,4,8,50.0,...,1,4,1,40.4,40.4,40.4,40.4,35.42069,4.97931,11.0
7,Mikal Bridges,NYK,NYK vs. SAS,2024-12-25,W,43,41,17,25,68.0,...,0,2,10,60.2,60.2,60.2,60.2,30.426667,29.773333,13.0
8,Draymond Green,GSW,GSW vs. LAL,2024-12-25,,7,0,0,4,0.0,...,0,1,7,14.4,14.4,14.4,14.4,27.35,-12.95,13.0
9,Andrew Wiggins,GSW,GSW vs. LAL,2024-12-25,,18,10,4,5,80.0,...,0,0,5,14.2,14.2,14.2,14.2,29.032,-14.832,13.0


In [236]:
dataset.info()
len(dataset)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9459 entries, 0 to 9458
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PLAYER         9459 non-null   object        
 1   TEAM           9459 non-null   object        
 2   MATCH UP       9459 non-null   object        
 3   GAME DATE      9459 non-null   datetime64[ns]
 4   W/L            9440 non-null   object        
 5   MIN            9459 non-null   int64         
 6   PTS            9459 non-null   int64         
 7   FGM            9459 non-null   int64         
 8   FGA            9459 non-null   int64         
 9   FG%            9459 non-null   object        
 10  3PM            9459 non-null   int64         
 11  3PA            9459 non-null   int64         
 12  3P%            9459 non-null   object        
 13  FTM            9459 non-null   int64         
 14  FTA            9459 non-null   int64         
 15  FT%            9459 n

9459

In [237]:
# Remove players without a cluster
dataset = dataset.dropna(subset=['CLUSTER'])
dataset.info()
len(dataset)

<class 'pandas.core.frame.DataFrame'>
Index: 8885 entries, 0 to 9458
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PLAYER         8885 non-null   object        
 1   TEAM           8885 non-null   object        
 2   MATCH UP       8885 non-null   object        
 3   GAME DATE      8885 non-null   datetime64[ns]
 4   W/L            8866 non-null   object        
 5   MIN            8885 non-null   int64         
 6   PTS            8885 non-null   int64         
 7   FGM            8885 non-null   int64         
 8   FGA            8885 non-null   int64         
 9   FG%            8885 non-null   object        
 10  3PM            8885 non-null   int64         
 11  3PA            8885 non-null   int64         
 12  3P%            8885 non-null   object        
 13  FTM            8885 non-null   int64         
 14  FTA            8885 non-null   int64         
 15  FT%            8885 non-nu

8885

In [238]:
cluster0 = dataset[dataset['CLUSTER'] == 0]
cluster1 = dataset[dataset['CLUSTER'] == 1]
cluster2 = dataset[dataset['CLUSTER'] == 2] 
cluster3 = dataset[dataset['CLUSTER'] == 3]
cluster4 = dataset[dataset['CLUSTER'] == 4]
cluster5 = dataset[dataset['CLUSTER'] == 5]
cluster6 = dataset[dataset['CLUSTER'] == 6]
cluster7 = dataset[dataset['CLUSTER'] == 7]
cluster8 = dataset[dataset['CLUSTER'] == 8]
cluster9 = dataset[dataset['CLUSTER'] == 9]
cluster10 = dataset[dataset['CLUSTER'] == 10]
cluster11 = dataset[dataset['CLUSTER'] == 11]
cluster12 = dataset[dataset['CLUSTER'] == 12]
cluster13 = dataset[dataset['CLUSTER'] == 13]
cluster14 = dataset[dataset['CLUSTER'] == 14]

In [239]:
cluster0.describe()

Unnamed: 0,GAME DATE,MIN,PTS,FGM,FGA,3PM,3PA,FTM,FTA,OREB,...,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff,CLUSTER
count,582,582.0,582.0,582.0,582.0,582.0,582.0,582.0,582.0,582.0,...,582.0,582.0,582.0,582.0,582.0,582.0,582.0,582.0,582.0,582.0
mean,2024-11-20 02:55:40.206185728,29.190722,15.687285,5.654639,12.857388,1.847079,5.245704,2.530928,3.180412,0.757732,...,2.458763,2.213058,-6.022337,28.824914,28.914691,28.98197,29.012118,28.824914,-3.662591e-16,0.0
min,2024-10-23 00:00:00,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-43.0,1.5,3.9,3.9,3.9,21.151852,-25.88462,0.0
25%,2024-11-04 00:00:00,25.0,10.0,3.0,9.0,1.0,3.0,0.0,1.0,0.0,...,1.0,1.0,-15.0,20.95,23.475,24.265,24.946429,24.991667,-6.92963,0.0
50%,2024-11-18 12:00:00,30.0,15.0,5.0,12.0,1.5,5.0,2.0,2.0,1.0,...,2.0,2.0,-6.0,27.8,28.533333,28.9325,28.65,28.584615,-0.4100267,0.0
75%,2024-12-05 00:00:00,34.0,21.0,8.0,16.0,3.0,7.0,4.0,5.0,1.0,...,3.0,3.0,2.0,36.35,34.191667,33.3,33.207143,30.977778,6.953704,0.0
max,2024-12-25 00:00:00,44.0,43.0,16.0,28.0,9.0,20.0,11.0,13.0,4.0,...,10.0,6.0,32.0,68.1,68.1,68.1,68.1,40.804,29.01429,0.0
std,,6.434608,7.833364,2.921103,4.909712,1.591387,2.965047,2.425816,2.901141,0.926601,...,1.669051,1.38692,12.054431,11.276563,8.209132,7.392767,7.104916,5.011683,10.10168,0.0


In [240]:
dfFeatures = cluster0[['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
dfLabels = cluster0['FP']

In [241]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)

In [242]:
train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=42)

In [243]:
reg = DecisionTreeRegressor(random_state=42)
reg.fit(train, train_labels)

In [244]:
train_predictions = reg.predict(train)
predictions = reg.predict(test)

In [245]:
df0 = pd.DataFrame(test, columns = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg'])
df0['Actual'] = test_labels
df0['Predicted'] = predictions
df0['Error'] = abs(df0['Actual'] - df0['Predicted'])

In [266]:
df0.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,Actual,Predicted,Error
count,78.0,78.0,78.0,78.0,78.0,78.0,78.0
mean,5.653846,5.463718,5.529054,5.655416,5.826923,5.446154,4.239744
std,3.919244,3.30589,3.128626,2.377626,5.564794,5.483309,3.748605
min,0.5,0.675,0.675,2.252632,-1.0,-1.0,0.0
25%,2.675,2.97,3.085714,4.042857,1.275,1.2,1.5
50%,4.7,4.27,5.142857,5.904545,4.1,4.9,3.0
75%,8.408333,7.9375,7.546429,7.26,9.775,7.475,6.15
max,19.566667,16.8,16.8,9.888235,26.3,24.6,17.2


Fantasy Points Per Minute may be better than the raw Fantasy Points.  This is because players who play more minutes are more likely to score more Fantasy Points.  This is a way to normalize the data. 

In [247]:
dfgood = df0[(df0['Error']<5)]
dfgood.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,Actual,Predicted,Error
count,53.0,53.0,53.0,53.0,53.0,53.0,53.0
mean,27.836164,28.666698,28.377516,28.277995,27.875472,28.233962,2.132075
std,7.453075,7.18697,6.954949,5.017373,8.98445,9.124712,1.340174
min,9.4,9.4,9.4,21.151852,9.4,10.6,0.0
25%,22.3,25.16,25.16,25.086667,21.1,21.8,1.0
50%,27.6,28.6,28.314286,27.133333,26.0,25.9,2.3
75%,31.3,31.64,30.714286,30.566667,33.7,34.3,3.1
max,52.45,52.45,52.45,40.804,53.0,52.2,4.9


In [248]:
# Identfy unique list of cluster to loop over
clusterdf.head()

Unnamed: 0,Player,Cluster
0,Anthony Black,8
1,Ayo Dosunmu,8
2,Bennedict Mathurin,11
3,Chris Paul,6
4,Clint Capela,2


In [249]:
clusterList = clusterdf['Cluster'].tolist()

In [250]:
clusterList

[8,
 8,
 11,
 6,
 2,
 6,
 1,
 13,
 2,
 10,
 6,
 4,
 2,
 13,
 10,
 13,
 1,
 10,
 1,
 13,
 7,
 1,
 13,
 13,
 11,
 11,
 12,
 7,
 8,
 6,
 13,
 7,
 10,
 13,
 6,
 10,
 8,
 13,
 2,
 6,
 0,
 11,
 6,
 1,
 1,
 13,
 8,
 11,
 11,
 6,
 14,
 11,
 4,
 11,
 13,
 4,
 13,
 1,
 7,
 13,
 2,
 8,
 0,
 13,
 5,
 2,
 10,
 10,
 7,
 13,
 1,
 13,
 10,
 0,
 5,
 0,
 11,
 8,
 13,
 8,
 7,
 8,
 7,
 4,
 7,
 11,
 13,
 11,
 4,
 7,
 13,
 1,
 1,
 8,
 13,
 1,
 1,
 6,
 4,
 5,
 1,
 13,
 6,
 11,
 1,
 4,
 7,
 7,
 8,
 0,
 0,
 4,
 12,
 6,
 10,
 2,
 0,
 2,
 3,
 5,
 1,
 4,
 6,
 13,
 4,
 9,
 7,
 0,
 8,
 13,
 13,
 4,
 6,
 6,
 6,
 9,
 13,
 2,
 13,
 14,
 0,
 14,
 13,
 2,
 13,
 11,
 0,
 6,
 13,
 6,
 11,
 1,
 1,
 2,
 1,
 2,
 7,
 8,
 11,
 1,
 9,
 1,
 1,
 11,
 0,
 5,
 11,
 2,
 13,
 11,
 13,
 1,
 11,
 3,
 0,
 0,
 5,
 1,
 11,
 8,
 7,
 0,
 5,
 4,
 7,
 13,
 9,
 13,
 11,
 5,
 10,
 7,
 4,
 10,
 2,
 4,
 2,
 7,
 11,
 13,
 4,
 5,
 10,
 8,
 4,
 1,
 2,
 8,
 10,
 1,
 1,
 3,
 8,
 8,
 4,
 13,
 11,
 5,
 2,
 11,
 12,
 1,
 6,
 8,
 9,
 12,
 13,
 4,
 8,
 13,

In [251]:
newClusterList = []
for cluster in clusterList:
    if not cluster in newClusterList:
        newClusterList.append(cluster)
newClusterList

[8, 11, 6, 2, 1, 13, 10, 4, 7, 12, 0, 14, 5, 3, 9]

In [252]:
newClusterSet = set(clusterList)
newClusterSet

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

In [253]:
uniqueClusterList = list(set(clusterList))
uniqueClusterList

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [262]:
for cluster in uniqueClusterList:
    clusterdf = dataset[dataset['CLUSTER'] == cluster]
    dfFeatures = clusterdf[['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg']]
    dfLabels = clusterdf[['FP']]

    labels = np.array(dfLabels)
    features = np.array(dfFeatures)

    train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=42)

    reg = DecisionTreeRegressor(random_state=42)
    reg.fit(train, train_labels)

    train_predictions = reg.predict(train)
    predictions = reg.predict(test)

    df0 = pd.DataFrame(test, columns = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg'])
    df0['Actual'] = test_labels
    df0['Predicted'] = predictions
    df0['Error'] = abs(df0['Actual'] - df0['Predicted'])

    print(f"Cluster {cluster} average error is {df0['Error'].mean()}")

Cluster 0 average error is 8.117808219178082
Cluster 1 average error is 7.375527426160337
Cluster 2 average error is 9.969999999999999
Cluster 3 average error is 6.523076923076922
Cluster 4 average error is 7.2835897435897445
Cluster 5 average error is 5.497692307692308
Cluster 6 average error is 10.194904458598726
Cluster 7 average error is 10.16842105263158
Cluster 8 average error is 8.848630136986301
Cluster 9 average error is 7.521649484536082
Cluster 10 average error is 7.625396825396827
Cluster 11 average error is 9.21780104712042
Cluster 12 average error is 6.6436170212765955
Cluster 13 average error is 8.166319444444445
Cluster 14 average error is 4.23974358974359
