In [758]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

In [759]:
dataset = pd.read_excel('../data/boxScores.xlsx')
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,STL,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,0,5,3.0,3.0,3.0,3.0,5.7
1,A.J. Lawson,TOR,TOR @ MEM,2024-12-26,L,5,6,2,3,66.7,...,0,0,0,0,-4,8.4,5.7,5.7,5.7,5.7
2,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,1,0,0,0,4,4.5,4.5,4.5,4.5,13.85
3,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,0,1,-6,11.7,8.1,8.1,8.1,13.85
4,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.85


In [760]:
dataset['AveDiff'] = abs(dataset['FP'] - dataset['Season_FP_Avg'])

In [761]:
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,BLK,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,0,5,3.0,3.0,3.0,3.0,5.7,2.7
1,A.J. Lawson,TOR,TOR @ MEM,2024-12-26,L,5,6,2,3,66.7,...,0,0,0,-4,8.4,5.7,5.7,5.7,5.7,2.7
2,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,0,0,0,4,4.5,4.5,4.5,4.5,13.85,9.35
3,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,0,1,-6,11.7,8.1,8.1,8.1,13.85,2.15
4,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,0,1,0,-3,5.5,7.233333,7.233333,7.233333,13.85,8.35


In [762]:
clusterdf = pd.read_excel('../data/clusteredPlayers.xlsx')

In [763]:
clusterdf = clusterdf.drop('Unnamed: 0', axis=1)
clusterdf.head()

Unnamed: 0,Player,Cluster
0,Anthony Black,13
1,Bennedict Mathurin,4
2,Clint Capela,3
3,De'Aaron Fox,8
4,Jake LaRavia,2


In [764]:
clusterDict = {}

In [765]:
clusterDict.update(pd.Series(clusterdf['Cluster'].values, index=clusterdf['Player']).to_dict())

In [766]:
dataset['CLUSTER'] = dataset['PLAYER'].map(clusterDict)

In [767]:
dataset.head()

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff,CLUSTER
0,A.J. Lawson,TOR,TOR @ NYK,2024-12-23,L,4,3,1,1,100,...,0,0,5,3.0,3.0,3.0,3.0,5.7,2.7,
1,A.J. Lawson,TOR,TOR @ MEM,2024-12-26,L,5,6,2,3,66.7,...,0,0,-4,8.4,5.7,5.7,5.7,5.7,2.7,
2,AJ Green,MIL,MIL @ PHI,2024-10-23,W,4,0,0,0,-,...,0,0,4,4.5,4.5,4.5,4.5,13.85,9.35,11.0
3,AJ Green,MIL,MIL vs. CHI,2024-10-25,L,11,9,3,4,75.0,...,0,1,-6,11.7,8.1,8.1,8.1,13.85,2.15,11.0
4,AJ Green,MIL,MIL @ BKN,2024-10-27,L,13,5,1,4,25.0,...,1,0,-3,5.5,7.233333,7.233333,7.233333,13.85,8.35,11.0


In [768]:
dataset['CLUSTER'] = dataset['CLUSTER'].replace(clusterDict)
dataset.sort_values(by=['GAME DATE'], ascending = [True], inplace=True)
dataset.head(5)

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,TOV,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff,CLUSTER
3925,Jalen Brunson,NYK,NYK @ BOS,2024-10-22,L,25,22,9,14,64.3,...,4,3,-23,22.2,22.2,22.2,22.2,39.543333,17.343333,8.0
612,Ariel Hukporti,NYK,NYK @ BOS,2024-10-22,L,6,0,0,0,-,...,0,0,9,7.8,7.8,7.8,7.8,5.95,1.85,
7067,Naz Reid,MIN,MIN @ LAL,2024-10-22,L,26,12,3,8,37.5,...,1,3,-6,17.3,17.3,17.3,17.3,23.872414,6.572414,2.0
6811,Mike Conley,MIN,MIN @ LAL,2024-10-22,L,20,5,1,7,14.3,...,3,1,-22,12.8,12.8,12.8,12.8,22.548,9.748,14.0
7096,Neemias Queta,BOS,BOS vs. NYK,2024-10-22,W,4,0,0,0,-,...,0,1,0,0.0,0.0,0.0,0.0,14.879167,14.879167,3.0


In [769]:
# dataset.info()
len(dataset)

9665

In [770]:
# Remove players without a cluster
dataset = dataset.dropna(subset=['CLUSTER'])
dataset.info()
len(dataset)

<class 'pandas.core.frame.DataFrame'>
Index: 7280 entries, 3925 to 8799
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   PLAYER         7280 non-null   object        
 1   TEAM           7280 non-null   object        
 2   MATCH UP       7280 non-null   object        
 3   GAME DATE      7280 non-null   datetime64[ns]
 4   W/L            7280 non-null   object        
 5   MIN            7280 non-null   int64         
 6   PTS            7280 non-null   int64         
 7   FGM            7280 non-null   int64         
 8   FGA            7280 non-null   int64         
 9   FG%            7280 non-null   object        
 10  3PM            7280 non-null   int64         
 11  3PA            7280 non-null   int64         
 12  3P%            7280 non-null   object        
 13  FTM            7280 non-null   int64         
 14  FTA            7280 non-null   int64         
 15  FT%            7280 non

7280

In [771]:
# Add additional features
dataset['FPPM'] = dataset['FP'] / dataset['MIN']

In [772]:
cluster0 = dataset[dataset['CLUSTER'] == 0]
cluster1 = dataset[dataset['CLUSTER'] == 1]
cluster2 = dataset[dataset['CLUSTER'] == 2] 
cluster3 = dataset[dataset['CLUSTER'] == 3]
cluster4 = dataset[dataset['CLUSTER'] == 4]
cluster5 = dataset[dataset['CLUSTER'] == 5]
cluster6 = dataset[dataset['CLUSTER'] == 6]
cluster7 = dataset[dataset['CLUSTER'] == 7]
cluster8 = dataset[dataset['CLUSTER'] == 8]
cluster9 = dataset[dataset['CLUSTER'] == 9]
cluster10 = dataset[dataset['CLUSTER'] == 10]
cluster11 = dataset[dataset['CLUSTER'] == 11]
cluster12 = dataset[dataset['CLUSTER'] == 12]
cluster13 = dataset[dataset['CLUSTER'] == 13]
cluster14 = dataset[dataset['CLUSTER'] == 14]

In [773]:
cluster2.describe()
cluster0.head(150)

Unnamed: 0,PLAYER,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,...,PF,+/-,FP,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,AveDiff,CLUSTER,FPPM
5053,Jordan Clarkson,UTA,UTA vs. MEM,2024-10-23,L,25,17,7,16,43.8,...,1,-3,34.5,34.500000,34.50,34.500000,27.133333,7.366667,0.0,1.380000
8615,T.J. McConnell,IND,IND @ DET,2024-10-23,W,16,14,7,8,87.5,...,0,-11,24.4,24.400000,24.40,24.400000,24.053333,0.346667,0.0,1.525000
2304,Dejounte Murray,NOP,NOP vs. CHI,2024-10-23,W,30,14,4,15,26.7,...,0,5,40.6,40.600000,40.60,40.600000,39.621429,0.978571,0.0,1.353333
5173,Jose Alvarado,NOP,NOP vs. CHI,2024-10-23,W,13,3,1,3,33.3,...,3,-5,10.5,10.500000,10.50,10.500000,24.209091,13.709091,0.0,0.807692
5185,Josh Giddey,CHI,CHI @ NOP,2024-10-23,L,30,14,5,11,45.5,...,1,-22,21.5,21.500000,21.50,21.500000,31.485185,9.985185,0.0,0.716667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5064,Jordan Clarkson,UTA,UTA @ LAL,2024-11-19,L,26,18,6,15,40.0,...,2,-1,25.8,29.900000,31.90,30.700000,27.133333,1.333333,0.0,0.992308
8124,Russell Westbrook,DEN,DEN @ MEM,2024-11-19,W,32,12,5,12,41.7,...,3,-2,49.0,30.466667,27.74,31.728571,30.807143,18.192857,0.0,1.531250
5890,Kevin Porter Jr.,LAC,LAC vs. ORL,2024-11-20,W,20,10,5,10,50.0,...,3,9,29.8,23.366667,18.04,19.300000,20.529630,9.270370,0.0,1.490000
8628,T.J. McConnell,IND,IND @ HOU,2024-11-20,L,19,17,8,12,66.7,...,1,-6,29.2,29.800000,26.54,23.957143,24.053333,5.146667,0.0,1.536842


In [774]:
dfFeatures = cluster0[['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM']]
dfLabels = cluster0['FP']

In [775]:
labels = np.array(dfLabels)
features = np.array(dfFeatures)

In [776]:
train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=4)

In [777]:
reg = RandomForestRegressor(
    random_state=4,
)
reg.fit(train, train_labels)

In [778]:
train_predictions = reg.predict(train)
predictions = reg.predict(test)

In [779]:
df0 = pd.DataFrame(test, columns = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM'])
df0['Actual'] = test_labels
df0['Predicted'] = predictions
df0['Error'] = abs(df0['Actual'] - df0['Predicted'])

In [780]:
df0.describe()
# df0.head()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM,Actual,Predicted,Error
count,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0
mean,23.548,23.102267,23.169727,23.650437,1.012098,24.290667,25.32696,4.769253
std,8.95156,8.132993,7.887497,6.015015,0.349378,10.784075,10.233881,3.983192
min,7.433333,8.35,8.35,11.028571,0.363636,2.5,7.988,0.065
25%,16.816667,16.47,16.741429,20.52963,0.748074,17.3,16.158,1.603
50%,23.466667,23.85,24.4,24.053333,1.04,23.1,27.84,3.988
75%,30.45,29.73,29.7,28.233333,1.263305,30.55,32.0775,7.412
max,46.233333,41.12,42.028571,39.621429,2.007692,52.2,44.84,16.49


Fantasy Points Per Minute may be better than the raw Fantasy Points.  This is because players who play more minutes are more likely to score more Fantasy Points.  This is a way to normalize the data. 

In [781]:
dfgood = df0[(df0['Error']<5)]
dfgood.describe()

Unnamed: 0,Last3_FP_Avg,Last5_FP_Avg,Last7_FP_Avg,Season_FP_Avg,FPPM,Actual,Predicted,Error
count,47.0,47.0,47.0,47.0,47.0,47.0,47.0,47.0
mean,23.159929,23.23234,23.481074,23.074253,0.940134,23.334043,23.655894,2.252574
std,9.234566,8.185443,8.045307,5.935141,0.345868,10.582271,10.562244,1.573934
min,7.433333,8.35,8.35,11.028571,0.363636,4.0,7.988,0.065
25%,15.466667,16.95,16.755714,17.937931,0.676026,14.55,14.1025,0.7545
50%,23.366667,24.3,25.314286,24.053333,0.996429,22.4,26.363,2.352
75%,30.025,29.8,29.953571,28.233333,1.17511,30.15,31.1165,3.6125
max,42.866667,35.62,34.3,31.485185,1.733333,45.3,44.84,4.874


In [782]:
# Identfy unique list of cluster to loop over
clusterdf.head()

Unnamed: 0,Player,Cluster
0,Anthony Black,13
1,Bennedict Mathurin,4
2,Clint Capela,3
3,De'Aaron Fox,8
4,Jake LaRavia,2


In [783]:
clusterList = clusterdf['Cluster'].tolist()

In [784]:
newClusterSet = set(clusterList)
newClusterSet

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

In [785]:
uniqueClusterList = list(set(clusterList))
uniqueClusterList

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [786]:
ClusterError = 0
for cluster in uniqueClusterList:
    clusterdf = dataset[dataset['CLUSTER'] == cluster]
    dfFeatures = clusterdf[['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM']]
    dfLabels = clusterdf[['FP']]

    labels = np.array(dfLabels).ravel()
    features = np.array(dfFeatures)

    train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.25, random_state=4)

    reg = RandomForestRegressor(random_state=4)
    reg.fit(train, train_labels)

    train_predictions = reg.predict(train)
    predictions = reg.predict(test)

    df0 = pd.DataFrame(test, columns = ['Last3_FP_Avg', 'Last5_FP_Avg', 'Last7_FP_Avg', 'Season_FP_Avg', 'FPPM'])
    df0['Actual'] = test_labels
    df0['Predicted'] = predictions
    df0['Error'] = abs(df0['Actual'] - df0['Predicted'])
    ClusterError += df0['Error'].mean()

    print(f"Cluster {cluster} average error is {df0['Error'].mean()}")
avgClusterError = ClusterError/len(uniqueClusterList)
print(f"Average error across clusters is: {avgClusterError}")

ValueError: Shape of passed values is (75, 5), indices imply (75, 4)