In [149]:
 import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [150]:
csvpath = Path('../Project-03/teamGameStats.csv')
teamstats_df = pd.read_csv(csvpath)

In [151]:
teamstats_df.info('include=all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13658 entries, 0 to 13657
Data columns (total 24 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TEAM      13658 non-null  object 
 1   MATCHUP   13658 non-null  object 
 2   GAMEDATE  13658 non-null  object 
 3   W/L       13658 non-null  object 
 4   MIN       13658 non-null  int64  
 5   PTS       13658 non-null  int64  
 6   FGM       13658 non-null  int64  
 7   FGA       13658 non-null  int64  
 8   FG%       13658 non-null  float64
 9   3PM       13658 non-null  int64  
 10  3PA       13658 non-null  int64  
 11  3P%       13658 non-null  float64
 12  FTM       13658 non-null  int64  
 13  FTA       13658 non-null  int64  
 14  FT%       13658 non-null  float64
 15  OREB      13658 non-null  int64  
 16  DREB      13658 non-null  int64  
 17  REB       13658 non-null  int64  
 18  AST       13658 non-null  int64  
 19  TOV       13658 non-null  int64  
 20  STL       13658 non-null  in

In [152]:
teamstats_df = teamstats_df.drop(columns=['TEAM','MATCHUP','GAMEDATE', 'W/L','+/-','PTS'])

In [153]:
teamstats_scaled = StandardScaler().fit_transform(teamstats_df)
print(teamstats_scaled[0:5])

[[-0.22894488  0.98348849  0.69748943  0.53604426  0.58908519  0.28071198
   0.5308945   1.41574728  1.53075361 -0.03627464  1.05428061  0.5449045
   1.04771713 -0.68886744 -1.06487493  0.10304907  0.85174609 -0.28069469]
 [-0.22894488 -0.17458426 -0.5445868   0.18617842  1.86170003  1.67996901
   0.67340049 -0.73810329 -0.77749535 -0.03627464 -0.83745679  0.91454321
   0.28290042  0.47289727 -0.05108653 -0.57638335  1.25266794  1.10884067]
 [ 3.30032054  0.21143999  1.93956567 -0.99231703 -1.19257558 -0.35531394
  -1.48611341  1.08438565  0.98763621  0.30392597  2.67576981 -0.37919226
   1.20068047  0.27926982 -0.30453363 -0.57638335 -0.35101943 -0.97546237]
 [-0.22894488 -0.36759638  0.00744708 -0.42148329  1.35265409  1.17114827
   0.5747425  -0.07538004 -0.23437794  0.40112614  1.59477701  0.5449045
   1.35364381  0.86015217  0.70925476  0.44276529 -1.55378496 -1.67023005]
 [-0.22894488 -0.75362063  0.55948096 -1.17645694 -0.17448371  0.78953272
  -0.87224143  0.25598159  0.7160775

In [154]:
# defining 'W/L' column as feature set
#X = teamstats_df.copy()
#X.drop('W/L', axis=1, inplace=True)
#X.head(20)

In [155]:
# converting 'W/L' to 0/1's
#y=teamstats_df['W/L'].apply(lambda x: 1 if x=="W" else 0)


In [156]:
pca = PCA(n_components=5)

teamstats_pca = pca.fit_transform(teamstats_scaled)

In [157]:
df_teamstats_pca = pd.DataFrame(data=teamstats_pca, columns=['pc01', 'pc02', 'pc03', 'pc04', 'pc05']) #, 'pc06', 'pc07', 'pc08', 'pc09', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15', 'pc16', 'pc17', 'pc18', 'pc19', 'pc20'])
df_teamstats_pca.head()

Unnamed: 0,pc01,pc02,pc03,pc04,pc05
0,0.533969,1.555589,1.641546,0.301951,0.425619
1,1.984152,0.035597,0.128879,-0.258863,-2.34186
2,-1.094815,3.790584,0.148203,1.182956,2.108965
3,1.047679,1.325305,-0.440368,0.419597,-1.565845
4,-1.498792,1.340667,-0.885499,1.138336,-0.502115


In [158]:
pca.explained_variance_ratio_

array([0.18651123, 0.14644611, 0.11806125, 0.08541942, 0.07958945])

In [159]:
inertia = []
k = list(range(1,10))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_teamstats_pca)
    inertia.append(km.inertia_)
    

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [160]:
 # Predicting clusters with k2

# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_teamstats_pca)

# Predict clusters
predictions = model.predict(df_teamstats_pca)

# Add the predicted class columns
df_teamstats_pca["class"] = model.labels_
df_teamstats_pca.head()

Unnamed: 0,pc01,pc02,pc03,pc04,pc05,class
0,0.533969,1.555589,1.641546,0.301951,0.425619,0
1,1.984152,0.035597,0.128879,-0.258863,-2.34186,1
2,-1.094815,3.790584,0.148203,1.182956,2.108965,0
3,1.047679,1.325305,-0.440368,0.419597,-1.565845,0
4,-1.498792,1.340667,-0.885499,1.138336,-0.502115,2


In [161]:
# Plotting the clusters
df_teamstats_pca.hvplot.scatter(
    x="pc01",
    y="pc02",
    hover_cols=["class"],
    by="class",
)