In [129]:
 import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [130]:
csvpath = Path('../Project-03/teamGameStats.csv')
teamstats_df = pd.read_csv(csvpath)

In [131]:
teamstats_df.info('include=all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Data columns (total 24 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TEAM      1700 non-null   object 
 1   MATCHUP   1700 non-null   object 
 2   GAMEDATE  1700 non-null   object 
 3   W/L       1700 non-null   object 
 4   MIN       1700 non-null   int64  
 5   PTS       1700 non-null   int64  
 6   FGM       1700 non-null   int64  
 7   FGA       1700 non-null   int64  
 8   FG%       1700 non-null   float64
 9   3PM       1700 non-null   int64  
 10  3PA       1700 non-null   int64  
 11  3P%       1700 non-null   float64
 12  FTM       1700 non-null   int64  
 13  FTA       1700 non-null   int64  
 14  FT%       1700 non-null   float64
 15  OREB      1700 non-null   int64  
 16  DREB      1700 non-null   int64  
 17  REB       1700 non-null   int64  
 18  AST       1700 non-null   int64  
 19  TOV       1700 non-null   int64  
 20  STL       1700 non-null   int6

In [132]:
teamstats_df = teamstats_df.drop(columns=['TEAM','MATCHUP','GAMEDATE', 'W/L','+/-','PTS'])

In [133]:
teamstats_scaled = StandardScaler().fit_transform(teamstats_df)
print(teamstats_scaled[0:5])

[[-0.22424044  0.76382311  0.56422903  0.42595961  0.06949194 -0.37449775
   0.47067759  1.55063575  1.72481131 -0.13135184  1.21146975  0.48435507
   1.06245855 -0.9517666  -0.99793281  0.13949991  0.88429842 -0.09780737]
 [-0.22424044 -0.40652325 -0.79847092  0.07156016  1.2949722   1.1745197
   0.62190645 -0.67534997 -0.67676158 -0.13135184 -0.8150259   0.86025263
   0.27682152  0.26706822  0.01008013 -0.54392906  1.30421667  1.35647125]
 [ 3.66564477 -0.0164078   1.92692898 -1.12220641 -1.64618043 -1.07859658
  -1.66979243  1.20817641  1.15973533  0.20889345  2.94846602 -0.45538884
   1.21958596  0.06392908 -0.24192311 -0.54392906 -0.37545631 -0.82494667]
 [-0.22424044 -0.60158098 -0.1928265  -0.54397573  0.8047801   0.61124063
   0.51720954  0.00956872 -0.1116856   0.30610639  1.7904685   0.48435507
   1.37671337  0.67334649  0.76608983  0.4812144  -1.63521105 -1.55208598]
 [-0.22424044 -0.99169643  0.41281792 -1.30873244 -0.66579622  0.18878132
  -1.01834503  0.35202806  0.877197

In [134]:
# defining 'W/L' column as feature set
#X = teamstats_df.copy()
#X.drop('W/L', axis=1, inplace=True)
#X.head(20)

In [135]:
# converting 'W/L' to 0/1's
#y=teamstats_df['W/L'].apply(lambda x: 1 if x=="W" else 0)


In [136]:
pca = PCA(n_components=5)

teamstats_pca = pca.fit_transform(teamstats_scaled)

In [137]:
df_teamstats_pca = pd.DataFrame(data=teamstats_pca, columns=['pc01', 'pc02', 'pc03', 'pc04', 'pc05']) #, 'pc06', 'pc07', 'pc08', 'pc09', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15', 'pc16', 'pc17', 'pc18', 'pc19', 'pc20'])
df_teamstats_pca.head()

Unnamed: 0,pc01,pc02,pc03,pc04,pc05
0,0.542188,1.274213,1.962342,0.97841,-0.393242
1,-1.335047,-0.05686,0.128731,-2.131515,-0.963441
2,2.502751,3.597007,0.862637,2.863725,0.299488
3,-0.204543,1.468486,-0.261556,-0.741929,-0.534874
4,2.529707,1.032873,-0.889897,0.402533,-0.507443


In [138]:
pca.explained_variance_ratio_

array([0.1895116 , 0.13577759, 0.12038145, 0.08962069, 0.0857849 ])

In [144]:
inertia = []
k = list(range(1,10))

# Calculate the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_teamstats_pca)
    inertia.append(km.inertia_)
    

# Creating the Elbow Curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [147]:
 # Predicting clusters with k2

# Initialize the K-Means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_teamstats_pca)

# Predict clusters
predictions = model.predict(df_teamstats_pca)

# Add the predicted class columns
df_teamstats_pca["class"] = model.labels_
df_teamstats_pca.head()

Unnamed: 0,pc01,pc02,pc03,pc04,pc05,class
0,0.542188,1.274213,1.962342,0.97841,-0.393242,1
1,-1.335047,-0.05686,0.128731,-2.131515,-0.963441,2
2,2.502751,3.597007,0.862637,2.863725,0.299488,1
3,-0.204543,1.468486,-0.261556,-0.741929,-0.534874,1
4,2.529707,1.032873,-0.889897,0.402533,-0.507443,1


In [148]:
# Plotting the clusters
df_teamstats_pca.hvplot.scatter(
    x="pc01",
    y="pc02",
    hover_cols=["class"],
    by="class",
)