In [31]:
### Setup Environment ###
import numpy as np
import pandas as pd
import plotly_express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

# Clustering
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from factor_analyzer import FactorAnalyzer
np.set_printoptions(suppress=True, formatter={'float_kind':'{:0.4f}'.format})
pd.set_option('display.float_format', lambda x: '%.4f' % x)
pd.set_option('display.max_rows', None)

teams = pd.read_csv('pipe_historical_teams.csv')
ratings = pd.read_csv('ware_historical_team_ratings.csv')
df = teams.merge(ratings, left_on='team_year', right_on='team_year')
four_factors = pd.read_csv('ware_historical_four_factors.csv')
df = df.merge(four_factors, left_on='team_year', right_on='team_year')

# I. K-Means Model

In [32]:
cluster_df = df[['ovr_rating_bpi',
                 'ovr_rating_kenpom',
                 'ovr_rating_torvik',
                 'wab',
                 'off_rating_bpi',
                 'off_rating_kenpom',
                 'off_rating_torvik',
                 'def_rating_bpi',
                 'def_rating_kenpom',
                 'def_rating_torvik',
                 'reb%_off', 
                 'efg%_def', 
                 'ftr_def', 
                 'reb%_def', 
           ]]

In [33]:
# Create PCA Data Frame
sc = StandardScaler()
shape = cluster_df.shape
pca_array = sc.fit_transform(cluster_df)
pca = PCA(n_components=shape[1])
model_array = pca.fit_transform(pca_array)
model_df = pd.DataFrame(model_array)
model_df = model_df.loc[:, 0:4]

In [34]:
# Fit K-Means Clustering Mode
kmeans6 = KMeans(n_clusters=5, init='k-means++', algorithm='lloyd',
                 max_iter=500, random_state=123)
kmeans6.fit_predict(model_df)
df['cluster'] = kmeans6.labels_



# II. Apply to 2024 Teams

In [35]:
teams_2024 = pd.read_csv('ware_current_team_info.csv')
ratings_2024 = pd.read_csv('ware_current_team_ratings.csv')
df_2024 = teams_2024.merge(ratings_2024, left_on='team_year', right_on='team_year')
four_factors_2024 = pd.read_csv('ware_current_four_factors.csv')
df_2024 = df_2024.merge(four_factors_2024, left_on='team_year', right_on='team_year')

cluster_df_2024 = df_2024[['ovr_rating_bpi',
                           'ovr_rating_kenpom',
                           'ovr_rating_torvik',
                           'wab',
                           'off_rating_bpi',
                           'off_rating_kenpom',
                           'off_rating_torvik',
                           'def_rating_bpi',
                           'def_rating_kenpom',
                           'def_rating_torvik',
                           'reb%_off', 
                           'efg%_def', 
                           'ftr_def', 
                           'reb%_def', 
                           ]]

pca_array_2024 = sc.transform(cluster_df_2024)
model_array_2024 = pca.transform(pca_array_2024)
model_df_2024 = pd.DataFrame(model_array_2024)
model_df_2024 = model_df_2024.loc[:, 0:4]
kmeans6_2024 = kmeans6.predict(model_df_2024)
df_2024['cluster'] = kmeans6_2024

In [36]:
df_merge_historical = df[['team_year',
                          'year',
                          'team',
                          'conference',
                          'seed',
                          'finish',
                          'ovr_rating_bpi',
                          'ovr_rating_kenpom',
                          'ovr_rating_torvik',
                          'wab',
                          'off_rating_bpi',
                          'off_rating_kenpom',
                          'off_rating_torvik',
                          'def_rating_bpi',
                          'def_rating_kenpom',
                          'def_rating_torvik',
                          'reb%_off', 
                          'efg%_def', 
                          'ftr_def', 
                          'reb%_def',
                          'cluster'
                        ]]

df_merge_current = df_2024[['team_year',
                          'year',
                          'team',
                          'conference',
                          'seed',
                          'finish',
                          'ovr_rating_bpi',
                          'ovr_rating_kenpom',
                          'ovr_rating_torvik',
                          'wab',
                          'off_rating_bpi',
                          'off_rating_kenpom',
                          'off_rating_torvik',
                          'def_rating_bpi',
                          'def_rating_kenpom',
                          'def_rating_torvik',
                          'reb%_off', 
                          'efg%_def', 
                          'ftr_def', 
                          'reb%_def',
                          'cluster'
                        ]]

df_merge_all = pd.concat([df_merge_historical, df_merge_current], ignore_index=True)
df = df_merge_all

In [37]:
# Label Tiers
df['tier'] = ''

### S Tier ###
df.loc[(df['cluster']==2) & (df['ovr_rating_kenpom']>25) & (df['wab']>5), 'tier'] = 'S'

### A Tier ###
df.loc[(df['cluster']==2) & (df['ovr_rating_kenpom']<=25) & (df['wab']>5), 'tier'] = 'A'

### B Tier ###
df.loc[(df['cluster']==2) & (df['ovr_rating_kenpom']>25) & (df['wab']<=5), 'tier'] = 'B'
df.loc[(df['cluster']==1) & (df['ovr_rating_kenpom']>20), 'tier'] = 'B'
df.loc[(df['cluster']==4) & (df['ovr_rating_kenpom']>17) & (df['wab']>4), 'tier'] = 'B'

### C Tier ###
df.loc[(df['cluster']==2) & (df['ovr_rating_kenpom']<=25) & (df['wab']<=5), 'tier'] = 'C'
df.loc[(df['cluster']==1) & (df['ovr_rating_kenpom']<=20), 'tier'] = 'C'
df.loc[(df['cluster']==4) & (df['ovr_rating_kenpom']>17) & (df['wab']<=4), 'tier'] = 'C'
df.loc[(df['cluster']==4) & (df['ovr_rating_kenpom']<=17) & (df['ovr_rating_kenpom']>10), 'tier'] = 'C'
df.loc[(df['cluster']==3) & (df['ovr_rating_kenpom']>12) & (df['ovr_rating_torvik']>.78), 'tier'] = 'C'

### D Tier ###
df.loc[(df['cluster']==4) & (df['ovr_rating_kenpom']<=10), 'tier'] = 'D'
df.loc[(df['cluster']==3) & (df['ovr_rating_kenpom']<=12) | (df['ovr_rating_torvik']<=.78), 'tier'] = 'D'

### F Tier ###
df.loc[(df['cluster']==0), 'tier'] = 'F'

In [38]:
pd.crosstab(index=df['finish'], columns=df['tier'])

tier,A,B,C,D,F,S
finish,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Champion,1,1,0,0,0,13
Elite Eight,11,10,12,2,0,22
Final Four,7,4,9,1,0,9
First Round,4,29,234,86,175,5
In Progress,0,11,34,9,13,1
Runner Up,4,1,3,0,0,7
Second Round,14,30,144,19,14,21
Sweet 16,17,23,52,1,2,25


# III. Create Composite Scores

In [39]:
scale_df = df[['ovr_rating_bpi',
               'ovr_rating_kenpom',
               'ovr_rating_torvik',
               'off_rating_bpi',
               'off_rating_kenpom',
               'off_rating_torvik',
               'def_rating_bpi',
               'def_rating_kenpom',
               'def_rating_torvik'
              ]]

scale_df['def_rating_kenpom_scaled'] = (1/(df[['def_rating_kenpom']]/100))*100
scale_df['def_rating_torvik_scaled'] = (1/(df[['def_rating_torvik']]/100))*100
mm = MinMaxScaler(feature_range=(0,10))
scale_array = mm.fit_transform(scale_df)
scale_df = pd.DataFrame(scale_array, columns=['ovr_rating_bpi',
                                              'ovr_rating_kenpom',
                                              'ovr_rating_torvik',
                                              'off_rating_bpi',
                                              'off_rating_kenpom',
                                              'off_rating_torvik',
                                              'def_rating_bpi',
                                              'def_rating_kenpom',
                                              'def_rating_torvik',
                                              'def_rating_kenpom_scaled',
                                              'def_rating_torvik_scaled'])
df['overall'] = round(.5*scale_df['ovr_rating_bpi'] + .3*scale_df['ovr_rating_kenpom'] + .2*scale_df['ovr_rating_torvik'], 2)
df['offense'] = round(.5*scale_df['off_rating_bpi'] + .3*scale_df['off_rating_kenpom'] + .2*scale_df['off_rating_torvik'], 2)
df['defense'] = round(.5*scale_df['def_rating_bpi'] + .3*scale_df['def_rating_kenpom_scaled'] + .2*scale_df['def_rating_torvik_scaled'], 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scale_df['def_rating_kenpom_scaled'] = (1/(df[['def_rating_kenpom']]/100))*100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scale_df['def_rating_torvik_scaled'] = (1/(df[['def_rating_torvik']]/100))*100


# IV. Export Data Mart Data Frame

In [40]:
mart_page1_teams = df[['team_year',
                       'year',
                       'team',
                       'conference',
                       'seed',
                       'overall',
                       'offense',
                       'defense',
                       'tier',
                       'finish']]
mart_page1_teams.to_csv('mart_page1_teams.csv')