In [1]:
#Import packages/libraries
from sklearn.cluster import KMeans
import pandas as pd
from sklearn import preprocessing, metrics
from sklearn.cluster import DBSCAN
from ipywidgets import interact, widgets
from IPython.core.display import HTML
from IPython.display import clear_output

In [3]:
#Dataset Processing
nba19 = pd.read_csv('nba_2019.csv')
nba19 = nba19.dropna()
nba19.index = list(range(0, len(nba19)))

factors = ['Player', 'FG%', 'FGA', '3PA', '3P%', 'FTA', 'FT%', 'PTS', 'AST', 'ORB', 'DRB', 'STL', 'BLK']
nba19 = nba19[factors]

#Normalization
for x in factors:
    if x == 'Player' or x == 'FG%' or x == '3P%' or x == 'FT%':
        pass
    else:
        nba19[x] = (nba19[x] - min(nba19[x]))/(max(nba19[x]) - min(nba19[x]))

In [4]:
#K-Means Clustering
k = 10

k_model = KMeans(n_clusters=k)
k_model.fit(nba19.drop('Player', axis=1))
labels = k_model.labels_

nba19['K-Means'] = labels

In [5]:
#DBSCAN Clustering
d_model = DBSCAN(eps=0.5, min_samples=5).fit(nba19.drop('Player', axis=1).values)

labels = d_model.labels_
nba19['DBSCAN'] = labels

In [6]:
#Hierarchical Clustering
#http://rpubs.com/jdumalig/nbaclustering
h = pd.read_csv('hierarchical.csv')

In [7]:
#Merged dataframes
nba19 = pd.concat([nba19, h[['Hierarchical']]], axis=1, sort=False)

In [8]:
#Display dataframes side-by-side
def multi_table(table_list):
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>'
    )

#Display clusters by player
def clusters(Player):
    indices = nba19[(nba19['Player'] == Player)][['K-Means', 'DBSCAN', 'Hierarchical']].iloc[0]

    k = nba19[(nba19['K-Means'] == indices[0])].sort_values(by='PTS', ascending=False)[['Player', 'K-Means']]
    d = nba19[(nba19['DBSCAN'] == indices[1])].sort_values(by='PTS', ascending=False)[['Player', 'DBSCAN']]
    h = nba19[(nba19['Hierarchical'] == indices[2])].sort_values(by='PTS', ascending=False)[['Player', 'Hierarchical']]

    #Player may not appear in displayed dataframes if not PTS leader
    display(multi_table([k, d, h]))

In [9]:
#Number of players in each cluster by type of clustering
k = pd.DataFrame(nba19['K-Means'].value_counts())
d = pd.DataFrame(nba19['DBSCAN'].value_counts())
h = pd.DataFrame(nba19['Hierarchical'].value_counts())

In [10]:
interact(clusters, Player=list(nba19.sort_values(by='PTS', ascending=False)['Player']))

#clusters('Stephen Curry')

interactive(children=(Dropdown(description='Player', options=('James Harden', 'Paul George', 'Giannis Antetoko…

<function __main__.clusters(Player)>

In [None]:
#Works best in GOOGLE COLAB

#Display model + cluster
def cluster_display(Cluster):
  display( nba19[(nba19[cluster_type]) == Cluster].sort_values(by='PTS', ascending=False)[['Player', cluster_type]].head(10) )

def cluster_list(Model):
  clear_output()

  global cluster_type
  cluster_type = Model

  interact(cluster_display, Cluster=list((nba19[Model].value_counts()).index))

interact(cluster_list, Model=['K-Means', 'DBSCAN', "Hierarchical"])