In [5]:
# 라이브러리 임포트
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import adjusted_rand_score, silhouette_score, calinski_harabaz_score

In [7]:
# Iris 데이터 데이터 파일(Iris.csv) 불러오기
x_df = pd.read_csv('dataset/Iris.csv')
x_df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [12]:
# 데이터를 numpy array 형태로 추출
x = x_df.iloc[:, 1:5].values

In [15]:
# 실제 label 도출
unique_species = np.unique(x_df['Species'])

labels_true = np.zeros(x.shape[0])
for i, species in enumerate(unique_species):
    labels_true[x_df['Species'] == species] = i

print(labels_true)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2.]


In [13]:
# 군집 개수 후보 리스트
num_clusters_set = np.arange(2, 21)

In [17]:
for num_clusters in num_clusters_set:
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++')
    kmeans.fit(x)
    
    labels_pred = kmeans.labels_
    
    print('num_clusters: ', num_clusters, 
          '| adjusted RI: ', adjusted_rand_score(labels_true, labels_pred))

num_clusters:  2 | adjusted RI:  0.5399218294207123
num_clusters:  3 | adjusted RI:  0.7302382722834697
num_clusters:  4 | adjusted RI:  0.6460787233460947
num_clusters:  5 | adjusted RI:  0.6078964652364223
num_clusters:  6 | adjusted RI:  0.45370649337406077
num_clusters:  7 | adjusted RI:  0.41632746212476474
num_clusters:  8 | adjusted RI:  0.4650880794385831
num_clusters:  9 | adjusted RI:  0.3905498994929889
num_clusters:  10 | adjusted RI:  0.32177845382904574
num_clusters:  11 | adjusted RI:  0.3632039367569716
num_clusters:  12 | adjusted RI:  0.3157509559617249
num_clusters:  13 | adjusted RI:  0.29445136434006886
num_clusters:  14 | adjusted RI:  0.29772898937509445
num_clusters:  15 | adjusted RI:  0.2671974058767182
num_clusters:  16 | adjusted RI:  0.23709500907425524
num_clusters:  17 | adjusted RI:  0.26206167736191743
num_clusters:  18 | adjusted RI:  0.24229566274980058
num_clusters:  19 | adjusted RI:  0.23096213940556448
num_clusters:  20 | adjusted RI:  0.204453504

In [18]:
for num_clusters in num_clusters_set:
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++')
    kmeans.fit(x)
    
    labels_pred = kmeans.labels_
    
    print('num_clusters: ', num_clusters, 
          '| silhouette: ', silhouette_score(x, labels_pred, 'euclidean'))

num_clusters:  2 | silhouette:  0.6808136202936816
num_clusters:  3 | silhouette:  0.5525919445499757
num_clusters:  4 | silhouette:  0.4978256901095472
num_clusters:  5 | silhouette:  0.4885175508886279
num_clusters:  6 | silhouette:  0.36650401502359653
num_clusters:  7 | silhouette:  0.3508759606304935
num_clusters:  8 | silhouette:  0.3651645360269737
num_clusters:  9 | silhouette:  0.34038584534991545
num_clusters:  10 | silhouette:  0.32054197294326947
num_clusters:  11 | silhouette:  0.327658887271523
num_clusters:  12 | silhouette:  0.27853979949299645
num_clusters:  13 | silhouette:  0.30980238241696384
num_clusters:  14 | silhouette:  0.2797588604311969
num_clusters:  15 | silhouette:  0.294082124876406
num_clusters:  16 | silhouette:  0.27980014987429946
num_clusters:  17 | silhouette:  0.27623162175663324
num_clusters:  18 | silhouette:  0.26373149947693497
num_clusters:  19 | silhouette:  0.28058951419674877
num_clusters:  20 | silhouette:  0.27683503356154987


In [20]:
for num_clusters in num_clusters_set:
    kmeans = KMeans(n_clusters=num_clusters, init='k-means++')
    kmeans.fit(x)
    
    labels_pred = kmeans.labels_
    
    print('num_clusters: ', num_clusters, 
          '| CH-index: ', calinski_harabaz_score(x, labels_pred))

num_clusters:  2 | CH-index:  513.3038433517568
num_clusters:  3 | CH-index:  560.3999242466401
num_clusters:  4 | CH-index:  529.3982941434156
num_clusters:  5 | CH-index:  494.0943819140986
num_clusters:  6 | CH-index:  474.6603308371146
num_clusters:  7 | CH-index:  447.5611102512434
num_clusters:  8 | CH-index:  433.035942442285
num_clusters:  9 | CH-index:  409.1685946509984
num_clusters:  10 | CH-index:  387.7898121018463
num_clusters:  11 | CH-index:  370.99819355075203
num_clusters:  12 | CH-index:  354.1500533791419
num_clusters:  13 | CH-index:  356.07578902793995
num_clusters:  14 | CH-index:  338.01199458369643
num_clusters:  15 | CH-index:  332.0913431668949
num_clusters:  16 | CH-index:  321.04471924236867
num_clusters:  17 | CH-index:  320.3389150494265
num_clusters:  18 | CH-index:  318.4217677018849
num_clusters:  19 | CH-index:  319.9059435608507
num_clusters:  20 | CH-index:  319.419021307929


In [None]:
""" 실습
    syn_unbalanced.xlsx 파일을 불러와 
    군집수 3~15까지에 대해 적합한 군집수 후보를 5개 찾고, 
    군집 분석 방법별로 5개의 분석 결과 plot을 그리기
"""