## [통계적머신러닝] 과제11
통계학과 2019150419 기다연<br>
2021.11.09

### 문제1.
Sklearn에서 제공하는 make_circles data에 적절한 noise를 첨가한 후, K-means, DBSCAN, HDBSCAN을 적용하여 비교하라.

In [127]:
from sklearn import datasets
import plotly.express as px

# Original make_circles data
X, y = datasets.make_circles(n_samples=500, 
                             factor=0.5, 
                             noise=0.062, 
                             random_state=42)
fig = px.scatter(x = X[:,0], y = X[:,1])
fig.update_layout(width=500, height=450)
fig.show()

### K-means clustering

In [128]:
# Elbow method으로 최적의 K값 찾기
from sklearn.cluster import KMeans
import numpy as np
import plotly.graph_objects as go

distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i, 
                n_init=10,
                max_iter=300, 
                random_state=42)  # n_init=10: K-means++ with K=10을 의미
    km.fit(X)
    distortions.append(km.inertia_)

fig=go.Figure()
fig.add_trace(go.Scatter(x=np.arange(1,11), y=distortions, mode='lines+markers'))
fig.update_layout(width=500, height=400)
fig.update_xaxes(title_text='number of cluster')
fig.update_yaxes(title_text='SSE(k)')
fig.show()

In [129]:
# 최적의 K값을 5로 설정
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5, 
            init='k-means++', 
            n_init=10, 
            max_iter=300, 
            tol=1e-04, 
            random_state=42)  # init='random': K-means를 의미
y_km = km.fit_predict(X)

In [130]:
# Plotting results
fig=go.Figure()
fig.add_trace(go.Scatter(x=X[y_km == 0, 0],y=X[y_km == 0, 1],mode='markers',name='cluster1'))
fig.add_trace(go.Scatter(x=X[y_km == 1, 0],y=X[y_km == 1, 1],mode='markers',name='cluster2'))
fig.add_trace(go.Scatter(x=X[y_km == 2, 0],y=X[y_km == 2, 1],mode='markers',name='cluster3'))
fig.add_trace(go.Scatter(x=X[y_km == 3, 0],y=X[y_km == 3, 1],mode='markers',name='cluster4'))
fig.add_trace(go.Scatter(x=X[y_km == 4, 0],y=X[y_km == 4, 1],mode='markers',name='cluster5'))

fig.add_trace(go.Scatter(x=km.cluster_centers_[:, 0],
                         y=km.cluster_centers_[:, 1],
                         mode='markers',
                         marker=dict(size=12,colorscale='Viridis'), name='centroid'))
fig.show()

In [82]:
# Accuracy score
import numpy as np
from scipy.stats import mode

clusters = km.fit_predict(X)
labels = np.zeros_like(clusters)
print(labels.shape)
for i in range(10):
    mask = (clusters==i)
    labels[mask] = mode(y[mask])[0]  # 군집 결과에 라벨 부여

(500,)


In [83]:
from sklearn.metrics import accuracy_score
print('Accruacy of K-means clustering:', accuracy_score(y, labels))

Accruacy of K-means clustering: 0.51


### DBSCAN

In [131]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.2, 
            min_samples=10, 
            metric='euclidean')
y_db = db.fit_predict(X)

fig=go.Figure()
fig.add_trace(go.Scatter(x=X[y_db==0,0],y= X[y_db==0,1],mode='markers',name='cluster1'))
fig.add_trace(go.Scatter(x=X[y_db==1,0],y= X[y_db==1,1],mode='markers',name='cluster2'))
fig.update_layout(width=500,height=450)
fig.show()

In [141]:
# eps 값 변화
from plotly.subplots import make_subplots

eps_list = [.1,.2,.3,.4,.5]
for e in eps_list:
    db = DBSCAN(eps=e, 
                min_samples=10, 
                metric='euclidean')
    y_db = db.fit_predict(X)

    fig=go.Figure()
    fig.add_trace(go.Scatter(x=X[y_db==0,0],y= X[y_db==0,1],mode='markers',name='cluster1'))
    fig.add_trace(go.Scatter(x=X[y_db==1,0],y= X[y_db==1,1],mode='markers',name='cluster2'))
    fig.update_layout(width=400,height=380)
    fig.show()
    
    clusters = db.fit_predict(X)
    labels = np.zeros_like(clusters)
    for i in range(10):
        mask = (clusters==i)
        labels[mask] = mode(y[mask])[0]
    print('Accruacy of DBSCAN (eps=%.1f):' % e, accuracy_score(y, labels))

Accruacy of DBSCAN (eps=0.1): 0.97


Accruacy of DBSCAN (eps=0.2): 1.0


Accruacy of DBSCAN (eps=0.3): 0.5


Accruacy of DBSCAN (eps=0.4): 0.5


Accruacy of DBSCAN (eps=0.5): 0.5


In [124]:
# min_samples 값 변화
from plotly.subplots import make_subplots

sample_list = [5,10,15,20,25]
for e in sample_list:
    db = DBSCAN(eps=0.2, 
                min_samples=e, 
                metric='euclidean')
    y_db = db.fit_predict(X)

    fig=go.Figure()
    fig.add_trace(go.Scatter(x=X[y_db==0,0],y= X[y_db==0,1],mode='markers',name='cluster1'))
    fig.add_trace(go.Scatter(x=X[y_db==1,0],y= X[y_db==1,1],mode='markers',name='cluster2'))
    fig.update_layout(width=500,height=450)
    fig.show()
    
    clusters = db.fit_predict(X)
    labels = np.zeros_like(clusters)
    for i in range(10):
        mask = (clusters==i)
        labels[mask] = mode(y[mask])[0]
    print('Accruacy of DBSCAN (min_samples=%d):' % e, accuracy_score(y, labels))

Accruacy of DBSCAN (min_samples=5): 1.0


Accruacy of DBSCAN (min_samples=10): 1.0


Accruacy of DBSCAN (min_samples=15): 1.0


Accruacy of DBSCAN (min_samples=20): 1.0


Accruacy of DBSCAN (min_samples=25): 1.0


In [75]:
# Accuracy score
import numpy as np
from scipy.stats import mode

clusters = db.fit_predict(X)
labels = np.zeros_like(clusters)
print(labels.shape)
for i in range(10):
    mask = (clusters==i)
    labels[mask] = mode(y[mask])[0]  # 군집 결과에 라벨 부여

(500,)


In [76]:
from sklearn.metrics import accuracy_score
print('Accruacy of DBSCAN:', accuracy_score(y, labels))

Accruacy of DBSCAN: 1.0


### HDBSCAN

In [28]:
pip install hdbscan

Note: you may need to restart the kernel to use updated packages.


In [132]:
import hdbscan
hdb=hdbscan.HDBSCAN(min_samples=10)
y_hdb = hdb.fit_predict(X)

fig=go.Figure()
fig.add_trace(go.Scatter(x=X[y_hdb==0,0],y= X[y_hdb==0,1],mode='markers',name='cluster1'))
fig.add_trace(go.Scatter(x=X[y_hdb==1,0],y= X[y_hdb==1,1],mode='markers',name='cluster2'))
fig.update_layout(width=500,height=450)
fig.show()

In [126]:
# min_samples 값 변화
from plotly.subplots import make_subplots

sample_list = [5,10,15,20,25]
for e in sample_list:
    db = hdbscan.HDBSCAN(min_samples=e)
    y_db = db.fit_predict(X)

    fig=go.Figure()
    fig.add_trace(go.Scatter(x=X[y_db==0,0],y= X[y_db==0,1],mode='markers',name='cluster1'))
    fig.add_trace(go.Scatter(x=X[y_db==1,0],y= X[y_db==1,1],mode='markers',name='cluster2'))
    fig.update_layout(width=500,height=450)
    fig.show()
    
    clusters = db.fit_predict(X)
    labels = np.zeros_like(clusters)
    for i in range(10):
        mask = (clusters==i)
        labels[mask] = mode(y[mask])[0]
    print('Accruacy of HDBSCAN (min_samples=%d):' % e, accuracy_score(y, labels))

Accruacy of HDBSCAN (min_samples=5): 1.0


Accruacy of HDBSCAN (min_samples=10): 0.998


Accruacy of HDBSCAN (min_samples=15): 0.998


Accruacy of HDBSCAN (min_samples=20): 1.0


Accruacy of HDBSCAN (min_samples=25): 0.896


In [95]:
# Accuracy score
import numpy as np
from scipy.stats import mode

clusters = hdb.fit_predict(X)
labels = np.zeros_like(clusters)
print(labels.shape)
for i in range(10):
    mask = (clusters==i)
    labels[mask] = mode(y[mask])[0]  # 군집 결과에 라벨 부여

(500,)


In [96]:
from sklearn.metrics import accuracy_score
print('Accruacy of HDBSCAN:', accuracy_score(y, labels))

Accruacy of HDBSCAN: 0.998


### Clustering method 비교

In [136]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig=make_subplots(rows=1,cols=3, subplot_titles=('K-means clustering', 'DBSCAN', 'HDBSCAN'))
fig.add_trace(go.Scatter(x=X[y_km == 0, 0],y=X[y_km == 0, 1],mode='markers',name='km1'),row=1,col=1)
fig.add_trace(go.Scatter(x=X[y_km == 1, 0],y=X[y_km == 1, 1],mode='markers',name='km2'),row=1,col=1)
fig.add_trace(go.Scatter(x=X[y_km == 2, 0],y=X[y_km == 2, 1],mode='markers',name='km3'),row=1,col=1)
fig.add_trace(go.Scatter(x=X[y_km == 3, 0],y=X[y_km == 3, 1],mode='markers',name='km4'),row=1,col=1)
fig.add_trace(go.Scatter(x=X[y_km == 4, 0],y=X[y_km == 4, 1],mode='markers',name='km5'),row=1,col=1)
fig.add_trace(go.Scatter(x=X[y_db==0,0],y=X[y_db==0,1],mode='markers',name='dbscan1'),row=1,col=2)
fig.add_trace(go.Scatter(x=X[y_db==1,0],y=X[y_db==1,1],mode='markers',name='dbscn2'),row=1,col=2)
fig.add_trace(go.Scatter(x=X[y_hdb==0,0],y=X[y_hdb==0,1],mode='markers',name='hdbscan1'),row=1,col=3)
fig.add_trace(go.Scatter(x=X[y_hdb==1,0],y=X[y_hdb==1,1],mode='markers',name='hdbscan2'),row=1,col=3)
fig.update_layout(height=400,width=1000, title_text='Clustering method Comparison',title_x=0.5)
fig.show()