# Clustering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
!pip install --upgrade xlrd
!pip install --upgrade openpyxl

ModuleNotFoundError: No module named 'pandas'

In [None]:
!curl -L "https://goo.gl/Cx8Rzw" -o power_data.xls

In [None]:
df = pd.read_excel('power_data.xls')
df.head()

In [None]:
print(df.columns)
print(df.index)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df = df.set_index('구분')
df = df.drop(['합계','개성'], errors='ignore')
df.shape

In [None]:
df.head()

In [None]:
# Colab 에서 한글 폰트 설정 - 설중 후에 꼭 다시 runtime restart 해 주어야 함
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()


In [None]:
# '-' 기호 보이게 하기
import platform
import matplotlib
from matplotlib import font_manager, rc
matplotlib.rcParams['axes.unicode_minus'] = False

In [None]:
import platform
import matplotlib
from matplotlib import font_manager, rc

# '-' 기호 보이게 하기
matplotlib.rcParams['axes.unicode_minus'] = False

# 운영 체제마다 한글이 보이게 하는 설정
if platform.system() == 'Windows':
    path = "c:\Windows\Fonts\malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Linux':
    rc('font', family='NanumBarunGothic')

In [None]:
df.head(2)

In [None]:
df = df.drop('합계',axis=1)

In [None]:
df.plot(kind='barh',figsize=(10,6), stacked=True)

In [None]:
see_c = ['서비스업','제조업']
df[see_c].plot(kind='barh', figsize=(10,6), stacked=True)

In [None]:
df2 = df[see_c]
df2.head(5)

## scatter plot

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(df2['서비스업'], df2['제조업'], c='k', marker='o')
plt.xlabel('서비스업')
plt.ylabel('제조업')

for n in range(df2.shape[0]):
    plt.text(df2['서비스업'][n]*1.03, df2['제조업'][n]*0.98, df2.index[n])
    

In [None]:
df2 = df2.drop(['경기','서울'])
df2.shape

In [None]:
plt.figure(figsize=(6,6))
plt.scatter(df2['서비스업'], df2['제조업'], c='k', marker='o')
plt.xlabel('서비스업')
plt.ylabel('제조업')

for n in range(df2.shape[0]):
    plt.text(df2['서비스업'][n]*1.03, df2['제조업'][n]*0.98, df2.index[n])

## 병합 군집과 덴드로그램

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

plt.figure(figsize=(10,5))
link_dist = linkage(df2,metric='euclidean',method='centroid')
link_dist

In [None]:
dendrogram(link_dist,labels=list(df2.index))
plt.show()

## KMeans

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters = 3).fit(df2)
print(km.n_clusters)

In [None]:
km.labels_,km.cluster_centers_

In [None]:
km_labels_two = km.labels_

In [None]:
df2['클러스터'] = km.labels_
df2

In [None]:
df2.drop('클러스터',axis=1,inplace=True); df2.head()

In [None]:
centers = km.cluster_centers_; centers

In [None]:
my_markers = ['*','^','o','^','.',',','1','2']
my_color = ['r','c','g','b','g','k','r','y']

plt.figure(figsize=(10,8))
plt.xlabel('서비스업')
plt.ylabel('제조업')
for n in range(df2.shape[0]):
    label = km.labels_[n]
    plt.scatter(df2['서비스업'][n], df2['제조업'][n], c=my_color[label],marker=my_markers[label], s=100)
    plt.text(df2['서비스업'][n]*1.03, df2['제조업'][n]*0.98, df2.index[n])

for i in range(km.n_clusters):
    plt.scatter(centers[i][0],centers[i][1], c='b', s=50)
    

## Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df2[['서비스업','제조업']] = scaler.fit_transform(df2[['서비스업','제조업']])
df2.head(3)

## Clustering after scaling

In [None]:
Z = linkage(df2,metric='euclidean',method='centroid')
plt.figure(figsize=(10,5))
plt.title('Dendrogram')
dendrogram(Z,labels=df2.index)
plt.show()

In [None]:
km = KMeans(n_clusters=3).fit(df2)

In [None]:
centers = km.cluster_centers_
km_labels_two_scaled = km.labels_

In [None]:
plt.clf() # Clear Figure
plt.figure(figsize=(8,6))
plt.xlabel('서비스업')
plt.ylabel('제조업')

for n in range(df2.shape[0]):
    label = km.labels_[n]
    plt.scatter(df2['서비스업'][n], df2['제조업'][n], c=my_color[label],marker=my_markers[label],s=100)
    plt.text(df2['서비스업'][n]*1.05, df2['제조업'][n]*0.99,df2.index[n])
for i in range(km.n_clusters):
    plt.scatter(centers[i][0],centers[i][1],c='k',s=50)

## 모든 feature 다 사용해버리기

In [None]:
df.head()

In [None]:
df.drop(['업무용합계','산업용합계','합계'], axis=1, inplace=True,errors='ignore')
df.drop(['경기','서울'], inplace=True,errors='ignore')

In [None]:
df.head()

In [None]:
index_ = df.index
column_ = df.columns

In [None]:
index_

In [None]:
column_

In [None]:
type(column_)

In [None]:
list(column_)

In [None]:
list(column_).index('제조업'), list(column_).index('서비스업')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [None]:
print(type(df), type(df_scaled))

In [None]:
Z = linkage(df_scaled,metric='euclidean',method='centroid')
plt.figure(figsize=(10,5))
plt.title('Dendrogram for all features (scaled)')
dendrogram(Z,labels=index_)
plt.show()

In [None]:
km = KMeans(n_clusters=3).fit(df_scaled)
print(km.cluster_centers_)
print(km.labels_)

In [None]:
print("all features, scaled: ", km.labels_)
print("two features:         ", km_labels_two)
print("two features, scaled  ", km_labels_two_scaled)

## DBSCAN

In [None]:
from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
X,y = make_moons(n_samples=300,noise=0.1,random_state=11)
# X: samples y: label

X[:5],y[:5]

In [None]:
plt.scatter(X[:,0],X[:,1],c='b')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2)
predict = kmeans.fit_predict(X)

predict

In [None]:
plt.scatter(X[:,0],X[:,1],c=predict)

In [None]:
dbscan = DBSCAN(eps=0.2, min_samples=5, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0],X[:,1],c=predict)

In [None]:
predict

In [None]:
dbscan = DBSCAN(eps=0.1, min_samples=5, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0],X[:,1],c=predict)

In [None]:
set(predict)

In [None]:
dbscan = DBSCAN(eps=0.1, min_samples=5, metric='euclidean')
predict = dbscan.fit_predict(X)
print(predict)
plt.scatter(X[:,0],X[:,1],c=predict)

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
predict = dbscan.fit_predict(X)
print(predict)
plt.scatter(X[:,0],X[:,1],c=predict)

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=10, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

In [None]:
dbscan = DBSCAN(eps=0.2, min_samples=6, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

In [None]:
dbscan = DBSCAN(eps=0.2, min_samples=15, metric='euclidean')
predict = dbscan.fit_predict(X)
plt.scatter(X[:,0], X[:,1],c=predict)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances,cosine_distances,manhattan_distances

X = [[0,1],[1,1]]
print(euclidean_distances(X,X))