# K MEANS

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
x = [1, 5, 1.5, 8, 1, 9]
y = [2, 8, 1.8, 8, 0.6, 11]

In [None]:
# 넘파이로 다루는 경우
X = np.array([x,y])
X = X.T
X

In [None]:
# 넘파이 데이터를 시각화한 경우
plt.scatter(X[:,0], X[:,1])

In [None]:
# 데이터 프레임으로 만들어서 시각화 한 경우
df = pd.DataFrame({'x': x,
              'y': y})
plt.scatter(df.x, df.y)

In [None]:
#클래스와 오브젝트
class Calculator:
    def addition(self, x, y):
        return x+y

    def substract(self, x,y):
        return x-y

In [None]:
calculator = Calculator() #instantiation
calculator.addition(2, 3)

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)
kmeans.fit(df)
label = kmeans.labels_
center = kmeans.cluster_centers_

In [None]:
plt.scatter(x,y)
plt.scatter(center[:,0], center[:,1], marker='x', s=100, c='black', linewidth=3)
plt.show()

# US ARREST  PCA

In [None]:
arrest = pd.read_csv('/content/USArrests.csv')
arrest = arrest.set_index('Unnamed: 0')
arrest.columns

In [None]:
# pca로 2차원으로 변환후 그래프(슬라이드 66번 참조)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
result = pca.fit_transform(arrest)

In [None]:
df = pd.DataFrame(result, columns=['c1', 'c2'])
plt.scatter(df.c1, df.c2)
for state, x, y in zip(arrest.index, df.c1, df.c2):
    plt.annotate(xy=[x,y], text=state) #좌표옆에 주이름
plt.show()

In [None]:
# rotation matrix, explained ratio, accumulated explained ratio(슬라이드 67번 참조)
print('rotation matrix:')
print(pca.components_.T)
print('explained ratio: ')
print(pca.explained_variance_ratio_)
print('accumulated explained ratio: ')
print(pca.explained_variance_ratio_.cumsum())
ratio = pca.explained_variance_ratio_.cumsum()

In [None]:
# plot accumulatred explained ratio
plt.plot(np.arange(1,3),ratio)

# IRIS DATA PCA

In [None]:
# 아이리스 데이터 불러오기
from sklearn import datasets
iris = datasets.load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target
df

In [None]:
# X and y split
y = df.species
x = df.drop('species', axis=1)

In [None]:
x.boxplot()
plt.show()

In [None]:
# Scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
plt.boxplot(x_scaled)
plt.show()

In [None]:
# PCA
pca = PCA(n_components=2)
result = pca.fit_transform(x_scaled)
df_result = pd.DataFrame(result, columns=['c1', 'c2'])

In [None]:
# 2D projection
plt.scatter(df_result.c1, df_result.c2, c=df.species)
plt.show()

In [None]:
# DENDROGRAM
X = np.array([[5,3],
[10,15],
[15,12],
[24,10],
[30,30],
[85,70],
[71,80],
[60,78],
[70,55],
[80,91]]) #10X2
len(X)

# 군집 갯수를 정할 때 사용하는 방법들

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
dendrogram(linkage(X, 'ward'),distance_sort='descending')
import warnings
warnings.filterwarnings('ignore') # 2개 군집이 적당할 것 같음

In [None]:
# elbow method
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
wss = []
k_range = range(1, 11)
for k in k_range:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    total = kmeans.inertia_
    wss.append(total)

In [None]:
plt.plot(k_range, wss, 'bx-')
plt.xlabel('Number of clusters K')
plt.ylabel('Total within-clusters sum of squares')
plt.title('The Elbow Method showing the optimal k')
plt.show() # 2개 군집이 적당함

In [None]:
# 실루엣 메소드
from sklearn.metrics import silhouette_score
silhouette = []
k_range = range(2, 10)
for k in k_range:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    silhouette_avg = silhouette_score(X, kmeans.labels_)
    silhouette.append(silhouette_avg)

In [None]:
plt.plot(k_range,
silhouette, 'bx-')
plt.xlabel('Number of clusters K')
plt.ylabel('Average Silhouettes')
plt.title('The Silhouette Method showing the optimal k')
plt.show()

# US ARREST KMEANS

In [None]:
#데이터 가져오기
arrest = pd.read_csv('/content/USArrests.csv')
arrest = arrest.set_index('Unnamed: 0')
arrest.columns

In [None]:
# 데이터 준비(표준화)
arrest.boxplot()

In [None]:
scaler = StandardScaler()
arrest_scaled = scaler.fit_transform(arrest)
plt.boxplot(arrest_scaled)
plt.show()

In [None]:
#군집갯수 정하기(dendrogram, elbow, sillouette 법 중 하나)
dendrogram(linkage(arrest_scaled, 'ward'))
warnings.filterwarnings('ignore') #2개 또는 4개

In [None]:
#KMEANS
kmeans = KMeans(n_clusters=2)
kmeans.fit_transform(arrest_scaled)
print(kmeans.labels_)
kmeans.cluster_centers_

In [None]:
# 라벨을 새로운 컬럼으로 추가
arrest['cluster'] = kmeans.labels_
arrest.head()

In [None]:
#프로파일링
arrest.groupby('cluster').mean()

In [None]:
# 클러스터별 주
print('범죄발생이 높은 주들: ')
arrest[arrest.cluster == 0].index

In [None]:
print('범죄발생이 낮은 주들: ')
arrest[arrest.cluster == 1].index