In [None]:
"""
(차원 축소)
PCA: 데이터에 가장 가까운 d개의 초평면을 정의한 다음 데이터의 분산이 최대가 되도록
그 평면에 투영시켜 데이터셋의 차원을 d차원으로 축소시킨다.
"""

from sklearn.decomposition import PCA

# 2차원으로 줄이기
pca = PCA(n_compoents=2)
X2D = pca.fit_transform(X)

# 보존하려는 분산의 비율을 n_components 파라미터로 설정할 수도 있다.
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

# 압축 후 복원. MNIST의 데이터로 784 -> 154 ->784차원으로 변환
pca = PCA(n_components=154)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)

# 점진적 PCA: 데이터의 크기가 너무 클 때, 훈련 세트를 미니 배치로 나눈 뒤 PCA 적용
from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_trian, n_batches):
    inc_pca.partial_fix(X_batch)
X_reduced = inc_pca.transform(X_train)

In [9]:
import warnings
warnings.filterwarnings('ignore')
from sklearn import datasets

iris = datasets.load_iris()
X = iris['data'][:, (0, 1, 2, 3)]   # 꽃잎의 특성
y = iris['target']

In [10]:
"""군집: k-평균"""

from sklearn.cluster import KMeans

# 알고리즘이 찾을 클러스터 개수 k. cluster_centers_로 k수 만큼 센트로이드의 좌표가 출력됨
k = 3
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)
kmeans.cluster_centers_

array([[5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])

In [11]:
# 센트로이드 위치 대충 알때 초기화할 수 있는 방법
import numpy as np

good_init = np.array([[5,3,4,1], [5,3,1,0], [7,3,6,2]])
kmeans = KMeans(n_clusters=3, init=good_init, n_init=1)   # n_init=1로 설정
y_pred = kmeans.fit_predict(X)
kmeans.inertia_   # 이너셔: 가장 가까운 센트로이드 사이의 평균 제곱 거리

78.85566582597727

In [12]:
# 군집을 사용한 전처리: 숫자 데이터셋 분류

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_digits, y_digits = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)

0.9688888888888889

In [13]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('kmeans', KMeans(n_clusters=50)),
    ('log_reg', LogisticRegression()),
]) 
pipeline.fit(X_train, y_train)
pipeline.score(X_test, y_test)

0.9711111111111111

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(kmeans__n_clusters=range(2, 100))
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train, y_train)

print(grid_clf.best_params_)
print(grid_clf.score(X_test, y_test))

Fitting 3 folds for each of 98 candidates, totalling 294 fits
[CV] END ...............................kmeans__n_clusters=2; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=2; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=2; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=3; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=3; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=3; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=4; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=4; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=4; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=5; total time=   0.1s
[CV] END ...............................kmeans__n_clusters=5; total time=   0.1s
[CV] END ...............................kmeans_