유방암 데이터

10, 5, 2 차원으로 차원축소
원본과 차원축소한 데이터의 분류 정확도 측정
2차원으로 축소한 것은 시각화
군집화하기
군집화 성능 평가

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

데이터 정규화

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
df.head(3)

In [None]:
from sklearn.preprocessing import StandardScaler
cancer_std = StandardScaler().fit_transform(cancer.data)

차원 축소

In [None]:
from sklearn.decomposition import PCA
pca2 = PCA(n_components=2)
pca5 = PCA(n_components=5)
pca10 = PCA(n_components=10)

In [None]:
cancer_pca2 = pca2.fit_transform(cancer_std)
cancer_pca5 = pca5.fit_transform(cancer_std)
cancer_pca10 = pca10.fit_transform(cancer_std)

분류 정확도

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# 정규화된 오리지널 데이터
X_train, X_test, y_train, y_test = train_test_split(
    cancer_std, cancer.target, stratify=cancer.target, random_state=2021
)
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

In [None]:
0.993006993006993
def pca_accuracy(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, random_state=2021
    )
    lrc = LogisticRegression(random_state=2021)
    lrc.fit(X_train, y_train)
    score = lrc.score(X_test, y_test)
    return np.round(score, 4)

In [None]:
print('PCA 2 정확도:', pca_accuracy(cancer_pca2, cancer.target))
print('PCA 5 정확도:', pca_accuracy(cancer_pca5, cancer.target))
print('PCA 10 정확도:', pca_accuracy(cancer_pca10, cancer.target))
print('원데이터 정확도:', pca_accuracy(cancer_std, cancer.target))

globals() : 전역 변수를 딕셔너리에 담아두기

In [None]:
for dim in [2,5,10]:
    globals()[f'pca{dim}'] = PCA(n_components=dim)
    globals()[f'cancer_pca{dim}'] = globals()[f'pca{dim}'].fit_transform(cancer_std)
    acc = pca_accuracy(globals()[f'cancer_pca{dim}'], cancer.target)
    print(f'PCA {dim} 정확도: {acc}')
print('원데이터 정확도:', pca_accuracy(cancer_std, cancer.target))

군집화

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=2021)
kmeans.fit(cancer_std)
df['cluster'] = kmeans.labels_

In [None]:
df = df[['target','cluster']]
df['PC1'] = cancer_pca2[:,0]
df['PC2'] = cancer_pca2[:,1]
df.head(3)

In [None]:
#시각화하기
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(12,4))
for k, column in enumerate(['target','cluster']):
    ax = axs[k]
    for i, marker in enumerate(['^','o']):
        x_axis_data = df[df[column] == i]['PC1']
        y_axis_data = df[df[column] == i]['PC2']
        ax.scatter(x_axis_data, y_axis_data, marker=marker, label=cancer.target_names[i])

    if k == 0:          # Original
        ax.set_title('Original data', fontsize=15)
        ax.set_ylabel('PCA Component 2')
        ax.legend()
    else:               # Cluster
        ax.set_title('Clustered data', fontsize=15)
    ax.set_xlabel('PCA Component 1')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(cancer_std, df.cluster)

In [None]:
from visualize import visualize_silhouette
visualize_silhouette([2,3], cancer.data)