In [1]:
import numpy as np
from sklearn.decomposition import PCA

X = np.array([
    [1,1,1,1,1,1,1,1,1,1,3,3,3,3,3,4,5,6],
    [1,2,1,1,1,1,1,1,1,1,3,4,3,3,3,4,5,6],
    [3,3,3,3,3,1,1,1,1,1,1,1,1,1,1,5,4,6],
    [3,4,3,3,3,1,2,1,1,1,1,1,1,1,1,5,4,5],
    [1,1,1,1,1,3,3,3,3,3,1,1,1,1,1,6,4,5],
    [1,2,1,1,1,3,3,3,2,3,1,1,1,1,1,5,4,5]])

pca = PCA(n_components=3) # 주성분 분석(3개의 주성분으로 축소)
X2D = pca.fit_transform(X)
X2D

array([[ 3.84432202,  0.20850925,  0.46823094],
       [ 4.09935419, -0.13332384, -0.49268129],
       [-1.70050345, -3.04924012,  0.76869479],
       [-2.21379239, -3.06986255, -0.66041977],
       [-2.102115  ,  3.36337224,  0.55345806],
       [-1.92726537,  2.68054501, -0.63728274]])

In [2]:
# 축소된 3개의 주성분 각각의 정보량 확인

for i in pca.explained_variance_ratio_:
    print('{:.2f}'.format(i))

0.54
0.42
0.02


In [3]:
# 재구성 오차 : 원본 데이터를 압축한 후 복구한 데이터와 원본 데이터 사이의 평균 제곱 거리

1 - pca.explained_variance_ratio_.sum()

0.015189685531841413

In [4]:
# 

In [5]:
from sklearn.datasets import fetch_openml

# 784 픽셀 mnist 데이터 다운로드
mnist = fetch_openml('mnist_784')

In [6]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X,y)

In [7]:
X_train.shape

(52500, 784)

In [8]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_) # 누적 정보량
d = np.argmax(cumsum >= 0.95) + 1 # 분산이 0.95 이상인 차원의 수

d

154

In [9]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)
print(pca.n_components_) # 주성분의 수
print(np.sum(pca.explained_variance_ratio_)) # 분산비율 합계

154
0.9504411498697515


In [10]:
pca = PCA(n_components=154) # 변수의 개수를 154개로 지정하는 법
X_reduced = pca.fit_transform(X_train) # 적용
X_recovered = pca.inverse_transform(X_reduced) # 154개로 줄인 것을 784차원으로 복구(손실 있음)

In [11]:
from sklearn.decomposition import IncrementalPCA

# 배치batch와 미니배치mini batch
# 배치는 전체 샘플을 메모리에 한번에 다 올려서 학습시키는 것. 메모리 많이 쓰임
# 미니배치는 전체 샘플을 나눠서 처리
# 온라인online은 하나씩!! 보는 것. 미니배치의 극단적 형태..ㅋㅋ

n_batches = 100 # 미니배치 사이즈 지정
inc_pca = IncrementalPCA(n_components=154) # 점진적인 주성분분석 클래스 IncrementalPCA

for X_batch in np.array_split(X_train, n_batches):
    print(".", end="")
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)
X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)

....................................................................................................

In [12]:
np.allclose(pca.mean_, inc_pca.mean_)
# 일반 PCA와 점진적PCA로 MNIST 데이터를 변환한 결과 비교
# allclose()는 오차범위 내에서 평균이 같은지 확인

True

In [13]:
# 학습시간 비교

In [14]:
X_train = mnist['data'][:60000]
y_train = mnist['target'][:60000]

X_test = mnist['data'][60000:]
y_test = mnist['target'][60000:]

In [15]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)

In [16]:
%%time

rnd_clf.fit(X_train,y_train)

CPU times: user 4.13 s, sys: 117 ms, total: 4.25 s
Wall time: 4.29 s


RandomForestClassifier(n_estimators=10, random_state=42)

In [17]:
from sklearn.metrics import accuracy_score

y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9492

In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_reduced = pca.fit_transform(X_train)

In [19]:
# 랜덤포레스트

import time

rnd_clf2 = RandomForestClassifier(n_estimators=10, random_state=42)
t0 = time.time()
rnd_clf2.fit(X_train_reduced, y_train)
t1 = time.time()
print("학습용 데이터셋(압축)의 학습 시간 {:.2f}s".format(t1 - t0))

# 차원은 축소됐지만 학습 시간은 오히려 늘어남

학습용 데이터셋(압축)의 학습 시간 12.53s


In [20]:
X_test_reduced = pca.transform(X_test)
y_pred = rnd_clf2.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

0.9009

In [None]:
# 로지스틱 회귀분석(시간 넘 오래 걸림..ㅎㅎ)

In [22]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(multi_class="multinomial", max_iter=500, random_state=42)
t0 = time.time()
log_clf.fit(X_train, y_train)
t1 = time.time()
print("학습용 데이터셋(압축)의 학습 시간 {:.2f}s".format(t1 - t0))

학습용 데이터셋(압축)의 학습 시간 381.06s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
y_pred = log_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9208

In [24]:
log_clf2 = LogisticRegression(multi_class="multinomial", max_iter=500, random_state=42)
t0 = time.time()
log_clf2.fit(X_train_reduced, y_train)
t1 = time.time()
print("학습용 데이터셋(압축)의 학습 시간 {:.2f}s".format(t1 - t0))

학습용 데이터셋(압축)의 학습 시간 30.55s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
y_pred = log_clf2.predict(X_test_reduced)
accuracy_score(y_test, y_pred)

0.9241