# PCA

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA as lib_PCA

In [2]:
class PCA:
    def __init__(self, n_components, whiten, random_state=41, gram_iter=100):
        self.n_components = n_components
        self.whiten = whiten
        self.gram_iter = gram_iter

    def whitening(self, data):
        centered_data = self.centering(data)
        std_data = np.std(centered_data, axis=0)
        whitened_data = centered_data / std_data

        return whitened_data

    def centering(self, data):
        n_samples, n_features = data.shape

        mean_data = np.mean(data, axis=0)
        centered_data = data - mean_data

        return centered_data
   
    def covariance(self, data):
        centered_data = self.centering(data)
        n_samples, n_features = centered_data.shape
        cov_data = (centered_data.T @ centered_data) / (n_samples - 1)
        
        return cov_data
     
    def gram_schmidt(self, A):
        # data의 차원 확인
        n, m = A.shape  # n: 행 수 (벡터 수), m: 열 수 (차원 수)
        Q = np.zeros((n, m))  # 직교화된 벡터를 저장할 행렬
    
        for i in range(m):  # 열 벡터를 기준으로 반복
            # 현재 열 벡터를 복사
            q = A[:, i]  # A의 i번째 열 벡터
        
            # 이전에 구한 직교 벡터에 대한 정사영을 뺌.
            for j in range(i):
                q -= np.dot(Q[:, j], A[:, i]) * Q[:, j]  # j번째 직교 벡터에 대한 정사영 제거

            # 벡터 정규화
            Q[:, i] = q / np.linalg.norm(q)  # 정규화하여 Q에 저장

        R = Q.T @ A

        return Q, R
    
    def eig(self, cov_data):
        n, m = cov_data.shape  # n: 행 수 (벡터 수), m: 열 수 (차원 수)
        A = cov_data.copy()

        # 고유행렬 누적곱
        eigenvectors = np.eye(n)
        for _ in range(self.gram_iter):
            Q, R = self.gram_schmidt(A)
            # Q, R = np.linalg.qr(A)
            A = R @ Q
            eigenvectors = eigenvectors @ Q

        eigenvalues = np.diagonal(A)
        
        return eigenvalues, eigenvectors
    
    def fit(self, data):
        if self.whiten == True:
            adj_data = self.whitening(data)
        else:
            adj_data = self.centering(data)

        cov_data = self.covariance(adj_data)
        eigvalues, eigvectors = self.eig(cov_data)
    
        # 대각 요소를 크기순으로 정렬한 인덱스 저장
        sorted_indices = np.argsort(-eigvalues)  # 내림차순 정렬 (- 붙여서 내림차순)

        # 고유값 정렬
        eigvalues = eigvalues[sorted_indices]
    
        # 대각 요소 크기순으로 열 벡터 재정렬
        eigvectors = eigvectors[:, sorted_indices]

        pca_output = adj_data @ (eigvectors[:, :self.n_components])

        return pca_output

In [3]:
people = fetch_lfw_people(min_faces_per_person=100, resize=0.1)
image_shape = people.images[0].shape
print("dataset keys        : ", people.keys())
print("dataset.images shape: ", people.images.shape)
print("dataset.data shape  : ", people.data.shape)
print("dataset.target shape: ", people.target.shape)

dataset keys        :  dict_keys(['data', 'images', 'target', 'target_names', 'DESCR'])
dataset.images shape:  (1140, 37, 28)
dataset.data shape  :  (1140, 1036)
dataset.target shape:  (1140,)


# centering 메서드 TEST

In [4]:
pca = PCA(200, False)
data = people.data
print("before centering:")
print(data)
print()
centered_data = pca.centering(people.data)
print("after centering:")
print(centered_data)
print()
centered_data_sum = np.sum(centered_data, axis=0)
print("sum of vectors:")
print(centered_data_sum)

before centering:
[[0.35555556 0.23660131 0.3385621  ... 0.43790853 0.41699347 0.627451  ]
 [0.18039216 0.26666668 0.47189543 ... 0.89673203 0.92026144 0.87973857]
 [0.14509805 0.15947713 0.32156864 ... 0.41045752 0.36601308 0.2379085 ]
 ...
 [0.83660126 0.77777773 0.7359478  ... 0.54771245 0.5647059  0.6039216 ]
 [0.41830066 0.530719   0.57385623 ... 0.74771243 0.87712413 0.7098039 ]
 [0.21176471 0.46013072 0.4379085  ... 0.0379085  0.09673203 0.12418302]]

after centering:
[[-0.00345391 -0.15751739 -0.11793375 ... -0.04740888 -0.03488389
   0.21804026]
 [-0.17861731 -0.12745202  0.01539958 ...  0.41141462  0.4683841
   0.47032782]
 [-0.21391143 -0.23464157 -0.13492721 ... -0.07485989 -0.08586428
  -0.17150225]
 ...
 [ 0.47759178  0.38365903  0.27945194 ...  0.06239504  0.11282855
   0.19451085]
 [ 0.05929118  0.13660029  0.11736038 ...  0.26239502  0.42524678
   0.30039313]
 [-0.14724477  0.06601202 -0.01858735 ... -0.4474089  -0.35514534
  -0.28522772]]

sum of vectors:
[-1.9983947e

# whitening 메서드 TEST

In [5]:
pca = PCA(200, False)
data = people.data
print("before whitening:")
print(data)
print()
whitened_data = pca.whitening(data)
print("after centering:")
print(whitened_data)
print()
whitened_data_sum = np.sum(whitened_data, axis=0)
print("sum of vectors:")
print(whitened_data_sum)

before whitening:
[[0.35555556 0.23660131 0.3385621  ... 0.43790853 0.41699347 0.627451  ]
 [0.18039216 0.26666668 0.47189543 ... 0.89673203 0.92026144 0.87973857]
 [0.14509805 0.15947713 0.32156864 ... 0.41045752 0.36601308 0.2379085 ]
 ...
 [0.83660126 0.77777773 0.7359478  ... 0.54771245 0.5647059  0.6039216 ]
 [0.41830066 0.530719   0.57385623 ... 0.74771243 0.87712413 0.7098039 ]
 [0.21176471 0.46013072 0.4379085  ... 0.0379085  0.09673203 0.12418302]]

after centering:
[[-0.02082666 -0.9850557  -0.7460987  ... -0.1607969  -0.11549429
   0.7235281 ]
 [-1.0770406  -0.79703796  0.09742422 ...  1.3953967   1.5507357
   1.5606998 ]
 [-1.2898597  -1.4673619  -0.8536065  ... -0.25390258 -0.28428122
  -0.5690999 ]
 ...
 [ 2.8798199   2.3992622   1.7679309  ...  0.21162552  0.3735551
   0.6454499 ]
 [ 0.35751858  0.85424787  0.74247134 ...  0.8899663   1.4079158
   0.99680156]
 [-0.8878679   0.4128149  -0.11759143 ... -1.5174787  -1.1758225
  -0.94647783]]

sum of vectors:
[-0.00121415  0

# covariance 메서드 TEST

In [6]:
pca = PCA(200, False)
data = people.data
print("PCA:")
print(pca.covariance(data))
print()
print("NUMPY: ")
print(np.cov(data, rowvar=False))

PCA:
[[ 0.02752733  0.02335889  0.01677731 ... -0.0016957  -0.00105826
  -0.00114592]
 [ 0.02335889  0.02559271  0.02097222 ... -0.00104826 -0.00082488
  -0.00131569]
 [ 0.01677731  0.02097222  0.0250072  ... -0.00031834 -0.00076869
  -0.00196164]
 ...
 [-0.0016957  -0.00104826 -0.00031834 ...  0.08700515  0.07547966
   0.05279473]
 [-0.00105826 -0.00082488 -0.00076869 ...  0.07547966  0.09130815
   0.07754881]
 [-0.00114592 -0.00131569 -0.00196164 ...  0.05279473  0.07754881
   0.09089578]]

NUMPY: 
[[ 0.02752732  0.02335889  0.01677732 ... -0.0016957  -0.00105826
  -0.00114592]
 [ 0.02335889  0.02559271  0.02097222 ... -0.00104826 -0.00082487
  -0.00131569]
 [ 0.01677732  0.02097222  0.0250072  ... -0.00031834 -0.00076869
  -0.00196164]
 ...
 [-0.0016957  -0.00104826 -0.00031834 ...  0.08700516  0.07547964
   0.05279473]
 [-0.00105826 -0.00082487 -0.00076869 ...  0.07547964  0.09130815
   0.07754879]
 [-0.00114592 -0.00131569 -0.00196164 ...  0.05279473  0.07754879
   0.09089579]]


# gram_schmidt 메서드 TEST

In [7]:
A = np.array([[ 1.04716195, -0.18351747, -0.10934386, -0.00601054, -0.07288471],
                       [-0.18351747,  0.94827502,  0.00726028, -0.0830421 ,  0.08130832],
                       [-0.10934386,  0.00726028,  1.08134424,  0.0548622 , -0.03538415],
                       [-0.00601054, -0.0830421 ,  0.0548622 ,  0.91862992, -0.05356764],
                       [-0.07288471,  0.08130832, -0.03538415, -0.05356764,  1.08806177]])


pca = PCA(200, False)
Q, R = pca.gram_schmidt(A)
print("PCA")
print("Q:\n", Q)
print("R:\n", R)
print()

print("NUMPY")
Q, R = np.linalg.qr(A)
print("Q:\n", Q)
print("R:\n", R)
print()


PCA
Q:
 [[ 0.97753339  0.1720481   0.10439437  0.0138482   0.06113887]
 [-0.17131491  0.97808922  0.02037106  0.09368345 -0.06929004]
 [-0.1020733  -0.03106457  0.99230642 -0.05100842  0.03663518]
 [-0.00561088 -0.09353995  0.04571164  0.99329719  0.04989916]
 [-0.06803841  0.06345235 -0.04387759 -0.04231388  0.99379518]]
R:
 [[ 1.07122883e+00  1.12558858e-17 -2.23392779e-17  4.27399188e-18
  -9.44794220e-19]
 [ 2.44172212e-17  9.08625164e-01  9.08967993e-18  3.98729912e-17
  -5.08121850e-17]
 [-1.90171613e-17  8.16724616e-18  1.06581827e+00 -2.02573374e-17
   3.43961751e-17]
 [ 4.82707684e-18  4.50304903e-17 -2.23810211e-17  9.04077836e-01
   1.95240862e-17]
 [ 4.36403032e-18 -2.82753252e-17  2.97022610e-17  1.07566129e-17
   1.06725132e+00]]

NUMPY
Q:
 [[-0.97753339 -0.1720481  -0.10439437 -0.0138482   0.06113887]
 [ 0.17131491 -0.97808922 -0.02037106 -0.09368345 -0.06929004]
 [ 0.1020733   0.03106457 -0.99230642  0.05100842  0.03663518]
 [ 0.00561088  0.09353995 -0.04571164 -0.99329

# eig 메서드 TEST

In [8]:
A = np.array([[ 1.04716195, -0.18351747, -0.10934386, -0.00601054, -0.07288471],
                       [-0.18351747,  0.94827502,  0.00726028, -0.0830421 ,  0.08130832],
                       [-0.10934386,  0.00726028,  1.08134424,  0.0548622 , -0.03538415],
                       [-0.00601054, -0.0830421 ,  0.0548622 ,  0.91862992, -0.05356764],
                       [-0.07288471,  0.08130832, -0.03538415, -0.05356764,  1.08806177]])

pca = PCA(200, False)
print("pca")
val, vec = pca.eig(A)
print("val: ", val)
print("vec: ", vec)

print("numpy")
val, vec = np.linalg.eigh(A)
print("val: ", val)
print("vec: ", vec)

pca
val:  [1.04716195 0.91611348 0.92200251 0.90735179 0.902919  ]
vec:  [[-0.61923945  0.57670058 -0.08190873  0.33057157  0.4098443 ]
 [-0.3577804   0.25306905  0.46540826 -0.74085479 -0.20610333]
 [ 0.39033199  0.0388916   0.69264682  0.09354104  0.5980124 ]
 [-0.50813577 -0.55097421  0.46251516  0.41248405 -0.23272749]
 [-0.27923944 -0.5461569  -0.288108   -0.40368751  0.61462847]]
numpy
val:  [0.77049726 0.89675976 0.97375554 1.16066394 1.28179641]
vec:  [[ 0.5400817  -0.38979931  0.26310731 -0.21628385 -0.66359935]
 [ 0.73305079 -0.08112912 -0.42103024 -0.1159412   0.51511725]
 [ 0.10402753 -0.45917653  0.38619767  0.74810337  0.26368141]
 [ 0.39986526  0.79378087  0.28837288  0.32998821 -0.13404689]
 [ 0.01530818  0.0233111   0.72194219 -0.5207956   0.45474572]]


# PCA 분류

In [None]:
pca = PCA(200, False, gram_iter=1)
n_component_data = pca.fit(people.data)
pca_w = PCA(200, True)
n_component_data_w = pca_w.fit(people.data)
x_train, x_test, y_train, y_test = train_test_split(n_component_data, people.target, test_size=0.25, random_state=41)
x_train_w, x_test_w, y_train_w, y_test_w = train_test_split(n_component_data_w, people.target, test_size=0.25, random_state=41)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier as knc

In [None]:
knn = knc(n_neighbors=5)
knn_w = knc(n_neighbors=5)

In [None]:
knn.fit(x_train, y_train)
knn_w.fit(x_train_w, y_train_w)

In [None]:
y_pred = knn.predict(x_test)
y_pred_w = knn_w.predict(x_test_w)

# Results
#### -F1 score를 각 채점 항목 별로 출력하면 됩니다

In [None]:
from sklearn.metrics import f1_score

In [None]:
print(f"Naive PCA: {f1_score(y_test, y_pred, average='micro')}\nWhitening PCA: {f1_score(y_test_w, y_pred_w, average='micro')}")