# PCA

In [1]:
import pandas as pd
import numpy as np

In [24]:
data = {"x1":[10, 20],
       "x2":[20, 5]
       }
df = pd.DataFrame(data)
df

Unnamed: 0,x1,x2
0,10,20
1,20,5


In [25]:
np.cov(df)

array([[ 50. , -75. ],
       [-75. , 112.5]])

In [26]:
df.mean()

x1    15.0
x2    12.5
dtype: float64

In [27]:
df = df-df.mean()
df

Unnamed: 0,x1,x2
0,-5.0,7.5
1,5.0,-7.5


In [35]:
np.cov(df.T)

array([[ 50. , -75. ],
       [-75. , 112.5]])

In [29]:
df.cov()

Unnamed: 0,x1,x2
x1,50.0,-75.0
x2,-75.0,112.5


In [34]:
np.dot(df.T, df)

array([[ 50. , -75. ],
       [-75. , 112.5]])

## PCA 예제

In [11]:
data = {
    "calory":[1.2, 2.8, 5, 3, 4.5, 2.],
    "breakfast":[0, 3, 5, 2, 7, 5],
    "lunch":[4, 5, 7, 6, 7, 3],
    "dinner":[5, 6, 7, 6, 7, 6],
    "weight":[65, 75, 100, 71, 97, 60]
}

In [12]:
df = pd.DataFrame(data)
df

Unnamed: 0,calory,breakfast,lunch,dinner,weight
0,1.2,0,4,5,65
1,2.8,3,5,6,75
2,5.0,5,7,7,100
3,3.0,2,6,6,71
4,4.5,7,7,7,97
5,2.0,5,3,6,60


In [13]:
y = df["weight"]
y

0     65
1     75
2    100
3     71
4     97
5     60
Name: weight, dtype: int64

In [14]:
df = df.drop(["weight"], axis = 1)
df

Unnamed: 0,calory,breakfast,lunch,dinner
0,1.2,0,4,5
1,2.8,3,5,6
2,5.0,5,7,7
3,3.0,2,6,6
4,4.5,7,7,7
5,2.0,5,3,6


In [18]:
X = df
X

Unnamed: 0,calory,breakfast,lunch,dinner
0,-1.883333,-3.666667,-1.333333,-1.166667
1,-0.283333,-0.666667,-0.333333,-0.166667
2,1.916667,1.333333,1.666667,0.833333
3,-0.083333,-1.666667,0.666667,-0.166667
4,1.416667,3.333333,1.666667,0.833333
5,-1.083333,1.333333,-2.333333,-0.166667


In [19]:
X_cen = X - X.mean()
X_cen

Unnamed: 0,calory,breakfast,lunch,dinner
0,-1.883333,-3.666667,-1.333333,-1.166667
1,-0.283333,-0.666667,-0.333333,-0.166667
2,1.916667,1.333333,1.666667,0.833333
3,-0.083333,-1.666667,0.666667,-0.166667
4,1.416667,3.333333,1.666667,0.833333
5,-1.083333,1.333333,-2.333333,-0.166667


In [20]:
X_cen.mean()

calory       0.000000e+00
breakfast   -3.700743e-17
lunch        0.000000e+00
dinner      -4.625929e-18
dtype: float64

In [39]:
X_cov = np.cov(X_cen.T)
X_cov

array([[2.09766667, 2.61333333, 2.12666667, 1.04333333],
       [2.61333333, 6.26666667, 1.73333333, 1.66666667],
       [2.12666667, 1.73333333, 2.66666667, 0.93333333],
       [1.04333333, 1.66666667, 0.93333333, 0.56666667]])

- 고유값 : 축의 길이
- 고유벡터 : 좌표

In [40]:
eigenvalue, eigenvector = np.linalg.eig(X_cov)
print("eigenvalue : ", eigenvalue)
print("eigenvector : \n", eigenvector)

eigenvalue :  [9.14780875e+00 2.36702072e+00 8.08883672e-02 1.94882794e-03]
eigenvector : 
 [[ 0.43937091  0.34911627 -0.7283658  -0.39313393]
 [ 0.77295605 -0.58010996  0.23904478 -0.09417519]
 [ 0.3862551   0.73448569  0.55523568  0.05523686]
 [ 0.24555883  0.04605465 -0.32257433  0.91297625]]


In [41]:
sum(eigenvalue)

11.597666666666676

In [42]:
eigenvalue[0]

9.147808752147373

In [43]:
eigenvalue[0]/sum(eigenvalue)

0.7887628619676975

In [44]:
eigenvector.T

array([[ 0.43937091,  0.77295605,  0.3862551 ,  0.24555883],
       [ 0.34911627, -0.58010996,  0.73448569,  0.04605465],
       [-0.7283658 ,  0.23904478,  0.55523568, -0.32257433],
       [-0.39313393, -0.09417519,  0.05523686,  0.91297625]])

In [45]:
eigenvector.T[0]

array([0.43937091, 0.77295605, 0.3862551 , 0.24555883])

In [46]:
projected_X = X_cov.dot(eigenvector.T[0])
projected_X

array([4.0192811 , 7.07085414, 3.53338781, 2.24632517])

# 파이썬을 활용한 PCA 구현2

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    "x1":[0.2, 0.45, 0.33, 0.54, 0.77],
    "x2" : [5.6, 5.87, 6.37, 7.9, 7.87],
    "x3" : [3.56, 2.4, 1.95, 1.32, 0.98],
    "y" : [100, 70, 60, 45, 30]
}
df = pd.DataFrame(data)
df

Unnamed: 0,x1,x2,x3,y
0,0.2,5.6,3.56,100
1,0.45,5.87,2.4,70
2,0.33,6.37,1.95,60
3,0.54,7.9,1.32,45
4,0.77,7.87,0.98,30


In [3]:
X = df[["x1", "x2", "x3"]]
X

Unnamed: 0,x1,x2,x3
0,0.2,5.6,3.56
1,0.45,5.87,2.4
2,0.33,6.37,1.95
3,0.54,7.9,1.32
4,0.77,7.87,0.98


In [4]:
X = (X-X.mean())/X.std()
X

Unnamed: 0,x1,x2,x3
0,-1.192988,-1.022721,1.501191
1,-0.036992,-0.776612,0.354036
2,-0.59187,-0.320854,-0.090981
3,0.379167,1.073766,-0.714005
4,1.442683,1.04642,-1.05024


In [6]:
X.describe()

Unnamed: 0,x1,x2,x3
count,5.0,5.0,5.0
mean,0.0,4.884981e-16,-2.664535e-16
std,1.0,1.0,1.0
min,-1.192988,-1.022721,-1.05024
25%,-0.59187,-0.7766117,-0.7140051
50%,-0.036992,-0.3208537,-0.09098125
75%,0.379167,1.04642,0.3540358
max,1.442683,1.073766,1.501191


In [35]:
print("numpy covariance : \n", X.cov())
X_cov = np.dot(X.T, X) / 4
print("수식을 통한 covariance : \n", X_cov )

numpy covariance : 
           x1        x2        x3
x1  1.000000  0.838879 -0.884010
x2  0.838879  1.000000 -0.911681
x3 -0.884010 -0.911681  1.000000
수식을 통한 covariance : 
 [[ 1.          0.83887873 -0.88401004]
 [ 0.83887873  1.         -0.91168074]
 [-0.88401004 -0.91168074  1.        ]]


In [15]:
eigenvalue, eigenvector = np.linalg.eig(X_cov)
print("고유값 : ", eigenvalue)
print("고유벡터 : \n", eigenvector)

고유값 :  [0.73512513 0.04377061 0.02110426]
고유벡터 : 
 [[-0.56993665 -0.7754158   0.27185024]
 [-0.57618187  0.61301734  0.54057764]
 [ 0.58582135 -0.15145983  0.79616158]]


## 전체에서 77%이상의 고유값을 갖는 벡터가 있으면 해당 벡터를 이용해서 회귀 분석 실행 가능

In [17]:
print("eigenvalue 비율 : \n", eigenvalue/sum(eigenvalue))

eigenvalue 비율 : 
 [0.91890641 0.05471326 0.02638033]


In [19]:
z1 = X.dot(eigenvector[:, 0])
z1

0    2.148630
1    0.675954
2    0.468900
3   -1.253065
4   -2.040419
dtype: float64

In [20]:
X.dot(eigenvector.T[0])

0    2.148630
1    0.675954
2    0.468900
3   -1.253065
4   -2.040419
dtype: float64

In [23]:
from sklearn import linear_model

linear_regression = linear_model.LinearRegression()
linear_regression.fit(X=X, y=df["y"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
linear_regression.score(X = X, y=df["y"])

0.9978441052442628

In [25]:
linear_regression.fit(X = pd.DataFrame(z1), y = df["y"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
linear_regression.score(X = pd.DataFrame(z1), y = df["y"])

0.959290206663969

## 고유값이 낮은 것으로 한다면 어떻게 될까


In [27]:
z2 = X.dot(eigenvector[:, 1])
z2

0    0.070746
1   -0.501015
2    0.276036
3    0.472368
4   -0.318136
dtype: float64

In [29]:
linear_regression.fit(X = pd.DataFrame(z2), y = df["y"])
score = linear_regression.score(X = pd.DataFrame(z2), y = df["y"])
print("score : ", score)

score :  4.181500875222177e-05


## 고유값의 비율이 77%가 넘지 않는 나머지 것 고유벡터를 이용한다면 어떻게 될까

In [30]:
z3 = X.dot(eigenvector[:, 1:])
z3

Unnamed: 0,0,1
0,0.070746,0.318016
1,-0.501015,-0.148005
2,0.276036,-0.406782
3,0.472368,0.115067
4,-0.318136,0.121704


In [31]:
linear_regression.fit(X = pd.DataFrame(z3), y = df["y"])
score = linear_regression.score(X = pd.DataFrame(z3), y = df["y"])
print("score : ", score)

score :  0.038553898580293566


# PCA를 이용한 보스턴 집값 데이터 분석

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [37]:
from sklearn.datasets import load_boston
boston_dataset = load_boston()

boston = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


## 데이터의 평균을 0으로, 분산을 1로 표준화

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(boston)
X = scaler.transform(boston)
X

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

In [40]:
print("X mean : ", X.mean())
print("X std : ", X.std())

X mean :  -1.1147462804871136e-15
X std :  0.9999999999999994


In [41]:
X = pd.DataFrame(X, columns = boston_dataset.feature_names)
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


In [51]:
y = boston_dataset.target
y

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [42]:
from sklearn.decomposition import PCA
pca = PCA(n_components=13)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=13, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [43]:
print("고유값 : \n", pca.singular_values_)
print("고유 벡터 :\n", pca.components_)

고유값 : 
 [55.6793095  26.93022859 25.07516773 20.83105866 20.55278239 18.23864114
 16.45874174 14.15716218 11.83779223 10.55653065  9.70171478  9.25566343
  5.66883461]
고유 벡터 :
 [[ 0.2509514  -0.25631454  0.34667207  0.00504243  0.34285231 -0.18924257
   0.3136706  -0.32154387  0.31979277  0.33846915  0.20494226 -0.20297261
   0.30975984]
 [-0.31525237 -0.3233129   0.11249291  0.45482914  0.21911553  0.14933154
   0.31197778 -0.34907    -0.27152094 -0.23945365 -0.30589695  0.23855944
  -0.07432203]
 [ 0.24656649  0.29585782 -0.01594592  0.28978082  0.12096411  0.59396117
  -0.01767481 -0.04973627  0.28725483  0.22074447 -0.32344627 -0.3001459
  -0.26700025]
 [ 0.06177071  0.12871159  0.01714571  0.81594136 -0.12822614 -0.28059184
  -0.17520603  0.21543585  0.13234996  0.10333509  0.28262198  0.1684985
   0.06941441]
 [ 0.08215692  0.32061699 -0.00781119  0.08653094  0.13685356 -0.4234472
   0.01669085  0.09859225 -0.20413162 -0.13046057 -0.58400223 -0.34560695
   0.39456113]
 [-0.219659

In [44]:
pca.singular_values_/sum(pca.singular_values_)

array([0.22731479, 0.1099446 , 0.10237118, 0.0850443 , 0.08390821,
       0.07446057, 0.067194  , 0.05779763, 0.04832864, 0.0430978 ,
       0.03960795, 0.03778691, 0.02314343])

In [45]:
sum(pca.singular_values_[0:8])/sum(pca.singular_values_)

0.8080352767158057

In [46]:
z = pca.transform(X)
z

array([[-2.09829747,  0.77311275,  0.34294273, ..., -0.03300036,
         0.01944023,  0.36597533],
       [-1.45725167,  0.59198521, -0.69519931, ..., -0.64080983,
        -0.12579741, -0.07071949],
       [-2.07459756,  0.5996394 ,  0.1671216 , ..., -0.48755672,
         0.13332653, -0.0140218 ],
       ...,
       [-0.31236047,  1.15524644, -0.40859759, ...,  0.11565634,
         0.28196407,  0.06247358],
       [-0.27051907,  1.04136158, -0.58545406, ...,  0.0870124 ,
         0.30170082,  0.05436991],
       [-0.12580322,  0.76197805, -1.294882  , ...,  0.18432101,
         0.23273318,  0.01970872]])

In [47]:
z.shape

(506, 13)

In [49]:
zDF = pd.DataFrame(z[:, 0:8])
zDF

Unnamed: 0,0,1,2,3,4,5,6,7
0,-2.098297,0.773113,0.342943,-0.891774,0.423070,-0.315338,0.318641,0.295832
1,-1.457252,0.591985,-0.695199,-0.487459,-0.195876,0.264223,0.553861,-0.223670
2,-2.074598,0.599639,0.167122,-0.739204,-0.934534,0.448095,0.484560,0.105166
3,-2.611504,-0.006871,-0.100284,-0.343721,-1.104956,0.664649,0.622641,0.255941
4,-2.458185,0.097712,-0.075348,-0.427907,-1.065924,0.617047,0.705086,-0.134524
...,...,...,...,...,...,...,...,...
501,-0.314968,0.724285,-0.860896,-0.434740,-1.121040,0.508064,0.199056,-0.249896
502,-0.110513,0.759308,-1.255979,-0.309376,-0.891542,0.408208,0.191779,-0.146502
503,-0.312360,1.155246,-0.408598,-0.786304,-1.595185,0.467947,0.294119,-0.638660
504,-0.270519,1.041362,-0.585454,-0.678134,-1.416024,0.482259,0.271597,-0.579344


In [52]:
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
linear_regression.fit(zDF, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [53]:
linear_regression.score(X = zDF, y = y)

0.7055977712036576

In [56]:
np.linalg.inv(np.array([[1, 2], [1, 3]]))

array([[ 3., -2.],
       [-1.,  1.]])