In [1]:
pip install dmba

Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.2.4


In [3]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
# 주성분 분석(PCA)과 데이터 전처리를 수행하는데 필요한 모듈을 가져오는 것

from sklearn import preprocessing
# 데이터 전처리와 스케일링과 같은 데이터 준비 작업을 수행하는 데 사용, 데이터를 정규화하거나 표준화하는 등의 작업을 수행

import matplotlib.pylab as plt

import dmba

%matplotlib inline

In [4]:
wine_df = dmba.load_data('Wine.csv')
wine_df = wine_df.drop(columns=['Type']) ##데이터프레임에서 'Type'이라는 열(column)을 제거

In [5]:
pcs = PCA()
pcs.fit(wine_df.dropna(axis=0))
pcsSummary_df = pd.DataFrame({'Standard deviation': np.sqrt(pcs.explained_variance_),
                           'Proportion of variance': pcs.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pcs.explained_variance_ratio_)})
pcsSummary_df = pcsSummary_df.transpose()
pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, len(pcsSummary_df.columns) + 1)]
pcsSummary_df.round(4)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13
Standard deviation,314.9632,13.1353,3.0722,2.2341,1.1085,0.9171,0.5282,0.3891,0.3348,0.2678,0.1938,0.1452,0.0906
Proportion of variance,0.9981,0.0017,0.0001,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cumulative proportion,0.9981,0.9998,0.9999,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


- PC1 요인이 변동의 99% 이상을 차지하는 것으로 나타나므로 정규화(표준화)를 통해 확인할 필요가 있다.

In [9]:
# 상위 5개의 요인에 대하여 로딩값 추출

pcsComponents_df = pd.DataFrame(pcs.components_.transpose(), columns=pcsSummary_df.columns,
                                index=wine_df.columns)
pcsComponents_df.iloc[:,:5]

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
Alcohol,0.001659,0.001203,0.016874,0.141447,-0.020337
Malic_Acid,-0.000681,0.002155,0.122003,0.16039,0.612883
Ash,0.000195,0.004594,0.051987,-0.009773,-0.020176
Ash_Alcalinity,-0.004671,0.02645,0.938593,-0.330965,-0.064352
Magnesium,0.017868,0.999344,-0.02978,-0.005394,0.006149
Total_Phenols,0.00099,0.000878,-0.040485,-0.074585,-0.315245
Flavanoids,0.001567,-5.2e-05,-0.085443,-0.169087,-0.524761
Nonflavanoid_Phenols,-0.000123,-0.001354,0.013511,0.010806,0.029648
Proanthocyanins,0.000601,0.005004,-0.024659,-0.050121,-0.251183
Color_Intensity,0.002327,0.0151,0.291398,0.878894,-0.331747


- PC1 요인에 기여하는 변수들의 가중치를 확인하면 Proline이 0.999823으로 압도적이다.

In [7]:
wine_df.describe().round(3)

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcalinity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280_OD315,Proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.001,2.336,2.367,19.495,99.742,2.295,2.029,0.362,1.591,5.058,0.957,2.612,746.893
std,0.812,1.117,0.274,3.34,14.282,0.626,0.999,0.124,0.572,2.318,0.229,0.71,314.907
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.362,1.602,2.21,17.2,88.0,1.742,1.205,0.27,1.25,3.22,0.782,1.938,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.678,3.082,2.558,21.5,107.0,2.8,2.875,0.438,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


- 각 변수들의 평균 간 차이를 확인하면 그 차이가 크고 특히 Proline에서 엄청난 차이가 나타난다. 이 차이를 조정하기 위해 각 변수의 분산이 1인 표준화된 변수로 대체하여 모든 변동에 동등한 중요성을 부여한다.

- 각 변수가 가진 분산를 표준편차로 나누어 준다.

정규화한 이후 주성분 점수 출력하기




In [23]:
pcs = PCA()
scores=preprocessing.scale(wine_df.iloc[:, 3:].dropna(axis=0)) # 정규화 진행

pcs.fit(scores)

pcsSummary_df = pd.DataFrame({'Standard deviation': np.sqrt(pcs.explained_variance_),
                           'Proportion of variance': pcs.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pcs.explained_variance_ratio_)})
pcsSummary_df = pcsSummary_df.transpose()
pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, len(pcsSummary_df.columns) + 1)]
pcsSummary_df.round(4)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
Standard deviation,2.0975,1.3304,0.9762,0.9056,0.84,0.6971,0.5869,0.5328,0.4285,0.3319
Proportion of variance,0.4375,0.176,0.0948,0.0815,0.0702,0.0483,0.0343,0.0282,0.0183,0.011
Cumulative proportion,0.4375,0.6135,0.7083,0.7898,0.86,0.9083,0.9426,0.9708,0.989,1.0


- PC1으로 99% 이상의 변동을 설명하던 이전과 달리 표준화 이후 90% 이상의 변동을 설명하기 위해서는 6개 또는 그 이상의 요인이 필요하다는 것을 알 수 있다.

In [20]:
pcsComponents_df = pd.DataFrame(pcs.components_.transpose(), columns=pcsSummary_df.columns,
                                index=wine_df.iloc[:, 3:].columns)
pcsComponents_df.iloc[:,:5]

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
Ash_Alcalinity,-0.237343,-0.129058,0.78277,0.114388,0.085797
Magnesium,0.14452,0.418493,0.197506,0.78582,0.140403
Total_Phenols,0.413607,0.061257,0.190674,-0.245869,0.061567
Flavanoids,0.443197,-0.021184,0.14232,-0.16243,0.016772
Nonflavanoid_Phenols,-0.312604,-0.044262,0.130303,-0.285818,0.757712
Proanthocyanins,0.335454,0.084487,0.419892,-0.196278,-0.081966
Color_Intensity,-0.108832,0.652336,0.075205,-0.32814,-0.001694
Hue,0.303669,-0.34686,-0.166158,0.219961,0.510884
OD280_OD315,0.400652,-0.241555,0.100647,-0.059956,-0.058645
Proline,0.283647,0.436699,-0.234453,-0.051922,0.351515


- 가중치 또한 조정이 되어서 실제로 Proline 변수는 PC1 요인과 강하지 않은 양의 상관관계를 가졌다는 것을 확인해볼 수 있다.

In [21]:
scores2 = pd.DataFrame(pcs.transform(scores))

scores2.columns = ['PC{}'.format(i) for i in range(1, len(pcsSummary_df.columns) + 1)]

scores2.iloc[:,:5]

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,3.074800,1.146801,0.099626,0.784978,0.067697
1,2.103100,0.159028,-2.272774,-0.119457,-0.259125
2,2.601263,0.781845,0.685887,-0.772424,0.113146
3,3.537905,2.401738,0.271687,-0.809211,-0.058558
4,1.084485,0.078511,1.023671,0.768285,0.564539
...,...,...,...,...,...
173,-2.916124,1.172950,-0.246170,-0.541361,0.287388
174,-2.443097,1.245600,0.568392,0.075814,-0.003877
175,-2.454070,2.966482,0.107039,0.547222,-0.070364
176,-2.511831,2.670503,0.278091,0.388572,0.553840


In [26]:
scores2.corr().round(3)

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
PC1,1.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,-0.0
PC2,-0.0,1.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0
PC3,0.0,-0.0,1.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0
PC4,-0.0,-0.0,-0.0,1.0,-0.0,0.0,0.0,0.0,-0.0,0.0
PC5,0.0,0.0,-0.0,-0.0,1.0,0.0,0.0,0.0,0.0,0.0
PC6,-0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.0
PC7,-0.0,0.0,-0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.0
PC8,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,1.0,-0.0,0.0
PC9,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.0,1.0,0.0
PC10,-0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,1.0


- 표준화가 잘 이루어져서 상관관계가 0, 각 요인들이 상호독립적으로 존재하고 있다.