# Dimension Reduction - 차원 축소

---

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# 1 PCA(Principal Component Analysis) - 주성분 분석

- 피처 선택(Feature selection)
- 피처 추출(Feature extraction)

### 1.1 PCA 적용 - Iris 데이터 셋

#### 1.1.1 데이터 로딩

In [None]:
# Visual Python: Machine Learning > Data Sets
from sklearn.datasets import load_iris

ldata = load_iris()
# Create DataFrame
df_ldata = pd.DataFrame(data=ldata.data, columns=ldata.feature_names)
df_ldata['target'] = ldata.target
df_ldata

#### 1.1.2 데이터 시각화

In [None]:
# Visual Python: Visualization > Seaborn
sns.scatterplot(data=df_ldata, x='sepal length (cm)', y='sepal width (cm)', hue='target')
plt.show()

#### 1.1.3 데이터 스케일링

In [None]:
# Visual Python: Machine Learning > Data Prep
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
X_scaled = scaler.fit_transform(df_ldata[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']])

#### 1.2.4 PCA 적용

In [None]:
# Visual Python: Machine Learning > Dimension
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
X_pca = pca.fit_transform(X_scaled)
X_pca

In [None]:
print('원본 데이터 형태:', X_scaled.shape)
print('PCA 적용된 데이터 형태:', X_pca.shape)

#### 1.2.5 PCA 성분
- 정보를 담고 있는 비율

In [None]:
# Visual Python: Machine Learning > Model Info
ratio = pca.explained_variance_ratio_
ratio

#### 1.2.6 PCA 적용된 DataFrame 생성

In [None]:
df_pca = pd.DataFrame(X_pca, columns=['col1', 'col2'])

df_pca['target']= df_ldata['target']
df_pca

#### 1.2.7 PCA 적용된 데이터 시각화

In [None]:
# Visual Python: Visualization > Seaborn
sns.scatterplot(data=df_pca, x='col1', y='col2', hue='target')
plt.show()

#### 1.2.8 분류 예측: 원본 데이터 사용

In [None]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_ldata[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']], df_ldata['target'])

# [2] Classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))
# Classification report
print(metrics.classification_report(y_test, pred))

#### 1.2.9 분류 예측: PCA 적용된 데이터 사용

In [None]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_pca[['col1', 'col2']], df_pca['target'])

# [2] Classifier
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

# [3] Fit
model.fit(X_train, y_train)

# [4] Predict
pred = model.predict(X_test)

# [5] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))
# Classification report
print(metrics.classification_report(y_test, pred))

---

In [None]:
# End of file