# 1. PCA를 이용해서 diabates 적용해보기

In [3]:
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 1. 데이터 로드
# Diabetes 데이터셋을 불러옵니다.
diabetes_data = load_diabetes()
diabetes_df = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)

diabetes_df

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [9]:
X = diabetes_df.drop('bp',axis=1)
y = diabetes_df['bp']
print(X)
print(y)

          age       sex       bmi        s1        s2        s3        s4  \
0    0.038076  0.050680  0.061696 -0.044223 -0.034821 -0.043401 -0.002592   
1   -0.001882 -0.044642 -0.051474 -0.008449 -0.019163  0.074412 -0.039493   
2    0.085299  0.050680  0.044451 -0.045599 -0.034194 -0.032356 -0.002592   
3   -0.089063 -0.044642 -0.011595  0.012191  0.024991 -0.036038  0.034309   
4    0.005383 -0.044642 -0.036385  0.003935  0.015596  0.008142 -0.002592   
..        ...       ...       ...       ...       ...       ...       ...   
437  0.041708  0.050680  0.019662 -0.005697 -0.002566 -0.028674 -0.002592   
438 -0.005515  0.050680 -0.015906  0.049341  0.079165 -0.028674  0.034309   
439  0.041708  0.050680 -0.015906 -0.037344 -0.013840 -0.024993 -0.011080   
440 -0.045472 -0.044642  0.039062  0.016318  0.015283 -0.028674  0.026560   
441 -0.045472 -0.044642 -0.073030  0.083740  0.027809  0.173816 -0.039493   

           s5        s6  
0    0.019907 -0.017646  
1   -0.068332 -0.092204

In [11]:
# 2. 데이터 표준화
# PCA를 적용하기 전에 데이터를 표준화합니다 (평균 0, 분산 1로 변환).
scaler = StandardScaler()
X_scaled_data = scaler.fit_transform(X)

X_scaled_data

array([[ 0.80050009,  1.06548848,  1.29708846, ..., -0.05449919,
         0.41853093, -0.37098854],
       [-0.03956713, -0.93853666, -1.08218016, ..., -0.83030083,
        -1.43658851, -1.93847913],
       [ 1.79330681,  1.06548848,  0.93453324, ..., -0.05449919,
         0.06015558, -0.54515416],
       ...,
       [ 0.87686984,  1.06548848, -0.33441002, ..., -0.23293356,
        -0.98564884,  0.32567395],
       [-0.9560041 , -0.93853666,  0.82123474, ...,  0.55838411,
         0.93616291, -0.54515416],
       [-0.9560041 , -0.93853666, -1.53537419, ..., -0.83030083,
        -0.08875225,  0.06442552]])

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# 데이터 분할
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled_data, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 스케일링 (train 데이터로 fit, 나머지는 transform)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# PCA 훈련 (train 데이터로 fit, 나머지는 transform)
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 지도 학습 모델 (랜덤 포레스트) 학습
clf = RandomForestRegressor()
clf.fit(X_train_pca, y_train)  # y_train 사용

# 검증 데이터로 예측
y_val_pred = clf.predict(X_val_pca)

# 성능 평가
mse = mean_squared_error(y_val, y_val_pred)
print("Validation MSE:", mse)

Validation MSE: 0.0022471040229311737
