In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("./데이터/abalone.csv")

In [3]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
# 특징과 라벨 분리
X = df.drop(['Sex'], axis = 1)
Y = df['Sex']

In [5]:
# 학습 데이터와 평가 데이터 분리
from sklearn.model_selection import train_test_split
Train_X, Test_X, Train_Y, Test_Y = train_test_split(X, Y)

In [6]:
Train_X.corr() # 특징 간 상관 행렬 출력 => 얼핏봐도 특징 간 선형 관계가 존재

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
Length,1.0,0.986902,0.819458,0.924268,0.897061,0.903226,0.897456,0.565495
Diameter,0.986902,1.0,0.825762,0.925003,0.893228,0.900361,0.904921,0.582394
Height,0.819458,0.825762,1.0,0.806752,0.763903,0.786599,0.803864,0.553198
Whole weight,0.924268,0.925003,0.806752,1.0,0.969892,0.966092,0.956372,0.547796
Shucked weight,0.897061,0.893228,0.763903,0.969892,1.0,0.932513,0.885259,0.430906
Viscera weight,0.903226,0.900361,0.786599,0.966092,0.932513,1.0,0.908703,0.512835
Shell weight,0.897456,0.904921,0.803864,0.956372,0.885259,0.908703,1.0,0.632364
Rings,0.565495,0.582394,0.553198,0.547796,0.430906,0.512835,0.632364,1.0


#### VIF 기준 특징 선택

In [7]:
# VIF 계산
from sklearn.linear_model import LinearRegression as LR
VIF_dict = dict()
for col in Train_X.columns:
    model = LR().fit(Train_X.drop([col], axis = 1), Train_X[col])
    r2 = model.score(Train_X.drop([col], axis = 1), Train_X[col]) 
    # LinearRegression의 score가 r2 점수임
    VIF = 1 / (1 - r2)
    VIF_dict[col] = VIF

In [8]:
VIF_dict 
# Height를 제외하곤 VIF가 모두 높으므로, 이러한 상황에서는 사실 PCA를 사용하는 것이 바람직

{'Length': 40.82597764588323,
 'Diameter': 42.47917359607689,
 'Height': 3.3883370642167328,
 'Whole weight': 112.11092981307453,
 'Shucked weight': 32.22709869372162,
 'Viscera weight': 17.30065714301831,
 'Shell weight': 21.59410782655036,
 'Rings': 2.119675662261056}

In [8]:
from sklearn.neural_network import MLPRegressor as MLP
from sklearn.metrics import mean_absolute_error as MAE

In [9]:
# 전체 특징을 모두 사용하였을 때
model = MLP(random_state = 2313, max_iter = 500)
model.fit(Train_X, Train_Y)
pred_Y = model.predict(Test_X)
score = MAE(Test_Y, pred_Y)
print(score)

1.5276345561460773


In [10]:
# VIF 점수가 30점 미만인 특징만 사용하였을 때 
selected_features = [key for key, val in VIF_dict.items() if val < 30] 
model = MLP(random_state = 2313, max_iter = 500)
model.fit(Train_X[selected_features], Train_Y)
pred_Y = model.predict(Test_X[selected_features])
score = MAE(Test_Y, pred_Y)
print(score)

1.5076713980631804


#### PCA 사용

In [12]:
from sklearn.decomposition import PCA
PCA_model = PCA(n_components = 3).fit(Train_X)

Train_Z = PCA_model.transform(Train_X)
Test_Z = PCA_model.transform(Test_X)

print(Train_Z.shape)

(3132, 3)


In [13]:
model = MLP(random_state = 2313, max_iter = 500)
model.fit(Train_Z, Train_Y)
pred_Y = model.predict(Test_Z)
score = MAE(Test_Y, pred_Y)
print(score)

1.4484133888721893
