### Abalone(전복) 나이 예측
- 피처: 'Height', 'Shell_weight'

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#### 데이터 로드 & 전처리 & EDA

In [2]:
DATA_PATH_train = r'C:\Users\KDP-43\Desktop\머신러닝_플젝\dataset\mix_abalone_train.csv'
DATA_PATH_test	= r'C:\Users\KDP-43\Desktop\머신러닝_플젝\dataset\abalone_test.csv'

raw_trainDF = pd.read_csv(DATA_PATH_train)
raw_testDF	= pd.read_csv(DATA_PATH_test)

raw_trainDF.head(3)

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6


In [3]:
raw_trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94792 entries, 0 to 94791
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             94792 non-null  object 
 1   Length          94792 non-null  float64
 2   Diameter        94792 non-null  float64
 3   Height          94792 non-null  float64
 4   Whole_weight    94792 non-null  float64
 5   Shucked_weight  94792 non-null  float64
 6   Viscera_weight  94792 non-null  float64
 7   Shell_weight    94792 non-null  float64
 8   Rings           94792 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 6.5+ MB


In [4]:
# 각 컬럼별 결측치 여부 확인


# 수치형 음수(-1)
check_minus = {}
for col in raw_trainDF.columns[1:]:
    # print(f'===================={col}====================')
    # print( (raw_trainDF[col] < 0).sum())
    check_minus[col] = (raw_trainDF[col] < 0).sum()
print(check_minus)

# 수치형 0
check_zero = {}
for col in raw_trainDF.columns[1:]:
    check_zero[col] = (raw_trainDF[col] == 0).sum()
print(check_zero)

# 범주형 [Sex] value_counts()
print("\n[SEX value counts]")
print(raw_trainDF['Sex'].value_counts())

{'Length': 0, 'Diameter': 0, 'Height': 0, 'Whole_weight': 0, 'Shucked_weight': 0, 'Viscera_weight': 0, 'Shell_weight': 0, 'Rings': 0}
{'Length': 0, 'Diameter': 0, 'Height': 8, 'Whole_weight': 0, 'Shucked_weight': 0, 'Viscera_weight': 0, 'Shell_weight': 0, 'Rings': 0}

[SEX value counts]
Sex
I    34435
M    32555
F    27802
Name: count, dtype: int64


In [5]:
# Height ==0 행 8개 nan -> 삭제
mask = (raw_trainDF['Height'] == 0)

drop_trainDF = raw_trainDF.loc[~mask,:].copy().reset_index(drop=True)
drop_trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94784 entries, 0 to 94783
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             94784 non-null  object 
 1   Length          94784 non-null  float64
 2   Diameter        94784 non-null  float64
 3   Height          94784 non-null  float64
 4   Whole_weight    94784 non-null  float64
 5   Shucked_weight  94784 non-null  float64
 6   Viscera_weight  94784 non-null  float64
 7   Shell_weight    94784 non-null  float64
 8   Rings           94784 non-null  int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 6.5+ MB


In [6]:
# sex -> get_dummies()
tmpDF = pd.get_dummies(drop_trainDF).copy()
tmpDF.head(5)

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Sex_F,Sex_I,Sex_M
0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11,True,False,False
1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11,True,False,False
2,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6,False,True,False
3,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10,False,False,True
4,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9,False,True,False


In [7]:
rings_data = tmpDF['Rings']
encod_DF = tmpDF.drop('Rings', axis=1).copy()

encod_DF['Rings'] = rings_data
encod_DF.head(5)

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_F,Sex_I,Sex_M,Rings
0,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,True,False,False,11
1,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,True,False,False,11
2,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,False,True,False,6
3,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,False,False,True,10
4,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,False,True,False,9


In [10]:
# [sex] 카테고리에 따른 8개 피쳐 히스토그램

* Height ==> 높은 값의 이상치 때문에 상관계수가 높게 나온 듯


#### 다중회귀

##### 타겟/피쳐 분리
- 타겟: Rings
- 피쳐: Rings, Sex_{} 제외 모든 칼럼

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [13]:
drop_trainDF.columns

Index(['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
       'Viscera_weight', 'Shell_weight', 'Rings'],
      dtype='object')

In [14]:
# 타겟/피쳐 분리
featureDF = drop_trainDF[['Shell_weight', 'Diameter', 'Height', 'Shucked_weight']]
targetSR = drop_trainDF['Rings']

print(f'featureDF: {featureDF.shape}, {featureDF.ndim}D')
print(f'targetSR: {targetSR.shape}, {targetSR.ndim}D')

featureDF: (94784, 4), 2D
targetSR: (94784,), 1D


In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR, 
                                                    test_size=0.2,
                                                    random_state=12
                                                    )

print(f'X_train: {X_train.shape}, {X_train.ndim}D', f'y_train: {y_train.shape}, {y_train.ndim}D')
print(f'X_test: {X_test.shape}, {X_test.ndim}D', f'y_train: {y_test.shape}, {y_test.ndim}D')


X_train: (75827, 4), 2D y_train: (75827,), 1D
X_test: (18957, 4), 2D y_train: (18957,), 1D


In [16]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 다중 회귀 모델 학습
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

# 예측
y_pred_linear = linear_model.predict(X_test_scaled)

# 다중 회귀 성능 평가
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("다중 회귀 성능 평가:")
print(f"MSE: {mse_linear}")
print(f"RMSE: {rmse_linear}")
print(f"MAE: {mae_linear}")
print(f"R²: {r2_linear}\n")

다중 회귀 성능 평가:
MSE: 4.013565486838061
RMSE: 2.0033885012243786
MAE: 1.381579052887035
R²: 0.5932987299765347



In [17]:
# 다항 회귀를 위한 다항 피처 생성 (2차)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# 다항 회귀 모델 학습
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# 예측
y_pred_poly = poly_model.predict(X_test_poly)

# 다항 회귀 성능 평가
mse_poly = mean_squared_error(y_test, y_pred_poly)
rmse_poly = np.sqrt(mse_poly)
mae_poly = mean_absolute_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print("다항 회귀 성능 평가:")
print(f"MSE: {mse_poly}")
print(f"RMSE: {rmse_poly}")
print(f"MAE: {mae_poly}")
print(f"r2: {r2_poly}")

다항 회귀 성능 평가:
MSE: 3.781050098790855
RMSE: 1.9444922470379908
MAE: 1.3391548868576988
r2: 0.6168599011917318


In [18]:
degrees = [2,3,4,5]

for degree in degrees:
# 다항 회귀를 위한 다항 피처 생성 (n차)
	poly = PolynomialFeatures(degree=degree)
	X_train_poly = poly.fit_transform(X_train_scaled)
	X_test_poly = poly.transform(X_test_scaled)

	# 다항 회귀 모델 학습
	poly_model = LinearRegression()
	poly_model.fit(X_train_poly, y_train)

	# 예측
	y_pred_poly = poly_model.predict(X_test_poly)

	# 다항 회귀 성능 평가
	mse_poly = mean_squared_error(y_test, y_pred_poly)
	rmse_poly = np.sqrt(mse_poly)
	mae_poly = mean_absolute_error(y_test, y_pred_poly)
	r2_poly = r2_score(y_test, y_pred_poly)

	print()
	print(f"[{degree}차] 다항 회귀 성능 평가:")
	print(f"MSE: {mse_poly}")
	print(f"RMSE: {rmse_poly}")
	print(f"MAE: {mae_poly}")
	print(f"r2: {r2_poly}")


[2차] 다항 회귀 성능 평가:
MSE: 3.781050098790855
RMSE: 1.9444922470379908
MAE: 1.3391548868576988
r2: 0.6168599011917318

[3차] 다항 회귀 성능 평가:
MSE: 3.7107943763619486
RMSE: 1.926342227217674
MAE: 1.3179881738826353
r2: 0.623979030462689

[4차] 다항 회귀 성능 평가:
MSE: 3.677718200870389
RMSE: 1.917737782093889
MAE: 1.3070703768349743
r2: 0.6273306943695196

[5차] 다항 회귀 성능 평가:
MSE: 3.9440097154618265
RMSE: 1.985953100015664
MAE: 1.3067639297985074
r2: 0.6003469320424895


3차 다항식에서 MSE,RMSE,MAE가 가장 낮으면서 r2-score가 가장 높음
다만) MSE,RMSE,MAE는 0에 수렴하지 않아 아직 튜닝이 더 필요한 모델