# 선형회기 모델, 작성, 예측, 평가

# LinearRegression & RandomForestRegression

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [34]:
df = pd.read_csv('./data1/premium.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [35]:
X = df.drop('charges', axis = 1).values
y = df['charges'].values
df.shape

(1338, 7)

In [36]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## 결측치 제거

In [48]:
# BMI 컬럼의 평균 계산
bmi_mean = df['bmi'].mean()

# 결측값을 평균으로 대체
df['bmi'] = df['bmi'].fillna(bmi_mean)

# 결과 확인
print(df['bmi'].isnull().sum())  # 0이면 성공

0


## 문자형 변수 숫자로 변환

In [49]:
# 문자형 변수 숫자로 변환 (sex, region, smoker)
df_encoded = df.copy()
df_encoded['sex'] = df_encoded['sex'].map({'female': 0, 'male': 1})
df_encoded['region'] = df_encoded['region'].map({'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3})
df_encoded['smoker'] = df_encoded['smoker'].map({'no': 0, 'yes': 1})

print(df_encoded.head())

   age  sex     bmi  children  smoker  region      charges
0   19    0  27.900         0       1       3  16884.92400
1   18    1  33.770         1       0       2   1725.55230
2   28    1  33.000         3       0       2   4449.46200
3   33    1  22.705         0       0       1  21984.47061
4   32    1  28.880         0       0       1   3866.85520


In [45]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1333 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 73.3 KB


## X, y 다시 정의

In [50]:
# X1, y 정의
X1 = df_encoded.drop('charges', axis=1)
y = df_encoded['charges']

# 단독모델

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.3, random_state = 156)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
y_pred[:3]

array([14474.70246359, -1367.94174448, 11182.0795591 ])

 위 결과
 - LinearRegression 모델이 예측한 의료비(charges)의 추정값 상위 3개
 - 선형 회귀는 출력값의 범위를 제한하지 않기 때문에, 회귀선 아래로 지나치는 경우 음수가 나올 수 있음. 하지만 의료비는 음수가 될 수 없는 값이므로, 개선이 필요

In [52]:
# 평가
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
rmse, mse

(np.float64(5892.287122021437), np.float64(34719047.52833968))

위 결과
- y_test: 실제 의료비 (실제 값)
- y_pred: 예측된 의료비
- mean_squared_error: 오차의 제곱 평균 → MSE
- np.sqrt(mse): MSE의 제곱근 → RMSE
- RMSE ≈ 5892원이면, 의료비 예측에서 평균적으로 약 5,892원 정도의 오차가 있다는 뜻
- 일반적으로 RMSE가 작을수록 성능이 좋음

In [53]:
# 결정계수
r2_score(y_test, y_pred)

np.float64(0.7314050294401666)

위 결과
- 모델이 **전체 변동의 약 73.1%**를 설명하고 있다는 뜻
- 1에 가까울수록 좋은 모델
- 0이면 예측이 평균만큼도 설명하지 못한 것이고, 음수면 완전히 잘못된 모델

In [54]:
# 회귀식
# W1, W0
lr.intercept_, lr.coef_

(np.float64(-12749.427561095079),
 array([  257.5385329 ,  -339.97677884,   369.66261678,   471.40493778,
        23624.46109983,  -375.59873801]))

위 결과
- intercept_: 선형 회귀식의 y절편 (기준값)
    - 모든 독립변수가 0일 때 예측되는 charges 값
    - 여기선 -12749.4 → 음수지만 해석보단 상대비교가 중요

- coef_: 각 독립변수의 회귀계수 (즉, 영향력)
    - charges = W0 + W1*age + W2*sex + ... 와 같은 식에서 W1~Wn


In [55]:
np.round(lr.intercept_, 1), np.round(lr.coef_, 1) # 위에 있는 식을 계산한 결과

(np.float64(-12749.4),
 array([  257.5,  -340. ,   369.7,   471.4, 23624.5,  -375.6]))

위 결과 
- 위에서 출력된 절편과 계수를 소수점 1자리로 반올림한 결과
    - 보기 쉽게 출력하기 위한 가공


In [58]:
pd.Series(data = np.round(lr.coef_, 1), index = df.drop('charges', axis = 1).columns).sort_values(ascending = False)

smoker      23624.5
children      471.4
bmi           369.7
age           257.5
sex          -340.0
region       -375.6
dtype: float64

In [60]:
from sklearn.model_selection import cross_val_score
neg_mse_scores = cross_val_score(lr, X1, y, scoring = 'neg_mean_squared_error', cv = 5)
neg_mse_scores

array([-37353966.147801  , -38018280.71475136, -32981193.39000173,
       -39560881.14778336, -37174240.90789755])

In [61]:
# MSE, RMSE
RMSE = np.sqrt(neg_mse_scores * -1)
np.mean(RMSE), RMSE

(np.float64(6081.4847105593835),
 array([6111.78911186, 6165.89658645, 5742.92550796, 6289.74412419,
        6097.06822234]))

In [62]:
# R2
r2_scores = cross_val_score(lr, X1, y, scoring = 'r2', cv = 5)
r2_scores, np.mean(r2_scores)

(array([0.75962321, 0.70729102, 0.77528105, 0.73350581, 0.7552539 ]),
 np.float64(0.7461909971637162))

In [68]:
# Linear Regression 학습 및 평가
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_pred))
lr_r2 = r2_score(y_test, lr_pred)

print("Linear Regression")
print(f"RMSE: {lr_rmse:.2f}")
print(f"R²: {lr_r2:.4f}")

Linear Regression
RMSE: 5892.29
R²: 0.7314


In [77]:
# Random Forest Regressor 학습 및 평가
rf = RandomForestRegressor(n_estimators=100, random_state=200)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_r2 = r2_score(y_test, rf_pred)

print("Random Forest Regression")
print(f"RMSE: {rf_rmse:.2f}")
print(f"R²: {rf_r2:.4f}")

Random Forest Regression
RMSE: 4760.46
R²: 0.8247


RMSE: 예측값과 실제값 사이의 평균 오차 크기 (작을수록 좋음)

R² (결정계수): 모델이 데이터를 얼마나 잘 설명하는지 (1에 가까울수록 좋음)

# 교차검증

In [79]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

# 모델 정의
rf = RandomForestRegressor(random_state=42)

# --- (1) MSE 기반 교차검증 ---
neg_mse_scores = cross_val_score(rf, X1, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mse_scores)

print("RMSE (각 fold):", rmse_scores)
print("평균 RMSE:", np.mean(rmse_scores))

# --- (2) R² 기반 교차검증 ---
r2_scores = cross_val_score(rf, X1, y, scoring='r2', cv=5)

print("R² (각 fold):", r2_scores)
print("평균 R²:", np.mean(r2_scores))


RMSE (각 fold): [4748.69945332 5508.3730006  4650.41986061 5052.94671873 4790.65796783]
평균 RMSE: 4950.21940021638
R² (각 fold): [0.85488738 0.76639064 0.85264745 0.82800682 0.84890016]
평균 R²: 0.8301664881701845


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import numpy as np

# 모델 정의
lr = LinearRegression()

# --- (1) MSE 기반 교차검증 ---
neg_mse_scores = cross_val_score(lr, X1, y, scoring='neg_mean_squared_error', cv=5)
rmse_scores = np.sqrt(-neg_mse_scores)

print("RMSE (각 fold):", rmse_scores)
print("평균 RMSE:", np.mean(rmse_scores))

# --- (2) R² 기반 교차검증 ---
r2_scores = cross_val_score(lr, X1, y, scoring='r2', cv=5)

print("R² (각 fold):", r2_scores)
print("평균 R²:", np.mean(r2_scores)) 

RMSE (각 fold): [6111.78911186 6165.89658645 5742.92550796 6289.74412419 6097.06822234]
평균 RMSE: 6081.4847105593835
R² (각 fold): [0.75962321 0.70729102 0.77528105 0.73350581 0.7552539 ]
평균 R²: 0.7461909971637162


# 다항회귀

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X1,y,test_size = 0.3, random_state = 156)
model_poly = Pipeline([
    ('poly', PolynomialFeatures(degree = 2, include_bias = False)),
    ('linear', LinearRegression())]
)
model_poly.fit(X_train, y_train)

In [85]:
pred_poly = model_poly.predict(X_test)
mean_squared_error(y_test, pred_poly)

np.float64(21741384.61782171)

In [86]:
r2_score(y_test, pred_poly)

np.float64(0.8318033766166195)

## 다항회귀 시뮬레이션

In [88]:
results = []

for degree in range(1, 10):
    model_poly = Pipeline([
        ('poly', PolynomialFeatures(degree = degree, include_bias = False)),
        ('linear', LinearRegression())]
    )
    model_poly.fit(X_train, y_train)
    pred_poly = model_poly.predict(X_test)
    mse = mean_squared_error(y_test, pred_poly)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, pred_poly)
    
    results.append({'degree': degree,
                    'MSE': mse,
                    'RMSE': rmse,
                    'R2':r2})
    
pd.DataFrame(results)

Unnamed: 0,degree,MSE,RMSE,R2
0,1,34719050.0,5892.287,0.731405
1,2,21741380.0,4662.766,0.8318034
2,3,22760610.0,4770.808,0.8239184
3,4,29928430.0,5470.688,0.7684664
4,5,43945280.0,6629.123,0.6600287
5,6,948632100000.0,973977.4,-7337.848
6,7,249857700000000.0,15806890.0,-1932959.0
7,8,20497270000000.0,4527391.0,-158570.8
8,9,744969400000000.0,27294130.0,-5763263.0
