## 데이터 불러오기

In [1]:
import pandas as pd
df=pd.read_csv('data/auto.csv')
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,8,307.0,130.0,3504.0,12.0,70,1,18.0
1,8,350.0,165.0,3693.0,11.5,70,1,15.0
2,8,318.0,150.0,3436.0,11.0,70,1,18.0
3,8,304.0,150.0,3433.0,12.0,70,1,16.0
4,8,302.0,140.0,3449.0,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
387,4,140.0,86.0,2790.0,15.6,82,1,27.0
388,4,97.0,52.0,2130.0,24.6,82,2,44.0
389,4,135.0,84.0,2295.0,11.6,82,1,32.0
390,4,120.0,79.0,2625.0,18.6,82,1,28.0


## 전처리

In [2]:
df=pd.get_dummies(df,columns=['origin'])
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,mpg,origin_1,origin_2,origin_3
0,8,307.0,130.0,3504.0,12.0,70,18.0,1,0,0
1,8,350.0,165.0,3693.0,11.5,70,15.0,1,0,0
2,8,318.0,150.0,3436.0,11.0,70,18.0,1,0,0
3,8,304.0,150.0,3433.0,12.0,70,16.0,1,0,0
4,8,302.0,140.0,3449.0,10.5,70,17.0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
387,4,140.0,86.0,2790.0,15.6,82,27.0,1,0,0
388,4,97.0,52.0,2130.0,24.6,82,44.0,0,1,0
389,4,135.0,84.0,2295.0,11.6,82,32.0,1,0,0
390,4,120.0,79.0,2625.0,18.6,82,28.0,1,0,0


## X, y 나누기

In [3]:
y=df['mpg']

In [4]:
X=df.drop(columns=['mpg'])

## 회귀분석 모듈 불러오기

In [5]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

## 학습

In [6]:
reg.fit(X,y)

LinearRegression()

## 회귀모형의 정확도 평가

## RMSE

$$ 
RMSE = \sqrt {\sum_{i = 1}^{n} \dfrac {(\hat y_{i}-y_{i})^{2}} n}
$$  

In [7]:
y_pred = reg.predict(X)
y_pred[:5]

array([14.95325212, 14.04009845, 15.23055101, 14.99408418, 14.90194083])

In [8]:
y_pred-y

0     -3.046748
1     -0.959902
2     -2.769449
3     -1.005916
4     -2.098059
         ...   
387    1.108037
388   -8.534024
389   -0.970261
390    1.100271
391   -2.552462
Name: mpg, Length: 392, dtype: float64

In [9]:
import numpy as np

In [10]:
np.sqrt(np.sum((y_pred-y)**2)/y.count())

3.268351515330416

## R2_score

$$ 
R^{2} = \dfrac {TSS-RSS} {TSS} = 1 - \dfrac {RSS} {TSS}
$$

$$ 
TSS = \sum_{i = 1}^{n}(y_{i}-\overline{y})^{2}
$$

$$ 
RSS = \sum_{i = 1}^{n}(y_{i}-\hat{y})^{2}
$$

In [11]:
y.mean()


23.44591836734694

In [12]:
tss=np.sum((y-y.mean())**2)
tss

23818.99346938775

In [13]:
rss=np.sum((y-y_pred)**2)
rss

4187.39167808295

In [15]:
r2_score = 1-rss/tss
r2_score

0.8241994699119172

## sklearn 패키지를 이용한 평가지표 구하기

In [16]:
from sklearn.metrics import mean_squared_error, r2_score

* https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
* https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html

### RMSE

In [17]:
mean_squared_error(y,y_pred)

10.682121627762628

In [18]:
np.sqrt(mean_squared_error(y,y_pred))

3.268351515330416

### R2_Score

In [19]:
r2_score(y,y_pred)

0.8241994699119172