In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

In [4]:
boston = pd.DataFrame(data, columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT'])

In [5]:
boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


| 컬럼명  | 설명                                           |
|---------|------------------------------------------------|
| CRIM    | 지역별 범죄 발생률                             |
| ZN      | 25,000평방피트를 초과하는 거주 지역의 비율    |
| INDUS   | 비상업 지역 넓이 비율                          |
| CHAS    | 찰스강에 대한 더미 변수 (1: 강의 경계에 위치, 0: 그 외) |
| NOX     | 일산화질소 농도                                |
| RM      | 거주할 수 있는 방 개수                         |
| AGE     | 1940년 이전에 건축된 소유 주택의 비율          |
| DIS     | 5개 주요 고용센터까지의 가중 거리              |
| RAD     | 고속도로 접근 용이도                           |
| TAX     | 10,000달러당 재산세율                          |
| PTRATIO | 지역의 교사와 학생 수 비율                     |
| B       | 지역의 흑인 거주 비율                          |
| LSTAT   | 하위 계층의 비율                               |
| MEDV    | 본인 소유의 주택 가격 (중앙값)                 |

In [6]:
boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    float64
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    float64
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
dtypes: float64(13)
memory usage: 51.5 KB


In [7]:
boston.describe().T
# 전치 : 행렬 바꿈 (T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CRIM,506.0,3.613524,8.601545,0.00632,0.082045,0.25651,3.677083,88.9762
ZN,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
CHAS,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
NOX,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
RM,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
AGE,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
DIS,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
RAD,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
TAX,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


In [8]:
from scipy.stats import shapiro, kstest, norm
# scipy : 검정 도구
# shapiro : 샤피로 윌크, 데이터 크기가 50 이하일 때 정규분포를 따르는지 검정하는 도구
# kstest : 콜모고로프-스미르노프, 큰 표본에서 정규분포를 따르는지 검정하는 도구
# norm : 정규분포를 다루는 모듈

# 정규성 테스트 결과 저장
results = {}
for column in boston.columns:
     # 각 열에 접근하여 정규분포를 비교하여 stat(테스트 통계량), p(p-value)
    stat, p = kstest(boston[column], 'norm', args = (boston[column].mean(), boston[column].std()))  
    results[column] = (stat, p)

# 결과 출력
for column, (stat, p) in results.items():
    print(f'{column} - Statistics={stat}, p-value={p}')
    if p > 0.05:   # 기각하냐 기각하지 않냐 를 물어보는 것
        print(f'{column}는 정규분포를 따릅니다.')
    else:
        print(f'{column}는 정규분포를 따르지 않습니다.')
        +
# => 정규분포를 따르지 않으니 비정규분포 => MinMaxScaler 불러오기

CRIM - Statistics=0.3374739947048539, p-value=6.577508906954499e-52
CRIM는 정규분포를 따르지 않습니다.
ZN - Statistics=0.4221338010042449, p-value=2.8343702405949505e-82
ZN는 정규분포를 따르지 않습니다.
INDUS - Statistics=0.2184635911385101, p-value=1.1014658708932236e-21
INDUS는 정규분포를 따르지 않습니다.
CHAS - Statistics=0.5381455356507261, p-value=1.3281824544994916e-137
CHAS는 정규분포를 따르지 않습니다.
NOX - Statistics=0.1055243833372041, p-value=2.3242525676841285e-05
NOX는 정규분포를 따르지 않습니다.
RM - Statistics=0.08192106668316346, p-value=0.002111544733332488
RM는 정규분포를 따르지 않습니다.
AGE - Statistics=0.1480902806403489, p-value=3.7664493497864345e-10
AGE는 정규분포를 따르지 않습니다.
DIS - Statistics=0.13230506189505215, p-value=3.486049121482753e-08
DIS는 정규분포를 따르지 않습니다.
RAD - Statistics=0.31633292265651747, p-value=1.6333405316186914e-45
RAD는 정규분포를 따르지 않습니다.
TAX - Statistics=0.20766993462806982, p-value=1.282869737976958e-19
TAX는 정규분포를 따르지 않습니다.
PTRATIO - Statistics=0.1771664696913211, p-value=2.3032517826397184e-14
PTRATIO는 정규분포를 따르지 않습니다.
B - Stati

In [10]:
# 거리/수치 기반 알고리즘의 경우 특정 컬럼이 영향을 많이 주지 않게끔 스케일링 작업 필요!
from sklearn.preprocessing import MinMaxScaler   # 0 ~ 1 사이의 값으로 범위 조정

In [11]:
# 스케일러 초기화
mm_scaler = MinMaxScaler()

X_scaled = mm_scaler.fit_transform(boston)   # 기준 잡고 바꿔버리기!
# fix : 기준 잡는다
# 주의사항 : 나중에 train, test 나눴을 때 train에 fix이나 fix_transform을 적용시켰다면, test에는 transform만 적용 시켜야 함!

In [12]:
X_scaled = pd.DataFrame(X_scaled, columns = boston.columns)

In [13]:
X_scaled

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.000000,0.18,0.067815,0.0,0.314815,0.577505,0.641607,0.269203,0.000000,0.208015,0.287234,1.000000,0.089680
1,0.000236,0.00,0.242302,0.0,0.172840,0.547998,0.782698,0.348962,0.043478,0.104962,0.553191,1.000000,0.204470
2,0.000236,0.00,0.242302,0.0,0.172840,0.694386,0.599382,0.348962,0.043478,0.104962,0.553191,0.989737,0.063466
3,0.000293,0.00,0.063050,0.0,0.150206,0.658555,0.441813,0.448545,0.086957,0.066794,0.648936,0.994276,0.033389
4,0.000705,0.00,0.063050,0.0,0.150206,0.687105,0.528321,0.448545,0.086957,0.066794,0.648936,1.000000,0.099338
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.000633,0.00,0.420455,0.0,0.386831,0.580954,0.681771,0.122671,0.000000,0.164122,0.893617,0.987619,0.219095
502,0.000438,0.00,0.420455,0.0,0.386831,0.490324,0.760041,0.105293,0.000000,0.164122,0.893617,1.000000,0.202815
503,0.000612,0.00,0.420455,0.0,0.386831,0.654340,0.907312,0.094381,0.000000,0.164122,0.893617,1.000000,0.107892
504,0.001161,0.00,0.420455,0.0,0.386831,0.619467,0.889804,0.114514,0.000000,0.164122,0.893617,0.991301,0.131071


In [15]:
# train, test 분리
from sklearn.model_selection import train_test_split

In [16]:
target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

In [22]:
# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, #target은 정답 데이터
                                                    test_size = 0.3,
                                                   random_state = 2024) #랜덤한 값으로 고정

In [24]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((354, 13), (354,), (152, 13), (152,))

In [23]:
# 일반 선형 모델로 예측을 수행해보자
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score  # 교차 검증 수행

In [26]:
# 검증으로 성능 확인! - 회귀모델 다루고 있는 거라 0.65는 정확도가 아님 ! 65% 정도로 데이터를 이해하고 있구나! 임..
cross_val_score(LinearRegression(), X_train, y_train, cv = 5).mean()

0.6573103472280185

![image.png](attachment:4a44472e-9b38-41a9-865a-bee11b4f06cd.png)
- 회귀의 평가를 위한 지표는 실제 값과 회귀 예측값의 차이를 기반으로 합니다.
- 회귀 평가지표 MAE, MSE, RMSE, MSLE, RMSLE는 값이 작을수록 회귀 성능이 좋은 것입니다.
- 값이 작을수록 예측값과 실제값의 차이가 없다는 뜻이기 때문입니다. 반면, R² 는 값이 클수록 성능이 좋습니다.
- 

- 평균 절대 오차 (MAE) : 실제 값과 예측 값의 차이를 절댓값으로 변환해 평균한 것
- 평균 제곱 오차 (MSE) : 실제 값과 예측 값의 차이를 제곱해 평균한 것, 오차값을 절대값(제곱)으로 양수화 시킴!
  - 단점은 예측한 값이 1 이하인 값들은 오차값이 제곱하면 더 줄어든다.
- 평균 제곱근 오차 (RMSE) : MSE에 제곱근을 취함. 오류가 크면 클 수록 값이 너무 커지기 때문에 원래 단위로 돌아가기 위해서 루트 씌움
  - MSE 값은 오류의 제곱을 구하므로 실제 오류 평균보다 더 커지는 특성이 있어 MSE에 루트를 씌운 RMSE 값을 쓰는 것.