### 회귀
- 범위형 데이터(키, 몸무게, 연봉 등)의 데이터 예측


In [2]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 불필요한 경고 안뜨게
import warnings
warnings.filterwarnings('ignore')

# 데이터 전처리 알고리즘(비지도학습)
# 문자열 데이터를 숫자로 변환한다.
from sklearn.preprocessing import LabelEncoder
# 표준편차 기반으로 표준화(컬럼 당 숫자 편차가 다르니까.)
# 잘못된 학습을 정상화 시키는 목적으로 사용함
from sklearn.preprocessing import StandardScaler # 젤 유명 표준화

# train data랑 test data 나눔
from sklearn.model_selection import train_test_split

#교차검증
from sklearn.model_selection import cross_val_score

# 평가 함수
# 정확도 평가 함수
from sklearn.metrics import accuracy_score

# mse 평가 함수
from sklearn.metrics import mean_squared_error

# 학습 알고리즘 - 분류
# 최 근접 이웃
# 학습시 : 주어진 데이터를 저장만 한다
# 예측시 : 주변의 데이터를 보고 가장 많은 결과로 결정한다
from sklearn.neighbors import KNeighborsClassifier

# 선형
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# 트리
# 학습시 : 주어진 데이터를 갖고 질문을 생성한다
# 예측시 : 질문을 통해 최종 결과 예측
from sklearn.tree import DecisionTreeClassifier

# 앙상블 - 다수의 알고리즘이 던지는 결과를 취합하여 최종 결과를 결정함
# 트리들을 사용한다
# 학습시 : 데이터를 랜덤하게 섞어서 80%를 추출한다.
# 추출된 데이터를 트리의 개수만큼 생성해 각 트리들에게 주고 학습 수행
# 예측시 : 각 트리가 던지는 결과를 취합하여 다수결의 원칙으로 최종결과 결정
from sklearn.ensemble import RandomForestClassifier

# 부스팅 - 앙상블 알고리즘이 잘못 예측한 데이터를 다시 학습하는 방식
# 학습과 예측 원리는 앙상블과 동일함
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# 학습 알고리즘 - 회귀
# 최근접이웃
from sklearn.neighbors import KNeighborsRegressor

# 선형
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR

# 트리
from sklearn.tree import DecisionTreeRegressor
# 앙상블
from sklearn.ensemble import RandomForestRegressor
# 부스팅
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

### 데이터 불러오기

In [3]:
df1 = pd.read_csv('data/boston.csv')
df1

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0,0.5380,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.4690,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.4690,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.4580,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.4580,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,0.05023,35.0,6.06,0,0.4379,5.706,28.4,6.6407,1,304,16.9,394.02,12.43,17.1
332,0.03466,35.0,6.06,0,0.4379,6.031,23.3,6.6407,1,304,16.9,362.25,7.83,19.4
333,0.05083,0.0,5.19,0,0.5150,6.316,38.1,6.4584,5,224,20.2,389.71,5.68,22.2
334,0.03738,0.0,5.19,0,0.5150,6.310,38.5,6.4584,5,224,20.2,389.40,6.75,20.7


### 데이터 전처리

In [4]:
# 결측치 확인
df1.isna().sum()
# 모든 결과가 0이라는건 결측치가 없다

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
target     0
dtype: int64

In [5]:
# 컬럼의 타입을 확인한다.
# int : 정수, float : 실수, datetime : 날짜, object : 문자열
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 336 entries, 0 to 335
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     336 non-null    float64
 1   ZN       336 non-null    float64
 2   INDUS    336 non-null    float64
 3   CHAS     336 non-null    int64  
 4   NOX      336 non-null    float64
 5   RM       336 non-null    float64
 6   AGE      336 non-null    float64
 7   DIS      336 non-null    float64
 8   RAD      336 non-null    int64  
 9   TAX      336 non-null    int64  
 10  PTRATIO  336 non-null    float64
 11  B        336 non-null    float64
 12  LSTAT    336 non-null    float64
 13  target   336 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 36.9 KB


In [6]:
# 입력과 결과로 나눠준다.
X = df1.drop('target', axis=1)
y = df1['target']

display(X)
display(y)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.5380,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.4690,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.4690,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.4580,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.4580,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,0.05023,35.0,6.06,0,0.4379,5.706,28.4,6.6407,1,304,16.9,394.02,12.43
332,0.03466,35.0,6.06,0,0.4379,6.031,23.3,6.6407,1,304,16.9,362.25,7.83
333,0.05083,0.0,5.19,0,0.5150,6.316,38.1,6.4584,5,224,20.2,389.71,5.68
334,0.03738,0.0,5.19,0,0.5150,6.310,38.5,6.4584,5,224,20.2,389.40,6.75


0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
331    17.1
332    19.4
333    22.2
334    20.7
335    21.1
Name: target, Length: 336, dtype: float64

In [7]:
# 입력데이터 표준화
# 모든 컬럼의 값의 범위를 비슷한 수준으로 맞춰서
# 학습의 정상화를 보장받을 수 있게 한다.
scaler1 = StandardScaler()
X = scaler1.fit_transform(X)
X

array([[-0.63326028,  0.12485095, -1.03628034, ..., -1.07088823,
         0.41031054, -0.92655695],
       [-0.60097475, -0.58885279, -0.25831126, ...,  0.06047401,
         0.41031054, -0.22972861],
       [-0.60100552, -0.58885279, -0.25831126, ...,  0.06047401,
         0.31149419, -1.08568842],
       ...,
       [-0.56479774, -0.58885279, -0.56557636, ...,  1.14658176,
         0.23574309, -0.80930218],
       [-0.5854857 , -0.58885279, -0.56557636, ...,  1.14658176,
         0.22821654, -0.63006989],
       [-0.58205565, -0.58885279, -0.56557636, ...,  1.14658176,
         0.41031054, -0.41901131]])

### 데이터에 적합한 학습 알고리즘을 선택한다.


In [8]:
# 검증 첫 번째 방식
# 주어진 데이터를 학습용과 검증용으로 나누어 모든 모델을 테스트한다
# 전체 데이터를 랜덤하게 섞어서 80%의 학습용, 20%의 검증용을 추출한다.
# random_state를 지정해주면 랜덤의 패턴이 고정된다.
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 1)

In [9]:
# 학습 모델 생성
model1 = KNeighborsRegressor()
model2 = LinearRegression()
model3 = Ridge()
model4 = Lasso()
model5 = ElasticNet()
model6 = DecisionTreeRegressor()
model7 = RandomForestRegressor()
model8 = AdaBoostRegressor()
model9 = GradientBoostingRegressor()

In [10]:
# 학습용 데이터를 학습한다.
model1.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)
model5.fit(X_train, y_train)
model6.fit(X_train, y_train)
model7.fit(X_train, y_train)
model8.fit(X_train, y_train)
model9.fit(X_train, y_train)

GradientBoostingRegressor()

In [11]:
# 검증용 데이터를 통해 결과를 예측한다.
pred1 = model1.predict(X_test)
pred2 = model2.predict(X_test)
pred3 = model3.predict(X_test)
pred4 = model4.predict(X_test)
pred5 = model5.predict(X_test)
pred6 = model6.predict(X_test)
pred7 = model7.predict(X_test)
pred8 = model8.predict(X_test)
pred9 = model9.predict(X_test)

In [12]:
# 평가지표 MSE
r1 = mean_squared_error(y_test, pred1)
r2 = mean_squared_error(y_test, pred2)
r3 = mean_squared_error(y_test, pred3)
r4 = mean_squared_error(y_test, pred4)
r5 = mean_squared_error(y_test, pred5)
r6 = mean_squared_error(y_test, pred6)
r7 = mean_squared_error(y_test, pred7)
r8 = mean_squared_error(y_test, pred8)
r9 = mean_squared_error(y_test, pred9)

In [13]:
print(r1)
print(r2)
print(r3)
print(r4)
print(r5)
print(r6)
print(r7)
print(r8)
print(r9)
# 그래디언트 부스팅이 mse제일 낮다

9.635623809523807
8.896911833790798
8.901413611096782
10.882990391103737
13.296078377014382
13.382142857142853
5.137522452380951
6.835716924102218
3.807461867205309


In [14]:
# 학습 모델 생성
model1 = KNeighborsRegressor()
model2 = LinearRegression()
model3 = Ridge()
model4 = Lasso()
model5 = ElasticNet()
model6 = DecisionTreeRegressor()
model7 = RandomForestRegressor()
model8 = AdaBoostRegressor()
model9 = GradientBoostingRegressor()

In [17]:
# 교차검증 수행
# cross_val_score(검증모델, 독립변수, 종속변수, 평가지표, 검증횟수)
r1 = cross_val_score(model1, X, y, scoring='neg_mean_squared_error', cv=10)
r2 = cross_val_score(model2, X, y, scoring='neg_mean_squared_error', cv=10)
r3 = cross_val_score(model3, X, y, scoring='neg_mean_squared_error', cv=10)
r4 = cross_val_score(model4, X, y, scoring='neg_mean_squared_error', cv=10)
r5 = cross_val_score(model5, X, y, scoring='neg_mean_squared_error', cv=10)
r6 = cross_val_score(model6, X, y, scoring='neg_mean_squared_error', cv=10)
r7 = cross_val_score(model7, X, y, scoring='neg_mean_squared_error', cv=10)
r8 = cross_val_score(model8, X, y, scoring='neg_mean_squared_error', cv=10)
r9 = cross_val_score(model9, X, y, scoring='neg_mean_squared_error', cv=10)
# 'neg_mean_squared_error' mse에 음수를 붙여서 높을수록 좋은애로 가자 한거임

In [19]:
# 각 모델의 성능 평균 수치를 출력
# 위에 neg_mse여서 절댓값 붙임
print(abs(r1.mean()))
print(abs(r2.mean()))
print(abs(r3.mean()))
print(abs(r4.mean()))
print(abs(r5.mean()))
print(abs(r6.mean()))
print(abs(r7.mean()))
print(abs(r8.mean()))
print(abs(r9.mean()))
# 교차검증 하니까 랜포 회귀 더 높게 나옴

30.414012869875222
12.744797887893485
12.676775756379831
15.032171935302582
19.293499518400147
17.1457495543672
9.790910440641714
10.82998622624301
11.106622489775392


In [20]:
model7

RandomForestRegressor()

In [21]:
# 선정된 모델에 전체 데이터를 학습시킨다 (몰랐노.... 데이터 많으면 학습데이터만 해도 된다)
best_model = RandomForestRegressor()
best_model.fit(X,y)

RandomForestRegressor()

### 미래데이터에 예측

In [23]:
# 예측할 데이터를 불러온다.
df2 = pd.read_csv('data/boston_new.csv')
df2

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.03427,0,5.19,0,0.515,5.869,46.3,5.2311,5,224,20.2,396.90,9.80
1,0.03041,0,5.19,0,0.515,5.895,59.6,5.6150,5,224,20.2,394.81,10.56
2,0.03306,0,5.19,0,0.515,6.059,37.3,4.8122,5,224,20.2,396.14,8.51
3,0.05497,0,5.19,0,0.515,5.985,45.4,4.8122,5,224,20.2,396.90,9.74
4,0.06151,0,5.19,0,0.515,5.968,58.5,4.8122,5,224,20.2,396.90,9.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,0.06263,0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
166,0.04527,0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
167,0.06076,0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64
168,0.10959,0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48


In [24]:
# 전치리
X = scaler1.transform(df2)
X

array([[-0.59026931, -0.58885279, -0.56557636, ...,  1.14658176,
         0.41031054, -0.11917412],
       [-0.59620652, -0.58885279, -0.56557636, ...,  1.14658176,
         0.35956701,  0.00813106],
       [-0.59213046, -0.58885279, -0.56557636, ...,  1.14658176,
         0.39185835, -0.3352579 ],
       ...,
       [-0.54952402, -0.58885279,  0.5360017 , ...,  1.50861767,
         0.41031054, -0.81600245],
       [-0.47441672, -0.58885279,  0.5360017 , ...,  1.50861767,
         0.3265473 , -0.67529673],
       [-0.57005817, -0.58885279,  0.5360017 , ...,  1.50861767,
         0.41031054, -0.4407872 ]])

In [25]:
# 예측한다.
pred = best_model.predict(X)
pred

array([20.097, 20.766, 21.501, 20.975, 20.863, 33.503, 25.355, 27.426,
       30.476, 20.704, 19.976, 25.718, 27.072, 30.102, 25.551, 27.12 ,
       22.179, 29.59 , 22.047, 22.642, 16.134, 18.999, 20.378, 19.848,
       23.106, 19.138, 20.06 , 17.717, 44.297, 22.04 , 17.642, 17.806,
       22.286, 28.066, 36.448, 22.58 , 21.469, 13.479, 13.479, 34.794,
       16.292, 24.663, 15.18 , 15.058, 32.553, 16.464, 15.205, 15.199,
       14.945, 14.502, 13.835, 15.42 , 14.103, 14.889, 15.878, 16.253,
       14.541, 17.879, 15.903, 16.385, 15.88 , 15.046, 14.746, 16.212,
       14.809, 15.471, 15.145, 14.776, 15.863, 14.989, 13.802, 18.712,
       14.673, 25.699, 20.652, 16.244, 14.31 , 15.192, 14.087, 14.897,
       23.436, 15.404, 14.667, 25.73 , 17.397, 17.121, 18.228, 15.791,
       16.593, 14.351, 18.173, 18.216, 15.883, 15.01 , 16.33 , 24.932,
       20.572, 16.433, 17.551, 16.282, 16.007, 14.782, 15.378, 15.36 ,
       15.539, 15.622, 16.392, 16.338, 14.607, 15.11 , 16.092, 16.437,
      

In [27]:
# 결과를 붙여주고 저장한다.
df2['target'] = pred
df2.to_csv('data/boston_result.csv', index=False)