# 회귀문제

## 분석 데이터 검토

In [3]:
import pandas as pd
data = pd.read_csv('house_price.csv', encoding='utf-8')
data.head()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
0,23,6.777,0.141112,2.442244,8.10396,500000
1,49,6.0199,0.160984,2.726688,5.752412,500000
2,35,5.1155,0.249061,1.902676,3.888078,500000
3,32,4.7109,0.231383,1.913669,4.508393,500000
4,21,4.5625,0.255583,3.092664,4.667954,500000


In [4]:
print(data.shape)

(17689, 6)


In [5]:
data.describe()

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value
count,17689.0,17689.0,17689.0,17689.0,17689.0,17689.0
mean,27.378823,3.671141,0.213278,2.952117,5.244001,189043.439313
std,11.28023,1.525937,0.051167,0.731573,1.184922,95487.122628
min,1.0,0.4999,0.1,0.75,1.64,14999.0
25%,18.0,2.5329,0.177464,2.47027,4.426829,114400.0
50%,28.0,3.4539,0.204104,2.854962,5.190779,171100.0
75%,36.0,4.5918,0.240157,3.316092,5.953728,242700.0
max,51.0,9.9055,0.498127,6.954023,11.901869,500000.0


In [None]:
data.hist(bins=50, figsize=(20,15))

## 특성(x)과 레이블(y) 나누기

In [10]:
X1= data[['housing_age','income','bedrooms','households','rooms']]
X2 = data[data.columns[0:5]]
X3 = data.loc[:,'housing_age':'rooms']

In [13]:
print(X1.shape)
print(X2.shape)
print(X3.shape)

(17689, 5)
(17689, 5)
(17689, 5)


In [14]:
y = data[['house_value']]

In [15]:
print(y.shape)

(17689, 1)


## train-test 데이터셋 나누기

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=1)

In [17]:
print(y_train.mean())
print(y_test.mean())

house_value    189198.688226
dtype: float64
house_value    188577.797875
dtype: float64


## 정규화

In [20]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler_minmax = MinMaxScaler()
scaler_standard = StandardScaler()

### train data의 정규화

In [28]:
scaler_minmax.fit(X_train)
X_scaled_minmax_train = scaler_minmax.transform(X_train)

In [29]:
scaler_standard.fit(X_train)
X_scaled_standard_train = scaler_standard.transform(X_train)

In [30]:
pd.DataFrame(X_scaled_minmax_train).describe()

Unnamed: 0,0,1,2,3,4
count,13266.0,13266.0,13266.0,13266.0,13266.0
mean,0.528456,0.337348,0.284772,0.354756,0.351075
std,0.224891,0.16278,0.129692,0.118051,0.115554
min,0.0,0.0,0.0,0.0,0.0
25%,0.34,0.214861,0.194125,0.276713,0.271474
50%,0.54,0.313521,0.261454,0.339383,0.345944
75%,0.7,0.437101,0.351973,0.413497,0.420886
max,1.0,1.0,1.0,1.0,1.0


In [31]:
pd.DataFrame(X_scaled_standard_train).describe()

Unnamed: 0,0,1,2,3,4
count,13266.0,13266.0,13266.0,13266.0,13266.0
mean,-3.749283e-17,-7.766372e-18,2.1424480000000002e-17,5.281133e-16,5.706945e-16
std,1.000038,1.000038,1.000038,1.000038,1.000038
min,-2.349918,-2.072498,-2.195848,-3.005221,-3.038316
25%,-0.8380196,-0.7524962,-0.6989694,-0.661121,-0.6888916
50%,0.0513325,-0.1463823,-0.1798046,-0.1302303,-0.0444062
75%,0.7628142,0.612836,0.5181721,0.4976055,0.6041656
max,2.096842,4.071007,5.515037,5.466011,5.616


### test data의 정규화

In [32]:
X_scaled_minmax_test = scaler_minmax.transform(X_test)
pd.DataFrame(X_scaled_minmax_test).describe()

Unnamed: 0,0,1,2,3,4
count,4423.0,4423.0,4423.0,4423.0,4423.0
mean,0.524938,0.336617,0.283791,0.355531,0.351587
std,0.227736,0.160616,0.124947,0.117534,0.115225
min,0.0,0.0,0.038642,0.054685,0.007239
25%,0.34,0.219779,0.196146,0.278629,0.272149
50%,0.54,0.315057,0.261776,0.338958,0.346515
75%,0.7,0.430605,0.352273,0.413766,0.419226
max,1.0,0.978545,0.962714,0.948575,0.973105


In [34]:
X_scaled_standard_test = scaler_standard.transform(X_test)
pd.DataFrame(X_scaled_standard_test).describe()

Unnamed: 0,0,1,2,3,4
count,4423.0,4423.0,4423.0,4423.0,4423.0
mean,-0.015645,-0.004487,-0.007567,0.006567,0.004428
std,1.012687,0.986744,0.963453,0.99566,0.997192
min,-2.349918,-2.072498,-1.89788,-2.54197,-2.975668
25%,-0.83802,-0.722287,-0.683391,-0.644889,-0.683049
50%,0.051332,-0.136944,-0.177326,-0.133829,-0.039462
75%,0.762814,0.572927,0.520485,0.499884,0.5898
max,2.096842,3.939196,5.227527,5.030378,5.383241


## 모델 학습

In [44]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_scaled_minmax_train, y_train)

In [45]:
pred_train = model.predict(X_scaled_minmax_train)
model.score(X_scaled_minmax_train, y_train)

0.5703486808769582

In [52]:
pred_test = model.predict(X_scaled_minmax_test)
model.score(X_scaled_minmax_test, y_test)

0.5836850955113921

In [56]:
import numpy as np
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, pred_test)
np.sqrt(MSE)

61561.773394320095

In [58]:
from sklearn.metrics import mean_absolute_error
MAE = mean_absolute_error(y_test, pred_test)
MAE

45901.08580055117

## 예측값 병합 및 저장

In [62]:
pred_train = model.predict(X_scaled_minmax_train)
y_train['y_pred'] = pred_train
y_train

Unnamed: 0,house_value,y_pred
6013,214700,193871.382632
4484,241400,185582.480582
2419,298900,275631.419802
14025,104200,159585.461629
3860,256100,205996.153033
...,...,...
10955,146400,193567.327413
17289,55500,135868.762962
5192,228000,308319.475852
12172,130900,225533.840743


In [63]:
pred_test = model.predict(X_scaled_minmax_test)
y_test['y_pred'] = pred_test
y_test

Unnamed: 0,house_value,y_pred
3006,278100,154095.706614
8149,180700,182868.398536
4426,242700,113317.502861
11603,138100,253302.754810
13864,107000,110527.337210
...,...,...
13851,107200,156292.086817
8714,173100,203815.553226
9795,160400,143735.926581
11913,134800,198553.272830


In [65]:
total_test = pd.concat([X_test, y_test],axis=1)
total_test

Unnamed: 0,housing_age,income,bedrooms,households,rooms,house_value,y_pred
3006,16,3.0187,0.204797,2.090909,5.796791,278100,154095.706614
8149,17,3.8465,0.207249,2.895806,5.134993,180700,182868.398536
4426,40,3.1250,0.190299,4.700000,5.360000,242700,113317.502861
11603,9,5.6856,0.177462,2.855967,9.275720,138100,253302.754810
13864,31,1.9219,0.280615,3.959693,3.871401,107000,110527.337210
...,...,...,...,...,...,...,...
13851,45,2.8351,0.189500,2.774510,4.780392,107200,156292.086817
8714,45,4.0985,0.176536,3.317308,5.737179,173100,203815.553226
9795,26,3.0078,0.201908,2.912088,5.184066,160400,143735.926581
11913,41,3.8618,0.161687,2.453532,6.345725,134800,198553.272830


In [66]:
total_test.to_csv('regression_test.csv')