In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [3]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [4]:
# 문제 데이터
housing_df=pd.DataFrame(housing.data,columns=housing.feature_names)
# 주택에 대한 일반적인 정보

In [5]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
# 정답 데이터
housing_price=housing.target

## 1. train test로 분리

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train,X_test,y_train,y_test=train_test_split(housing_df,housing_price, test_size=3)

## 2. LinearRegression

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
linear_model = LinearRegression()

In [11]:
linear_model.fit(X_train,y_train)

In [12]:
# 회귀 모델의 Score -> R square
linear_model.score(X_test, y_test) 

0.3012213575143684

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
# 교차 검증
result = cross_val_score(linear_model,X_train,y_train,cv=5)

In [15]:
result

array([-0.10847506,  0.6156134 ,  0.58597485,  0.60569997,  0.61641945])

In [16]:
result.mean()

0.46304652146669395

## 3. 특성 확장

In [17]:
# 복사!
extended_X_train = X_train.copy()
extended_X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
15206,4.4219,9.0,5.784543,0.976581,904.0,2.117096,33.01,-117.06
20524,4.1060,10.0,5.781801,1.080780,2909.0,2.701021,38.55,-121.55
70,1.7719,26.0,6.047244,1.196850,392.0,3.086614,37.81,-122.29
10391,7.8061,2.0,11.110599,1.631336,703.0,3.239631,33.58,-117.65
15168,1.9176,24.0,4.234694,0.969388,279.0,1.423469,33.02,-117.06
...,...,...,...,...,...,...,...,...
17510,3.0139,35.0,3.606061,1.212121,377.0,3.808081,37.34,-121.92
8688,4.6406,34.0,5.042453,1.023585,601.0,2.834906,33.87,-118.34
2039,4.0027,13.0,5.785185,1.009877,1347.0,3.325926,36.75,-119.69
3240,1.8500,14.0,4.541219,1.039427,1077.0,3.860215,36.09,-119.56


In [18]:
for col1 in X_train.columns:
    for col2 in X_train.columns:
        extended_X_train[col1+'x'+col2] = X_train[col1]*X_train[col2]

In [19]:
extended_X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedIncxMedInc,MedIncxHouseAge,...,LatitudexLatitude,LatitudexLongitude,LongitudexMedInc,LongitudexHouseAge,LongitudexAveRooms,LongitudexAveBedrms,LongitudexPopulation,LongitudexAveOccup,LongitudexLatitude,LongitudexLongitude
15206,4.4219,9.0,5.784543,0.976581,904.0,2.117096,33.01,-117.06,19.553200,39.7971,...,1089.6601,-3864.1506,-517.627614,-1053.54,-677.138642,-114.318548,-105822.24,-247.827260,-3864.1506,13703.0436
20524,4.1060,10.0,5.781801,1.080780,2909.0,2.701021,38.55,-121.55,16.859236,41.0600,...,1486.1025,-4685.7525,-499.084300,-1215.50,-702.777948,-131.368802,-353588.95,-328.309146,-4685.7525,14774.4025
70,1.7719,26.0,6.047244,1.196850,392.0,3.086614,37.81,-122.29,3.139630,46.0694,...,1429.5961,-4623.7849,-216.685651,-3179.54,-739.517480,-146.362835,-47937.68,-377.462047,-4623.7849,14954.8441
10391,7.8061,2.0,11.110599,1.631336,703.0,3.239631,33.58,-117.65,60.935197,15.6122,...,1127.6164,-3950.6870,-918.387665,-235.30,-1307.161982,-191.926728,-82707.95,-381.142627,-3950.6870,13841.5225
15168,1.9176,24.0,4.234694,0.969388,279.0,1.423469,33.02,-117.06,3.677190,46.0224,...,1090.3204,-3865.3212,-224.474256,-2809.44,-495.713265,-113.476531,-32659.74,-166.631327,-3865.3212,13703.0436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17510,3.0139,35.0,3.606061,1.212121,377.0,3.808081,37.34,-121.92,9.083593,105.4865,...,1394.2756,-4552.4928,-367.454688,-4267.20,-439.650909,-147.781818,-45963.84,-464.281212,-4552.4928,14864.4864
8688,4.6406,34.0,5.042453,1.023585,601.0,2.834906,33.87,-118.34,21.535168,157.7804,...,1147.1769,-4008.1758,-549.168604,-4023.56,-596.723868,-121.131038,-71122.34,-335.482736,-4008.1758,14004.3556
2039,4.0027,13.0,5.785185,1.009877,1347.0,3.325926,36.75,-119.69,16.021607,52.0351,...,1350.5625,-4398.6075,-479.083163,-1555.97,-692.428815,-120.872123,-161222.43,-398.080074,-4398.6075,14325.6961
3240,1.8500,14.0,4.541219,1.039427,1077.0,3.860215,36.09,-119.56,3.422500,25.9000,...,1302.4881,-4314.9204,-221.186000,-1673.84,-542.948100,-124.273835,-128766.12,-461.527312,-4314.9204,14294.5936


In [20]:
extended_X_train.shape
# 특성 확장으로 기존 8 + 새로운 8*8(64) -> 72개 특성

(20637, 72)

In [21]:
extended_X_test = X_test.copy()

In [22]:
for col1 in X_test.columns:
    for col2 in X_test.columns:
        extended_X_test[col1+'x'+col2] = X_test[col1]*X_test[col2]

In [23]:
extended_X_test

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedIncxMedInc,MedIncxHouseAge,...,LatitudexLatitude,LatitudexLongitude,LongitudexMedInc,LongitudexHouseAge,LongitudexAveRooms,LongitudexAveBedrms,LongitudexPopulation,LongitudexAveOccup,LongitudexLatitude,LongitudexLongitude
2305,4.7804,19.0,6.347059,0.994118,1378.0,2.701961,36.83,-119.77,22.852224,90.8276,...,1356.4489,-4411.1291,-572.548508,-2275.63,-760.187235,-119.065471,-165043.06,-323.613843,-4411.1291,14344.8529
9978,3.6641,32.0,5.614887,1.071197,785.0,2.540453,38.53,-122.46,13.425629,117.2512,...,1484.5609,-4718.3838,-448.705686,-3918.72,-687.599029,-131.178835,-96131.1,-311.103883,-4718.3838,14996.4516
5042,1.7067,49.0,4.304878,1.015244,901.0,2.746951,33.98,-118.32,2.912825,83.6283,...,1154.6404,-4020.5136,-201.936744,-5797.68,-509.353171,-120.123659,-106606.32,-325.019268,-4020.5136,13999.6224


In [24]:
# 특성 확장시킨 데이터로 새로운 모델 학습하기!
linear_model2 = LinearRegression()
linear_model2.fit(extended_X_train,y_train)

In [25]:
linear_model2.score(extended_X_test,y_test)

0.2254506923000683

In [26]:
result2 = cross_val_score(linear_model2, extended_X_train,y_train,cv=5)

In [27]:
result2

array([-285.87935455,    0.68160533,    0.65934209,    0.50060114,
          0.67289027])

In [28]:
result2.mean()

-56.67298314386873

In [29]:
X_train.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20637.0,20637.0,20637.0,20637.0,20637.0,20637.0,20637.0,20637.0
mean,3.870742,28.638804,5.429001,1.096685,1425.535495,3.070714,35.631743,-119.569615
std,1.899889,12.585474,2.474332,0.473944,1132.529725,10.386803,2.135965,2.003557
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5637,18.0,4.440748,1.006085,787.0,2.429603,33.93,-121.8
50%,3.5347,29.0,5.229091,1.04878,1166.0,2.818408,34.26,-118.49
75%,4.7431,37.0,6.052381,1.099526,1725.0,3.28227,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


## 4. Scaler 적용
- 특성(Feature)들의 범위를 정규화 해주는 작업
- 훈련데이터와 테스트데이터에 같은 변환을 적용해야 함!

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()

In [32]:
# 기준은 훈련 데이터!! 테스트X
scaler.fit(X_train)

In [33]:
# 데이터 스케일링은 transform 함수에서 진행된다

trainsform_X_train=scaler.transform(X_train)
trainsform_X_train

array([[ 2.90107247e-01, -1.56047205e+00,  1.43695841e-01, ...,
        -9.18127999e-02, -1.22745754e+00,  1.25261029e+00],
       [ 1.23830346e-01, -1.48101344e+00,  1.42587626e-01, ...,
        -3.55934348e-02,  1.36628060e+00, -9.88458412e-01],
       [-1.10474488e+00, -2.09675701e-01,  2.49868793e-01, ...,
         1.53080361e-03,  1.01982460e+00, -1.35781049e+00],
       ...,
       [ 6.94574309e-02, -1.24263761e+00,  1.43955255e-01, ...,
         2.45713431e-02,  5.23549794e-01, -6.00869680e-02],
       [-1.06363622e+00, -1.16317900e+00, -3.58805362e-01, ...,
         7.60118162e-02,  2.14548500e-01,  4.79920827e-03],
       [ 4.39144624e+00,  1.85624812e+00,  1.52821778e+00, ...,
        -2.18517571e-02,  1.02450644e+00, -1.32786303e+00]])

In [34]:
trainsform_X_test=scaler.transform(X_test)
trainsform_X_test

array([[ 0.47880705, -0.76588596,  0.3710417 , -0.21641808, -0.04197386,
        -0.03550299,  0.5610045 , -0.10001692],
       [-0.10876783,  0.26707595,  0.07512756, -0.05377953, -0.56559311,
        -0.05105268,  1.35691692, -1.44266165],
       [-1.13906351,  1.6178723 , -0.4543246 , -0.17184162, -0.46316506,
        -0.03117138, -0.77331927,  0.6237135 ]])

In [35]:
# 이번에는 스케일링 된 데이터로 학습을 시켜보자!
linear_model3 = LinearRegression()

In [36]:
linear_model3.fit(trainsform_X_train,y_train)
linear_model3.score(trainsform_X_test,y_test)

0.3012213575143775

## 5. KNN에 스케일링 데이터 적용

In [37]:
from sklearn.neighbors import KNeighborsRegressor

In [38]:
# 1. 원본 데이터

In [39]:
knn_model = KNeighborsRegressor()
knn_result = cross_val_score(knn_model, X_train, y_train, cv=5)

In [40]:
knn_result.mean()

0.1500409785165306

In [41]:
# 2. 스케일링한 데이터

In [42]:
knn_model2 = KNeighborsRegressor()
knn_result2 = cross_val_score(knn_model2, trainsform_X_train, y_train, cv=5)
knn_result2

array([0.67116614, 0.68769774, 0.68218734, 0.69489486, 0.69467609])

In [43]:
knn_result2.mean()

0.68612443341632