In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [3]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [4]:
# 문제 데이터
housing_df=pd.DataFrame(housing.data,columns=housing.feature_names)
# 주택에 대한 일반적인 정보

In [5]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [6]:
# 정답 데이터
housing_price=housing.target

## 1. train test로 분리

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train,X_test,y_train,y_test=train_test_split(housing_df,housing_price, test_size=3)

## 2. LinearRegression

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
linear_model = LinearRegression()

In [11]:
linear_model.fit(X_train,y_train)

In [12]:
# 회귀 모델의 Score -> R square
linear_model.score(X_test, y_test) 

-3.5715289582901413

In [13]:
from sklearn.model_selection import cross_val_score

In [14]:
# 교차 검증
result = cross_val_score(linear_model,X_train,y_train,cv=5)

In [15]:
result

array([0.58990937, 0.614     , 0.62753829, 0.58943942, 0.59377792])

In [16]:
result.mean()

0.6029330000600496

## 3. 특성 확장

In [17]:
# 복사!
extended_X_train = X_train.copy()
extended_X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
16810,3.5039,32.0,4.205630,1.062424,2583.0,3.161567,37.65,-122.41
18670,4.8953,21.0,5.395806,0.985583,2001.0,2.622543,36.98,-121.88
20348,2.8942,39.0,4.333333,1.090909,113.0,3.424242,34.19,-119.05
20059,1.6300,12.0,3.398601,0.940559,1134.0,3.965035,35.87,-119.27
2470,2.4814,20.0,5.389972,1.011142,1319.0,3.674095,36.58,-119.90
...,...,...,...,...,...,...,...,...
16817,5.3922,26.0,7.085859,0.946970,1617.0,4.083333,37.64,-122.46
20346,3.1446,13.0,3.284815,1.039339,2257.0,1.775767,34.22,-119.06
48,0.9506,40.0,3.900000,1.218750,423.0,2.643750,37.82,-122.26
3334,4.1250,30.0,5.384615,2.923077,20.0,1.538462,38.92,-122.65


In [18]:
for col1 in X_train.columns:
    for col2 in X_train.columns:
        extended_X_train[col1+'x'+col2] = X_train[col1]*X_train[col2]

In [19]:
extended_X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedIncxMedInc,MedIncxHouseAge,...,LatitudexLatitude,LatitudexLongitude,LongitudexMedInc,LongitudexHouseAge,LongitudexAveRooms,LongitudexAveBedrms,LongitudexPopulation,LongitudexAveOccup,LongitudexLatitude,LongitudexLongitude
16810,3.5039,32.0,4.205630,1.062424,2583.0,3.161567,37.65,-122.41,12.277315,112.1248,...,1417.5225,-4608.7365,-428.912399,-3917.12,-514.811212,-130.051261,-316185.03,-387.007381,-4608.7365,14984.2081
18670,4.8953,21.0,5.395806,0.985583,2001.0,2.622543,36.98,-121.88,23.963962,102.8013,...,1367.5204,-4507.1224,-596.639164,-2559.48,-657.640839,-120.122883,-243881.88,-319.635491,-4507.1224,14854.7344
20348,2.8942,39.0,4.333333,1.090909,113.0,3.424242,34.19,-119.05,8.376394,112.8738,...,1168.9561,-4070.3195,-344.554510,-4642.95,-515.883333,-129.872727,-13452.65,-407.656061,-4070.3195,14172.9025
20059,1.6300,12.0,3.398601,0.940559,1134.0,3.965035,35.87,-119.27,2.656900,19.5600,...,1286.6569,-4278.2149,-194.410100,-1431.24,-405.351189,-112.180524,-135252.18,-472.909720,-4278.2149,14225.3329
2470,2.4814,20.0,5.389972,1.011142,1319.0,3.674095,36.58,-119.90,6.157346,49.6280,...,1338.0964,-4385.9420,-297.519860,-2398.00,-646.257660,-121.235933,-158148.10,-440.523955,-4385.9420,14376.0100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16817,5.3922,26.0,7.085859,0.946970,1617.0,4.083333,37.64,-122.46,29.075821,140.1972,...,1416.7696,-4609.3944,-660.328812,-3183.96,-867.734242,-115.965909,-198017.82,-500.045000,-4609.3944,14996.4516
20346,3.1446,13.0,3.284815,1.039339,2257.0,1.775767,34.22,-119.06,9.888509,40.8798,...,1171.0084,-4074.2332,-374.396076,-1547.78,-391.090087,-123.743714,-268718.42,-211.422832,-4074.2332,14175.2836
48,0.9506,40.0,3.900000,1.218750,423.0,2.643750,37.82,-122.26,0.903640,38.0240,...,1430.3524,-4623.8732,-116.220356,-4890.40,-476.814000,-149.004375,-51715.98,-323.224875,-4623.8732,14947.5076
3334,4.1250,30.0,5.384615,2.923077,20.0,1.538462,38.92,-122.65,17.015625,123.7500,...,1514.7664,-4773.5380,-505.931250,-3679.50,-660.423077,-358.515385,-2453.00,-188.692308,-4773.5380,15043.0225


In [20]:
extended_X_train.shape
# 특성 확장으로 기존 8 + 새로운 8*8(64) -> 72개 특성

(20637, 72)

In [21]:
extended_X_test = X_test.copy()

In [22]:
for col1 in X_test.columns:
    for col2 in X_test.columns:
        extended_X_test[col1+'x'+col2] = X_test[col1]*X_test[col2]

In [23]:
extended_X_test

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedIncxMedInc,MedIncxHouseAge,...,LatitudexLatitude,LatitudexLongitude,LongitudexMedInc,LongitudexHouseAge,LongitudexAveRooms,LongitudexAveBedrms,LongitudexPopulation,LongitudexAveOccup,LongitudexLatitude,LongitudexLongitude
11692,3.59,34.0,4.257732,1.054983,869.0,2.986254,33.87,-117.99,12.8881,122.06,...,1147.1769,-3996.3213,-423.5841,-4011.66,-502.369794,-124.477423,-102533.31,-352.348144,-3996.3213,13921.6401
19211,3.0444,36.0,4.559748,1.081761,1087.0,2.278826,38.46,-122.71,9.268371,109.5984,...,1479.1716,-4719.4266,-373.578324,-4417.56,-559.52673,-132.742893,-133385.77,-279.634738,-4719.4266,15057.7441
5193,2.5,42.0,4.977273,1.113636,526.0,3.984848,33.93,-118.25,6.25,105.0,...,1151.2449,-4012.2225,-295.625,-4966.5,-588.5625,-131.6875,-62199.5,-471.208333,-4012.2225,13983.0625


In [24]:
# 특성 확장시킨 데이터로 새로운 모델 학습하기!
linear_model2 = LinearRegression()
linear_model2.fit(extended_X_train,y_train)

In [25]:
linear_model2.score(extended_X_test,y_test)

-1.8822275157038644

In [26]:
result2 = cross_val_score(linear_model2, extended_X_train,y_train,cv=5)

In [27]:
result2

array([ 0.66678005, -4.4328628 ,  0.03119361,  0.64919748,  0.63672927])

In [28]:
result2.mean()

-0.48979247861006153

In [31]:
X_train.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20637.0,20637.0,20637.0,20637.0,20637.0,20637.0,20637.0,20637.0
mean,3.870791,28.638223,5.429121,1.096677,1425.563696,3.070653,35.631892,-119.569693
std,1.899926,12.585969,2.47433,0.473945,1132.518047,10.386801,2.135949,2.003507
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5637,18.0,4.440748,1.006061,787.0,2.429787,33.93,-121.8
50%,3.5349,29.0,5.229213,1.04878,1166.0,2.818049,34.26,-118.49
75%,4.7437,37.0,6.052381,1.099526,1725.0,3.282258,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


## 4. Scaler 적용
- 특성(Feature)들의 범위를 정규화 해주는 작업
- 훈련데이터와 테스트데이터에 같은 변환을 적용해야 함!

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
scaler = StandardScaler()

In [34]:
# 기준은 훈련 데이터!! 테스트X
scaler.fit(X_train)

In [35]:
# 데이터 스케일링은 transform 함수에서 진행된다

trainsform_X_train=scaler.transform(X_train)
trainsform_X_train

array([[-0.19311274,  0.26711164, -0.49448527, ...,  0.00875299,
         0.94485252, -1.41770218],
       [ 0.53924926, -0.60689865, -0.01346437, ..., -0.04314337,
         0.631167  , -1.15315961],
       [-0.51402773,  0.82330001, -0.44287288, ...,  0.03404298,
        -0.6750757 ,  0.25939785],
       ...,
       [-1.53703966,  0.90275549, -0.6180087 , ..., -0.04110155,
         1.02444437, -1.34283164],
       [ 0.13380262,  0.10820068, -0.01798717, ..., -0.14751692,
         1.53945045, -1.53749504],
       [-0.82883707,  0.50547808, -0.52921268, ...,  0.02245347,
        -0.7453038 ,  0.59381958]])

In [42]:
trainsform_X_test=scaler.transform(X_test)
trainsform_X_test

array([[-0.14779409,  0.4260226 , -0.4734279 , -0.08797486, -0.49145105,
        -0.0081258 , -0.82489565,  0.78848298],
       [-0.43497011,  0.58493356, -0.35136505, -0.03147289, -0.29895497,
        -0.07623584,  1.32408427, -1.56744325],
       [-0.72151451,  1.06166645, -0.18261861,  0.0357841 , -0.79432332,
         0.08801722, -0.79680441,  0.65870738]])

In [37]:
# 이번에는 스케일링 된 데이터로 학습을 시켜보자!
linear_model3 = LinearRegression()

In [43]:
linear_model3.fit(trainsform_X_train,y_train)
linear_model3.score(trainsform_X_test,y_test)

-3.571528958289968

## 5. KNN에 스케일링 데이터 적용

In [45]:
from sklearn.neighbors import KNeighborsRegressor

In [46]:
# 1. 원본 데이터

In [48]:
knn_model = KNeighborsRegressor()
knn_result = cross_val_score(knn_model, X_train, y_train, cv=5)

In [49]:
knn_result.mean()

0.15746508255837538

In [50]:
# 2. 스케일링한 데이터

In [51]:
knn_model2 = KNeighborsRegressor()
knn_result2 = cross_val_score(knn_model2, trainsform_X_train, y_train, cv=5)
knn_result2

array([0.67663178, 0.68895521, 0.70841872, 0.68245354, 0.67355424])

In [52]:
knn_result2.mean()

0.6860026975838562