In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing

In [2]:
housing = fetch_california_housing()

In [3]:
housing.data #문제

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [4]:
housing.target #답

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [5]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [6]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [7]:
# 데이터 프레임 생성
housing_df = pd.DataFrame(housing.data, columns=housing.feature_names)
#주택에 대한 일반적인 정보!

In [8]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [9]:
# 주택의 가격을 예측하는 실습 : 회귀
housing_price = housing.target

In [10]:
### train-test-split
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(housing_df,housing_price, test_size=0.3)

In [12]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

In [13]:
linear_model.fit(X_train, y_train)

In [14]:
linear_model.score(X_test,y_test)

0.603942538184429

In [15]:
linear_model.coef_

array([ 4.34843479e-01,  8.87459669e-03, -1.06003805e-01,  6.23218532e-01,
       -5.61134994e-06, -3.81108734e-03, -4.21264829e-01, -4.36429611e-01])

In [16]:
linear_model.intercept_

-37.13264583878999

# 2 Linear Regession

In [17]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

In [18]:
# 모델 학습
linear_model.fit(X_train,y_train)

In [19]:
linear_model.score(X_test,y_test) #R2 스코어 :  R square

0.603942538184429

In [20]:
from sklearn.model_selection import cross_val_score

In [22]:
result = cross_val_score(linear_model, X_train, y_train, cv = 5)
result

array([0.62837941, 0.58353804, 0.57810986, 0.59999392, 0.61974528])

In [23]:
result.mean()

0.601953303191592

# 3 특성확장

In [24]:
# copy 함수를 활용하여 복사하기
extended_X_train = X_train.copy()

In [25]:
extended_X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
18487,5.1519,15.0,5.067019,0.964727,1582.0,2.790123,37.01,-121.58
6081,3.7750,25.0,5.207547,1.125000,1084.0,2.556604,34.10,-117.87
7828,3.1204,32.0,4.596288,1.095128,1371.0,3.180974,33.91,-118.14
1175,0.8252,15.0,5.050000,1.350000,145.0,3.625000,39.48,-121.57
7608,3.0972,40.0,5.463158,1.073684,330.0,3.473684,33.88,-118.26
...,...,...,...,...,...,...,...,...
6310,5.0650,27.0,5.766871,1.023517,3538.0,3.617587,34.04,-118.02
9979,3.1563,18.0,6.235294,1.193277,343.0,2.882353,38.68,-122.27
5337,3.2870,31.0,3.894502,1.050520,1632.0,2.424963,34.04,-118.46
4898,1.7813,30.0,5.955882,1.264706,376.0,5.529412,34.01,-118.24


In [26]:
for col1 in X_train.columns :   # 8번 반복
    for col2 in X_train.columns :  # 8번 반복
        extended_X_train[ col1 +'x'+ col2 ] = X_train[col1] * X_train[col2]

In [28]:
extended_X_train.shape
# 특성 확장으로 기존 8 + 새로운 8*8(64) => 72개 특성

(14448, 72)

In [30]:
extended_X_test= X_test.copy()

In [31]:
for col1 in X_test.columns :   # 8번 반복
    for col2 in X_test.columns :  # 8번 반복
        extended_X_test[ col1 +'x'+ col2 ] = X_test[col1] * X_test[col2]

In [33]:
extended_X_test.shape

(6192, 72)

In [34]:
# 특성 확장시킨 데이터로 새로운 모델 학습하기!
linear_model2 = LinearRegression()
linear_model2.fit(extended_X_train, y_train)

In [35]:
linear_model2.score(extended_X_test,y_test)

0.6594835099300057

In [37]:
result2 = cross_val_score(linear_model2,extended_X_train, y_train, cv = 5)
result2

array([  0.55572339,   0.57745926,   0.61783839, -39.21991146,
        -0.52682796])

In [41]:
result2.mean()

-7.599143679668738

In [44]:
# 기술 통계 확인
X_train.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,14448.0,14448.0,14448.0,14448.0,14448.0,14448.0,14448.0,14448.0
mean,3.863902,28.74121,5.430307,1.096836,1418.599045,3.093843,35.648153,-119.587878
std,1.90146,12.619075,2.614271,0.508163,1117.236549,11.676935,2.131294,2.005142
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.568575,18.0,4.436763,1.006094,785.0,2.434627,33.93,-121.81
50%,3.529,29.0,5.225568,1.04898,1166.0,2.81771,34.27,-118.52
75%,4.733875,37.0,6.051641,1.098924,1722.0,3.279898,37.72,-118.02
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


# 데이터 스케일링
- 특성들의 범위를 정규화 해주는 과정

## Scaler 적용
- 특성(Feature)들의 범위를 정규화 해주는 작업
- 훈련 데이터와 테스트데이터에 같은 변환을 적용해야 함!

In [65]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

In [66]:
# StandardScaler 생성
scaler = StandardScaler()
scaler1 = MinMaxScaler()
scaler2 = RobustScaler()
scaler3 = Normalizer()


In [68]:
# 기준은 훈련 데이터! 
# 스케일러 학습되는 작업
scaler.fit(X_train) 

In [69]:
scaler1.fit(X_train) 

In [70]:
scaler2.fit(X_train) 

In [71]:
scaler3.fit(X_train) 

In [85]:
# .transform : 데이터 스케일링
# 데이터 스케일링은 transform 함수에서 진행된다!
transfrom_X_train= scaler.transform(X_train)
transfrom_X_train

array([[ 0.67739664, -1.08896142, -0.13896787, ..., -0.02601114,
         0.63899858, -0.99354101],
       [-0.04675618, -0.29648286, -0.08521192, ..., -0.04601021,
        -0.72641647,  0.85676584],
       [-0.39102986,  0.25825213, -0.31903641, ...,  0.00746207,
        -0.81556728,  0.72210739],
       ...,
       [-0.30340995,  0.17900428, -0.58748973, ..., -0.05728418,
        -0.75456935,  0.56251219],
       [-1.09530253,  0.09975642,  0.20104802, ...,  0.20858664,
        -0.7686458 ,  0.67223389],
       [ 1.62896152, -0.37573071,  1.10043941, ...,  0.02643304,
        -0.82495157,  0.86175319]])

In [87]:
transfrom_X_train1= scaler1.transform(X_train)
transfrom_X_train1

array([[0.32082316, 0.2745098 , 0.02992186, ..., 0.00168819, 0.47502657,
        0.27589641],
       [0.22586585, 0.47058824, 0.03091807, ..., 0.00150027, 0.16578108,
        0.64541833],
       [0.18072165, 0.60784314, 0.02658483, ..., 0.00200272, 0.1455898 ,
        0.6185259 ],
       ...,
       [0.19221114, 0.58823529, 0.02160985, ..., 0.00139433, 0.15940489,
        0.58665339],
       [0.08837119, 0.56862745, 0.03622304, ..., 0.0038926 , 0.15621679,
        0.60856574],
       [0.44560075, 0.45098039, 0.05289058, ..., 0.00218099, 0.1434644 ,
        0.64641434]])

In [88]:
transfrom_X_train2= scaler2.transform(X_train)
transfrom_X_train2

array([[ 0.74950353, -0.73684211, -0.09818024, ..., -0.03263608,
         0.72295515, -0.80738786],
       [ 0.11361012, -0.21052632, -0.01115953, ..., -0.30890231,
        -0.04485488,  0.17150396],
       [-0.18870364,  0.15789474, -0.38967713, ...,  0.42976152,
        -0.09498681,  0.10026385],
       ...,
       [-0.1117628 ,  0.10526316, -0.82425224, ..., -0.46464054,
        -0.06068602,  0.01583113],
       [-0.80713989,  0.05263158,  0.45224112, ...,  3.20808806,
        -0.06860158,  0.07387863],
       [ 1.58509214, -0.26315789,  1.90818554, ...,  0.69182576,
        -0.10026385,  0.17414248]])

In [89]:
transfrom_X_train3= scaler3.transform(X_train)
transfrom_X_train3

array([[ 0.00324593,  0.00945068,  0.00319245, ...,  0.00175791,
         0.02331799, -0.07660095],
       [ 0.00345939,  0.02290989,  0.00477217, ...,  0.00234286,
         0.0312491 , -0.10801557],
       [ 0.00226627,  0.02324085,  0.00333818, ...,  0.00231027,
         0.02462804, -0.08580233],
       ...,
       [ 0.002008  ,  0.01893764,  0.00237912, ...,  0.00148139,
         0.02079475, -0.07236623],
       [ 0.00448868,  0.07559669,  0.01500817, ...,  0.01393351,
         0.08570144, -0.29795174],
       [ 0.00839158,  0.02893151,  0.01001398, ...,  0.00410163,
         0.0408537 , -0.14207781]])

In [90]:
transfrom_X_test= scaler.transform(X_test)
transfrom_X_test

array([[-0.50294773,  1.84320925, -0.39924458, ..., -0.04466131,
        -0.83433587,  0.83681644],
       [-0.51409745,  0.41674784, -0.27313623, ...,  0.11425171,
         0.78445517, -1.14814887],
       [ 0.09624415,  0.33749999,  0.0637876 , ...,  0.03402835,
        -0.72641647,  0.68220859],
       ...,
       [ 0.36646691,  0.25825213, -0.27135703, ...,  0.20623374,
        -0.90471809,  0.83681644],
       [-0.66219929, -0.9304657 ,  0.26789062, ..., -0.13384062,
        -1.23316844,  1.25575384],
       [-1.0167812 , -0.53422643, -0.47962865, ...,  0.10671199,
        -0.93287098,  1.70461534]])

In [91]:
transfrom_X_test1= scaler1.transform(X_test)
transfrom_X_test1

array([[1.66045986e-01, 1.00000000e+00, 2.50984085e-02, ...,
        1.51294419e-03, 1.41339001e-01, 6.41434263e-01],
       [1.64583937e-01, 6.47058824e-01, 2.74354505e-02, ...,
        3.00617733e-03, 5.07970244e-01, 2.45019920e-01],
       [2.44617316e-01, 6.27450980e-01, 3.36793284e-02, ...,
        2.25235504e-03, 1.65781084e-01, 6.10557769e-01],
       ...,
       [2.80051310e-01, 6.07843137e-01, 2.74684227e-02, ...,
        3.87049051e-03, 1.25398512e-01, 6.41434263e-01],
       [1.45163515e-01, 3.13725490e-01, 3.74617688e-02, ...,
        6.74966898e-04, 5.10095643e-02, 7.25099602e-01],
       [9.86676046e-02, 4.11764706e-01, 2.36087296e-02, ...,
        2.93533009e-03, 1.19022317e-01, 8.14741036e-01]])

In [92]:
transfrom_X_test2= scaler2.transform(X_test)
transfrom_X_test2

array([[-0.28698102,  1.21052632, -0.51951896, ..., -0.29026871,
        -0.1055409 ,  0.16094987],
       [-0.29677181,  0.26315789, -0.31537342, ...,  1.90494916,
         0.80474934, -0.88918206],
       [ 0.23918164,  0.21052632,  0.23004248, ...,  0.79674702,
        -0.04485488,  0.07915567],
       ...,
       [ 0.47646977,  0.15789474, -0.31249322, ...,  3.17558512,
        -0.14511873,  0.16094987],
       [-0.42682307, -0.63157895,  0.56044663, ..., -1.522188  ,
        -0.3298153 ,  0.38258575],
       [-0.7381887 , -0.36842105, -0.64964556, ...,  1.80079587,
        -0.16094987,  0.62005277]])

In [93]:
transfrom_X_test3= scaler3.transform(X_test)
transfrom_X_test3

array([[ 0.00242614,  0.04338956,  0.00366025, ...,  0.00214641,
         0.02826162, -0.09838582],
       [ 0.00300315,  0.03537521,  0.00490704, ...,  0.004607  ,
         0.03882949, -0.12682012],
       [ 0.00338981,  0.02764186,  0.00468828, ...,  0.00292432,
         0.02856325, -0.09902486],
       ...,
       [ 0.00160484,  0.01126031,  0.00166122, ...,  0.00193605,
         0.01186555, -0.04149071],
       [ 0.00359032,  0.02343193,  0.00845014, ...,  0.00211032,
         0.04551308, -0.16136332],
       [ 0.00285881,  0.03257739,  0.00618448, ...,  0.00642644,
         0.04984341, -0.17202345]])

In [55]:
linear_model3 = LinearRegression()
linear_model3.fit(transfrom_X_train,y_train)
linear_model3.score(transfrom_X_test, y_test)

0.6039425381844292

In [56]:
result3 =cross_val_score(linear_model3,transfrom_X_train,y_train, cv = 5)
result3

array([0.62837941, 0.58353804, 0.57810986, 0.59999392, 0.61974528])

In [57]:
result3.mean()

0.6019533031915917

In [58]:
from sklearn.neighbors import KNeighborsRegressor

In [62]:
# 원본데이터 (스케일링 X) - X_train
knn_model = KNeighborsRegressor()

knn_result = cross_val_score(knn_model, X_train, y_train, cv =5)
knn_result.mean()

0.11446423844724204

In [63]:
# 스케일링 시킨 데이터 - transform_X_train
knn_model2 = KNeighborsRegressor()

knn_result2 = cross_val_score(knn_model2, transfrom_X_train, y_train, cv = 5)
knn_result2.mean()

0.6738426978949508

In [96]:
knn1 = KNeighborsRegressor()
knn1_result = cross_val_score(knn1, transfrom_X_train, y_train, cv =5)
knn1_result.mean()

0.6738426978949508

In [97]:
knn2 = KNeighborsRegressor()
knn2_result = cross_val_score(knn2, transfrom_X_train1, y_train, cv =5)
knn2_result.mean()

0.6936262631151164

In [98]:
knn3 = KNeighborsRegressor()
knn3_result = cross_val_score(knn3, transfrom_X_train2, y_train, cv =5)
knn3_result.mean()

0.6749102678965475

In [99]:
knn4 = KNeighborsRegressor()
knn4_result = cross_val_score(knn4, transfrom_X_train3, y_train, cv =5)
knn4_result.mean()

0.3914368791288002