In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing

In [2]:
housing = fetch_california_housing()

In [3]:
housing.data #문제

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]])

In [4]:
housing.target #답

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [5]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [6]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [7]:
# 데이터 프레임 생성
housing_df = pd.DataFrame(housing.data, columns=housing.feature_names)
#주택에 대한 일반적인 정보!

In [8]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [9]:
# 주택의 가격을 예측하는 실습 : 회귀
housing_price = housing.target

In [10]:
### train-test-split
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(housing_df,housing_price, test_size=0.3)

In [12]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

In [13]:
linear_model.fit(X_train, y_train)

In [14]:
linear_model.score(X_test,y_test)

0.5919358323249566

In [15]:
linear_model.coef_

array([ 4.36725366e-01,  1.00249541e-02, -9.93579762e-02,  5.98172063e-01,
       -7.33474982e-06, -3.29478410e-03, -4.19491568e-01, -4.32200632e-01])

In [16]:
linear_model.intercept_

-36.73741254025754

# 2 Linear Regession

In [17]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

In [18]:
# 모델 학습
linear_model.fit(X_train,y_train)

In [19]:
linear_model.score(X_test,y_test) #R2 스코어 :  R square

0.5919358323249566

In [20]:
from sklearn.model_selection import cross_val_score

In [21]:
result = cross_val_score(linear_model, X_train, y_train, cv = 5)
result

array([0.61091997, 0.60497901, 0.62457688, 0.59726883, 0.58702749])

In [22]:
result.mean()

0.6049544354552733

# 3 특성확장

In [23]:
# copy 함수를 활용하여 복사하기
extended_X_train = X_train.copy()

In [24]:
extended_X_train

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
3190,5.1728,7.0,4.650485,0.815534,327.0,3.174757,36.29,-119.80
14138,2.6629,37.0,5.009615,0.985577,589.0,2.831731,32.74,-117.07
15300,2.5509,26.0,5.245747,1.089792,2372.0,2.241966,33.18,-117.36
8532,3.6486,29.0,4.298246,1.035088,2175.0,3.179825,33.89,-118.35
10814,5.8777,27.0,5.618510,1.085779,1082.0,2.442438,33.63,-117.95
...,...,...,...,...,...,...,...,...
13338,4.4042,9.0,5.416452,1.056555,1138.0,2.925450,34.02,-117.65
4346,8.5136,26.0,6.174051,0.965190,1468.0,2.322785,34.12,-118.36
13185,5.1165,40.0,5.747368,1.121053,457.0,2.405263,33.94,-117.76
3192,3.2019,33.0,6.128571,1.019048,580.0,2.761905,36.34,-119.72


In [25]:
for col1 in X_train.columns :   # 8번 반복
    for col2 in X_train.columns :  # 8번 반복
        extended_X_train[ col1 +'x'+ col2 ] = X_train[col1] * X_train[col2]

In [26]:
extended_X_train.shape
# 특성 확장으로 기존 8 + 새로운 8*8(64) => 72개 특성

(14448, 72)

In [27]:
extended_X_test= X_test.copy()

In [28]:
for col1 in X_test.columns :   # 8번 반복
    for col2 in X_test.columns :  # 8번 반복
        extended_X_test[ col1 +'x'+ col2 ] = X_test[col1] * X_test[col2]

In [29]:
extended_X_test.shape

(6192, 72)

In [30]:
# 특성 확장시킨 데이터로 새로운 모델 학습하기!
linear_model2 = LinearRegression()
linear_model2.fit(extended_X_train, y_train)

In [31]:
linear_model2.score(extended_X_test,y_test)

0.6632226926409608

In [32]:
result2 = cross_val_score(linear_model2,extended_X_train, y_train, cv = 5)
result2

array([   0.57222799,   -1.14083452,    0.54244819, -391.87068375,
          0.67168851])

In [33]:
result2.mean()

-78.24503071624558

In [34]:
# 기술 통계 확인
X_train.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,14448.0,14448.0,14448.0,14448.0,14448.0,14448.0,14448.0,14448.0
mean,3.870507,28.639466,5.431579,1.096701,1421.902547,3.120111,35.6135,-119.55593
std,1.898959,12.545775,2.621596,0.512042,1117.136489,12.364264,2.13293,2.001
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.55,-124.35
25%,2.559875,18.0,4.438304,1.00533,791.0,2.432146,33.93,-121.78
50%,3.5446,29.0,5.227321,1.048797,1165.0,2.825823,34.25,-118.48
75%,4.7387,37.0,6.048293,1.099455,1721.0,3.290166,37.7,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


# 데이터 스케일링
- 특성들의 범위를 정규화 해주는 과정

## Scaler 적용
- 특성(Feature)들의 범위를 정규화 해주는 작업
- 훈련 데이터와 테스트데이터에 같은 변환을 적용해야 함!

In [35]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

In [36]:
# StandardScaler 생성
scaler = StandardScaler()
scaler1 = MinMaxScaler()
scaler2 = RobustScaler()
scaler3 = Normalizer()


In [37]:
# 기준은 훈련 데이터! 
# 스케일러 학습되는 작업
scaler.fit(X_train) 

In [38]:
scaler1.fit(X_train) 

In [39]:
scaler2.fit(X_train) 

In [40]:
scaler3.fit(X_train) 

In [41]:
# .transform : 데이터 스케일링
# 데이터 스케일링은 transform 함수에서 진행된다!
transfrom_X_train= scaler.transform(X_train)
transfrom_X_train

array([[ 0.68581669, -1.72490052, -0.29795608, ...,  0.00441988,
         0.31718028, -0.1219781 ],
       [-0.6359534 ,  0.66642542, -0.1609623 , ..., -0.02332447,
        -1.34725463,  1.24238672],
       [-0.69493514, -0.21039409, -0.0708876 , ..., -0.07102526,
        -1.14095847,  1.09745419],
       ...,
       [ 0.65616784,  0.90555801,  0.12046112, ..., -0.05781761,
        -0.78462874,  0.89754726],
       [-0.35210382,  0.34758196,  0.2658749 , ..., -0.02897207,
         0.34062302, -0.08199671],
       [ 0.5948163 ,  1.14469061, -0.04518305, ..., -0.03361135,
        -0.82213714,  0.70763567]])

In [42]:
transfrom_X_train1= scaler1.transform(X_train)
transfrom_X_train1

array([[0.32226452, 0.11764706, 0.02696904, ..., 0.00199772, 0.39787234,
        0.45318725],
       [0.14917036, 0.70588235, 0.02951492, ..., 0.00172167, 0.02021277,
        0.7250996 ],
       [0.14144632, 0.49019608, 0.03118886, ..., 0.00124707, 0.06702128,
        0.69621514],
       ...,
       [0.31838182, 0.76470588, 0.03474488, ..., 0.00137848, 0.14787234,
        0.6563745 ],
       [0.18634226, 0.62745098, 0.03744724, ..., 0.00166548, 0.40319149,
        0.46115538],
       [0.31034744, 0.82352941, 0.03166656, ..., 0.00161932, 0.1393617 ,
        0.6185259 ]])

In [43]:
transfrom_X_train2= scaler2.transform(X_train)
transfrom_X_train2

array([[ 0.74728351, -1.15789474, -0.35828583, ...,  0.40667369,
         0.54111406, -0.35013263],
       [-0.40466765,  0.42105263, -0.13522213, ...,  0.0068852 ,
        -0.4005305 ,  0.37400531],
       [-0.45607151, -0.15789474,  0.01144434, ..., -0.68047034,
        -0.28381963,  0.29708223],
       ...,
       [ 0.7214439 ,  0.57894737,  0.32301288, ..., -0.49015171,
        -0.08222812,  0.19098143],
       [-0.15728661,  0.21052632,  0.55978665, ..., -0.07449519,
         0.55437666, -0.32891247],
       [ 0.66797471,  0.73684211,  0.05329844, ..., -0.14134597,
        -0.10344828,  0.09018568]])

In [44]:
transfrom_X_train3= scaler3.transform(X_train)
transfrom_X_train3

array([[ 0.01476701,  0.01998319,  0.01327594, ...,  0.00906311,
         0.10359859, -0.34199809],
       [ 0.00441913,  0.06140214,  0.00831354, ...,  0.00469931,
         0.0543326 , -0.1942797 ],
       [ 0.00107393,  0.01094606,  0.00220847, ...,  0.00094387,
         0.01396886, -0.04940883],
       ...,
       [ 0.01077374,  0.0842274 ,  0.01210215, ...,  0.00506473,
         0.07146695, -0.24796547],
       [ 0.00538763,  0.055527  ,  0.01031216, ...,  0.00464728,
         0.06114701, -0.20144524],
       [ 0.00463429,  0.0398549 ,  0.00492452, ...,  0.00250673,
         0.03138341, -0.10949901]])

In [45]:
transfrom_X_test= scaler.transform(X_test)
transfrom_X_test

array([[-1.00148216,  1.14469061, -0.26355658, ..., -0.05619611,
         2.42702734, -2.30596133],
       [-0.22529309,  0.10844937, -0.28379489, ...,  0.09727979,
        -0.71430051,  0.79759379],
       [-0.22803153,  0.42729283, -0.05267991, ..., -0.03579588,
         0.80947792, -1.20147553],
       ...,
       [-1.10354162,  0.0287385 , -0.61176301, ...,  0.11649531,
         0.42032835, -0.94659419],
       [-1.16278666,  1.14469061, -0.60330941, ...,  0.02882276,
        -0.78462874,  0.64266592],
       [-0.30728823, -0.76837014, -0.67266581, ..., -0.04084162,
        -0.80338294,  0.6276729 ]])

In [46]:
transfrom_X_test1= scaler1.transform(X_test)
transfrom_X_test1

array([[0.10130205, 0.82352941, 0.02760832, ..., 0.00139461, 0.87659574,
        0.01792829],
       [0.20294892, 0.56862745, 0.02723221, ..., 0.00292164, 0.16382979,
        0.63645418],
       [0.20259031, 0.64705882, 0.03152724, ..., 0.00159759, 0.50957447,
        0.23804781],
       ...,
       [0.08793672, 0.54901961, 0.02113727, ..., 0.00311283, 0.4212766 ,
        0.28884462],
       [0.0801782 , 0.82352941, 0.02129437, ..., 0.00224052, 0.14787234,
        0.60557769],
       [0.19221114, 0.35294118, 0.02000545, ..., 0.00154738, 0.14361702,
        0.60258964]])

In [47]:
transfrom_X_test2= scaler2.transform(X_test)
transfrom_X_test2

array([[-0.72323385,  0.73684211, -0.30227395, ..., -0.46678624,
         1.73474801, -1.50928382],
       [-0.04676833,  0.05263158, -0.3352275 , ...,  1.74476006,
        -0.04244032,  0.13793103],
       [-0.04915493,  0.26315789,  0.04109149, ..., -0.17282452,
         0.81962865, -0.92307692],
       ...,
       [-0.81218088,  0.        , -0.86925015, ...,  2.02165038,
         0.5994695 , -0.78779841],
       [-0.86381421,  0.73684211, -0.85548535, ...,  0.75831271,
        -0.08222812,  0.05570292],
       [-0.11822886, -0.52631579, -0.96841671, ..., -0.24553229,
        -0.0928382 ,  0.04774536]])

In [48]:
transfrom_X_test3= scaler3.transform(X_test)
transfrom_X_test3

array([[ 0.00167261,  0.03653097,  0.00402747, ...,  0.00206044,
         0.03465345, -0.10548956],
       [ 0.00138805,  0.01209563,  0.00188999, ...,  0.00174293,
         0.01374467, -0.04756003],
       [ 0.00457888,  0.04528933,  0.00705112, ...,  0.00356658,
         0.04973834, -0.16245547],
       ...,
       [ 0.00141796,  0.02316675,  0.00305788, ...,  0.00364312,
         0.02916613, -0.09702073],
       [ 0.00139802,  0.03615944,  0.00323753, ...,  0.00292342,
         0.02854073, -0.09945529],
       [ 0.00189946,  0.01097952,  0.00211973, ...,  0.00151122,
         0.01958977, -0.06836194]])

In [49]:
linear_model3 = LinearRegression()
linear_model3.fit(transfrom_X_train,y_train)
linear_model3.score(transfrom_X_test, y_test)

0.591935832324956

In [50]:
result3 =cross_val_score(linear_model3,transfrom_X_train,y_train, cv = 5)
result3

array([0.61091997, 0.60497901, 0.62457688, 0.59726883, 0.58702749])

In [51]:
result3.mean()

0.6049544354552742

In [52]:
from sklearn.neighbors import KNeighborsRegressor

In [53]:
# 원본데이터 (스케일링 X) - X_train
knn_model = KNeighborsRegressor()

knn_result = cross_val_score(knn_model, X_train, y_train, cv =5)
knn_result.mean()
# 데이터의 숫자 범위가 다양해서 예측을 거의 못 함
# 스케일링을 통해서 

0.11718567298756108

In [54]:
# 스케일링 시킨 데이터 - transform_X_train
knn_model2 = KNeighborsRegressor()

knn_result2 = cross_val_score(knn_model2, transfrom_X_train, y_train, cv = 5)
knn_result2.mean()

0.6781672614726432

In [55]:
knn1 = KNeighborsRegressor()
knn1_result = cross_val_score(knn1, transfrom_X_train, y_train, cv =5)
knn1_result.mean()

0.6781672614726432

In [56]:
knn2 = KNeighborsRegressor()
knn2_result = cross_val_score(knn2, transfrom_X_train1, y_train, cv =5)
knn2_result.mean()

0.6972257626691475

In [57]:
knn3 = KNeighborsRegressor()
knn3_result = cross_val_score(knn3, transfrom_X_train2, y_train, cv =5)
knn3_result.mean()

0.6878391780362103

In [58]:
knn4 = KNeighborsRegressor()
knn4_result = cross_val_score(knn4, transfrom_X_train3, y_train, cv =5)
knn4_result.mean()

0.4051084713360261