# 캘리포니아 주택 가격 데이터 이용한 `ElasticNet()` 모델 성능 높이기

## ElasticNet() 모델 성능 평가

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split

california = fetch_california_housing()

model = ElasticNet(alpha=0.1, l1_ratio=0.2)

X_train, X_test, y_train, y_test = train_test_split(california.data, california.target, test_size=0.2)

model.fit(X_train, y_train)

print("학습 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

학습 데이터 점수: 0.5855823837201637
평가 데이터 점수: 0.5913182684639027


## GridSearchCV() 통한 최적의 하이퍼 파라미터 찾기

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV

param_grid = [ {'alpha': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
                'l1_ratio': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] } ]

gs = GridSearchCV(estimator=ElasticNet(), param_grid=param_grid, cv=5)
result = gs.fit(california.data, california.target)

print("최적 점수: {}".format(result.best_score_))
print("최적 파라미터: {}".format(result.best_params_))
print(gs.best_estimator_)
pd.DataFrame(result.cv_results_)

최적 점수: 0.5531766524937997
최적 파라미터: {'alpha': 0.001, 'l1_ratio': 0.1}
ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.1,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.029199,0.007706,0.001256,7.5e-05,0.001,0.1,"{'alpha': 0.001, 'l1_ratio': 0.1}",0.550659,0.467596,0.550788,0.536042,0.660798,0.553177,0.061987,1
1,0.0219,0.002376,0.001138,8e-06,0.001,0.2,"{'alpha': 0.001, 'l1_ratio': 0.2}",0.550743,0.467529,0.550794,0.535931,0.660827,0.553165,0.062021,2
2,0.021148,0.002339,0.001145,1.9e-05,0.001,0.3,"{'alpha': 0.001, 'l1_ratio': 0.3}",0.550828,0.467461,0.550799,0.53582,0.660855,0.553153,0.062055,3
3,0.021474,0.001727,0.001155,8e-06,0.001,0.4,"{'alpha': 0.001, 'l1_ratio': 0.4}",0.550913,0.467393,0.550805,0.535708,0.660884,0.55314,0.06209,4
4,0.02704,0.014495,0.001161,2.1e-05,0.001,0.5,"{'alpha': 0.001, 'l1_ratio': 0.5}",0.550998,0.467324,0.55081,0.535595,0.660912,0.553128,0.062124,5
5,0.020678,0.002369,0.001142,1.1e-05,0.001,0.6,"{'alpha': 0.001, 'l1_ratio': 0.6}",0.551084,0.467255,0.550815,0.535481,0.660941,0.553115,0.062159,6
6,0.017311,0.001369,0.001171,7.7e-05,0.01,0.1,"{'alpha': 0.01, 'l1_ratio': 0.1}",0.561303,0.462766,0.550248,0.528318,0.661939,0.552915,0.064336,7
7,0.016826,0.001808,0.00113,1e-05,0.01,0.2,"{'alpha': 0.01, 'l1_ratio': 0.2}",0.561839,0.461978,0.550146,0.527041,0.662078,0.552616,0.064718,8
8,0.01621,0.001448,0.00115,2.3e-05,0.01,0.3,"{'alpha': 0.01, 'l1_ratio': 0.3}",0.562366,0.461136,0.550013,0.525691,0.662203,0.552282,0.065121,9
9,0.016415,0.00189,0.001146,1.3e-05,0.01,0.4,"{'alpha': 0.01, 'l1_ratio': 0.4}",0.562877,0.460234,0.549847,0.524262,0.662313,0.551907,0.065547,10


## Preprocessing - `StandardScaler`: 표준화

In [None]:
from sklearn.preprocessing import StandardScaler

california = fetch_california_housing()
california_df = pd.DataFrame(data=california.data, columns=california.feature_names)
california_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [None]:
scaler = StandardScaler()
california_scaled = scaler.fit_transform(california_df)
california_df_scaled = pd.DataFrame(data=california_scaled, columns=california.feature_names)
california_df_scaled.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.734255e-16,8.557001e-16,2.704111e-16,-1.531384e-16,-6.465442e-17,6.064808e-19,1.256263e-15,-6.52781e-15
std,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024,1.000024
min,-1.774299,-2.19618,-1.852319,-1.610768,-1.256123,-0.229,-1.447568,-2.385992
25%,-0.6881186,-0.8453931,-0.3994496,-0.1911716,-0.5638089,-0.06171062,-0.7967887,-1.113209
50%,-0.1767951,0.02864572,-0.08078489,-0.101065,-0.2291318,-0.02431585,-0.6422871,0.5389137
75%,0.4593063,0.6643103,0.2519615,0.006015869,0.2644949,0.02037453,0.9729566,0.7784964
max,5.858286,1.856182,55.16324,69.57171,30.25033,119.4191,2.958068,2.62528


In [None]:
X_train, X_test, y_train, y_test = train_test_split(california_df_scaled, california.target, test_size=0.2)
model = ElasticNet(alpha=0.001, l1_ratio=0.1)
model.fit(X_train, y_train)

print("훈련 데이터 점수: {}".format(model.score(X_train, y_train)))
print("평가 데이터 점수: {}".format(model.score(X_test, y_test)))

훈련 데이터 점수: 0.603019978699828
평가 데이터 점수: 0.6188119380297581


## cross_val_score(): 교차 검증

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, cross_validate

scores = cross_val_score(model, california.data, california.target, cv=5)

print("교차 검증 정확도: {}".format(scores))
print("교차 검증 정확도: {} +/- {}".format(np.mean(scores), np.std(scores)))

교차 검증 정확도: [0.55065917 0.46759593 0.5507877  0.53604231 0.66079815]
교차 검증 정확도: 0.5531766524937997 +/- 0.06198721845014342
