In [1]:
import pandas as pd
import numpy as np
import mglearn
import warnings
warnings.filterwarnings("ignore")
X,y = mglearn.datasets.load_extended_boston()

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3, random_state = 0)

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_val)
mse = mean_squared_error(y_val, pred)
print("MSE : {}".format(mse))
print("RMSE : {}".format(np.sqrt(mse)))
print("r2_score : {}".format(lr.score(X_val, y_val)))
print("r2_score2 : {}".format(r2_score(y_val, pred)))
# print("기울기 : {}".format(lr.coef_))
# print("절편 : {}".format(lr.intercept_))

MSE : 29.252507139200187
RMSE : 5.4085586933304315
r2_score : 0.6486839499987662
r2_score2 : 0.6486839499987662


In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [18]:
vlf_list = []

for i in range(X_train.shape[1]):
    vlf_list.append([variance_inflation_factor(X_train, i)])

In [12]:
vlf_list

[[54851.95197767613],
 [7130.307358208186],
 [22374.003679026846],
 [inf],
 [6771.725195571671],
 [6800.801502806481],
 [10940.386138579546],
 [7326.440153114255],
 [30805.61103699408],
 [81206.29682014103],
 [10708.631478254902],
 [7673.09915025527],
 [2412.5310782263655],
 [37.29568591935402],
 [19.417800952621043],
 [325173.70523822005],
 [14.822542635807283],
 [515.0122621388858],
 [177.6698962626534],
 [844.5913175230344],
 [30.361263840820758],
 [992617.8325887014],
 [2331396.8718411014],
 [29863.383191795456],
 [44.55382393756138],
 [159.09454435609885],
 [102.57528989122348],
 [13.50994311575819],
 [6.940197261396829],
 [50.240661897464726],
 [418.7975220112337],
 [44.137292535255966],
 [171.74257241834985],
 [28.918759686656628],
 [38.43267370061432],
 [63.36677850173572],
 [5720.689018922408],
 [28.167527476576897],
 [976.1444457223474],
 [159.60310031467287],
 [4324.919106210621],
 [1405.2563175684206],
 [1312.8887738668022],
 [236.1472746520232],
 [15955.742728721658],
 [20

In [23]:
vlf_df = pd.DataFrame(vlf_list, columns = ["vlf"])
vlf_df[vlf_df["vlf"] < 10] #한개 빼고 모두다 다중공선성 존재

Unnamed: 0,vlf
28,6.940197


### Ridge, Lasso, ElasticNet 실습

In [33]:
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train, y_train)

print("훈련 데이터 성능 : {}".format(ridge.score(X_train, y_train)))
print("테스트 데이터 성능 : {}".format(ridge.score(X_val, y_val)))   

훈련 데이터 성능 : 0.8820517790991202
테스트 데이터 성능 : 0.7834459976736695


In [43]:
# GridSearch CV로 최적의 파라미터 도출
from sklearn.model_selection import GridSearchCV
# alpha = 
ridge = Ridge()
param_grid = {'alpha':(0.0001,0.0005,0.001,0.005, 0.01,0.05,0.1, 0.15, 0.2, 0.25, 0.3,0.5,1,2,3)}
grid = GridSearchCV(ridge, param_grid = param_grid)
grid.fit(X_train, y_train)
pred = grid.predict(X_val)
print(grid.best_estimator_)
print("grid 학습 데이터 성능 : {}".format(grid.score(X_train, y_train)))
print("grid 테스트 데이터 성능 : {}".format(grid.score(X_val, y_val)))


Ridge(alpha=0.05)
grid 학습 데이터 성능 : 0.9333679643206895
grid 테스트 데이터 성능 : 0.7877067511712756


### Lasso 

In [44]:
from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(X_train, y_train)

print("학습 데이터 성능 : {}".format(lasso.score(X_train, y_train)))
print("테스트 데이터 성능 : {}".format(lasso.score(X_val, y_val)))

학습 데이터 성능 : 0.26489481124592407
테스트 데이터 성능 : 0.2136447822416102


In [49]:
alpha = [0.0001,0.0005,0.001,0.005, 0.01,0.05,0.1, 0.15, 0.2, 0.25, 0.3,0.5,1,2,3]

max_val = -197000
for i in alpha:
    lasso = Lasso(i)
    lasso.fit(X_train, y_train)
    score = lasso.score(X_val, y_val)
    
    if score > max_val:
        max_val = score

print(max_val)

0.8082914411591207


### ElasticNet

In [51]:
from sklearn.linear_model import ElasticNet

elastic = ElasticNet()
elastic.fit(X_train, y_train)

print("훈련 데이터 성능 : {}".format(elastic.score(X_train, y_train)))
print("테스트 데이터 성능 : {}".format(elastic.score(X_train, y_train)))

훈련 데이터 성능 : 0.3096253527541134
테스트 데이터 성능 : 0.3096253527541134


In [53]:
# gridSearchCV로 최적의 파라미터 찾기
elastic = ElasticNet()
param_grid = {"alpha" : [0.0001,0.0005,0.001,0.005, 0.01,0.05,0.1, 0.15, 0.2, 0.25, 0.3,0.5,1,2,3],\
             "l1_ratio": [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8, 0.9]}

grid = GridSearchCV(elastic, param_grid = param_grid)
grid.fit(X_train,y_train)

print("best params : {}".format(grid.best_params_))
print("훈련 데이터 성능 : {}".format(grid.score(X_train, y_train)))
print("테스트 데이터 성능 : {}".format(grid.score(X_train, y_train)))

best params : {'alpha': 0.0005, 'l1_ratio': 0.2}
훈련 데이터 성능 : 0.9231617946389865
테스트 데이터 성능 : 0.9231617946389865
