In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston #보스턴 주택값 데이터

In [2]:
boston = load_boston()

In [3]:
boston.keys() #키값 확인
#'data'->문제, 'target'->답,
#'feature_names'->특성(컬럼), 'DESCR', 'filename'

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [4]:
boston['data']

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [5]:
#문제가 들어간 데이터프레임 만들기
b_df = pd.DataFrame(boston.data, columns = boston.feature_names)
b_df.head()
#b_df : 문제에 해당하는 데이터프레임

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [6]:
y = boston['target']  # 일차원배열이라서 y변수에 바로 대입

## train, test 분리
- Overfitting을 방지하는 것
- Overfitting : 머신러닝 모델에 train 데이터를 100% 학습시킨 후 test 데이터에 모델을 적용했을 때 성능이 생각보다 않 나오는 경우
- train, test 분리하는 이유 : 검증을 위한 것

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(b_df, #문제
                                                   y, #답
                                                   random_state = 1,
                                                   test_size = 0.3)

- random_state : random_state=1 이라고 하면 바로 이 random 함수의 seed 값을 고정시키기 때문에 여러번 수행하더라도 같은 레코드를 추출합니다.

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(354, 13)
(152, 13)
(354,)
(152,)


## 규제
- L1(Lasso) : w의 모든 원소에 똑같은 힘으로 규제를 적용하는 법,계수들은 0이 됨 
- L2(Ridge) : w의 모든 원소에 골고루 규제를 적용하여 0에 가깝게 만드는 방법

In [10]:
# linear_model : 선형모델
from sklearn.linear_model import Lasso, Ridge

### L1규제 Lasso

In [11]:
lasso = Lasso(alpha = 1) # alpha : 규제의 세기를 설정하는 하이퍼파라미터
lasso.fit(X_train, y_train)

Lasso(alpha=1)

In [12]:
lasso.score(X_train, y_train)

0.6426270747993064

In [13]:
lasso.score(X_test, y_test)

0.6694782854622285

- 회귀문제에서의 score는 가까운 정도를 나타냄
- 예측 값의 미묘한 차이가 크게 중요하지 않다.

In [14]:
#.coef_ : 가중치확인코드 / 가중치가 0이 아닌것만 출력 
# 0이 되는 4개의 특성을 제외하고 나머지 9개만 사용 , 과대적합 방지
print(np.sum(lasso.coef_!=0))

9


### L2 규제 Ridge

In [15]:
ridge = Ridge(alpha=1)  # alpha : 규제의 세기를 설정하는 하이퍼파라미터
ridge.fit(X_train, y_train)

Ridge(alpha=1)

In [16]:
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))

0.7063721813825194
0.7890510666829774


In [17]:
print(np.sum(ridge.coef_!=0))

13


## LinearRegression

In [18]:
from sklearn.linear_model import LinearRegression

In [19]:
l_model = LinearRegression()

In [20]:
l_model.fit(X_train,y_train)

LinearRegression()

In [21]:
l_model.score(X_test, y_test) #R square

0.7836295385076297

## 특성확장

In [22]:
e_X_train = X_train.copy()

In [23]:
for i in X_train.columns:
    print(i)  #컬럼들에 연속형데이터들이 들어 있음

CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT


In [24]:
for i in X_train.columns:    #총 169번 반복
    for j in X_train.columns:
        e_X_train[i+'x'+j] = X_train[i] * X_train[j] #새로운 컬럼생성

In [25]:
e_X_train #총 182개 컬럼

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,...,LSTATxCHAS,LSTATxNOX,LSTATxRM,LSTATxAGE,LSTATxDIS,LSTATxRAD,LSTATxTAX,LSTATxPTRATIO,LSTATxB,LSTATxLSTAT
13,0.62976,0.0,8.14,0.0,0.538,5.949,61.8,4.7075,4.0,307.0,...,0.0,4.44388,49.13874,510.468,38.883950,33.04,2535.82,173.460,3278.3940,68.2276
61,0.17171,25.0,5.13,0.0,0.453,5.966,93.4,6.8185,8.0,284.0,...,0.0,6.54132,86.14904,1348.696,98.459140,115.52,4100.96,284.468,5459.4752,208.5136
377,9.82349,0.0,18.10,0.0,0.671,6.794,98.8,1.3580,24.0,666.0,...,0.0,14.25204,144.30456,2098.512,28.843920,509.76,14145.84,429.048,8430.1560,451.1376
39,0.02763,75.0,2.95,0.0,0.428,6.595,21.8,5.4011,3.0,252.0,...,0.0,1.84896,28.49040,94.176,23.332752,12.96,1088.64,79.056,1709.1216,18.6624
365,4.55587,0.0,18.10,0.0,0.718,3.561,87.9,1.6132,24.0,666.0,...,0.0,5.11216,25.35432,625.848,11.485984,170.88,4741.92,143.824,2525.4640,50.6944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,0.03548,80.0,3.64,0.0,0.392,5.876,19.1,9.2203,1.0,315.0,...,0.0,3.62600,54.35300,176.675,85.287775,9.25,2913.75,151.700,3655.4150,85.5625
72,0.09164,0.0,10.81,0.0,0.413,6.065,7.8,5.2873,4.0,305.0,...,0.0,2.27976,33.47880,43.056,29.185896,22.08,1683.60,105.984,2157.8232,30.4704
396,5.87205,0.0,18.10,0.0,0.693,6.405,96.0,1.6768,24.0,666.0,...,0.0,13.42341,124.06485,1859.520,32.479616,464.88,12900.42,391.274,7687.9530,375.1969
235,0.33045,0.0,6.20,0.0,0.507,6.086,61.5,3.6519,8.0,307.0,...,0.0,5.51616,66.21568,669.120,39.732672,87.04,3340.16,189.312,4099.0400,118.3744


In [26]:
# train 과 동일하게 test 에도 적용
e_X_test = X_test.copy()
for i in X_test.columns:    
    for j in X_test.columns:
        e_X_test[i+'x'+j] = X_test[i] * X_test[j] 

In [27]:
e_X_test #총 182개 컬럼

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,...,LSTATxCHAS,LSTATxNOX,LSTATxRM,LSTATxAGE,LSTATxDIS,LSTATxRAD,LSTATxTAX,LSTATxPTRATIO,LSTATxB,LSTATxLSTAT
307,0.04932,33.0,2.18,0.0,0.472,6.849,70.3,3.1827,7.0,222.0,...,0.0,3.55416,51.57297,529.359,23.965731,52.71,1671.66,138.552,2988.6570,56.7009
343,0.02543,55.0,3.78,0.0,0.484,6.696,56.4,5.7321,5.0,370.0,...,0.0,3.47512,48.07728,404.952,41.156478,35.90,2656.60,126.368,2849.7420,51.5524
47,0.22927,0.0,6.91,0.0,0.448,6.030,85.5,5.6894,3.0,233.0,...,0.0,8.42240,113.36400,1607.400,106.960720,56.40,4380.40,336.520,7383.5120,353.4400
67,0.05789,12.5,6.07,0.0,0.409,5.878,21.4,6.4980,4.0,345.0,...,0.0,3.31290,47.61180,173.340,52.633800,32.40,2794.50,153.090,3209.3010,65.6100
362,3.67822,0.0,18.10,0.0,0.770,5.362,96.2,2.1036,24.0,666.0,...,0.0,7.84630,54.63878,980.278,21.435684,244.56,6786.54,205.838,3880.2501,103.8361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
467,4.42228,0.0,18.10,0.0,0.584,6.003,94.5,2.5403,24.0,666.0,...,0.0,12.45088,127.98396,2014.740,54.159196,511.68,14199.12,430.664,7063.1028,454.5424
95,0.12204,0.0,2.89,0.0,0.445,6.625,57.8,3.4952,2.0,276.0,...,0.0,2.95925,44.05625,384.370,23.243080,13.30,1835.40,119.700,2380.5670,44.2225
122,0.09299,0.0,25.65,0.0,0.581,5.961,92.9,2.0869,2.0,188.0,...,0.0,10.41733,106.88073,1665.697,37.418117,35.86,3370.84,342.463,6779.1537,321.4849
260,0.54011,20.0,3.97,0.0,0.647,7.203,81.8,2.1121,5.0,264.0,...,0.0,6.20473,69.07677,784.462,20.255039,47.95,2531.76,124.670,3766.9520,91.9681


In [28]:
l_model.fit(e_X_train, y_train)
l_model.score(e_X_test, y_test) 
#기존 0.7836295385076297 -> 0.8043803930615876

0.8043803930615876

In [29]:
#alpha 값을 여러개로 해보쟈~! alpha값이 달라짐으로써 어떤 결과가 나올지
alpha_list = [0.001, 0.01, 0.1, 10, 100, 1000]
# Lasso 와 ridge 의 가중치 리스트 
r_coef_list = []
l_coef_list = []

for i in alpha_list:
    r_model = Ridge(alpha=i)
    l_model = Lasso(alpha=i)
    #학습
    r_model.fit(e_X_train, y_train)
    l_model.fit(e_X_train, y_train)
    
    r_coef_list.append(r_model.coef_)
    l_coef_list.append(l_model.coef_)

  return linalg.solve(A, Xy, sym_pos=True,
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [30]:
r_coef_list

[array([-2.96794630e+00,  4.40672170e-01, -4.97153974e+00,  3.77907262e+01,
         4.27237399e+01,  2.25141743e+01,  1.11012131e+00, -9.64573443e+00,
        -5.11285203e-01,  1.16692308e-01,  3.30379822e+00,  7.53175537e-02,
         9.39215885e-01,  5.63109559e-03,  5.88188925e-02,  7.24858192e-02,
         1.17493416e+00, -9.79294028e-01,  1.56969040e-01, -4.50406064e-03,
        -3.86814259e-02,  3.63648992e-02, -5.89096669e-03,  1.44194806e-01,
        -2.47064966e-05,  1.40072951e-02,  5.88188911e-02, -4.30502053e-04,
        -2.49358731e-03, -7.27163646e-02, -8.04039205e-01,  8.45357548e-03,
         3.48515985e-04, -1.01194607e-02,  6.68472778e-04,  2.98684306e-04,
         3.59400302e-03,  5.93291672e-05, -3.02806113e-03,  7.24849216e-02,
        -2.49380405e-03,  3.05929929e-02, -1.46580283e-01,  1.22890099e+00,
         1.05272464e-01,  1.96108487e-03,  8.38414732e-02, -1.71588288e-02,
         7.02228060e-04,  1.40663606e-02,  4.95166201e-04, -8.43067481e-03,
         1.1

In [31]:
#DataFrame형태 -> np.array(r_coef_list).T 기존데이터 회전후 컬럼이름추가
r_df = pd.DataFrame(np.array(r_coef_list).T, columns = alpha_list)
r_df
#규제를 많이 할수록 0에 가까워짐, but 0은 안됨

Unnamed: 0,0.001,0.010,0.100,10.000,100.000,1000.000
0,-2.967946,-3.084029,-2.661836,-0.075861,-0.004695,0.000504
1,0.440672,0.414180,0.314357,-0.194214,-0.158361,-0.035165
2,-4.971540,-4.576505,-3.893822,-0.230469,0.007065,0.004058
3,37.790726,27.202270,7.193797,0.101728,0.005391,0.000041
4,42.723740,6.367370,0.419346,0.029559,0.005779,0.000777
...,...,...,...,...,...,...
177,-0.009413,-0.009255,-0.010853,-0.018205,-0.018827,-0.015527
178,-0.000865,-0.000895,-0.000819,0.000002,0.000127,-0.000010
179,0.019950,0.020062,0.021877,0.016719,0.012587,0.005452
180,-0.000303,-0.000308,-0.000322,-0.000315,-0.000292,-0.000085


In [32]:
l_df = pd.DataFrame(np.array(l_coef_list).T, columns = alpha_list)
l_df
# 규제 많아지면 0 이됨

Unnamed: 0,0.001,0.010,0.100,10.000,100.000,1000.000
0,-1.049528,-0.868957,0.000000,0.000000e+00,-0.000000e+00,-0.000000
1,-0.188179,-0.173675,-0.042225,-0.000000e+00,0.000000e+00,0.000000
2,0.458929,0.226905,0.000000,-0.000000e+00,-0.000000e+00,-0.000000
3,26.852555,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000
4,-0.000000,0.000000,0.000000,-0.000000e+00,-0.000000e+00,-0.000000
...,...,...,...,...,...,...
177,0.010220,0.009429,0.004203,0.000000e+00,-0.000000e+00,0.000000
178,-0.000079,0.000029,0.000560,-1.929844e-04,-2.316784e-07,-0.000027
179,0.035672,0.030918,0.012882,-0.000000e+00,0.000000e+00,-0.000000
180,-0.000833,-0.000755,-0.000407,-2.015039e-07,-1.539374e-05,-0.000000


In [33]:
r_model = Ridge(alpha=1)
l_model = Lasso(alpha=1)

r_model.fit(e_X_train, y_train)
l_model.fit(e_X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


Lasso(alpha=1)

In [36]:
r_model.score(e_X_test, y_test)

0.8239994029582296

In [37]:
l_model.score(e_X_test, y_test)

0.8698528946654371