## 데이터 불러오기

In [1]:
import pandas as pd
df=pd.read_csv('data/auto.csv')

In [3]:
df

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,8,307.0,130.0,3504.0,12.0,70,1,18.0
1,8,350.0,165.0,3693.0,11.5,70,1,15.0
2,8,318.0,150.0,3436.0,11.0,70,1,18.0
3,8,304.0,150.0,3433.0,12.0,70,1,16.0
4,8,302.0,140.0,3449.0,10.5,70,1,17.0
...,...,...,...,...,...,...,...,...
387,4,140.0,86.0,2790.0,15.6,82,1,27.0
388,4,97.0,52.0,2130.0,24.6,82,2,44.0
389,4,135.0,84.0,2295.0,11.6,82,1,32.0
390,4,120.0,79.0,2625.0,18.6,82,1,28.0


## 데이터 나누기

In [4]:
df=pd.get_dummies(df,columns=['origin'])
y=df['mpg']
X=df.drop(columns=['mpg'])

## 교차검증

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate

reg = LinearRegression()
kfold = KFold(5,shuffle=True)
result = cross_validate(estimator=reg, X=X, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
result

{'fit_time': array([0.00390172, 0.00178385, 0.00203681, 0.00137019, 0.00159001]),
 'score_time': array([0.00216818, 0.00116491, 0.00117517, 0.00153184, 0.00112677]),
 'test_neg_mean_squared_error': array([-15.49677019, -12.03513887, -10.13647384, -11.01268351,
         -8.17289032]),
 'train_neg_mean_squared_error': array([ -9.6869278 , -10.44217924, -10.85405129, -10.70507139,
        -11.34520025]),
 'test_r2': array([0.76461179, 0.77967876, 0.84205114, 0.83873602, 0.83272818]),
 'train_r2': array([0.83699487, 0.83113356, 0.8186228 , 0.81812422, 0.82195084])}

## 하이퍼패러미터 변경(차수 K)

In [8]:
from sklearn.preprocessing import PolynomialFeatures

kfold = KFold(5,shuffle=True)
for i in range(1,5):
    transformer = PolynomialFeatures(degree=i, interaction_only=False)
    X_=transformer.fit_transform(X) #데이터에 차수 추가
    reg= LinearRegression()    
    result=cross_validate(estimator=reg, X=X_, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    print(i,'train',result['train_neg_mean_squared_error'].mean(), result['train_r2'].mean())
    print(i,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

1 train -10.567100540610168 0.825860553198404
1 test -11.728503993657247 0.8022773532669403
2 train -13.501050995734664 0.7765241514911924
2 test -18.13654791100012 0.6997741954888419
3 train -4.514746147867788 0.924839699625007
3 test -295.0927374900547 -4.102110860386073
4 train -0.24020288898421013 0.9960602848414817
4 test -105376.52130892195 -1970.6891437773606


## Regularization

In [9]:
from sklearn.linear_model import Ridge, Lasso
reg = Lasso(alpha=0.01) # alpha : 작을수록 유연성 높음, Ridge에서의 lambda에 해당 (모델 규제)
reg.fit(X,y)
reg.coef_

array([-0.44629638,  0.02286369, -0.01762971, -0.00671945,  0.07651421,
        0.77615517, -2.56966676,  0.        ,  0.16691861])

## Lambda의 크기에 따른 계수의 변화를 살펴보자

In [10]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
lambdas = [0,0.0001,0.001,0.01,0.1,1]
for i in lambdas:
    reg = Lasso(alpha=i)
    reg.fit(X,y)
    print(i, reg.coef_)

0 [-0.48970942  0.02397864 -0.01818346 -0.00671038  0.07910304  0.77702694
 -2.76337047 -0.13336811  0.08985776]
0.0001 [-0.48927812  0.02396754 -0.01817795 -0.00671047  0.07907707  0.77701821
 -2.6293992   0.          0.22266311]
0.001 [-0.4853459   0.02386686 -0.0181282  -0.0067113   0.07884366  0.77693967
 -2.62396889  0.          0.21759332]
0.01 [-0.44629638  0.02286369 -0.01762971 -0.00671945  0.07651421  0.77615517
 -2.56966676  0.          0.16691861]
0.1 [-0.06660805  0.0126583  -0.01331772 -0.00673194  0.05328912  0.76251936
 -1.8486989   0.          0.        ]
1 [-0.          0.         -0.00734394 -0.00646937  0.          0.66308442
 -0.          0.          0.        ]


  reg.fit(X,y)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


<Figure size 864x576 with 0 Axes>

In [11]:
import warnings
warnings.filterwarnings('ignore')

## 하이퍼패러미터 변경(다항식 추가)
kfold = KFold(5,shuffle=True)
lambdas = [0,0.001,0.01,0.1,0.5,1,10,100]
for l in lambdas:    
    reg= Lasso(alpha=l)
    result=cross_validate(estimator=reg, X=X, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
    print(l,'train',result['train_neg_mean_squared_error'].mean(), result['train_r2'].mean())
    print(l,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

0 train -10.558452686009906 0.8261420994999801
0 test -11.856780499912162 0.8031548110245279
0.001 train -10.607646710173785 0.8252710637646519
0.001 test -11.374743296398872 0.8102318100165791
0.01 train -10.606438008551859 0.8250580622136148
0.01 test -11.379367424306293 0.8039254954837919
0.1 train -10.773209394002151 0.8225100963946466
0.1 test -11.245359980389578 0.8124377937823523
0.5 train -11.633655237898125 0.8082341504326577
0.5 test -11.940429028101576 0.7980907101326838
1 train -11.71005498768512 0.8072053874426883
1 test -11.901109362492514 0.8029725220292789
10 train -18.006045898156355 0.7035768639730259
10 test -18.578544506285233 0.6930580462076547
100 train -18.67924515654883 0.6925168955494636
100 test -18.816720155246323 0.6895189930865994


## 다항식 변경 + hyperparameter tuning

In [12]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Lasso
kfold = KFold(5,shuffle=True)
for i in range(1,5):
    for j in [0,0.001,0.01,0.05,0.1,0.15,0.2]:
        transformer = PolynomialFeatures(degree=i, interaction_only=False)
        X_=transformer.fit_transform(X)
        reg= Lasso(alpha=j)    
        result=cross_validate(estimator=reg, X=X_, y=y, cv=kfold, scoring=['neg_mean_squared_error','r2'], return_train_score=True)
        print(i,j,'test',result['test_neg_mean_squared_error'].mean(), result['test_r2'].mean())

# negative_mean_squared_error, r2 score 둘다 클수록 좋음

1 0 test -11.653909288849508 0.8043890774125245
1 0.001 test -11.40721311559199 0.8093615493577537
1 0.01 test -11.523136222179001 0.8063696167960869
1 0.05 test -11.225274041861319 0.8120639279310764
1 0.1 test -11.429950260719195 0.8111777616240534
1 0.15 test -11.511943412698162 0.8066654735349135
1 0.2 test -11.865030138320156 0.7969079776278772
2 0 test -8.122385486411602 0.8625489128929568
2 0.001 test -8.340605401394555 0.8619961845814185
2 0.01 test -8.607471586540061 0.8487787606257899
2 0.05 test -8.434233972302248 0.8597598945580438
2 0.1 test -8.398626903501015 0.8594731186315478
2 0.15 test -7.859321886496227 0.8689890537776435
2 0.2 test -8.104246301277033 0.8645741781680429
3 0 test -7.67179828516431 0.8748812717712522
3 0.001 test -7.989648993957229 0.8688542259822007
3 0.01 test -8.186831406835669 0.8627195465580787
3 0.05 test -7.744451000006182 0.8707687138044419
3 0.1 test -8.252882640586924 0.8652310331777068
3 0.15 test -7.668875128232169 0.8661329008208026
3 0.2 