## Ridge

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
%matplotlib inline

boston = load_boston()

bostonDF = pd.DataFrame(boston.data , columns = boston.feature_names)
bostonDF['PRICE'] = boston.target

y_target_boston = bostonDF['PRICE']
X_data_boston = bostonDF.drop(['PRICE'],axis=1,inplace=False)

X_train_boston , X_test_boston , y_train_boston , y_test_boston = train_test_split(X_data_boston , y_target_boston ,
                                                                                   test_size=0.3, random_state=156)

In [2]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score

ridge = Ridge(alpha = 10)
neg_mse_scores = cross_val_score(ridge, X_data_boston, y_target_boston, scoring="neg_mean_squared_error", cv = 5)
rmse_scores  = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
print(' 5 folds 의 개별 RMSE scores : ', np.round(rmse_scores,3))
print(' 5 folds 의 평균 RMSE : {0:.3f} '.format(avg_rmse))

 5 folds 의 개별 RMSE scores :  [3.38  4.929 5.305 8.637 5.34 ]
 5 folds 의 평균 RMSE : 5.518 


In [3]:
# alpha값 하이퍼파라미터튜닝
alphas = [0 , 0.1 , 1 , 10 , 100]

# alphas list 값을 iteration하면서 alpha에 따른 평균 rmse 구함.
for alpha in alphas :
    ridge = Ridge(alpha = alpha)
    
    #cross_val_score를 이용하여 5 fold의 평균 RMSE 계산
    neg_mse_scores = cross_val_score(ridge, X_data_boston, y_target_boston, scoring="neg_mean_squared_error", cv = 5)
    avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
    print('alpha {0} 일 때 5 folds 의 평균 RMSE : {1:.3f} '.format(alpha,avg_rmse))

alpha 0 일 때 5 folds 의 평균 RMSE : 5.829 
alpha 0.1 일 때 5 folds 의 평균 RMSE : 5.788 
alpha 1 일 때 5 folds 의 평균 RMSE : 5.653 
alpha 10 일 때 5 folds 의 평균 RMSE : 5.518 
alpha 100 일 때 5 folds 의 평균 RMSE : 5.330 


In [4]:
for alpha in alphas :
    ridge = Ridge(alpha = alpha)
    ridge.fit(X_data_boston , y_target_boston)
    
    coeff = pd.Series(data=ridge.coef_, index=X_data_boston.columns)    
    print('alpha {0} 일 때 회귀계수 :'.format(alpha))
    print(coeff.sort_values(ascending=False))
    print()

alpha 0 일 때 회귀계수 :
RM          3.809865
CHAS        2.686734
RAD         0.306049
ZN          0.046420
INDUS       0.020559
B           0.009312
AGE         0.000692
TAX        -0.012335
CRIM       -0.108011
LSTAT      -0.524758
PTRATIO    -0.952747
DIS        -1.475567
NOX       -17.766611
dtype: float64

alpha 0.1 일 때 회귀계수 :
RM          3.818233
CHAS        2.670019
RAD         0.303515
ZN          0.046572
INDUS       0.015999
B           0.009368
AGE        -0.000269
TAX        -0.012421
CRIM       -0.107474
LSTAT      -0.525966
PTRATIO    -0.940759
DIS        -1.459626
NOX       -16.684645
dtype: float64

alpha 1 일 때 회귀계수 :
RM          3.854000
CHAS        2.552393
RAD         0.290142
ZN          0.047443
B           0.009673
AGE        -0.005415
INDUS      -0.008805
TAX        -0.012912
CRIM       -0.104595
LSTAT      -0.533343
PTRATIO    -0.876074
DIS        -1.372654
NOX       -10.777015
dtype: float64

alpha 10 일 때 회귀계수 :
RM         3.702272
CHAS       1.952021
RAD        0.2

## Lasso

In [5]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 10)
neg_mse_scores = cross_val_score(lasso, X_data_boston, y_target_boston, scoring="neg_mean_squared_error", cv = 5)
rmse_scores  = np.sqrt(-1 * neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
print(' 5 folds 의 개별 RMSE scores : ', np.round(rmse_scores,3))
print(' 5 folds 의 평균 RMSE : {0:.3f} '.format(avg_rmse))

 5 folds 의 개별 RMSE scores :  [5.271 6.812 8.962 7.846 4.038]
 5 folds 의 평균 RMSE : 6.586 


In [6]:
# alpha값 하이퍼파라미터튜닝
alphas = [0.07, 0.1, 0.5, 1, 3]

# alphas list 값을 iteration하면서 alpha에 따른 평균 rmse 구함.
for alpha in alphas :
    lasso = Lasso(alpha = alpha)
    
    #cross_val_score를 이용하여 5 fold의 평균 RMSE 계산
    neg_mse_scores = cross_val_score(lasso, X_data_boston, y_target_boston, scoring="neg_mean_squared_error", cv = 5)
    avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
    print('alpha {0} 일 때 5 folds 의 평균 RMSE : {1:.3f} '.format(alpha,avg_rmse))

alpha 0.07 일 때 5 folds 의 평균 RMSE : 5.612 
alpha 0.1 일 때 5 folds 의 평균 RMSE : 5.615 
alpha 0.5 일 때 5 folds 의 평균 RMSE : 5.669 
alpha 1 일 때 5 folds 의 평균 RMSE : 5.776 
alpha 3 일 때 5 folds 의 평균 RMSE : 6.189 


In [7]:
coeff_df = pd.DataFrame()
for alpha in alphas :
    lasso = Lasso(alpha = alpha)
    lasso.fit(X_data_boston , y_target_boston)
    
    coeff = pd.Series(data=lasso.coef_, index=X_data_boston.columns)
    colname='alpha:'+str(alpha)
    coeff_df[colname] = coeff
    
sort_column = 'alpha:'+str(alphas[0])
coeff_df.sort_values(by=sort_column, ascending=False)

Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
RM,3.789725,3.703202,2.498212,0.949811,0.0
CHAS,1.434343,0.95519,0.0,0.0,0.0
RAD,0.270936,0.274707,0.277451,0.264206,0.061864
ZN,0.049059,0.049211,0.049544,0.049165,0.037231
B,0.010248,0.010249,0.009469,0.008247,0.00651
NOX,-0.0,-0.0,-0.0,-0.0,0.0
AGE,-0.011706,-0.010037,0.003604,0.02091,0.042495
TAX,-0.01429,-0.01457,-0.015442,-0.015212,-0.008602
INDUS,-0.04212,-0.036619,-0.005253,-0.0,-0.0
CRIM,-0.098193,-0.097894,-0.083289,-0.063437,-0.0


## ElasticNet

In [8]:
from sklearn.linear_model import ElasticNet

alphas = [0.07, 0.1, 0.5, 1, 3]

# alphas list 값을 iteration하면서 alpha에 따른 평균 rmse 구함.
for alpha in alphas :
    elsnet = ElasticNet(alpha = alpha, l1_ratio=0.7)
    
    #cross_val_score를 이용하여 5 fold의 평균 RMSE 계산
    neg_mse_scores = cross_val_score(elsnet, X_data_boston, y_target_boston, scoring="neg_mean_squared_error", cv = 5)
    avg_rmse = np.mean(np.sqrt(-1 * neg_mse_scores))
    print('alpha {0} 일 때 5 folds 의 평균 RMSE : {1:.3f} '.format(alpha,avg_rmse))

alpha 0.07 일 때 5 folds 의 평균 RMSE : 5.542 
alpha 0.1 일 때 5 folds 의 평균 RMSE : 5.526 
alpha 0.5 일 때 5 folds 의 평균 RMSE : 5.467 
alpha 1 일 때 5 folds 의 평균 RMSE : 5.597 
alpha 3 일 때 5 folds 의 평균 RMSE : 6.068 


In [9]:
coeff_df = pd.DataFrame()
for alpha in alphas :
    elsnet = ElasticNet(alpha = alpha, l1_ratio=0.7)
    elsnet.fit(X_data_boston , y_target_boston)
    
    coeff = pd.Series(data=elsnet.coef_, index=X_data_boston.columns)
    colname='alpha:'+str(alpha)
    coeff_df[colname] = coeff
    
sort_column = 'alpha:'+str(alphas[0])
coeff_df.sort_values(by=sort_column, ascending=False)

Unnamed: 0,alpha:0.07,alpha:0.1,alpha:0.5,alpha:1,alpha:3
RM,3.574162,3.414154,1.918419,0.938789,0.0
CHAS,1.330724,0.979706,0.0,0.0,0.0
RAD,0.27888,0.283443,0.300761,0.289299,0.146846
ZN,0.050107,0.050617,0.052878,0.052136,0.038268
B,0.010122,0.010067,0.009114,0.00832,0.00702
AGE,-0.010116,-0.008276,0.00776,0.020348,0.043446
TAX,-0.014522,-0.014814,-0.016046,-0.016218,-0.011417
INDUS,-0.044855,-0.042719,-0.023252,-0.0,-0.0
CRIM,-0.099468,-0.099213,-0.08907,-0.073577,-0.019058
NOX,-0.175072,-0.0,-0.0,-0.0,-0.0


## 로지스틱 회귀

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train_cancer , X_test_cancer, y_train_cancer , y_test_cancer = train_test_split(data_scaled, cancer.target, 
                                                                                  test_size=0.3, random_state=0)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# 로지스틱 회귀를 이용하여 학습 및 예측 수행. 
lr_clf = LogisticRegression()
lr_clf.fit(X_train_cancer, y_train_cancer)
lr_preds = lr_clf.predict(X_test_cancer)

# accuracy와 roc_auc 측정
print('accuracy: {:0.3f}'.format(accuracy_score(y_test_cancer, lr_preds)))
print('roc_auc: {:0.3f}'.format(roc_auc_score(y_test_cancer , lr_preds)))

accuracy: 0.977
roc_auc: 0.972


In [13]:

from sklearn.model_selection import GridSearchCV

params={'penalty':['l2', 'l1'],
        'C':[0.01, 0.1, 1, 1, 5, 10]}

grid_clf = GridSearchCV(lr_clf, param_grid=params, scoring='accuracy', cv=3 )
grid_clf.fit(data_scaled, cancer.target)
print('최적 하이퍼 파라미터:{0}, 최적 평균 정확도:{1:.3f}'.format(grid_clf.best_params_, 
                                                  grid_clf.best_score_))

Traceback (most recent call last):
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.

최적 하이퍼 파라미터:{'C': 1, 'penalty': 'l2'}, 최적 평균 정확도:0.975


Traceback (most recent call last):
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\eunai\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.

## 트리기반 회귀

In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dt_reg = DecisionTreeRegressor(random_state=0, max_depth=4)
rf_reg = RandomForestRegressor(random_state=0, n_estimators=1000)
gb_reg = GradientBoostingRegressor(random_state=0, n_estimators=1000)
xgb_reg = XGBRegressor(n_estimators=1000)
lgb_reg = LGBMRegressor(n_estimators=1000)

In [25]:
from sklearn.metrics import mean_squared_error

models = [dt_reg, rf_reg, gb_reg, xgb_reg, lgb_reg]

for model in models:
    model.fit(X_train_boston, y_train_boston)
    y_pred = model.predict(X_test_boston)
    mse = mean_squared_error(y_pred, y_test_boston)
    rmse = mse**(1/2)
    
    print(model.__class__.__name__,'회귀 모델의 rmse 값:', rmse)

DecisionTreeRegressor 회귀 모델의 rmse 값: 3.1666749686184286
RandomForestRegressor 회귀 모델의 rmse 값: 2.699553665788792
GradientBoostingRegressor 회귀 모델의 rmse 값: 2.601457572082426
XGBRegressor 회귀 모델의 rmse 값: 2.6291810544326335
LGBMRegressor 회귀 모델의 rmse 값: 2.829125194085535
