# Naive Bayes (분류)

## data/library 불러오기

In [1]:
from sklearn.naive_bayes import GaussianNB

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
data=pd.read_csv("breast-cancer-wisconsin.csv")
x=data[data.columns[1:10]]
y=data[['Class']]

## test/train set 분리하기

In [3]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y,stratify=y,random_state=42)
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(x_train)
x_scaled_train=scaler.transform(x_train)
x_scaled_test=scaler.transform(x_test)

## model 학습하기

In [4]:
model=GaussianNB()
model.fit(x_scaled_train,y_train)

GaussianNB()

In [5]:
pred_train=model.predict(x_scaled_train)
model.score(x_scaled_train,y_train)

0.966796875

In [6]:
from sklearn.metrics import confusion_matrix
confusion_train=confusion_matrix(y_train,pred_train)
print(confusion_train)

[[319  14]
 [  3 176]]


## 학습결과 확인하기

In [8]:
from sklearn.metrics import classification_report
cfreport_train=classification_report(y_train,pred_train)
print(cfreport_train)

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       333
           1       0.93      0.98      0.95       179

    accuracy                           0.97       512
   macro avg       0.96      0.97      0.96       512
weighted avg       0.97      0.97      0.97       512



## Hyperparameter 최적화

### GridSerach

In [11]:
from sklearn.model_selection import GridSearchCV
param_grid={"var_smoothing":[0,1,2,3,4,5,6,7,8,9,10]}
grid_search=GridSearchCV(GaussianNB(),param_grid,cv=5,return_train_score=True)
# 파라미터 찾기
grid_search.fit(x_scaled_train,y_train)

GridSearchCV(cv=5, estimator=GaussianNB(),
             param_grid={'var_smoothing': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             return_train_score=True)

In [12]:
print("Best Parameter : {}".format(grid_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(grid_search.best_score_))
print('Test set Score : {:.4f}'.format(grid_search.score(x_scaled_test,y_test)))

Best Parameter : {'var_smoothing': 0}
Best Cross-validity Score : 0.9649
Test set Score : 0.9591


### RandomSearch

In [13]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
param_distribs={"var_smoothing":randint(low=0,high=30)} # 무작위 난수 생성
random_search=RandomizedSearchCV(GaussianNB(),param_distributions=param_distribs,n_iter=100,cv=5,return_train_score=True)
# 파라미터 찾기
random_search.fit(x_scaled_train,y_train)

RandomizedSearchCV(cv=5, estimator=GaussianNB(), n_iter=100,
                   param_distributions={'var_smoothing': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000246BB9B5520>},
                   return_train_score=True)

In [14]:
print("Best Parameter : {}".format(random_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(random_search.best_score_))
print('Test set Score : {:.4f}'.format(random_search.score(x_scaled_test,y_test)))

Best Parameter : {'var_smoothing': 0}
Best Cross-validity Score : 0.9649
Test set Score : 0.9591


# Naive Bayes (회귀)

## data set 나누기

In [33]:
from sklearn.linear_model import BayesianRidge

data=pd.read_csv("house_price.csv")
x=data[data.columns[1:5]]
y=data[['house_value']]
x_train,x_test,y_train,y_test= train_test_split(x,y,random_state=42)

## model 학습하기

In [34]:
model=BayesianRidge()
scaler.fit(x_train)

MinMaxScaler()

In [35]:
x_scaled_train=scaler.transform(x_train)
x_scaled_test=scaler.transform(x_test)

## 학습결과 확인하기

In [38]:
model.fit(x_scaled_train,y_train)
pred_train=model.predict(x_scaled_train)
model.score(x_scaled_train,y_train)

0.5455724466331764

In [39]:
pred_test=model.predict(x_scaled_test)
model.score(x_scaled_test,y_test)

0.5626859871488648

## 오차 (RMSE)확인하기

In [42]:
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train=mean_squared_error(y_train,pred_train)
MSE_test=mean_squared_error(y_test,pred_test)
print(np.sqrt(MSE_train))
print(np.sqrt(MSE_test))

64340.34302948542
63220.68115643447


## Hyperparameter 최적화

### GridSearch

In [43]:
from sklearn.model_selection import GridSearchCV
param_grid={"alpha_1":[1e-06,1e-05,1e-04,1e-03,1e-02,1e-01,1,2,3,4],"lambda_1":[1e-06,1e-05,1e-04,1e-03,1e-02,1e-01,1,2,3,4]}
grid_search=GridSearchCV( BayesianRidge(),param_grid,cv=5,return_train_score=True)
# 파라미터 찾기
grid_search.fit(x_scaled_train,y_train)

GridSearchCV(cv=5, estimator=BayesianRidge(),
             param_grid={'alpha_1': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1,
                                     2, 3, 4],
                         'lambda_1': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1,
                                      2, 3, 4]},
             return_train_score=True)

In [44]:
print("Best Parameter : {}".format(grid_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(grid_search.best_score_))
print('Test set Score : {:.4f}'.format(grid_search.score(x_scaled_test,y_test)))

Best Parameter : {'alpha_1': 4, 'lambda_1': 1e-06}
Best Cross-validity Score : 0.5452
Test set Score : 0.5627


### RandomSearch

In [52]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
param_distribs={"alpha_1":randint(low=1e-06,high=10),"lambda_1":randint(low=1e-06,high=10)} # 무작위 난수 생성
random_search=RandomizedSearchCV(BayesianRidge(),param_distributions=param_distribs,n_iter=50,cv=5,return_train_score=True)
# 파라미터 찾기

random_search.fit(x_scaled_train,y_train)

RandomizedSearchCV(cv=5, estimator=BayesianRidge(), n_iter=50,
                   param_distributions={'alpha_1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000246BC11D9D0>,
                                        'lambda_1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000246BC11DA90>},
                   return_train_score=True)

In [53]:
print("Best Parameter : {}".format(random_search.best_params_))
print("Best Cross-validity Score : {:.4f}".format(random_search.best_score_))
print('Test set Score : {:.4f}'.format(random_search.score(x_scaled_test,y_test)))

Best Parameter : {'alpha_1': 6, 'lambda_1': 0}
Best Cross-validity Score : 0.5452
Test set Score : 0.5627
