# 3.1 핵심 개념
- P(A|B) 조건부 확률과 베이즈 정리를 이용한 알고리즘
- 나이브 : 예측에 사용되는 특성치가 상호 독립적이라는 가정하에 롹률 계산을 단순화하기 나이브(단순/ 순진한 가정)
- 모든 특성치가 분류 혹은 예측하는 데에 동등한 역할을 한다

# 3.3 분석 코드

## Part1. 분류

In [21]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
data=pd.read_csv("../data/breast-cancer-wisconsin.csv", encoding='utf-8')
X=data[data.columns[1:10]]
y=data[['Class']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, stratify=y, random_state=42)

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(X_train)
X_scaled_train=scaler.transform(X_train)
X_scaled_test=scaler.transform(X_test)

from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(X_scaled_train, y_train)

GaussianNB()

In [22]:
pred_train=model.predict(X_train)
model.score(X_scaled_train, y_train)

0.966796875

In [5]:
from sklearn.metrics import confusion_matrix
confusion_train=confusion_matrix(y_train, pred_train)
print("훈련데이터 오차행렬 : \n", confusion_train)

훈련데이터 오차행렬 : 
 [[319  14]
 [  3 176]]


In [6]:
from sklearn.metrics import classification_report
cfreport_train=classification_report(y_train, pred_train)
print("분류예측 레포트 : \n", cfreport_train)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.99      0.96      0.97       333
           1       0.93      0.98      0.95       179

    accuracy                           0.97       512
   macro avg       0.96      0.97      0.96       512
weighted avg       0.97      0.97      0.97       512



In [23]:
pred_test=model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.9590643274853801

In [9]:
confusion_test=confusion_matrix(y_test, pred_test)
print("테스트데이터 오차행렬 : \n",confusion_test)

테스트데이터 오차행렬 : 
 [[106   5]
 [  2  58]]


In [10]:
cfreport_test=classification_report(y_test, pred_test)
print("분류예측 레포트 : \n", cfreport_test)

분류예측 레포트 : 
               precision    recall  f1-score   support

           0       0.98      0.95      0.97       111
           1       0.92      0.97      0.94        60

    accuracy                           0.96       171
   macro avg       0.95      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [24]:
# Grid Search
from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(GaussianNB(),
                        param_grid={'var_smoothing':[0,1,2,3,4,5,6,7,8,9,10]},
                        cv=5,
                        return_train_score=True)
grid_search.fit(X_scaled_train, y_train)

GridSearchCV(cv=5, estimator=GaussianNB(),
             param_grid={'var_smoothing': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             return_train_score=True)

In [25]:
print(f"Best Parameter : {grid_search.best_params_}")
print(f"Best Score : {grid_search.best_score_:.4f}")
print(f"TestSet Score : {grid_search.score(X_scaled_test, y_test):.4f}")

Best Parameter : {'var_smoothing': 0}
Best Score : 0.9649
TestSet Score : 0.9591


In [26]:
# Random Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
random_search=RandomizedSearchCV(GaussianNB(),
                                param_distributions={'var_smoothing' : randint(low=0, high=20)},
                                cv=5,
                                return_train_score=True)
random_search.fit(X_scaled_train, y_train)

RandomizedSearchCV(cv=5, estimator=GaussianNB(),
                   param_distributions={'var_smoothing': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000024D776BC4E0>},
                   return_train_score=True)

In [28]:
print(f"Best Parameter : {random_search.best_params_}")
print(f"Best Score : {random_search.best_score_:.4f}")
print(f"TestSet Score : {random_search.score(X_scaled_test, y_test):.4f}")

Best Parameter : {'var_smoothing': 0}
Best Score : 0.9649
TestSet Score : 0.9591


## Part2. 회귀

In [45]:
data2=pd.read_csv("../data/house_price.csv", encoding='utf-8')

X=data2[data2.columns[:-1]]
y=data2[['house_value']]

X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=42)

scaler=MinMaxScaler()
scaler.fit(X_train)
X_scaled_train=scaler.transform(X_train)
X_scaled_test=scaler.transform(X_test)

from sklearn.linear_model import BayesianRidge
model=BayesianRidge()
model.fit(X_scaled_train,y_train)

BayesianRidge()

In [46]:
pred_train=model.predict(X_scaled_train)
model.score(X_scaled_train, y_train)

0.5706920449333217

In [47]:
pred_test=model.predict(X_scaled_test)
model.score(X_scaled_test, y_test)

0.5826111218474421

In [52]:
# RMSE
import numpy as np
from sklearn.metrics import mean_squared_error
MSE_train=mean_squared_error(y_train, pred_train)
MSE_test=mean_squared_error(y_test, pred_test)
print(f"훈련 데이터 RMSE : {np.sqrt(MSE_train):.4f}")
print(f"테스트 데이터 RMSE : {np.sqrt(MSE_test):.4f}")

훈련 데이터 RMSE : 62536.7794
테스트 데이터 RMSE : 61763.6439


In [56]:
# Grid Search
grid_search=GridSearchCV(BayesianRidge(),
                        param_grid={'alpha_1':[1e-06, 1e-05,1e-04,1e-03,1e-02,1e-01,1,2,3,4],
                                   'lambda_1':[1e-06, 1e-05,1e-04,1e-03,1e-02,1e-01,1,2,3,4]},
                        cv=5)
grid_search.fit(X_scaled_train,y_train)

print(f"Best Parameter : {grid_search.best_params_}")
print(f"Best Score : {grid_search.best_score_:.4f}")
print(f"TestSet Score : {grid_search.score(X_scaled_test, y_test):.4f}")

Best Parameter : {'alpha_1': 4, 'lambda_1': 1e-06}
Best Score : 0.5703
TestSet Score : 0.5826


In [57]:
# Random Search
random_search=RandomizedSearchCV(BayesianRidge(),
                                 param_distributions={'alpha_1':randint(low=1e-06,high=10),
                                                      'lambda_1':randint(low=1e-06,high=10)},
                        cv=5,n_iter=20)
random_search.fit(X_scaled_train,y_train)

print(f"Best Parameter : {random_search.best_params_}")
print(f"Best Score : {random_search.best_score_:.4f}")
print(f"TestSet Score : {random_search.score(X_scaled_test, y_test):.4f}")

Best Parameter : {'alpha_1': 6, 'lambda_1': 0}
Best Score : 0.5703
TestSet Score : 0.5826
