# 와인 데이터 살펴보기

In [None]:
from sklearn.datasets import load_wine

wine = load_wine()
print(wine.keys())
print(wine.target_names)
print(wine.DESCR)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
['class_0' 'class_1' 'class_2']
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    1

In [None]:
import pandas as pd

wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df['Target'] = wine.target
wine_df

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


## 학습, 평가 데이터 분류

In [None]:
from sklearn.model_selection import train_test_split

X, y = wine.data, wine.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print('전체 데이터 크기: {}'.format(X.shape))
print('학습 데이터 크기: {}'.format(X_train.shape))
print('평가 데이터 크기: {}'.format(X_test.shape))

전체 데이터 크기: (178, 13)
학습 데이터 크기: (142, 13)
평가 데이터 크기: (36, 13)


## 전처리

### 전처리 전 데이터

In [None]:
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0


In [None]:
wine_train_df = pd.DataFrame(data=X_train, columns=wine.feature_names)
wine_train_df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,12.976056,2.341127,2.37331,19.490845,99.464789,2.320282,2.047676,0.363169,1.590423,4.982746,0.956451,2.640141,752.542254
std,0.839702,1.156191,0.281316,3.399748,13.765339,0.606335,1.004491,0.125785,0.564312,2.237932,0.23098,0.698788,323.009598
min,11.03,0.74,1.7,11.2,70.0,0.98,0.34,0.13,0.41,1.74,0.48,1.27,278.0
25%,12.33,1.6025,2.2025,17.2,89.0,1.8,1.2125,0.27,1.25,3.22,0.7825,2.0525,504.0
50%,13.01,1.87,2.36,19.45,97.0,2.405,2.17,0.335,1.535,4.6,0.98,2.825,666.0
75%,13.7,3.03,2.58,21.5,107.0,2.8,2.79,0.4475,1.9675,6.075,1.12,3.1775,1007.5
max,14.83,5.8,3.23,30.0,151.0,3.88,5.08,0.66,3.58,13.0,1.71,3.92,1680.0


In [None]:
wine_test_df = pd.DataFrame(data=X_test, columns=wine.feature_names)
wine_test_df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,13.0975,2.3175,2.339722,19.511111,100.833333,2.195833,1.956667,0.356667,1.592778,5.355278,0.961389,2.499444,724.611111
std,0.693057,0.962097,0.246744,3.136402,16.33139,0.697888,0.9869,0.120641,0.61139,2.624696,0.221941,0.752212,283.792709
min,11.65,0.94,1.36,10.6,80.0,1.3,0.56,0.17,0.42,1.28,0.57,1.33,355.0
25%,12.5975,1.6375,2.26,17.15,87.0,1.5975,1.145,0.2675,1.1125,3.2375,0.8175,1.7875,493.25
50%,13.22,1.845,2.36,19.75,101.0,1.99,1.755,0.355,1.625,5.1,0.96,2.6,677.5
75%,13.65,3.14,2.5075,21.125,111.5,2.7125,2.89,0.43,1.87,6.3375,1.0875,3.0775,924.25
max,14.37,4.31,2.69,26.0,162.0,3.85,3.56,0.61,3.28,10.68,1.45,4.0,1480.0


### 전처리 과정

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

### 전처리 후 데이터

In [None]:
wine_train_df = pd.DataFrame(data=X_train_scale, columns=wine.feature_names)
wine_train_df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0,142.0
mean,1.622333e-16,-1.229064e-15,4.097906e-15,-1.677062e-15,-1.219682e-16,-5.574571e-16,-7.497915e-16,-6.989714e-16,-2.595733e-16,-1.420616e-15,-2.618015e-15,1.042984e-15,-5.774675000000001e-17
std,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354,1.00354
min,-2.32576,-1.389731,-2.401902,-2.447297,-2.148083,-2.218288,-1.706059,-1.860276,-2.099195,-1.454122,-2.070034,-1.967679,-1.474328
25%,-0.7721113,-0.6411064,-0.6093311,-0.6762132,-0.7629185,-0.8611133,-0.8343851,-0.743324,-0.6053877,-0.7904558,-0.7557629,-0.8439195,-0.7721815
50%,0.04056656,-0.4089242,-0.04748034,-0.01205668,-0.1796914,0.1402165,0.1222081,-0.2247389,-0.09856023,-0.1716323,0.1023145,0.2654789,-0.2688731
75%,0.8651955,0.5979218,0.737327,0.5930637,0.5493424,0.7939773,0.741621,0.6728122,0.6705727,0.4897915,0.710572,0.7717093,0.7921135
max,2.215675,3.002201,3.056076,3.102099,3.757091,2.581475,3.029452,2.368186,3.538149,3.59512,3.273943,1.838024,2.881465


In [None]:
wine_test_df = pd.DataFrame(data=X_test_scale, columns=wine.feature_names)
wine_test_df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
count,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0,36.0
mean,0.145139,-0.020507,-0.119817,0.005982,0.099772,-0.205973,-0.090923,-0.051877,0.004188,0.167052,0.021455,-0.202056,-0.086778
std,0.828282,0.835072,0.880213,0.925805,1.190614,1.155068,0.985965,0.962503,1.087261,1.176974,0.964268,1.080262,0.881699
min,-1.584789,-1.216137,-3.614787,-2.624406,-1.419049,-1.688659,-1.486267,-1.541147,-2.081412,-1.660396,-1.679011,-1.881512,-1.235101
25%,-0.452418,-0.610727,-0.404211,-0.690972,-0.908725,-1.196269,-0.901821,-0.76327,-0.84991,-0.782608,-0.603699,-1.22449,-0.80558
50%,0.291541,-0.430623,-0.04748,0.076498,0.111922,-0.546646,-0.292399,-0.065174,0.061491,0.052579,0.015421,-0.057647,-0.233144
75%,0.80544,0.693399,0.478697,0.482371,0.877408,0.649157,0.841526,0.533193,0.497184,0.607502,0.569369,0.628098,0.533469
max,1.665922,1.708924,1.129731,1.921377,4.559029,2.531822,1.510892,1.969275,3.004647,2.554779,2.144322,1.952913,2.260096


# 나이브 베이스 분류기(Naive Bayes Classification)

## 가우시안 나이브 베이즈

In [None]:
model = GaussianNB()
model.fit(X_train_scale, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
predict = model.predict(X_train_scale)
acc = metrics.accuracy_score(y_train, predict)
f1 = metrics.f1_score(y_train, predict, average=None)

print('Train Accuracy: {}'.format(acc))
print('Train F1 Score: {}'.format(f1))

Train Accuracy: 0.9859154929577465
Train F1 Score: [0.98901099 0.98181818 0.98795181]


In [None]:
predict = model.predict(X_test_scale)

acc = metrics.accuracy_score(y_test, predict)
f1 = metrics.f1_score(y_test, predict, average=None)

print('Test Accuracy: {}'.format(acc))
print('Test F1 Score: {}'.format(f1))

Test Accuracy: 0.9722222222222222
Test F1 Score: [0.96       0.96969697 1.        ]


### `Cross_validate()`: 교차 검증

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
import multiprocessing

estimator = make_pipeline(StandardScaler(), GaussianNB())

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished


{'fit_time': array([0.00254655, 0.00249124, 0.00229311, 0.00220704, 0.00187302]),
 'score_time': array([0.00099492, 0.00083995, 0.00082517, 0.0008049 , 0.00051641]),
 'test_score': array([0.94444444, 0.97222222, 0.97222222, 0.94285714, 1.        ])}

## 베르누이 나이브 베이즈

In [None]:
model = BernoulliNB()
model.fit(X_train_scale, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [None]:
predict = model.predict(X_train_scale)
acc = metrics.accuracy_score(y_train, predict)
f1 = metrics.f1_score(y_train, predict, average=None)

print('Train Accuracy: {}'.format(acc))
print('Train F1 Score: {}'.format(f1))

Train Accuracy: 0.9436619718309859
Train F1 Score: [0.96842105 0.92307692 0.94117647]


In [None]:
predict = model.predict(X_test_scale)

acc = metrics.accuracy_score(y_test, predict)
f1 = metrics.f1_score(y_test, predict, average=None)

print('Test Accuracy: {}'.format(acc))
print('Test F1 Score: {}'.format(f1))

Test Accuracy: 0.9444444444444444
Test F1 Score: [0.92307692 0.9375     1.        ]


### `Cross_validate()`: 교차 검증

In [None]:
estimator = make_pipeline(StandardScaler(), BernoulliNB())

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished


{'fit_time': array([0.01035213, 0.00310516, 0.00413418, 0.00218463, 0.00258732]),
 'score_time': array([0.00085449, 0.00087261, 0.00075054, 0.00074768, 0.00066853]),
 'test_score': array([0.86111111, 0.91666667, 0.97222222, 0.94285714, 0.94285714])}

### `GridSearchCV()`: 최적의 하이퍼 파라미터 찾기

In [None]:
from sklearn.model_selection import GridSearchCV
import multiprocessing

param_grid = [{'alpha': [0.001, 0.01, 0.05, 0.1, 0.2, 1.0, 10.0]}]

gs = GridSearchCV(estimator=BernoulliNB(), param_grid=param_grid, n_jobs=multiprocessing.cpu_count())
gs.fit(X_train_scale, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid='deprecated', n_jobs=2,
             param_grid=[{'alpha': [0.001, 0.01, 0.05, 0.1, 0.2, 1.0, 10.0]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
gs.best_estimator_

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [None]:
print("GridSearchCV Score: {}".format(gs.best_score_))

GridSearchCV Score: 0.9298029556650247


## 다항 나이브 베이즈

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [None]:
model = MultinomialNB()
model.fit(X_train_scale, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
predict = model.predict(X_train_scale)
acc = metrics.accuracy_score(y_train, predict)
f1 = metrics.f1_score(y_train, predict, average=None)

print('Train Accuracy: {}'.format(acc))
print('Train F1 Score: {}'.format(f1))

Train Accuracy: 0.9647887323943662
Train F1 Score: [0.94382022 0.95575221 1.        ]


In [None]:
predict = model.predict(X_test_scale)

acc = metrics.accuracy_score(y_test, predict)
f1 = metrics.f1_score(y_test, predict, average=None)

print('Test Accuracy: {}'.format(acc))
print('Test F1 Score: {}'.format(f1))

Test Accuracy: 0.9166666666666666
Test F1 Score: [0.86956522 0.91428571 1.        ]


### `Cross_validate()`: 교차 검증

In [None]:
estimator = make_pipeline(MinMaxScaler(), MultinomialNB())

cross_validate(
    estimator=estimator,
    X=X, y=y,
    cv=5,
    n_jobs=multiprocessing.cpu_count(),
    verbose=True
)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.8s finished


{'fit_time': array([0.00309801, 0.0026319 , 0.00179219, 0.0032115 , 0.00175691]),
 'score_time': array([0.00073791, 0.00065994, 0.00059366, 0.00059438, 0.0006361 ]),
 'test_score': array([0.91666667, 0.94444444, 0.88888889, 0.97142857, 1.        ])}

### `GridSearchCV()`: 최적의 하이퍼 파라미터 찾기

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'alpha': [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]}]

gs = GridSearchCV(estimator=MultinomialNB(), param_grid=param_grid)
gs.fit(X_train_scale, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='deprecated', n_jobs=None,
             param_grid=[{'alpha': [1.0, 1.2, 1.4, 1.6, 1.8, 2.0]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [None]:
gs.best_estimator_

MultinomialNB(alpha=1.6, class_prior=None, fit_prior=True)

In [None]:
print("GridSearchCV Score: {}".format(gs.best_score_))

GridSearchCV Score: 0.9650246305418719
