### 라이브러리 및 데이터 확인

In [1]:
import pandas as pd
import numpy as np

In [91]:
from sklearn.datasets import load_wine
wine = load_wine()
x = pd.DataFrame(wine.data, columns =wine.feature_names)
y = pd.DataFrame(wine.target)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size = 0.2,
                                                    stratify = y,
                                                    random_state = 2023)

x_test = pd.DataFrame(x_test)
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)

x_test.reset_index()
y_train.columns = ['target']

### 와인의 종류를 분리
- 데이터의 결측치, 이상치 처리
- 정확도, F1스코어, AUC값
- 양식에 맞게 제출

### 1. 데이터탐색

In [86]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

print(x_train.info())
print(x_test.info())
print(y_train.info())

(142, 13)
(36, 13)
(142, 1)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 52 to 115
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       142 non-null    float64
 1   malic_acid                    142 non-null    float64
 2   ash                           142 non-null    float64
 3   alcalinity_of_ash             142 non-null    float64
 4   magnesium                     142 non-null    float64
 5   total_phenols                 142 non-null    float64
 6   flavanoids                    142 non-null    float64
 7   nonflavanoid_phenols          142 non-null    float64
 8   proanthocyanins               142 non-null    float64
 9   color_intensity               142 non-null    float64
 10  hue                           142 non-null    float64
 11  od280/od315_of_diluted_wines  142 non-null    float64
 12  proline                       142 n

In [24]:
#-- 기초 통계량 분석
print(x_train.describe())
print(x_test.describe())
print(y_train.describe())

          alcohol  malic_acid         ash  alcalinity_of_ash   magnesium  \
count  142.000000  142.000000  142.000000         142.000000  142.000000   
mean    13.025915    2.354296    2.340211          19.354225   98.732394   
std      0.812423    1.142722    0.279910           3.476825   13.581859   
min     11.030000    0.740000    1.360000          10.600000   70.000000   
25%     12.370000    1.610000    2.190000          16.800000   88.000000   
50%     13.050000    1.820000    2.320000          19.300000   97.000000   
75%     13.685000    3.115000    2.510000          21.500000  106.750000   
max     14.830000    5.800000    3.230000          30.000000  151.000000   

       total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
count     142.000000  142.000000            142.000000       142.000000   
mean        2.303592    2.043592              0.361479         1.575070   
std         0.633955    1.033597              0.124627         0.576798   
min         0.9

In [66]:
print(y_train.value_counts())

target
1         57
0         47
2         38
dtype: int64


### 2. 데이터 전처리(결측치, 이상치, 변수 제거)

In [87]:
#-- 결측치 확인
print(x_train.isnull().sum().sum())
print(y_train.isnull().sum().sum())
print(x_test.isnull().sum().sum())

0
0
0


In [74]:
#-- 결측치 채우기
# df= df.dropna() #-- 행 기준으로 삭제
#-- 결측치 대체
#-- 연속형 : 중앙값 median()
#-- 범주형 : 최빈값
#-- df.fillna(채울값)

#-- 이상치 대체
# df['변수명'] = np.where(df['변수명] >=5 , 대체할 값, df['변수명])

#-- 변수처리
#-- 불필요한 변수 처리
#df = df.dropna(['변수1', '변수2'], axis = 1)

#-- 원핫인코딩
# x_train = pd.get_dummise(x_train)
# x_test = pd.get_dummise(x_test)

### 3. 데이터 분리

In [88]:
print(len(x_train))
print(len(y_train['target']))

142
142


In [92]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train['target'],
                                                  test_size = 0.2,
                                                  stratify = y_train['target'],
                                                  random_state= 2023
                                                  )

print(x_train.shape)
print(x_val.shape)

print(y_train.shape)
print(y_val.shape)

(113, 13)
(29, 13)
(113,)
(29,)


### 4. 모델링 및 성능평가

In [93]:
#-- 랜덤포레스트 모델 사용
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [96]:
#-- 모델 예측
y_pred = model.predict(x_val)

#-- 성능 평가
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average = 'macro')
# roc_auc_score -> 이진 분류에서만 가능

In [99]:
#-- 정확도
print(acc)

#-- f1 score
print(f1)

1.0
1.0


### 5. 예측값 제출


In [106]:
#-- 1. 특정 클래스로 예측
y_result = model.predict(x_test)
print(y_result[:5])

#-- 2. 특정 클래스로 분류될 확률

y_result_prob = model.predict_proba(x_test)
print(y_result_prob[:5])

#-- 데이터 프레임화

result_prob = pd.DataFrame({
    'result' : y_result,
    'prob_0' : y_result_prob[:,0],
    'prob_1' : y_result_prob[:,1],
    'prob_2' : y_result_prob[:,2],
})

print(result_prob[:5])

[2 2 2 0 1]
[[0.03 0.02 0.95]
 [0.08 0.14 0.78]
 [0.04 0.09 0.87]
 [0.96 0.03 0.01]
 [0.06 0.89 0.05]]
   result  prob_0  prob_1  prob_2
0       2    0.03    0.02    0.95
1       2    0.08    0.14    0.78
2       2    0.04    0.09    0.87
3       0    0.96    0.03    0.01
4       1    0.06    0.89    0.05
