### Cross Validation Task

### 약물 A, B, C, X, Y
##### 다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집했다.
- 치료 과정 동안 각 환자는 5가지 약물, 즉 약물 A, 약물 B, 약물 c, 약물 x 및 y 중 하나에 반응했다.
-  미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

##### feature
- Age: 환자의 나이
- Sex: 환자의 성별
- BP: 혈압
- Cholesterol: 콜레스테롤 수치
- Na_to_K: 나트륨-칼륨

##### target
- Drug: 의약품, 환자에게 효과가 있었던 약

In [2]:
import pandas as pd

path = './datasets/drugs.csv'
d_df = pd.read_csv(path)
d_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [None]:
# 전차리 결측치, 중복행, 이상치, 
# 교차검증
# 분리 

In [3]:
d_need_df = d_df.copy()

In [9]:
# 결측치
d_need_df.isna().sum()
# 이상치
d_need_df.duplicated().sum()

0

In [15]:
d_need_df[['Age']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,200.0,44.315,16.544315,15.0,31.0,45.0,58.0,74.0


In [31]:
# 이상치(표준화)
from sklearn.preprocessing import StandardScaler

n_k_df = d_need_df['Na_to_K'].reset_index()
# display(n_k_df)

scaler = StandardScaler()
std_n_k = scaler.fit_transform(n_k_df[['Na_to_K']])

n_k_df['Na_to_K'] = std_n_k
n_k_df

con1 = n_k_df['Na_to_K'] >= -1.96
con2 = n_k_df['Na_to_K'] <= 1.96
con = con1 & con2
d_need_df = d_need_df.iloc[n_k_df[con].index].reset_index(drop=True)
d_need_df


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
173,56,F,LOW,HIGH,11.567,drugC
174,16,M,LOW,HIGH,12.006,drugC
175,52,M,NORMAL,HIGH,9.894,drugX
176,23,M,NORMAL,NORMAL,14.020,drugX


In [41]:
# 범주형 데이터에 대하여 encoding 진행
from sklearn.preprocessing import LabelEncoder
drug_encoder = LabelEncoder()
targets = drug_encoder.fit_transform(d_need_df['Drug'].tolist())
d_need_df['Drug'] = targets

gender_encoder = LabelEncoder()
genders = gender_encoder.fit_transform(d_need_df['Sex'].tolist())
d_need_df['Sex'] = genders

bp_encoder = LabelEncoder()
bp = bp_encoder.fit_transform(d_need_df['BP'].tolist())
d_need_df['BP'] = bp

cho_encoder = LabelEncoder()
cholesterol = cho_encoder.fit_transform(d_need_df['Cholesterol'].tolist())
d_need_df['Cholesterol'] = cholesterol

d_need_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4
...,...,...,...,...,...,...
173,56,0,1,0,11.567,2
174,16,1,1,0,12.006,2
175,52,1,2,0,9.894,3
176,23,1,2,1,14.020,3


array([4, 2, 2, 3, 4, 3, 4, 2, 4, 4, 2, 4, 4, 4, 3, 4, 3, 0, 2, 4, 4, 4,
       4, 4, 3, 4, 4, 3, 1, 3, 3, 3, 0, 3, 3, 3, 4, 1, 4, 3, 3, 3, 0, 2,
       4, 4, 3, 4, 1, 2, 1, 3, 4, 0, 4, 3, 1, 0, 3, 4, 4, 1, 4, 3, 4, 4,
       4, 0, 4, 0, 3, 1, 3, 2, 0, 2, 1, 3, 4, 4, 4, 4, 4, 4, 3, 4, 4, 0,
       0, 2, 3, 3, 3, 4, 1, 4, 0, 3, 3, 3, 3, 4, 3, 3, 0, 4, 4, 4, 4, 1,
       4, 4, 3, 3, 4, 3, 4, 4, 3, 1, 0, 1, 3, 0, 4, 1, 4, 0, 3, 3, 0, 3,
       2, 0, 1, 3, 3, 4, 2, 0, 4, 2, 3, 3, 1, 3, 4, 4, 3, 4, 0, 3, 3, 4,
       4, 0, 4, 0, 4, 4, 4, 4, 3, 3, 4, 4, 1, 0, 4, 4, 0, 4, 2, 2, 2, 3,
       3, 3], dtype=int64)

In [None]:
dd

In [62]:
# 교차검증(GridSearchCV)
from sklearn.tree import De
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

features = d_need_df.iloc[:, :-1].values
targets = d_need_df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=124)

params = {'max_depth': [2,3], 'min_depth_split': [2, 3]}

DecisionT
