### Cross Validation Task

### 약물 A, B, C, X, Y
##### 다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집했다.
- 치료 과정 동안 각 환자는 5가지 약물, 즉 약물 A, 약물 B, 약물 c, 약물 x 및 y 중 하나에 반응했다.
-  미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

##### feature
- Age: 환자의 나이
- Sex: 환자의 성별
- BP: 혈압
- Cholesterol: 콜레스테롤 수치
- Na_to_K: 나트륨-칼륨

##### target
- Drug: 의약품, 환자에게 효과가 있었던 약

In [1]:
import pandas as pd

drug_df = pd.read_csv('./datasets/drugs.csv')
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [2]:
na_to_k_df = drug_df.loc[:, 'Na_to_K'].reset_index()

In [3]:
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,25.355
1,1,13.093
2,2,10.114
3,3,7.798
4,4,18.043
...,...,...
195,195,11.567
196,196,12.006
197,197,9.894
198,198,14.020


In [4]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std_na_to_k = std.fit_transform(na_to_k_df[['Na_to_K']])

In [5]:
na_to_k_df['Na_to_K'] = std_na_to_k

In [7]:
na_to_k_df = na_to_k_df[na_to_k_df['Na_to_K'].between(-1.96, 1.96)]

In [9]:
drug_df = drug_df.iloc[na_to_k_df.index].reset_index(drop=True)

In [10]:
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
183,56,F,LOW,HIGH,11.567,drugC
184,16,M,LOW,HIGH,12.006,drugC
185,52,M,NORMAL,HIGH,9.894,drugX
186,23,M,NORMAL,NORMAL,14.020,drugX


In [13]:
from sklearn.preprocessing import LabelEncoder

drugs_encoder = LabelEncoder()
targets = drugs_encoder.fit_transform(drug_df['Drug'].tolist())
drug_df['Drug'] = targets

genders_encoder = LabelEncoder()
genders = genders_encoder.fit_transform(drug_df['Sex'].tolist())
drug_df['Sex'] = genders

blood_pressures_encoder = LabelEncoder()
blood_pressures = blood_pressures_encoder.fit_transform(drug_df['BP'].tolist())
drug_df['BP'] = blood_pressures

cholesterols_encoder = LabelEncoder()
cholesterols = cholesterols_encoder.fit_transform(drug_df['Cholesterol'].tolist())
drug_df['Cholesterol'] = cholesterols

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

dtc = DecisionTreeClassifier()

features = drug_df.iloc[:, :-1]
targets = drug_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

prameters = {'max_depth': [3, 4, 5], 'min_samples_split': [15, 16, 17]}

In [16]:
import pandas as pd

g_dtc = GridSearchCV(dtc,
            param_grid=prameters,
            cv=5,
            refit=True,
            return_train_score=True,
            n_jobs=-1)

In [17]:
g_dtc.fit(X_train, y_train)

In [19]:
pd.DataFrame(g_dtc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.014759,0.006031,0.003195,0.0009842451,3,15,"{'max_depth': 3, 'min_samples_split': 15}",0.8,0.9,0.9,...,0.873333,0.04899,7,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
1,0.010772,0.001716,0.002793,0.0007460858,3,16,"{'max_depth': 3, 'min_samples_split': 16}",0.8,0.9,0.9,...,0.873333,0.04899,7,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
2,0.012164,0.007002,0.004389,0.002793074,3,17,"{'max_depth': 3, 'min_samples_split': 17}",0.8,0.9,0.9,...,0.873333,0.04899,7,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
3,0.00818,0.005967,0.00339,0.001018211,4,15,"{'max_depth': 4, 'min_samples_split': 15}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.00399,0.000631,0.001995,6.289914e-07,4,16,"{'max_depth': 4, 'min_samples_split': 16}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.006184,0.001596,0.003594,0.002241267,4,17,"{'max_depth': 4, 'min_samples_split': 17}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,0.004188,0.000399,0.002195,0.0004000192,5,15,"{'max_depth': 5, 'min_samples_split': 15}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,0.011169,0.008773,0.002393,0.0004879365,5,16,"{'max_depth': 5, 'min_samples_split': 16}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,0.003987,0.00089,0.001795,0.0003992084,5,17,"{'max_depth': 5, 'min_samples_split': 17}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [21]:
dtc = g_dtc.best_estimator_

In [22]:
prediction = dtc.predict(X_test)
accuracy_score(y_test, prediction)

0.9736842105263158