# drug data 연습

The target feature is  
Drug type  
The feature sets are:  
Age  
Sex  
Blood Pressure Levels (BP)  
Cholesterol Levels  
Na to Potassium Ration  

In [29]:
import numpy as np
import pandas as pd

In [30]:
# csv 파일 읽어오기
df = pd.read_csv('drug200.csv')
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [31]:
df.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [32]:
df['BP'].value_counts()

HIGH      77
LOW       64
NORMAL    59
Name: BP, dtype: int64

In [33]:
df['Cholesterol'].value_counts()

HIGH      103
NORMAL     97
Name: Cholesterol, dtype: int64

In [34]:
x = df.drop('Drug', axis=1)
x

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,F,HIGH,HIGH,25.355
1,47,M,LOW,HIGH,13.093
2,47,M,LOW,HIGH,10.114
3,28,F,NORMAL,HIGH,7.798
4,61,F,LOW,HIGH,18.043
...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567
196,16,M,LOW,HIGH,12.006
197,52,M,NORMAL,HIGH,9.894
198,23,M,NORMAL,NORMAL,14.020


In [35]:
x = pd.get_dummies(x, columns=['Sex', 'BP', 'Cholesterol'])
x

Unnamed: 0,Age,Na_to_K,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL
0,23,25.355,1,0,1,0,0,1,0
1,47,13.093,0,1,0,1,0,1,0
2,47,10.114,0,1,0,1,0,1,0
3,28,7.798,1,0,0,0,1,1,0
4,61,18.043,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...
195,56,11.567,1,0,0,1,0,1,0
196,16,12.006,0,1,0,1,0,1,0
197,52,9.894,0,1,0,0,1,1,0
198,23,14.020,0,1,0,0,1,0,1


In [36]:
y = df['Drug']
y

0      DrugY
1      drugC
2      drugC
3      drugX
4      DrugY
       ...  
195    drugC
196    drugC
197    drugX
198    drugX
199    drugX
Name: Drug, Length: 200, dtype: object

In [37]:
# 표준화
from sklearn.preprocessing import StandardScaler

# StandardScaler객체 생성
scaler = StandardScaler()
# StandardScaler 로 데이터 셋 변환. fit( ) 과 transform( ) 호출.  
scaler.fit(x)
x = scaler.transform(x)

In [38]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
import warnings

warnings.simplefilter(action='ignore')

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 156, stratify=y)

#SVM 
clf =  svm.SVC()

# 학습/예측/평가
clf.fit(X_train , y_train)
pred = clf.predict(X_test)

print('정확도: {0:.4f}'.format(accuracy_score(y_test, pred)))

정확도: 0.8500


In [39]:
clf.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [40]:
from sklearn.model_selection import GridSearchCV
# 하이퍼 파라미터
param_grid = {
    'C' : [1, 2, 3, 4, 5],
    'gamma':['scale'],     
}

grid_search = GridSearchCV(clf, param_grid, cv=5, refit=True)
grid_search.fit(X_train, y_train)

print('GridSearchCV 최적 하이퍼 파라미터 :',grid_search.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_search.best_score_))

pred = grid_search.predict(X_test)
acc = accuracy_score(y_test, pred)
print('테스트 세트에서의 정확도 : {0:.4f}'.format(acc))

GridSearchCV 최적 하이퍼 파라미터 : {'C': 5, 'gamma': 'scale'}
GridSearchCV 최고 정확도: 0.9000
테스트 세트에서의 정확도 : 0.9250


In [41]:
# 교차검증
from sklearn.model_selection import cross_val_score

scores = cross_val_score(grid_search, x , y , cv=5)
for iter_count,accuracy in enumerate(scores):
    print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))

print("평균 정확도: {0:.4f}".format(np.mean(scores)))

교차 검증 0 정확도: 0.9250
교차 검증 1 정확도: 0.9750
교차 검증 2 정확도: 0.9750
교차 검증 3 정확도: 0.9250
교차 검증 4 정확도: 1.0000
평균 정확도: 0.9600


In [42]:
pred = grid_search.predict(X_test)
acc = accuracy_score(y_test, pred)
print('acc:', acc)

acc: 0.925
