In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# 실습

[UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)) 를 이용한 종양의 양성, 악성 여부 check  

[cell_samples.csv](https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/cell_samples.csv)

1. Sample code number: id number 
2. Clump Thickness: 1 - 10 
3. Uniformity of Cell Size: 1 - 10 
4. Uniformity of Cell Shape: 1 - 10 
5. Marginal Adhesion: 1 - 10 
6. Single Epithelial Cell Size: 1 - 10 
7. Bare Nuclei: 1 - 10 
8. Bland Chromatin: 1 - 10 
9. Normal Nucleoli: 1 - 10 
10. Mitoses: 1 - 10 
11. Class: (2 for benign, 4 for malignant)

In [4]:
cell_df = pd.read_csv("datasets/cell_samples.csv")

In [5]:
cell_df.dtypes

ID              int64
Clump           int64
UnifSize        int64
UnifShape       int64
MargAdh         int64
SingEpiSize     int64
BareNuc        object
BlandChrom      int64
NormNucl        int64
Mit             int64
Class           int64
dtype: object

### ID column 은 종양의 양성, 악성 판별과 무관한 것이 확실하므로 feature 에서 제외

### BareNuc	역시 numerical data 가 아니므로 dop 

In [6]:
feature_df = cell_df.drop(['ID', 'BareNuc'], axis=1)

In [7]:
feature_df.head()

Unnamed: 0,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BlandChrom,NormNucl,Mit,Class
0,5,1,1,1,2,3,1,1,2
1,5,4,4,5,7,3,2,1,2
2,3,1,1,1,2,3,1,1,2
3,6,8,8,1,3,3,7,1,2
4,4,1,1,3,2,3,1,1,2


### dataframe 을 numpy array 로 변환

In [8]:
X = np.asarray(feature_df, dtype=np.float32)[:, :-1]
y = np.asarray(feature_df)[:, -1]

In [9]:
X[:5]

array([[5., 1., 1., 1., 2., 3., 1., 1.],
       [5., 4., 4., 5., 7., 3., 2., 1.],
       [3., 1., 1., 1., 2., 3., 1., 1.],
       [6., 8., 8., 1., 3., 3., 7., 1.],
       [4., 1., 1., 3., 2., 3., 1., 1.]], dtype=float32)

In [10]:
y[:5]

array([2, 2, 2, 2, 2], dtype=int64)

### Train/Test dataset 분리

In [11]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (559, 8) (559,)
Test set: (140, 8) (140,)


### model 생성

In [12]:
from sklearn import svm

clf = svm.SVC(kernel='rbf')

clf.fit(X_train, y_train) 

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### 예측

In [13]:
y_pred = clf.predict(X_test)
y_pred [0:5]

array([4, 2, 4, 4, 2], dtype=int64)

In [14]:
clf.classes_

array([2, 4], dtype=int64)

### 평가
- accuracy, precision, recall  

- confusion matrix

In [15]:
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import  precision_score, accuracy_score, recall_score, roc_curve, roc_auc_score

print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))
print("Precision: {:.2f}".format(precision_score(y_test, y_pred, pos_label=4)))
print("Recall: {:.2f}".format(recall_score(y_test, y_pred, pos_label=4)))
print("f1 score\n", f1_score(y_test, y_pred, pos_label=4))
print()
print("confution matrix\n", confusion_matrix(y_test, y_pred, labels=[4, 2]))

Accuracy: 0.96
Precision: 0.96
Recall: 0.92
f1 score
 0.9411764705882353

confution matrix
 [[48  4]
 [ 2 86]]
