## 폐암환자 생존여부 머신러닝(SVM)으로 예측하기  - 4주차 온라인 4강 소스코드 참조

### 1. 사용 패키지와 모듈 임포트

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split #학습데이터와 테스트데이터 분할
from sklearn import svm   #SVM 모델
from sklearn import metrics  #정확도 비교

### 2. 데이터 가져오기

In [2]:
df = pd.read_csv('./thoracic_surgery.csv')
#일부 데이터만 가져오기
#data_df = df[0:행개수] 
data_df = df
data_df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,A17,Class
0,293,1,3.8,2.8,0,0,0,0,0,0,12,0,0,0,1,0,62,0
1,1,2,2.88,2.16,1,0,0,0,1,1,14,0,0,0,1,0,60,0
2,8,2,3.19,2.5,1,0,0,0,1,0,11,0,0,1,1,0,66,1
3,14,2,3.98,3.06,2,0,0,0,1,1,14,0,0,0,1,0,80,1
4,17,2,2.21,1.88,0,0,1,0,0,0,12,0,0,0,1,0,56,0


In [3]:
#각 행은 환자 한명 한명의 기록
#A1~A17 : 종양유형, 폐활량, 호흡곤란여부, 고통정도, 기침, 흡연, 천식여부 등의 수술 전 환자 상태
#         1은 해당사항 있음, 0은 해당 사항없음
#Class : 수술 후 생존 결과 (생존은 1, 사망은 0)

In [4]:
#데이터의 행,열 개수 확인
data_df.shape  

(470, 18)

### 3. 데이터셋 나누기

In [5]:
#일반적으로 80%/20%, 70%/30% 비율로 분할 (data_df, test_size= 0.2)
train, test = train_test_split(data_df, test_size=0.2,random_state=42)

In [6]:
print("train data", train.shape)
print("test data",test.shape)

train data (376, 18)
test data (94, 18)


### 4. 학습에 사용할 변수(특징, Feature) 선택하기

In [7]:
data_df.info()    #데이터 타입, 각 아이템 개수, 누락데이터(NaN(Not a Number or Not Available) 등 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      470 non-null    int64  
 1   A2      470 non-null    int64  
 2   A3      470 non-null    float64
 3   A4      470 non-null    float64
 4   A5      470 non-null    int64  
 5   A6      470 non-null    int64  
 6   A7      470 non-null    int64  
 7   A8      470 non-null    int64  
 8   A9      470 non-null    int64  
 9   A10     470 non-null    int64  
 10  A11     470 non-null    int64  
 11  A12     470 non-null    int64  
 12  A13     470 non-null    int64  
 13  A14     470 non-null    int64  
 14  A15     470 non-null    int64  
 15  A16     470 non-null    int64  
 16  A17     470 non-null    int64  
 17  Class   470 non-null    int64  
dtypes: float64(2), int64(16)
memory usage: 66.2 KB


In [8]:
data_df.corr()   # 상관계수 보기

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,A17,Class
A1,1.0,-0.007878,-0.034985,0.029563,-0.021449,0.052724,0.027238,-0.061885,0.008845,0.027271,0.047079,-0.009229,-0.016382,-0.015761,-0.038681,-0.0424,-0.005826,-0.074924
A2,-0.007878,1.0,0.121263,0.05878,-0.0559,0.0479,-0.062958,-0.047142,-0.07585,-0.019562,-0.13505,0.029753,-0.008675,-0.017461,-0.107427,-0.008675,0.076271,0.060444
A3,-0.034985,0.121263,1.0,0.032975,-0.091094,0.019786,-0.095827,0.055829,-0.05277,-0.100242,0.034088,-0.115145,-0.009135,-0.035584,-0.012009,-0.060578,-0.290178,-0.046374
A4,0.029563,0.05878,0.032975,1.0,-0.143155,0.161615,0.102979,0.260073,-0.099914,-0.086103,0.015504,-0.022251,-0.013617,-0.025088,-0.100853,-0.016509,-0.1159,-0.042841
A5,-0.021449,-0.0559,-0.091094,-0.143155,1.0,0.092863,0.123296,0.092863,0.684647,0.418042,0.089751,0.02531,0.026788,0.023166,0.172289,-0.03433,0.214528,0.0932
A6,0.052724,0.0479,0.019786,0.161615,0.092863,1.0,0.256225,0.067529,-0.024115,-0.072455,0.099942,0.022578,-0.017372,-0.034968,-0.077406,-0.017372,0.044789,0.057375
A7,0.027238,-0.062958,-0.095827,0.102979,0.123296,0.256225,1.0,0.134386,0.081772,0.060393,0.05984,-0.001471,-0.026886,0.086156,-0.044942,-0.026886,0.086705,0.065785
A8,-0.061885,-0.047142,0.055829,0.260073,0.092863,0.067529,0.134386,1.0,0.049843,-0.072455,0.075502,-0.042725,-0.017372,0.097572,-0.077406,-0.017372,-0.015331,0.10553
A9,0.008845,-0.07585,-0.05277,-0.099914,0.684647,-0.024115,0.081772,0.049843,1.0,0.202245,0.145345,0.016551,0.044101,0.017815,0.200373,-0.026401,0.149589,0.08886
A10,0.027271,-0.019562,-0.100242,-0.086103,0.418042,-0.072455,0.060393,-0.072455,0.202245,1.0,-0.036044,0.069522,0.058695,0.029726,0.118527,-0.029161,0.208003,0.086467


### 5. 학습용 데이터셋 : 학습데이터와 레이블(정답) 나누기

In [9]:
train_data_df = train[['A8','A1','A12','A15']]
train_data_df 

Unnamed: 0,A8,A1,A12,A15
414,0,124,0,1
305,0,336,0,1
22,1,173,0,1
307,0,339,0,1
46,0,414,0,1
...,...,...,...,...
106,0,73,0,1
270,0,288,0,1
348,0,394,0,1
435,0,350,0,0


In [10]:
train_label_df = train[['Class']]
train_label_df

Unnamed: 0,Class
414,0
305,0
22,0
307,0
46,0
...,...
106,0
270,0
348,0
435,0


In [11]:
train_data = train_data_df.values           # 값부분만 가져오기(2차원 형태)
train_data

array([[  0, 124,   0,   1],
       [  0, 336,   0,   1],
       [  1, 173,   0,   1],
       ...,
       [  0, 394,   0,   1],
       [  0, 350,   0,   0],
       [  0,  69,   0,   1]])

In [12]:
train_label = train_label_df.values.ravel() # 값부분을 1차원으로 펴주기
train_label

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,

### 6. 테스트 데이터셋 : 학습데이터와 레이블(정답) 나누기

In [13]:
test_data_df = test[['A8','A1','A12','A15']]
test_data_df

Unnamed: 0,A8,A1,A12,A15
55,0,4,0,0
73,0,29,0,1
33,0,316,0,1
446,0,456,0,1
425,0,274,1,1
...,...,...,...,...
444,0,425,0,1
137,0,112,1,1
131,0,104,0,0
398,0,467,0,1


In [14]:
test_label_df = test[['Class']]
test_label_df

Unnamed: 0,Class
55,0
73,0
33,0
446,0
425,0
...,...
444,0
137,0
131,0
398,0


In [15]:
# 값부분만 가져오기(2차원 형태)
test_data = test_data_df.values
test_data

array([[  0,   4,   0,   0],
       [  0,  29,   0,   1],
       [  0, 316,   0,   1],
       [  0, 456,   0,   1],
       [  0, 274,   1,   1],
       [  0, 237,   0,   1],
       [  0, 212,   0,   1],
       [  0, 111,   0,   1],
       [  0, 401,   0,   0],
       [  0,  24,   0,   1],
       [  0,  37,   0,   1],
       [  0, 396,   1,   1],
       [  0,  91,   0,   1],
       [  0, 325,   0,   1],
       [  0, 232,   0,   1],
       [  0, 190,   0,   1],
       [  0, 146,   0,   1],
       [  1,  57,   0,   0],
       [  0, 287,   0,   1],
       [  0,  36,   0,   1],
       [  0, 293,   0,   1],
       [  0,  98,   0,   1],
       [  1, 123,   0,   1],
       [  0, 259,   0,   1],
       [  0,  34,   1,   0],
       [  0, 209,   0,   1],
       [  0, 393,   0,   1],
       [  0,  32,   0,   0],
       [  0, 275,   0,   1],
       [  0, 174,   0,   1],
       [  0,  81,   0,   1],
       [  0, 250,   0,   1],
       [  0,  67,   0,   0],
       [  0,  68,   0,   1],
       [  0, 1

In [16]:
# 값부분을 1차원으로 펴주기
test_label = test_label_df.values.ravel()
test_label

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1])

### 7. SVM 모델 학습하기

In [17]:
#svm 객체 생성
clf = svm.SVC(C=1, gamma=0.1)
#svm 모델로 학습데이터로 학습하기 fit()
clf.fit(train_data,train_label)

### 8. 테스트 데이터로 예측하기

In [18]:
#위에서 만든 clf(svm) 모델로 예측하기
pred_svm = clf.predict(test_data)
pred_svm  #svm이 예측한 생존 여부 값

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

### 9. 모델 예측 정확도 확인

In [19]:
test_label #테스트 데이터에서 정답(생존 여부)

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1])

In [20]:
ac_score = metrics.accuracy_score(test_label, pred_svm)
print('accuracy : ', ac_score)

accuracy :  0.8297872340425532


In [21]:
comparison = pd.DataFrame({'prediction_svm':pred_svm, 'ground_truth':test_label})
comparison

Unnamed: 0,prediction_svm,ground_truth
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
89,0,0
90,0,0
91,0,0
92,0,0


### * 주의  !!!! : 소스코드 중간 중간에 변경사항이 있다면 
### 맨 위에서부터 차례 차례 Shift + Enter를 모두 해주거나, 
### 상단 Kernel -> Restart & Run All로 전체적으로 다시 한번 실행해야 적용됨
### 고친 부분만 Shift + Enter 한다고 전체 소스코드에 반영되지 않음

## 1. 데이터를 증가시켰을 때 정확도는 얼마나 차이가 나는가?

## A. 정확도 약간 증가한다.
데이터 200개 였을때 : accuracy :  0.825 <br>
데이터 전부 사용했을때 : accuracy :  0.8297

## 2. 학습 피쳐(특징데이터, 변수)를 변경시켜 수행할 경우 정확도는 어떤 차이가 나는가?

## A. [['A1','A2','A3','A4']] 사용했을때 1% 정도 상승했습니다.
[['A8','A1','A12','A15']] 사용했을때: accuracy: 0.8297 <br>
[['A1','A2','A3','A4']] 사용했을때: accuracy: 0.840

## 3. 비용(C)와 감마(gamma) 값을 변경하여 수행해보고 어떤 값이 가장 좋은 정확도를 갖는지 알아보자. 
### 비용이 0.1 , 1, 10 인 경우, 감마 값이 0.1, 1, 10 인 경우의 여러 조합으로 수행해 보자

## A.

## 4. 예측 정확도가 가장 높은 학습 모델의 데이터 수, 피쳐(변수) 및 파라메터의 값은 무엇인가?

## A.