# 서포트 벡터 머신으로 타이타닉 분석하기

In [2]:
import pandas as pd
import seaborn as sns

## 1. 데이터 준비

In [9]:
df = sns.load_dataset('titanic')

# data 확인
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## 2. 데이터 전처리

In [10]:
rdf = df.drop(['deck','embark_town'],axis=1)

rdf = rdf.dropna(subset=['age'], how='any', axis=0)

most_freq = rdf['embarked'].value_counts(dropna=True).idxmax()

rdf['embarked'].fillna(most_freq, inplace=True)

## 3. 분석할 데이터 지정

In [13]:
ndf = rdf[['survived','pclass','sex','age','sibsp','parch','embarked']]

onehot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, onehot_sex], axis =1)

onehot_embarked = pd.get_dummies(ndf['embarked'],prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis =1)

ndf.drop(['sex','embarked'],axis=1, inplace=True)

## 4. 훈련용 / 테스트용 데이터 구분

In [14]:
# 학습용 데이터
X = ndf[['pclass','age','sibsp','parch','female','male','town_C','town_Q','town_S']]

# 테스트용 데이터
y = ndf['survived']

# 데이터 변수들을 정규화(0 ~ 1,0 수치값)
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)

# 학습용 70%, 테스트용 30% 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

print('train_data 개수 : ', X_train.shape)
print('test_data 개수 : ', X_test.shape)

train_data 개수 :  (499, 9)
test_data 개수 :  (215, 9)


## 5. SVM 학습

In [16]:
from sklearn import svm

svm_model = svm.SVC(kernel='rbf')

svm_model.fit(X_train, y_train)

y_hat = svm_model.predict(X_test)

print(y_hat[0:10])
print(y_test.values[0:10])

[0 0 1 0 0 0 1 0 0 0]
[0 0 1 0 0 1 1 1 0 0]


## 6. 모델 성능 평가

In [18]:
from sklearn import metrics

svm_matrix = metrics.confusion_matrix(y_test, y_hat)
print(svm_matrix)

[[120   5]
 [ 35  55]]


## 7. 평가 지표 

In [19]:
svm_report = metrics.classification_report(y_test, y_hat)
print(svm_report)

              precision    recall  f1-score   support

           0       0.77      0.96      0.86       125
           1       0.92      0.61      0.73        90

    accuracy                           0.81       215
   macro avg       0.85      0.79      0.80       215
weighted avg       0.83      0.81      0.81       215



- precision : 정답을 맞춘 횟수
- recall : 맞춘 것을 정답과 비교했을 때 퍼센트값
- f1-score : 정답을 맞춘 것의 신뢰도
- accuracy : 전체 정확도