In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

In [2]:
df = pd.read_csv('titanic.csv')
data_df = df[:500] 
data_df.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,2,"Mellinger, Miss. Madeleine Violet",0,13.0,0,1,250644,19.5,,S,1
1,2,"Wells, Miss. Joan",0,4.0,1,1,29103,23.0,,S,1
2,2,"Duran y More, Miss. Florentina",0,30.0,1,0,SC/PARIS 2148,13.8583,,C,1
3,3,"Bradley, Miss. Bridget Delia",0,22.0,0,0,334914,7.725,,Q,1
4,1,"Francatelli, Miss. Laura Mabel",0,30.0,0,0,PC 17485,56.9292,E36,C,1


In [3]:
# 데이터 쪼개기
train, test = train_test_split(data_df, test_size=0.2, random_state=42)
print('train data:',train.shape)
print('test data:',test.shape)

train data: (400, 11)
test data: (100, 11)


In [4]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    500 non-null    int64  
 1   name      500 non-null    object 
 2   sex       500 non-null    int64  
 3   age       500 non-null    float64
 4   sibsp     500 non-null    int64  
 5   parch     500 non-null    int64  
 6   ticket    500 non-null    object 
 7   fare      500 non-null    float64
 8   cabin     136 non-null    object 
 9   embarked  499 non-null    object 
 10  survived  500 non-null    int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 43.1+ KB


In [5]:
data_df.corr(numeric_only=True)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,survived
pclass,1.0,0.14046,-0.442098,0.013955,0.002495,-0.570232,-0.303391
sex,0.14046,1.0,0.040042,-0.067684,-0.176914,-0.197603,-0.533145
age,-0.442098,0.040042,1.0,-0.179723,-0.107105,0.199014,0.022189
sibsp,0.013955,-0.067684,-0.179723,1.0,0.389178,0.165873,-0.009784
parch,0.002495,-0.176914,-0.107105,0.389178,1.0,0.269492,0.095833
fare,-0.570232,-0.197603,0.199014,0.165873,0.269492,1.0,0.220088
survived,-0.303391,-0.533145,0.022189,-0.009784,0.095833,0.220088,1.0


In [6]:
train_data_df = train[['pclass','sex','age','parch','fare']]
train_data_df.head()

Unnamed: 0,pclass,sex,age,parch,fare
249,3,1,38.0,0,8.6625
433,3,1,32.0,0,7.775
19,3,1,4.0,1,29.125
322,2,1,19.0,1,36.75
332,3,1,16.0,1,20.25


In [7]:
train_label_df = train[['survived']]
train_label_df.head()

Unnamed: 0,survived
249,0
433,1
19,0
322,0
332,0


In [8]:
train_data = train_data_df.values  
train_data

array([[ 3.    ,  1.    , 38.    ,  0.    ,  8.6625],
       [ 3.    ,  1.    , 32.    ,  0.    ,  7.775 ],
       [ 3.    ,  1.    ,  4.    ,  1.    , 29.125 ],
       ...,
       [ 3.    ,  1.    , 18.    ,  0.    ,  7.75  ],
       [ 2.    ,  0.    , 31.    ,  0.    , 21.    ],
       [ 2.    ,  1.    ,  1.    ,  2.    , 37.0042]])

In [9]:
train_label = train_label_df.values.ravel()
train_label

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [10]:
test_data_df = test[['pclass','sex','age','parch','fare']]
test_label_df = test[['survived']]
test_data = test_data_df.values
test_label = test_label_df.values.ravel()

In [11]:
clf = svm.SVC(C=1, gamma= 0.1)
clf.fit(train_data, train_label)

In [12]:
pred_svm = clf.predict(test_data)
pred_svm

array([0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1])

In [13]:
ac_score= metrics.accuracy_score(test_label, pred_svm)
print('정확도:', ac_score)

정확도: 0.68


In [16]:
comparasion_df = pd.DataFrame({'prediction_df': pred_svm , 'ground_truth': test_label})
comparasion_df

Unnamed: 0,prediction_df,ground_truth
0,0,0
1,0,0
2,1,0
3,0,0
4,0,0
...,...,...
95,0,0
96,0,1
97,0,0
98,1,1


# 데이터 500 개 사용: 68% 의 정확도