In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB
None


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
# NaN 값이 많은 deck열 삭제, embarked 열과 embarked_town 열은 의미가 동일하므로 embarked_town 열 삭제
ndf = df.drop(['deck','embark_town'], axis=1)
ndf.info()
ndf.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
survived      891 non-null int64
pclass        891 non-null int64
sex           891 non-null object
age           714 non-null float64
sibsp         891 non-null int64
parch         891 non-null int64
fare          891 non-null float64
embarked      889 non-null object
class         891 non-null category
who           891 non-null object
adult_male    891 non-null bool
alive         891 non-null object
alone         891 non-null bool
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 72.4+ KB


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,no,True


In [4]:
# age 변수의 값이 NaN인 행을 삭제
ndf = ndf.dropna(subset= ['age'] , how = 'any', axis=0)
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 13 columns):
survived      714 non-null int64
pclass        714 non-null int64
sex           714 non-null object
age           714 non-null float64
sibsp         714 non-null int64
parch         714 non-null int64
fare          714 non-null float64
embarked      712 non-null object
class         714 non-null category
who           714 non-null object
adult_male    714 non-null bool
alive         714 non-null object
alone         714 non-null bool
dtypes: bool(2), category(1), float64(2), int64(4), object(4)
memory usage: 63.6+ KB


In [5]:
# embarked 열의 NaN값을 승선도시 중에서 가장 많이 출연한 데이터 값으로 치환하기(결측치 채우기)
# ndf['embarked'].value_counts(dropna=True).idxmax()
most_freq = ndf['embarked'].value_counts(dropna=True).idxmax()
print(most_freq)

S


In [6]:
ndf['embarked'].fillna(most_freq, inplace=True)
print(ndf.describe(include='all'))

          survived      pclass   sex         age       sibsp       parch  \
count   714.000000  714.000000   714  714.000000  714.000000  714.000000   
unique         NaN         NaN     2         NaN         NaN         NaN   
top            NaN         NaN  male         NaN         NaN         NaN   
freq           NaN         NaN   453         NaN         NaN         NaN   
mean      0.406162    2.236695   NaN   29.699118    0.512605    0.431373   
std       0.491460    0.838250   NaN   14.526497    0.929783    0.853289   
min       0.000000    1.000000   NaN    0.420000    0.000000    0.000000   
25%       0.000000    1.000000   NaN   20.125000    0.000000    0.000000   
50%       0.000000    2.000000   NaN   28.000000    0.000000    0.000000   
75%       1.000000    3.000000   NaN   38.000000    1.000000    1.000000   
max       1.000000    3.000000   NaN   80.000000    5.000000    6.000000   

              fare embarked  class  who adult_male alive alone  
count   714.000000    

In [7]:
# 분류 분석에 사용할 변수 선택
# survived, pclass, sex, age, sibsp, parch, embarked
X = ndf[['pclass', 'sex', 'age', 'sibsp', 'parch', 'embarked']]
Y = ndf['survived']

In [8]:
# 범주형 데이터를 모델이 인식할 수 있는 숫자형 데이터로 변환 : one-hot encoding
onehot_sex = pd.get_dummies(ndf['sex'])
ndf = pd.concat([ndf, onehot_sex], axis=1)

In [9]:
onehot_embarked = pd.get_dummies(ndf['embarked'], prefix='town')
ndf = pd.concat([ndf, onehot_embarked], axis=1)

In [10]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 0 to 890
Data columns (total 18 columns):
survived      714 non-null int64
pclass        714 non-null int64
sex           714 non-null object
age           714 non-null float64
sibsp         714 non-null int64
parch         714 non-null int64
fare          714 non-null float64
embarked      714 non-null object
class         714 non-null category
who           714 non-null object
adult_male    714 non-null bool
alive         714 non-null object
alone         714 non-null bool
female        714 non-null uint8
male          714 non-null uint8
town_C        714 non-null uint8
town_Q        714 non-null uint8
town_S        714 non-null uint8
dtypes: bool(2), category(1), float64(2), int64(4), object(4), uint8(5)
memory usage: 67.0+ KB


In [11]:
X = ndf[['pclass', 'female', 'male', 'age', 'sibsp', 'parch', 'town_C', 'town_Q', 'town_S']]
Y = ndf['survived']

In [12]:
#KNN 분류 분석을 수행하려면 설명변수를 정규화 (평균 0, 표준편차1)
from sklearn import preprocessing
# print(X.head())
X = preprocessing.StandardScaler().fit(X).transform(X)
# print(X.head())

In [13]:
# train data : test data를 7:3으로 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=10)
print(X_train.shape)
print(X_test.shape)

(499, 9)
(215, 9)


In [14]:
from sklearn import svm
svm_model = svm.SVC(kernel = 'rbf')
svm_model.fit(X_train, Y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [15]:
y_predict = svm_model.predict(X_test)
print(y_predict[0:10])
print(Y_test.values[0:10])

[0 0 1 0 0 0 1 0 0 0]
[0 0 1 0 0 1 1 1 0 0]


In [16]:
from sklearn import metrics
svm_matrix = metrics.confusion_matrix(Y_test, y_predict)
print(svm_matrix)

[[120   5]
 [ 35  55]]


In [17]:
svm_report = metrics.classification_report(Y_test, y_predict)
print(svm_report)

              precision    recall  f1-score   support

           0       0.77      0.96      0.86       125
           1       0.92      0.61      0.73        90

    accuracy                           0.81       215
   macro avg       0.85      0.79      0.80       215
weighted avg       0.83      0.81      0.81       215



- SVM은 데이터의 특성 개수가 작아도 복잡한 결정 경계를 만들어 주는 분류 방식입니다.
- 데이터 전처리(dummy 변수, 정규화) 가 필요하며, 매개변수(hyper parameter) 설정에 따라 분류 분석의 성능에 영향이 큼