In [1]:
import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
df = pd.read_csv('Titanic.csv') # 분석할 데이터 불러오기

In [3]:
X = df.iloc[:, :-1] # 독립변수 matrix

In [4]:
y = df.iloc[:, -1] # 종속변수 matrix

In [5]:
# 독립변수가 연속형인 데이터
X_num = df.iloc[:, [2, 3, 4, 5]]

In [6]:
# 독립변수가 이산형인 데이터
X_cat = df.iloc[:, [0, 1, 6]]

In [7]:
tmp_df = X_cat # 이산형 데이터는 임시로 다른 곳에 저장
final_X = X_num # 연속형 데이터는 dummy 변수가 필요 없으므로 분석할 데이터에 저장

In [8]:
# 이산형 데이터를 하나씩 dummy 변수로 변환하는 작업 수행
for i in X_cat.columns:
    cat_list = pd.get_dummies(tmp_df[i], drop_first = True, prefix = i)
    final_X = final_X.join(cat_list) # dummy 변수로 만든 이산형 데이터를 분석용 데이터에 합치기

In [9]:
final_X # 분석용 데이터 확인
np.shape(final_X)

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,22.0,1,0,7.2500,0,1,1,0,1
1,38.0,1,0,71.2833,0,0,0,0,0
2,26.0,0,0,7.9250,0,1,0,0,1
3,35.0,1,0,53.1000,0,0,0,0,1
4,35.0,0,0,8.0500,0,1,1,0,1
5,30.0,0,0,8.4583,0,1,1,1,0
6,54.0,0,0,51.8625,0,0,1,0,1
7,2.0,3,1,21.0750,0,1,1,0,1
8,27.0,0,2,11.1333,0,1,0,0,1
9,14.0,1,0,30.0708,1,0,0,0,0


(889, 9)

In [36]:
# Disciriminant 함수 import
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [16]:
# k-fold cross-validation 함수 import
from sklearn.model_selection import KFold

In [17]:
cv = KFold(n_splits = 5, shuffle = True) # 5-fold cv 함수 입력

In [58]:
lda = LinearDiscriminantAnalysis() # 함수 명명
qda = QuadraticDiscriminantAnalysis(store_covariance=True)

In [38]:
sp_tr_idx, sp_te_idx = list(cv.split(final_X))[2] # 3번째 cv의 training set과 test set

In [39]:
sp_X_train = final_X.iloc[sp_tr_idx] # cv로 찾은 training set index로 독립변수 training set 나누기
sp_X_test = final_X.iloc[sp_te_idx] # cv로 찾은 test set index로 독립변수 test set 나누기
sp_y_train = y[sp_tr_idx] # cv로 찾은 training set index로 종속변수 training set 나누기
sp_y_test = y[sp_te_idx] # cv로 찾은 test set index로 종속변수 test set 나누기

In [40]:
# scaling 함수 import
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() # scaler 함수 명명
scaler = scaler.fit(sp_X_train.iloc[:, 0:np.shape(X_num)[1]]) # scaler 함수 fitting

In [41]:
# training set 내 연속형 데이터의 scaling
zs_X_train = scaler.transform(sp_X_train.iloc[:, 0:np.shape(X_num)[1]])
# training set의 연속형과 이산형 데이터 합치기
zs_X_train = np.c_[zs_X_train, sp_X_train.iloc[:, np.shape(X_num)[1]:].values]
# test set 내 연속형 데이터의 scaling (training set의 정보 이용)
zs_X_test = scaler.transform(sp_X_test.iloc[:, 0:np.shape(X_num)[1]])
# test set의 연속형과 이산형 데이터 합치기
zs_X_test = np.c_[zs_X_test, sp_X_test.iloc[:, np.shape(X_num)[1]:].values]

In [42]:
# scaling 시 사용한 평균값(mean)과 표준편차(var의 제곱근)
scaler.mean_
scaler.var_ ** 0.5

array([29.51113924,  0.51476793,  0.3628692 , 31.67304782])

array([12.89742727,  1.10972871,  0.75415375, 49.8103779 ])

In [45]:
lda.fit(zs_X_train, sp_y_train) # LDA 학습
qda.fit(zs_X_train, sp_y_train) # QDA 학습

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
                              store_covariance=False, tol=0.0001)

In [28]:
lda.score(zs_X_train, sp_y_train) # training set에서의 accuracy
lda.score(zs_X_test, sp_y_test) # test set 에서의 accuracy

0.810126582278481

0.7471910112359551

In [46]:
qda.score(zs_X_train, sp_y_train) # training set에서의 accuracy
qda.score(zs_X_test, sp_y_test) # test set에서의 accuracy

0.829817158931083

0.7359550561797753

In [32]:
# training set을 통해 학습된 예측 모형의 계수 확인
lda.coef_

array([[-0.51322086, -0.31384035, -0.20355077,  0.21516597, -0.5724323 ,
        -2.02278802, -3.83772127,  0.09249747, -0.54891729]])

In [50]:
qda.means_

array([[ 0.06522125,  0.03279618, -0.07719651, -0.22037193,  0.17906977,
         0.68837209,  0.86046512,  0.08139535,  0.78139535],
       [-0.09980476, -0.05018632,  0.11812989,  0.33722395,  0.2633452 ,
         0.33807829,  0.31316726,  0.09252669,  0.62989324]])

In [33]:
# 예측 모형의 종속변수 예측값
lda.predict(zs_X_train)

array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,

In [55]:
lda.predict_proba(zs_X_train) # 각 class에 속할 확률

array([[0.95145664, 0.04854336],
       [0.03630497, 0.96369503],
       [0.31698398, 0.68301602],
       ...,
       [0.47790341, 0.52209659],
       [0.41978495, 0.58021505],
       [0.91498207, 0.08501793]])

In [53]:
qda.predict(zs_X_train)

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,

In [57]:
qda.predict_proba(zs_X_train) # 각 class에 속할 확률

array([[0.98125037, 0.01874963],
       [0.00170634, 0.99829366],
       [0.5071604 , 0.4928396 ],
       ...,
       [0.91801569, 0.08198431],
       [0.13941617, 0.86058383],
       [0.95085517, 0.04914483]])