In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import accuracy_score

## 1. Heart Attack Data

**1) 주어진 데이터를 이용하여 heart attack 가능성을 예측하는 모델을 만들 것입니다. target을 예측하는 최적의 모델을 만드세요.**

In [2]:
heart_train = pd.read_csv('heart attack train.csv', index_col = 0)
heart_test = pd.read_csv('heart attack test.csv', index_col = 0)

In [3]:
heart_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
62,52,1,3,118,186,0,0,190,0,0.0,1,0,1,1
127,67,0,2,152,277,0,1,172,0,0.0,2,1,2,1
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3,1
287,57,1,1,154,232,0,0,164,0,0.0,2,1,2,0
108,50,0,1,120,244,0,1,162,0,1.1,2,0,2,1


In [4]:
# train test x, y, split

X_train = heart_train[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
                       'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 
                       'ca', 'thal']]
y_train = heart_train['target']

X_test = heart_test[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
                       'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 
                       'ca', 'thal']]
y_test = heart_test[['target']]

In [7]:
# scaling : StandardScaler 평균을 0, 분산을 1

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train

array([[-0.27090572,  0.6636838 ,  1.9766492 , ..., -0.66896473,
        -0.72428597, -2.11701865],
       [ 1.3708101 , -1.50674161,  0.99843017, ...,  0.96628239,
         0.27160724, -0.47497213],
       [ 0.27633288,  0.6636838 ,  0.99843017, ...,  0.96628239,
         0.27160724,  1.16707438],
       ...,
       [-2.78820331,  0.6636838 ,  0.02021114, ...,  0.96628239,
        -0.72428597, -0.47497213],
       [-0.38035344,  0.6636838 , -0.95800789, ...,  0.96628239,
        -0.72428597,  1.16707438],
       [-0.05201028,  0.6636838 ,  0.99843017, ...,  0.96628239,
        -0.72428597,  1.16707438]])

In [8]:
## get train acc using Logistic Regression, LDA and QDA

def get_acc(X, y, n_iter = 1000):
    logit_model = LogisticRegression(random_state = 22, max_iter = n_iter)
    lda_model = LinearDiscriminantAnalysis()
    qda_model = QuadraticDiscriminantAnalysis(store_covariance = True)

    models = [logit_model, lda_model, qda_model]
    model_name = ['Logistic Regression', 'LDA', 'QDA']
    
    fitted_models = []
    for i in range(3):
        model = models[i]
        fitted_model = model.fit(X, y)
        model_pred = fitted_model.predict(X)
        print(model_name[i], "accuracy : ", accuracy_score(y, model_pred))
        
        fitted_models.append(fitted_model)
    
    return fitted_models

In [9]:
fitted_models1 = get_acc(X_train, y_train)

Logistic Regression accuracy :  0.8677685950413223
LDA accuracy :  0.8677685950413223
QDA accuracy :  0.8884297520661157


**2) 1)에서 만든 모델을 이용하여 target과 각각의 변수간의 관계를 설명하세요.**

QDA는 y의 class 사이에 평균과 공분산 구조가 다른 경우를 반영한다는 점에서 LDA와 차이점을 갖는데, heart attack 데이터에서는 LDA보다 QDA의 성능이 좋은 것으로 보아, target 변수가 0이냐 1이냐에 따라 다른 평균과 공분산 구조를 갖는다고 추론할 수 있다.

In [30]:
pd.DataFrame(fitted_models1[2].covariance_[1])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.118539,-0.228991,0.104228,0.272482,0.25662,0.164914,-0.066698,-0.486929,0.038606,0.106246,-0.09973,0.155724,0.091815
1,-0.228991,1.165131,0.051703,0.02051,-0.215953,0.121834,0.064282,0.209406,0.102495,-0.010778,0.031863,0.073255,0.247168
2,0.104228,0.051703,0.838896,0.258703,0.005715,0.199284,-0.11043,0.026044,0.020741,0.172068,-0.096398,0.075216,0.024876
3,0.272482,0.02051,0.258703,0.853598,0.087197,0.131037,-0.122889,0.019997,-0.075636,0.133699,-0.036483,0.027127,-0.005537
4,0.25662,-0.215953,0.005715,0.087197,1.139892,0.02254,-0.256526,0.012972,-0.064295,0.077048,0.015715,0.019137,0.057211
5,0.164914,0.121834,0.199284,0.131037,0.02254,1.010275,-0.044027,-0.046328,-0.017604,-0.047505,0.076322,0.110236,0.109781
6,-0.066698,0.064282,-0.11043,-0.122889,-0.256526,-0.044027,0.936028,0.001109,-0.051607,-0.063828,0.099846,0.094044,0.106605
7,-0.486929,0.209406,0.026044,0.019997,0.012972,-0.046328,0.001109,0.662157,-0.104802,-0.0924,0.159375,-0.073198,0.035406
8,0.038606,0.102495,0.020741,-0.075636,-0.064295,-0.017604,-0.051607,-0.104802,0.488317,0.045198,-0.06422,-0.019436,0.0724
9,0.106246,-0.010778,0.172068,0.133699,0.077048,-0.047505,-0.063828,-0.0924,0.045198,0.530929,-0.332604,-0.035027,0.07935


In [31]:
pd.DataFrame(data = fitted_models1[2].means_,
             columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 
                       'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 
                       'ca', 'thal']).T

Unnamed: 0,0,1
age,0.249984,-0.20148
sex,0.342139,-0.275754
cp,-0.532301,0.429019
trestbps,0.136938,-0.110368
chol,0.121019,-0.097537
fbs,-0.00172,0.001387
restecg,-0.159664,0.128685
thalach,-0.498839,0.402049
exang,0.529516,-0.426774
oldpeak,0.469732,-0.37859


target이 0인지 1인지에 따라 각 변수는 다른 평균 값을 보이는데, scaling을 고려해 클래스와 각 변수간의 관계를 파악할 수 있다. age, sex, trestbps, chol, exang, oldpeak, ca, thal이 각 변수의 평균치보다 높을 때 heart attack이 일어날 가능성이 낮고, 낮을 때 heart attack이 일어날 가능성이 높다.

**3) 1)에서 만든 모델의 test misclassification rate를 구하세요.**

In [36]:
test_pred = fitted_models1[2].predict(X_test)
print(1-accuracy_score(y_test, test_pred))

0.360655737704918


## 2. Satisfaction Data

**1) 주어진 데이터를 이용하여 satisfaction을 예측하는 모델을 만들 것입니다. satisfaction을 예측하는 최적의 모델을 만드세요.**

In [104]:
satisfaction_train = pd.read_csv('satisfaction_train.csv', index_col = 0)
satisfaction_test = pd.read_csv('satisfaction_test.csv', index_col = 0)
satisfaction_train.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
105527,Female,Loyal Customer,39,Business travel,Business,2725,5,5,2,5,...,4,4,4,4,4,4,4,73,65.0,satisfied
82617,Female,Loyal Customer,27,Business travel,Business,1634,3,3,3,3,...,3,3,4,4,2,3,3,39,37.0,dissatisfied
34069,Female,Loyal Customer,21,Personal Travel,Eco Plus,1341,4,4,0,1,...,1,4,1,3,5,4,1,0,0.0,satisfied
18636,Male,Loyal Customer,64,Personal Travel,Eco Plus,3794,2,5,2,4,...,5,3,3,5,1,4,1,570,567.0,dissatisfied
28693,Female,Loyal Customer,69,Personal Travel,Eco,1237,5,5,5,5,...,5,5,5,5,3,5,4,69,72.0,satisfied


In [105]:
## onehot encoding using Label Encoder

from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()

satisfaction_train_encoded = satisfaction_train.apply(encoder.fit_transform)
satisfaction_test_encoded = satisfaction_test.apply(encoder.fit_transform)

In [106]:
## train test split

X_train = satisfaction_train_encoded[['Age', 'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient',
                                      'Food and drink', 'Gate location', 'Inflight wifi service',
                                      'Inflight entertainment', 'Online support', 'Ease of Online booking',
                                      'On-board service', 'Leg room service', 'Baggage handling',
                                      'Checkin service', 'Cleanliness', 'Online boarding',
                                      'Departure Delay in Minutes', 'Arrival Delay in Minutes']]
y_train = satisfaction_train_encoded['satisfaction']

X_test = satisfaction_test_encoded[['Age', 'Flight Distance', 'Seat comfort', 'Departure/Arrival time convenient',
                                      'Food and drink', 'Gate location', 'Inflight wifi service',
                                      'Inflight entertainment', 'Online support', 'Ease of Online booking',
                                      'On-board service', 'Leg room service', 'Baggage handling',
                                      'Checkin service', 'Cleanliness', 'Online boarding',
                                      'Departure Delay in Minutes', 'Arrival Delay in Minutes']]
y_test = satisfaction_test_encoded['satisfaction']

In [107]:
fitted_models2 = get_acc(X_train, y_train, 10000)

Logistic Regression accuracy :  0.807506661003205
LDA accuracy :  0.8072170521682048
QDA accuracy :  0.8151137197358768


**2) 1)에서 만든 모델의 test misclassification rate를 구하세요.**

In [108]:
test_pred = fitted_models2[2].predict(X_test)
print(1-accuracy_score(y_test, test_pred))

0.18541972353077463
