In [200]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

#### **과제-1: heart attack**

In [201]:
htrain = pd.read_csv('./data/heart attack train.csv', index_col=0)
htest = pd.read_csv('./data/heart attack test.csv', index_col=0)

**EDA**

In [202]:
colname = []
types = []
unique_count = []
unique_value = []
for col in htrain.columns[:-1]:
    colname.append(col)
    types.append(htrain[col].dtypes)
    unique_count.append(len(htrain[col].unique()))
    unique_value.append(htrain[col].unique()[:5])
EDA = pd.DataFrame({'feature': colname, 'feature_types': types, 'unique_count':unique_count, 'unique_value':unique_value})
EDA

Unnamed: 0,feature,feature_types,unique_count,unique_value
0,age,int64,41,"[52, 67, 57, 50, 62]"
1,sex,int64,2,"[1, 0]"
2,cp,int64,4,"[3, 2, 1, 0]"
3,trestbps,int64,47,"[118, 152, 150, 154, 120]"
4,chol,int64,137,"[186, 277, 126, 232, 244]"
5,fbs,int64,2,"[0, 1]"
6,restecg,int64,3,"[0, 1, 2]"
7,thalach,int64,86,"[190, 172, 173, 164, 162]"
8,exang,int64,2,"[0, 1]"
9,oldpeak,float64,36,"[0.0, 0.2, 1.1, 1.6, 3.6]"


**One Hot Encoding for categorical value**

: (참고) Tree모델의 경우, 가지로 나눠지기 때문에 One Hot Encoding의 전과 후의 결과가 같다.

: One Hot Encoding의 경우, train과 test가 동일하게 One Hot Encoding이 진행되어야 하기 때문에 합쳐서 작업을 해주어야 한다.

In [203]:
transformation_columns = dict(zip(EDA.feature, EDA.unique_count))
for col in htrain.columns[:-1]:
    if transformation_columns[col] < 6:
        htrain[col]=htrain[col].apply(lambda x: str(x))
for col in htest.columns[:-1]:
    if transformation_columns[col] < 6:
        htest[col]=htest[col].apply(lambda x: str(x))

In [205]:
total = pd.concat([htrain, htest],axis=0)

In [210]:
total = pd.get_dummies(total)

In [211]:
htrain = total.iloc[htrain.index]
htest = total.iloc[htest.index]

**Train_Test_Split**

In [212]:
X_train = htrain.drop('target',axis=1)
y_train = htrain.target
X_test = htest.drop('target', axis=1)
y_test = htest.target

In [213]:
X_train.columns

Index(['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'sex_0', 'sex_1',
       'cp_0', 'cp_1', 'cp_2', 'cp_3', 'fbs_0', 'fbs_1', 'restecg_0',
       'restecg_1', 'restecg_2', 'exang_0', 'exang_1', 'slope_0', 'slope_1',
       'slope_2', 'ca_0', 'ca_1', 'ca_2', 'ca_3', 'ca_4', 'thal_0', 'thal_1',
       'thal_2', 'thal_3'],
      dtype='object')

In [214]:
X_test.columns

Index(['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'sex_0', 'sex_1',
       'cp_0', 'cp_1', 'cp_2', 'cp_3', 'fbs_0', 'fbs_1', 'restecg_0',
       'restecg_1', 'restecg_2', 'exang_0', 'exang_1', 'slope_0', 'slope_1',
       'slope_2', 'ca_0', 'ca_1', 'ca_2', 'ca_3', 'ca_4', 'thal_0', 'thal_1',
       'thal_2', 'thal_3'],
      dtype='object')

**Model; Logistic Regression**

In [215]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [216]:
clf = LogisticRegression()
clf = clf.fit(X_train,y_train)

**Model Result**

In [223]:
coefficient = dict(zip(htrain.columns.values, clf.coef_[0]))
coefficient = sorted(coefficient.items(), key=lambda x: abs(x[1]), reverse=True)

In [224]:
coefficient[:5]

[('sex_1', -1.370823799698216),
 ('slope_2', 1.2743399761168215),
 ('cp_1', 0.7872623725343973),
 ('thal_2', -0.7420564443559714),
 ('ca_1', -0.6809209278331985)]

: 해당 coef는 log odds를 나타낸다.

**Test classification rate**

In [225]:
y_pred = clf.predict(X_test)

In [226]:
metrics.confusion_matrix(y_test, y_pred)

array([[25,  5],
       [ 2, 29]], dtype=int64)

In [227]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8852459016393442


#### **과제-2: satisfaction**

In [269]:
strain = pd.read_csv('./data/satisfaction_train.csv', index_col=0)
stest = pd.read_csv('./data/satisfaction_test.csv', index_col=0)

**EDA**

In [270]:
colname = []
types = []
unique_count = []
unique_value = []
for col in strain.columns[:-1]:
    colname.append(col)
    types.append(strain[col].dtypes)
    unique_count.append(len(strain[col].unique()))
    unique_value.append(strain[col].unique()[:5])
EDA = pd.DataFrame({'feature': colname, 'feature_types': types, 'unique_count':unique_count, 'unique_value':unique_value})
EDA

Unnamed: 0,feature,feature_types,unique_count,unique_value
0,Gender,object,2,"[Female, Male]"
1,Customer Type,object,2,"[Loyal Customer, disloyal Customer]"
2,Age,int64,75,"[39, 27, 21, 64, 69]"
3,Type of Travel,object,2,"[Business travel, Personal Travel]"
4,Class,object,3,"[Business, Eco Plus, Eco]"
5,Flight Distance,int64,4854,"[2725, 1634, 1341, 3794, 1237]"
6,Seat comfort,int64,6,"[5, 3, 4, 2, 0]"
7,Departure/Arrival time convenient,int64,6,"[5, 3, 4, 2, 1]"
8,Food and drink,int64,6,"[2, 3, 0, 5, 4]"
9,Gate location,int64,5,"[5, 3, 1, 4, 2]"


**One Hot Encoding for categorical value & Transformation of target**

In [271]:
transformation_columns = dict(zip(EDA.feature, EDA.unique_count))
for col in strain.columns[:-1]:
    if transformation_columns[col] < 7:
        strain[col]=strain[col].apply(lambda x: str(x))
for col in X_test.columns[:-1]:
    if transformation_columns[col] < 7:
        stest[col]=stest[col].apply(lambda x: str(x))

In [272]:
strain[strain.columns[-1]] = [1 if i=='satisfied' else 0 for i in strain[strain.columns[-1]]]
stest[stest.columns[-1]] = [1 if i=='satisfied' else 0 for i in stest[stest.columns[-1]]]

In [273]:
total = pd.concat([strain, stest],axis=0)

In [274]:
total = pd.get_dummies(total)

In [275]:
stest.shape

(12949, 23)

In [280]:
strain = total.iloc[:len(strain)]
stest = total.iloc[len(strain):]

**Train_Test_Split**

In [282]:
X_train = strain.drop('satisfaction',axis=1)
y_train = strain.satisfaction
X_test = stest.drop('satisfaction', axis=1)
y_test = stest.satisfaction

In [283]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

**Model; Logistic Regression**

In [284]:
clf = LogisticRegression()
clf = clf.fit(X_train,y_train)

In [285]:
coefficient = dict(zip(htrain.columns.values, clf.coef_[0]))
coefficient = sorted(coefficient.items(), key=lambda x: x[1], reverse=True)

In [286]:
coefficient[:5]

[('exang_1', 1.394239498453339),
 ('sex_0', 1.0866249479223449),
 ('cp_2', 0.7802299311978403),
 ('fbs_1', 0.7323918765079476),
 ('thal_3', 0.6751208851040738)]

**Test classification Rate**

In [287]:
y_pred = clf.predict(X_test)

In [288]:
metrics.confusion_matrix(y_test, y_pred)

array([[5117,  761],
       [ 762, 6309]], dtype=int64)

In [289]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8823847401343733
