In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
# plt.rc('font', family='NanumGothicOTF') # For MacOS
plt.rc('font', family='NanumGothic') # For Windows
mpl.rcParams['axes.unicode_minus'] = False
pd.set_option('display.max_columns', None)

In [3]:
df1 = pd.read_csv('./NHIS_OPEN_GJ_2017_100.csv',encoding='EUC-KR')
df2 = pd.read_csv('./NHIS_OPEN_GJ_2018_100.csv')

frames = [df1, df2]
df = pd.concat(frames)
df.to_csv('./data_1718.csv')

In [4]:
df = df.rename(columns={"연령대코드(5세단위)":"연령대 코드", 
                        "신장(5Cm단위)":"신장", 
                        "체중(5Kg단위)":"체중",
                        "식전혈당(공복혈당)":"식전혈당",
                        "(혈청지오티)AST":"AST",
                        "(혈청지오티)ALT":"ALT",})

In [5]:
df = df.drop(['기준년도', '가입자일련번호','시도코드', '시력(좌)', '시력(우)', '청력(좌)', '청력(우)','구강검진수검여부',
       '치아우식증유무', '결손치유무', '치아마모증유무', '제3대구치(사랑니)이상', '치석', '데이터공개일자'], axis = 1)

In [6]:
df.isnull().sum()


성별코드             0
연령대 코드           0
신장               0
체중               0
허리둘레           680
수축기혈압         5749
이완기혈압         5748
식전혈당          5957
총콜레스테롤      667287
트리글리세라이드    667297
HDL콜레스테롤    667298
LDL콜레스테롤    677034
혈색소           5966
요단백          15162
혈청크레아티닌       5949
AST           5947
ALT           5948
감마지티피         5951
흡연상태           378
음주여부        355479
dtype: int64

In [7]:
df.drop(df[df['AST'] == 9999].index, inplace=True)
df.drop(df[df['감마지티피'] == 999].index, inplace=True)
df.loc[df['음주여부'].isnull(), '음주여부'] = 0

In [8]:
#식전혈당이 없을 시 드랍
df = df.loc[df['식전혈당'].notnull()]

#당뇨병: 공복 혈당치 126 mg/dL 이상 당뇨병의 전단계: 공복 혈당치 100 mg/dL 이상
#저혈당: 혈당치 126 mg/dL 미만
df.loc[df['식전혈당'] < 100, '식전혈당'] = 0
df.loc[(df['식전혈당'] >= 100) & (df['식전혈당'] < 126), '식전혈당'] = 1
df.loc[df['식전혈당'] >= 126, '식전혈당'] = 2

In [9]:
#비정상적인 값 배제
df = df.loc[df['허리둘레']<140]
df = df.loc[df['허리둘레']>40]

#국내의 복부비만 기준은 남자는 90cm 이상, 여자의 경우 85cm 이상
df_obesity = df[['허리둘레','성별코드']].copy()

df_obesity.loc[(df_obesity['허리둘레']>= 90) & (df_obesity['성별코드']==1), '복부비만'] = 1
df_obesity.loc[(df_obesity['허리둘레']>= 85) & (df_obesity['성별코드']==2), '복부비만'] = 1
df_obesity['복부비만'] = df_obesity['복부비만'].fillna(0)

df['복부비만'] = df_obesity['복부비만']

In [10]:
df_weight = df[['신장', '체중']].copy()

#체중(kg)을 신장(m)의 제곱으로 나눈 값(체중(kg)/신장(m2))
temp = df_weight['신장']/100
df_bmi = round(df_weight['체중'] / (temp * temp), 1)

#세계보건기구 아시아태평양지역과 대한비만학회
#BMI < 25 kg/m2 : 저체중+정상 0
#BMI ≥ 25 kg/m2 : 과체중 1
#BMI ≥ 30 kg/m2 : 비만 2
df_bmi.loc[df_bmi < 25] = 0
df_bmi.loc[(df_bmi >= 25) & (df_bmi < 30)] = 1
df_bmi.loc[df_bmi >= 30] = 2

df['비만여부'] = df_bmi

df = df.drop(['체중', '신장'], axis = 1)

In [11]:
df_pressure = df[['이완기혈압', '수축기혈압']].copy()

# 정상: 0 / 120~139/80~89 전단계: 1/  140~159/90~99 1기:2 / 160/100 2기:3
df_pressure.loc[(df_pressure['수축기혈압'] < 120) & (df_pressure['이완기혈압'] < 80), '고혈압 여부'] = 0
df_pressure.loc[((df_pressure['수축기혈압'] >= 120) & (df_pressure['수축기혈압'] < 140))|
                ((df_pressure['이완기혈압'] >= 80) & (df_pressure['이완기혈압'] < 90)), '고혈압 여부'] = 1
df_pressure.loc[((df_pressure['수축기혈압'] >= 140) & (df_pressure['수축기혈압'] < 160))|
                ((df_pressure['이완기혈압'] >= 90) & (df_pressure['이완기혈압'] < 99)), '고혈압 여부'] = 2
df_pressure.loc[(df_pressure['수축기혈압'] >=160) | (df_pressure['이완기혈압'] >= 100), '고혈압 여부'] = 3

df['고혈압'] = df_pressure['고혈압 여부']
df = df.drop(['수축기혈압', '이완기혈압'], axis = 1)

In [12]:
df

Unnamed: 0,성별코드,연령대 코드,허리둘레,식전혈당,총콜레스테롤,트리글리세라이드,HDL콜레스테롤,LDL콜레스테롤,혈색소,요단백,혈청크레아티닌,AST,ALT,감마지티피,흡연상태,음주여부,복부비만,비만여부,고혈압
0,1,8,90.0,0.0,193.0,92.0,48.0,126.0,17.1,1.0,1.0,21.0,35.0,40.0,1.0,1.0,1.0,1.0,1.0
1,1,7,89.0,1.0,228.0,121.0,55.0,148.0,15.8,1.0,0.9,20.0,36.0,27.0,3.0,0.0,0.0,0.0,1.0
2,1,9,91.0,0.0,136.0,104.0,41.0,74.0,15.8,1.0,0.9,47.0,32.0,68.0,1.0,0.0,1.0,1.0,1.0
3,1,11,91.0,0.0,201.0,106.0,76.0,104.0,17.6,1.0,1.1,29.0,34.0,18.0,1.0,0.0,1.0,1.0,2.0
4,1,11,80.0,1.0,199.0,104.0,61.0,117.0,13.8,1.0,0.8,19.0,12.0,25.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2,11,84.0,1.0,,,,,12.2,1.0,0.7,18.0,15.0,19.0,1.0,0.0,0.0,1.0,0.0
999996,1,7,88.0,1.0,,,,,17.0,1.0,1.0,25.0,36.0,90.0,1.0,1.0,0.0,1.0,1.0
999997,1,8,87.0,0.0,,,,,16.4,1.0,0.9,21.0,15.0,36.0,2.0,1.0,0.0,1.0,3.0
999998,1,11,80.2,0.0,,,,,15.7,1.0,1.2,20.0,18.0,14.0,2.0,0.0,0.0,1.0,2.0


In [13]:
df = df.dropna(subset=['허리둘레','식전혈당','요단백','음주여부','흡연상태','총콜레스테롤','트리글리세라이드','HDL콜레스테롤','LDL콜레스테롤','혈색소','혈청크레아티닌','AST','ALT','감마지티피',"고혈압"])
df.isnull().sum()

성별코드        0
연령대 코드      0
허리둘레        0
식전혈당        0
총콜레스테롤      0
트리글리세라이드    0
HDL콜레스테롤    0
LDL콜레스테롤    0
혈색소         0
요단백         0
혈청크레아티닌     0
AST         0
ALT         0
감마지티피       0
흡연상태        0
음주여부        0
복부비만        0
비만여부        0
고혈압         0
dtype: int64

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
# 모델 평가를 위한 함수 설정
def get_clf_eval(y_test, y_pred):
    confusion = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    #precision = precision_score(y_test, y_pred) 이진분류에서만 사용가능
    #recall = recall_score(y_test, y_pred)
    #F1 = f1_score(y_test, y_pred)
    #AUC = roc_auc_score(y_test, y_pred)
    # 평가지표 출력
    print('오차행렬:\n', confusion)
    print('\n정확도: {:.4f}'.format(accuracy))
    #print('정밀도: {:.4f}'.format(precision))
    #print('재현율: {:.4f}'.format(recall))
    #print('F1: {:.4f}'.format(F1))
    #print('AUC: {:.4f}'.format(AUC))

In [15]:
#로지스틱회귀
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

target = df['식전혈당']
data = df.drop(['식전혈당'], axis=1)


#선형 회귀 계열의 로지스틱 회귀는 데이터의 정규 분포도에 따라 예측 성능 영향을 받을 수 있으므로 
#데이터에 먼저 정규 분포 형태의 표준 스케일링을 적용한 뒤에 train_test_split()을 이용해 데이터셋을 분리 한다.

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

X_train , X_test, y_train , y_test = train_test_split(data_scaled, target, test_size=0.25, random_state=0, stratify=target)

model = LogisticRegression(C = 0.1, penalty = 'l2')
model.fit(X_train, y_train)
predicts = model.predict(X_test)

# accuracy와 roc_auc 측정
print('accuracy: {:0.3f}'.format(accuracy_score(y_test, predicts)))


accuracy: 0.627


In [16]:
get_clf_eval(y_test, predicts)

오차행렬:
 [[184878  16900    776]
 [ 78932  20132   1117]
 [ 16179   8681   1300]]

정확도: 0.6273


In [17]:
from sklearn.model_selection import GridSearchCV

params={'penalty':['l2'],
        'C':[0.01, 0.1, 1, 1, 5, 10]}

grid_clf = GridSearchCV(model, param_grid=params, scoring='accuracy', cv=3 )
grid_clf.fit(data_scaled, target)
print('최적 하이퍼 파라미터:{0}, 최적 평균 정확도:{1:.3f}'.format(grid_clf.best_params_, 
                                                  grid_clf.best_score_))

최적 하이퍼 파라미터:{'C': 0.1, 'penalty': 'l2'}, 최적 평균 정확도:0.627


In [17]:
from sklearn.preprocessing import RobustScaler #outlier의 영향을 최소화 할수 있는 robust scaling 진행

scaler = RobustScaler()
data_scaled = scaler.fit_transform(data)

X_train , X_test, y_train , y_test = train_test_split(data, target, test_size=0.3, random_state=0)

In [19]:
from sklearn.tree import DecisionTreeClassifier #의사결정트리
from sklearn.model_selection import GridSearchCV

estimator = DecisionTreeClassifier()
params = {'criterion':['entropy'], 'max_depth':[None,2,3,4,5,6], 'max_leaf_nodes':[None,2,3,4,5,6,7], 'min_samples_split':[2,3,4,5,6], 'min_samples_leaf':[1,2,3]}
grid_clf = GridSearchCV(estimator, param_grid=params, cv=3, scoring='accuracy', n_jobs = -1)
grid_clf.fit(X_train, y_train)
print('최적 하이퍼 파라미터:{0}, 최적 평균 정확도:{1:.3f}'.format(grid_clf.best_params_, 
                                                  grid_clf.best_score_))

최적 하이퍼 파라미터:{'criterion': 'entropy', 'max_depth': 6, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}, 최적 평균 정확도:0.625


In [20]:
model = DecisionTreeClassifier(criterion = 'entropy', max_leaf_nodes = None, min_samples_leaf = 1, min_samples_split = 2, max_depth = 6)
model.fit(X_train, y_train)
predicts = model.predict(X_test)
print('예측 정확도: {:.4f}'.format(accuracy_score(y_test,predicts)))

예측 정확도: 0.6254


In [21]:
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.ensemble import RandomForestClassifier #랜덤포레스트
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

최적 하이퍼 파라미터:  {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}
최고 예측 정확도: 0.6314


In [22]:
model = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 8,
                                random_state = 0,
                                n_jobs = -1)
model.fit(X_train, y_train)
predicts = model.predict(X_test)
print('예측 정확도: {:.4f}'.format(accuracy_score(y_test,predicts)))

예측 정확도: 0.6321


In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [17]:
predicts = model.predict(X_test)
print('예측 정확도: {:.4f}'.format(accuracy_score(y_test,predicts)))

예측 정확도: 0.6273


In [29]:
X_train, X_test, y_train, y_test = train_test_split(data.to_numpy(), target.to_numpy(), train_size=0.8, random_state=1)

from tensorflow.keras.utils import to_categorical
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
# 훈련 데이터와 테스트 데이터에 대해서 원-핫 인코딩
print(y_train[:5])
print(y_test[:5])

[[0. 1. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]
[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]]


In [30]:
from tensorflow.keras.models import Sequential # 케라스의 Sequential()을 임포트
from tensorflow.keras.layers import Dense # 케라스의 Dense()를 임포트
from tensorflow.keras import optimizers # 케라스의 옵티마이저를 임포트

model=Sequential()
model.add(Dense(3, input_dim=18, activation='softmax'))
sgd=optimizers.SGD(lr=0.01)
# 학습률(learning rate, lr)은 0.01로 합니다.
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
# 옵티마이저는 경사하강법의 일종인 adam을 사용합니다.
# 손실 함수(Loss function)는 크로스 엔트로피 함수를 사용합니다.
model.fit(X_train,y_train, batch_size=1, epochs=200, validation_data=(X_test, y_test))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200

KeyboardInterrupt: 