<a href="https://colab.research.google.com/github/jaehyun0220/Colab/blob/master/SDS_PublicData_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 3조. 건강검진 데이터를 활용한 치아우식증 발생 예측
#### # Ver 3. 외부변수 추가

In [1]:
# Auth 인증 및 Google Drive 활용 Data load
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


#### #2. 작업환경 세팅

In [2]:
# 기본 라이브러리 로드
import pandas as pd
import numpy as np
import os, sys

from tqdm import tqdm_notebook

import re
import tensorflow as tf

#데이터 전처리 관련 라이브러리 로드
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import ShuffleSplit

#모델 알고리즘 로드
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier

# Deep Learning Model 로드
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation 
from keras.wrappers.scikit_learn import KerasClassifier

#HyperParameter Tuning을 위한 라이브러리 로드
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#모델 평가를 위한 라이브러리 로드
from sklearn import metrics, model_selection
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, auc

#수학 & 통계 관련 라이브러리 로드
import scipy.stats as st
from collections import Counter
import math

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Visualization
import seaborn as sns
import matplotlib as mpl  # 기본 설정 만지는 용도
import matplotlib.pyplot as plt  # 그래프 그리는 용도
import matplotlib.font_manager as fm  # 폰트 관련 용도


#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')

Using TensorFlow backend.


#### #3.사용할 사용자 함수 정의

In [0]:
def auc_graph(roc_auc, fpr, tpr):
  plt.title('Receiver Operating Characteristic')
  plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
  plt.legend(loc = 'lower right')
  plt.plot([0, 1], [0, 1],'r--')
  plt.xlim([0, 1])
  plt.ylim([0, 1])
  plt.ylabel('True Positive Rate')
  plt.xlabel('False Positive Rate')
  plt.show()

#### #4.원천 데이터 load 및 seed 설정

In [4]:
set_random_seed = 2580 # seed 지정
target_nm = 'dental_carries' # 타겟 변수 지정

#### 2013년 기준 #########################################################################################################################################
df_raw_2013 = pd.read_csv('../gdrive/My Drive/sds/data/NHIS_OPEN_GJ_2013_eng.csv', encoding = 'euc-kr')

# 컬럼명 내 불필요한 공백 및 특수문자 제거
df_raw_2013.rename(columns=lambda x: re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》 ]', '', x), inplace=True)

# 구강검진 결과가 있는 데이터셋만 load
df_data = df_raw_2013[df_raw_2013['examine_mouth']==1]

# 분석과 관련 없는 변수 및 기타 점검이 어려운 치아 관련 변수 삭제 
del_cols = ['baseyear', 'id', 'data_open_date','examine_mouth']
df_data.drop(columns=del_cols, axis=1, inplace=True)

# 치아우식증 결과가 있는 데이터만 끌고 옴
df_data = df_data[~(df_data['dental_carries'].isnull()) & (df_data['dental_carries'] != 2)]

# null 포함 데이터 모두 삭제 
df_data.dropna(how='any', inplace=True)
display(df_data.info())
print(len(df_data))

print(df_data[target_nm].value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380657 entries, 3 to 999998
Data columns (total 30 columns):
sex                      380657 non-null int64
ageband                  380657 non-null int64
province                 380657 non-null int64
height                   380657 non-null int64
weight                   380657 non-null int64
waist                    380657 non-null int64
sight_l                  380657 non-null float64
sight_r                  380657 non-null float64
hearing_l                380657 non-null float64
hearing_r                380657 non-null float64
bp_systolic              380657 non-null int64
bp_diastolic             380657 non-null int64
bs_before                380657 non-null int64
tot_cholesterol          380657 non-null int64
triglycerides            380657 non-null int64
HDL_cholesterol          380657 non-null int64
LDL_cholesterol          380657 non-null float64
hemoglobin               380657 non-null float64
piu                      380657

None

380657
0.0    288017
1.0     92640
Name: dental_carries, dtype: int64


#### #5. 데이터샘플링 및 Wrangling

In [5]:
df_sample = df_data.copy()
print(df_sample[target_nm].value_counts())

0.0    288017
1.0     92640
Name: dental_carries, dtype: int64


In [0]:
# 명목형 변수 변경
category_features = ['sex', 'ageband', 'height', 'weight','province', 'hearing_l', 'hearing_r', 'smoking','drinking', 'piu', 'missing_tooth', 'dental_abrasion', 'wisdom_teeth_abnormal', 'plaque']

for col in df_data.columns:
  if col in category_features:
    df_sample[col] = df_sample[col].astype(object)

##### #5-1.내부 명목형 변수 묶기

In [0]:
# 명목형 변수의 범주화
# 성별코드 변환
df_sample["C_sex"] = df_sample["sex"].apply(lambda x:  'Male' if x == 1 else 'Female')

# 연령대코드 5세 단위 변환
min_age_code = df_sample["ageband"].min()

df_sample["C_ageband"] = df_sample["ageband"].apply(lambda x:  (x-1)*5 + 20 if min_age_code == 1 else (x-1)*5).astype(object)

# 시도코드 변환
df_sample["C_province"] = np.where(df_sample['province'] == 11, 'Seoul', 
                            np.where(df_sample['province']== 26, 'Busan', 
                              np.where(df_sample['province'] == 27, 'Daegu', 
                                np.where(df_sample['province'] == 28, 'Incheon', 
                                  np.where(df_sample['province'] == 29, 'Kwangju', 
                                    np.where(df_sample['province'] == 30, 'Daejeon', 
                                      np.where(df_sample['province'] == 31, 'Ulsan', 
                                        np.where(df_sample['province'] == 36, 'Sejong', 
                                          np.where(df_sample['province'] == 41, 'Gyeonggi', 
                                            np.where(df_sample['province'] == 42, 'Gangwon', 
                                              np.where(df_sample['province'] == 43, 'Chungbuk', 
                                                np.where(df_sample['province'] == 44, 'Chungnam', 
                                                  np.where(df_sample['province'] == 45, 'Jeonbuk', 
                                                    np.where(df_sample['province'] == 46, 'Jeonnam', 
                                                      np.where(df_sample['province'] == 47, 'Gyungbuk', 
                                                        np.where(df_sample['province'] == 48, 'Gyungnam', 
                                                          np.where(df_sample['province'] == 49, 'Jeju', 'Err')
                                      ))))))))))))))))

# 청력좌  변환
df_sample["C_hearing_l"] = df_sample["hearing_l"].apply(lambda x:  'Normal' if x == 1 else 'Abnormal')

# 청력우  변환
df_sample["C_hearing_r"] = df_sample["hearing_r"].apply(lambda x:  'Normal' if x == 1 else 'Abnormal')

# 요단백  변환
df_sample["C_piu"] = df_sample["piu"].apply(lambda x:  'Negative' if x == 1 else 'Positive')

# 흡연상태  변환
df_sample["C_smoking"] = df_sample["smoking"].apply(lambda x:  'NonSmoking' if x == 1 else ('StopSmoking' if x==2 else 'Smoking'))

# 음주여부  변환
df_sample["C_drinking"] = df_sample["drinking"].apply(lambda x:  'NonDrinking' if x == 0 else 'Drinking')

# 결손치 여부  변환
df_sample["C_missing_tooth"] = df_sample["missing_tooth"].apply(lambda x:  'Normal' if x == 0 else 'Abnormal')

# 치석 여부  변환
df_sample["C_dental_abrasion"] = df_sample["dental_abrasion"].apply(lambda x:  'Normal' if x == 0 else 'Abnormal')

# 치석 여부  변환
df_sample["C_wisdom_teeth_abnormal"] = df_sample["wisdom_teeth_abnormal"].apply(lambda x:  'Normal' if x == 0 else 'Abnormal')

# 치석 여부  변환
df_sample["C_plaque"] = df_sample["plaque"].apply(lambda x:  'Normal' if x == 0 else 'Abnormal')

# 변환 명목형 원 컬럼 삭제
del_obj_trans_cols = ['sex', 'ageband', 'province', 'hearing_l', 'hearing_r', 'piu', 'smoking', 'drinking', 'missing_tooth', 'dental_abrasion', 'wisdom_teeth_abnormal', 'plaque']
df_sample.drop(columns=del_obj_trans_cols, axis=1, inplace=True)

##### #5-2. 내부 수치형 변수 구간화

##### #5-3. 내부 수치형 변수 정규화

In [0]:
# 수치형 변수들 중 분포 이슈로 log 값을 취할 필요 있는 변수들 추출
num_ln_target_features = ['sight_l', 'sight_r', 'AST','ALT', 'bs_before','serum_creatinine','GammaGTP','tot_cholesterol', 'triglycerides']

# Feature Engineering의 일환으로 Log 값 취한 뒤 Normalization을 하여 변수명 + LN (lognorm)으로 열 추가 후 그래프 다시 그림
num_ln_cols = list(map(lambda x: "LN_"+str(x), num_ln_target_features))

num_pipeline = Pipeline([
        ('log_scaler', FunctionTransformer(np.log1p, validate=True)), # pipeline 내 log transformation을 위해 Function Transfomer 사용
        ('normalizer', MinMaxScaler()),
    ])

piped_np = num_pipeline.fit_transform(df_sample[num_ln_target_features])
piped_df = pd.DataFrame(piped_np, columns=num_ln_cols)

piped_df.head()

df_sample.drop(columns=num_ln_target_features, axis=1, inplace=True)

In [9]:
print("Before Re-indexing: Data count is ", len(df_sample))
df_sample.drop_duplicates()
df_sample.reset_index(inplace=True, drop=True)
print("After Re-indexing: Data count is ", len(df_sample))

df_fe = pd.concat([df_sample, piped_df], axis=1)
df_fe.head()

Before Re-indexing: Data count is  380657
After Re-indexing: Data count is  380657


Unnamed: 0,height,weight,waist,bp_systolic,bp_diastolic,HDL_cholesterol,LDL_cholesterol,hemoglobin,dental_carries,C_sex,C_ageband,C_province,C_hearing_l,C_hearing_r,C_piu,C_smoking,C_drinking,C_missing_tooth,C_dental_abrasion,C_wisdom_teeth_abnormal,C_plaque,LN_sight_l,LN_sight_r,LN_AST,LN_ALT,LN_bs_before,LN_serum_creatinine,LN_GammaGTP,LN_tot_cholesterol,LN_triglycerides
0,145,40,62,110,70,87,110.0,12.4,1.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.260671,0.260671,0.324587,0.242257,0.273586,0.097408,0.393001,0.562271,0.524264
1,145,45,72,100,53,46,140.0,13.1,1.0,Female,20,Seoul,Normal,Normal,Negative,NonSmoking,Drinking,Normal,Normal,Normal,Abnormal,0.214731,0.037939,0.313473,0.301486,0.300466,0.110197,0.313119,0.56037,0.632675
2,145,60,81,119,79,55,1494.0,12.7,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.105152,0.214731,0.430787,0.469957,0.389152,0.083842,0.412729,0.578979,0.544586
3,150,40,61,132,88,83,33.0,13.7,0.0,Female,20,Gyungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Abnormal,0.214731,0.214731,0.353958,0.301486,0.32854,0.122296,0.258977,0.364558,0.511384
4,150,40,61,95,65,56,83.0,12.4,0.0,Female,20,Gyeonggi,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Abnormal,0.135235,0.135235,0.34475,0.362609,0.289235,0.083842,0.258977,0.438554,0.539037


#### #6. 외부변수 추가

In [10]:
df_add1 = pd.read_csv('../gdrive/My Drive/sds/data/DentalExamineResult_2014_PortionbyTotInspector.csv', encoding = 'euc-kr')
df_add2 = pd.read_csv('../gdrive/My Drive/sds/data/KOSIS_AgeSex_AverageDentalVisitCnt_2012.csv', encoding = 'euc-kr')
df_add3 = pd.read_csv('../gdrive/My Drive/sds/data/KOSIS_DentalPrevalenceTrend_2012.csv', encoding = 'euc-kr')
df_add4 = pd.read_csv('../gdrive/My Drive/sds/data/chs_12_final_from_python.csv', encoding = 'euc-kr')

# Multiple Key를 사용하여 Join
# left_on=['column_name1','column_name2'], right_on = ['column_name3','column_name4']
print(len(df_fe))

df_new = pd.merge(left=df_fe, right=df_add1, how='outer', left_on=['C_province','C_sex'], right_on = ['Province','Sex'], sort=False)
df_new.drop(columns=['Province','Sex'], axis=1, inplace=True)
display(df_new.head())
print(len(df_new))

df_new = pd.merge(left=df_new, right=df_add2, how='outer', left_on=['C_ageband','C_sex'], right_on = ['Ageband','Sex'], sort=False)
df_new.drop(columns=['Ageband','Sex'], axis=1, inplace=True)
display(df_new.head())
print(len(df_new))

df_new = pd.merge(left=df_new, right=df_add3, how='outer', left_on=['C_ageband','C_sex'], right_on = ['Ageband','Sex'], sort=False)
df_new.drop(columns=['Ageband','Sex'], axis=1, inplace=True)
display(df_new.head())
print(len(df_new))

df_new = pd.merge(left=df_new, right=df_add4, how='left', left_on=['C_ageband','C_sex','C_province', 'weight', 'height'], right_on = ['R_ageband','R_sex','R_province','R_weight','R_height'], sort=False)
df_new.drop(columns=['R_ageband','R_sex','R_province','R_weight','R_height'], axis=1, inplace=True)
display(df_new.head())
print(len(df_new))

display(df_new.info())

380657


Unnamed: 0,height,weight,waist,bp_systolic,bp_diastolic,HDL_cholesterol,LDL_cholesterol,hemoglobin,dental_carries,C_sex,C_ageband,C_province,C_hearing_l,C_hearing_r,C_piu,C_smoking,C_drinking,C_missing_tooth,C_dental_abrasion,C_wisdom_teeth_abnormal,C_plaque,LN_sight_l,LN_sight_r,LN_AST,LN_ALT,LN_bs_before,LN_serum_creatinine,LN_GammaGTP,LN_tot_cholesterol,LN_triglycerides,A_NormalA_Result,A_NormalB_Result,A_Caution_Result,A_NeedCare_Result,A_Nutrition_Edu,A_Hygine_Edu,A_Fluoride_Edu,A_Examine_Rec,A_Care_Rec,A_Carries_Rec,A_Cure_Rec
0,145,40,62,110,70,87,110.0,12.4,1.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.260671,0.260671,0.324587,0.242257,0.273586,0.097408,0.393001,0.562271,0.524264,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09
1,145,60,81,119,79,55,1494.0,12.7,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.105152,0.214731,0.430787,0.469957,0.389152,0.083842,0.412729,0.578979,0.544586,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09
2,150,40,74,100,60,62,75.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Negative,Smoking,Drinking,Normal,Normal,Normal,Abnormal,0.214731,0.214731,0.362668,0.288594,0.311299,0.069401,0.385848,0.428035,0.52114,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09
3,150,45,78,125,70,65,74.0,14.5,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,Drinking,Normal,Normal,Normal,Normal,0.238306,0.189809,0.370931,0.274579,0.325169,0.083842,0.242023,0.44371,0.572095,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09
4,150,50,81,118,70,60,135.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Positive,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.214731,0.260671,0.386285,0.274579,0.32176,0.110197,0.334605,0.552673,0.504536,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09


380657


Unnamed: 0,height,weight,waist,bp_systolic,bp_diastolic,HDL_cholesterol,LDL_cholesterol,hemoglobin,dental_carries,C_sex,C_ageband,C_province,C_hearing_l,C_hearing_r,C_piu,C_smoking,C_drinking,C_missing_tooth,C_dental_abrasion,C_wisdom_teeth_abnormal,C_plaque,LN_sight_l,LN_sight_r,LN_AST,LN_ALT,LN_bs_before,LN_serum_creatinine,LN_GammaGTP,LN_tot_cholesterol,LN_triglycerides,A_NormalA_Result,A_NormalB_Result,A_Caution_Result,A_NeedCare_Result,A_Nutrition_Edu,A_Hygine_Edu,A_Fluoride_Edu,A_Examine_Rec,A_Care_Rec,A_Carries_Rec,A_Cure_Rec,A_AverageDentalHospitalVisitCnt,A_AverageDentalClinicVisitCnt
0,145,40,62,110,70,87,110.0,12.4,1.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.260671,0.260671,0.324587,0.242257,0.273586,0.097408,0.393001,0.562271,0.524264,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57
1,145,60,81,119,79,55,1494.0,12.7,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.105152,0.214731,0.430787,0.469957,0.389152,0.083842,0.412729,0.578979,0.544586,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57
2,150,40,74,100,60,62,75.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Negative,Smoking,Drinking,Normal,Normal,Normal,Abnormal,0.214731,0.214731,0.362668,0.288594,0.311299,0.069401,0.385848,0.428035,0.52114,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57
3,150,45,78,125,70,65,74.0,14.5,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,Drinking,Normal,Normal,Normal,Normal,0.238306,0.189809,0.370931,0.274579,0.325169,0.083842,0.242023,0.44371,0.572095,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57
4,150,50,81,118,70,60,135.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Positive,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.214731,0.260671,0.386285,0.274579,0.32176,0.110197,0.334605,0.552673,0.504536,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57


380657


Unnamed: 0,height,weight,waist,bp_systolic,bp_diastolic,HDL_cholesterol,LDL_cholesterol,hemoglobin,dental_carries,C_sex,C_ageband,C_province,C_hearing_l,C_hearing_r,C_piu,C_smoking,C_drinking,C_missing_tooth,C_dental_abrasion,C_wisdom_teeth_abnormal,C_plaque,LN_sight_l,LN_sight_r,LN_AST,LN_ALT,LN_bs_before,LN_serum_creatinine,LN_GammaGTP,LN_tot_cholesterol,LN_triglycerides,A_NormalA_Result,A_NormalB_Result,A_Caution_Result,A_NeedCare_Result,A_Nutrition_Edu,A_Hygine_Edu,A_Fluoride_Edu,A_Examine_Rec,A_Care_Rec,A_Carries_Rec,A_Cure_Rec,A_AverageDentalHospitalVisitCnt,A_AverageDentalClinicVisitCnt,A_DentalPrevalenceTrend
0,145,40,62,110,70,87,110.0,12.4,1.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.260671,0.260671,0.324587,0.242257,0.273586,0.097408,0.393001,0.562271,0.524264,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57,382
1,145,60,81,119,79,55,1494.0,12.7,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.105152,0.214731,0.430787,0.469957,0.389152,0.083842,0.412729,0.578979,0.544586,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57,382
2,150,40,74,100,60,62,75.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Negative,Smoking,Drinking,Normal,Normal,Normal,Abnormal,0.214731,0.214731,0.362668,0.288594,0.311299,0.069401,0.385848,0.428035,0.52114,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57,382
3,150,45,78,125,70,65,74.0,14.5,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,Drinking,Normal,Normal,Normal,Normal,0.238306,0.189809,0.370931,0.274579,0.325169,0.083842,0.242023,0.44371,0.572095,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57,382
4,150,50,81,118,70,60,135.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Positive,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.214731,0.260671,0.386285,0.274579,0.32176,0.110197,0.334605,0.552673,0.504536,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,0.09,102.84,1036.57,382


380657


Unnamed: 0,height,weight,waist,bp_systolic,bp_diastolic,HDL_cholesterol,LDL_cholesterol,hemoglobin,dental_carries,C_sex,C_ageband,C_province,C_hearing_l,C_hearing_r,C_piu,C_smoking,C_drinking,C_missing_tooth,C_dental_abrasion,C_wisdom_teeth_abnormal,C_plaque,LN_sight_l,LN_sight_r,LN_AST,LN_ALT,LN_bs_before,LN_serum_creatinine,LN_GammaGTP,LN_tot_cholesterol,LN_triglycerides,A_NormalA_Result,A_NormalB_Result,A_Caution_Result,A_NeedCare_Result,A_Nutrition_Edu,A_Hygine_Edu,A_Fluoride_Edu,A_Examine_Rec,A_Care_Rec,A_Carries_Rec,...,R_income,R_AnemiaDiag,R_AnginaPectorisDiag,R_ArthritisDiag,R_Asthma_Diag,R_MasticationLesion,R_BHepatitisDiag,R_CHepatitisDiag,R_HemorrhoidsDiag,R_HealthInstExp,R_HBP_Diag,R_PronounceLesion,R_DentureUse,R_SubjHealthLevel,R_EQVAS,R_FinEduGrade,R_DentDidNotExp,R_EQ5DNormLife,R_CPRRecognition,R_DrinkStartAge,R_FamilyCnt,R_AveSleepTime,R_AIDSRecognition,R_BPCheckinYear,R_WalkingDay,R_WalkingMinutes,R_SmokingStartAge,R_NutriChk,R_DrinkFreq,R_NearGYM,R_EQ5DPain,R_EQ5DAthleticAbility,R_BPCheckinYear.1,R_StressIndex,R_ExerciseMidHour,R_DrinkPerOnce,R_BreakfastperWeek,R_EQ5DSelfManage,R_ExerciseHighHour,target
0,145,40,62,110,70,87,110.0,12.4,1.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.260671,0.260671,0.324587,0.242257,0.273586,0.097408,0.393001,0.562271,0.524264,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,...,3600.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,65.0,6.0,0.0,0.0,1.0,19.0,5.0,9.0,1.0,0.0,3.0,15.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,7.0,0.0,0.0,0.0
1,145,60,81,119,79,55,1494.0,12.7,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.105152,0.214731,0.430787,0.469957,0.389152,0.083842,0.412729,0.578979,0.544586,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,150,40,74,100,60,62,75.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Negative,Smoking,Drinking,Normal,Normal,Normal,Abnormal,0.214731,0.214731,0.362668,0.288594,0.311299,0.069401,0.385848,0.428035,0.52114,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,...,6320.0,0.4,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.6,0.0,0.0,0.2,0.2,75.0,5.4,0.4,0.0,0.6,19.0,3.2,6.8,0.8,3.0,5.0,10.0,0.0,0.2,0.4,0.8,0.2,0.0,0.6,1.0,0.4,1.0,4.6,0.0,2.4,0.4
3,150,45,78,125,70,65,74.0,14.5,0.0,Female,20,Chungbuk,Normal,Normal,Negative,NonSmoking,Drinking,Normal,Normal,Normal,Normal,0.238306,0.189809,0.370931,0.274579,0.325169,0.083842,0.242023,0.44371,0.572095,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,...,3155.555556,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.222222,76.666667,6.444444,0.444444,0.111111,1.0,16.555556,3.111111,7.333333,0.888889,0.444444,3.333333,13.888889,0.0,0.444444,0.333333,0.777778,0.444444,0.0,0.333333,1.0,0.0,0.777778,3.555556,0.0,0.222222,0.222222
4,150,50,81,118,70,60,135.0,13.3,0.0,Female,20,Chungbuk,Normal,Normal,Positive,NonSmoking,NonDrinking,Normal,Normal,Normal,Normal,0.214731,0.260671,0.386285,0.274579,0.32176,0.110197,0.334605,0.552673,0.504536,0.14,0.39,0.43,0.63,0.34,1.41,0.93,0.07,1.36,0.59,...,4700.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.5,68.333333,6.5,0.0,0.0,1.0,19.333333,4.5,7.333333,1.0,0.833333,5.5,28.333333,0.0,0.666667,0.0,1.0,0.0,0.0,0.166667,0.833333,0.333333,0.833333,3.5,0.0,1.0,0.0


380657
<class 'pandas.core.frame.DataFrame'>
Int64Index: 380657 entries, 0 to 380656
Data columns (total 84 columns):
height                             380657 non-null object
weight                             380657 non-null object
waist                              380657 non-null int64
bp_systolic                        380657 non-null int64
bp_diastolic                       380657 non-null int64
HDL_cholesterol                    380657 non-null int64
LDL_cholesterol                    380657 non-null float64
hemoglobin                         380657 non-null float64
dental_carries                     380657 non-null float64
C_sex                              380657 non-null object
C_ageband                          380657 non-null object
C_province                         380657 non-null object
C_hearing_l                        380657 non-null object
C_hearing_r                        380657 non-null object
C_piu                              380657 non-null object
C_smoking    

None

In [11]:
# 외부변수와 매칭이 되지 않는 항목은 버림
df_new.dropna(how='any', inplace=True)
print(df_new[target_nm].value_counts())

0.0    279141
1.0     89200
Name: dental_carries, dtype: int64


In [12]:
# 치아우식 0과 1의 값을 5만건씩 총 10만건 추출
df_new = df_new.groupby(target_nm).apply(lambda x: x.sample(n=50000, random_state=set_random_seed))
df_new.reset_index(drop=True, inplace=True)
print(df_new[target_nm].value_counts())

1.0    50000
0.0    50000
Name: dental_carries, dtype: int64


In [13]:
# 독립변수와 종속변수를 나눠 줌
data_x = df_new[df_new.columns.difference([target_nm])]
data_y = df_new[target_nm].astype('float64')

# 데이터를 나눈 뒤 속성별로 컬럼을 분류
num_attribs = [col for col in data_x.columns if data_x[col].dtype in ['int64','float64']]
cat_attribs = [col for col in data_x.columns if data_x[col].dtype not in ['int64','float64']]

num_attribs = list(set(num_attribs) - set([target_nm]))

print("num_attribs: ", num_attribs)
print("cat_attribs: ", cat_attribs)

num_attribs:  ['R_CPRRecognition', 'R_EQVAS', 'R_EQ5DNormLife', 'R_SmokingStartAge', 'R_BPCheckinYear', 'R_Asthma_Diag', 'R_WalkingDay', 'A_NormalB_Result', 'R_NearGYM', 'A_DentalPrevalenceTrend', 'R_MasticationLesion', 'R_AIDSRecognition', 'R_HealthInstExp', 'R_SubjHealthLevel', 'waist', 'LN_ALT', 'R_FinEduGrade', 'A_AverageDentalClinicVisitCnt', 'A_NormalA_Result', 'R_PronounceLesion', 'R_DrinkStartAge', 'R_income', 'R_DrinkPerOnce', 'R_BHepatitisDiag', 'R_ExerciseMidHour', 'R_DentDidNotExp', 'R_NutriChk', 'R_BPCheckinYear.1', 'LN_AST', 'R_DentureUse', 'A_NeedCare_Result', 'LDL_cholesterol', 'R_EQ5DAthleticAbility', 'R_EQ5DPain', 'A_Carries_Rec', 'LN_serum_creatinine', 'R_DrinkFreq', 'LN_GammaGTP', 'A_Caution_Result', 'A_Nutrition_Edu', 'A_Cure_Rec', 'LN_bs_before', 'R_BreakfastperWeek', 'A_AverageDentalHospitalVisitCnt', 'R_WalkingMinutes', 'LN_sight_l', 'R_ExerciseHighHour', 'A_Care_Rec', 'R_AnginaPectorisDiag', 'R_HBP_Diag', 'A_Fluoride_Edu', 'LN_triglycerides', 'R_CHepatitisDiag'

In [0]:
label = LabelEncoder()
for col in data_x[cat_attribs].columns:   
  data_x[col] = label.fit_transform(data_x[col])
  data_x = pd.get_dummies(data_x, columns =[col], prefix=col+"_lb")

In [0]:
# 데이터를 나눈 뒤 속성별로 컬럼을 분류
num_attribs = [col for col in data_x.columns if data_x[col].dtype in ['int64','float64']]
cat_attribs = [col for col in data_x.columns if data_x[col].dtype not in ['int64','float64']]

num_attribs = list(set(num_attribs) - set([target_nm]))

In [0]:
# 수치형 변수 정규화 
num_pipeline = Pipeline([
        ('min_max_scaler', MinMaxScaler()),
    ])

# numpy 형식으로 전체 변경
full_pipeline = ColumnTransformer([
        ("num_pipeline", num_pipeline, num_attribs),
        ("cat_encoder", OneHotEncoder(sparse=False), cat_attribs),
    ])

data_x_piped = full_pipeline.fit_transform(data_x)

data_y_piped = data_y.values

In [0]:
from sklearn.model_selection import train_test_split
# train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size = 0.2, random_state = set_random_seed)
train_x, test_x, train_y, test_y = train_test_split(data_x_piped, data_y_piped, test_size = 0.2, random_state = set_random_seed)

In [18]:
model_lr = linear_model.LogisticRegression().fit(train_x, train_y)
pred_y_lr = model_lr.predict(test_x)
pred_y_lr = (pred_y_lr>0.5)
print("Accuracy: {:.5f}".format(accuracy_score(test_y, pred_y_lr)))
print("Confusion Matrix: \n", confusion_matrix(test_y, pred_y_lr))
print("Classification Report Matrix: \n", classification_report(test_y, pred_y_lr, digits=3))

Accuracy: 0.64320
Confusion Matrix: 
 [[6480 3600]
 [3536 6384]]
Classification Report Matrix: 
               precision    recall  f1-score   support

         0.0      0.647     0.643     0.645     10080
         1.0      0.639     0.644     0.641      9920

    accuracy                          0.643     20000
   macro avg      0.643     0.643     0.643     20000
weighted avg      0.643     0.643     0.643     20000



In [19]:
model_dt = tree.DecisionTreeClassifier().fit(train_x, train_y)
pred_y_dt = model_dt.predict(test_x)
pred_y_dt = (pred_y_dt>0.5)
print("Accuracy: {:.5f}".format(accuracy_score(test_y, pred_y_dt)))
print("Confusion Matrix: \n", confusion_matrix(test_y, pred_y_dt))
print("Classification Report Matrix: \n", classification_report(test_y, pred_y_dt, digits=3))

Accuracy: 0.55320
Confusion Matrix: 
 [[5531 4549]
 [4387 5533]]
Classification Report Matrix: 
               precision    recall  f1-score   support

         0.0      0.558     0.549     0.553     10080
         1.0      0.549     0.558     0.553      9920

    accuracy                          0.553     20000
   macro avg      0.553     0.553     0.553     20000
weighted avg      0.553     0.553     0.553     20000



In [20]:
model_rf = ensemble.RandomForestClassifier().fit(train_x, train_y)
pred_y_rf = model_rf.predict(test_x)
pred_y_rf = (pred_y_rf>0.5)
print("Accuracy: {:.5f}".format(accuracy_score(test_y, pred_y_rf)))
print("Confusion Matrix: \n", confusion_matrix(test_y, pred_y_rf))
print("Classification Report Matrix: \n", classification_report(test_y, pred_y_rf, digits=3))

Accuracy: 0.59390
Confusion Matrix: 
 [[6721 3359]
 [4763 5157]]
Classification Report Matrix: 
               precision    recall  f1-score   support

         0.0      0.585     0.667     0.623     10080
         1.0      0.606     0.520     0.559      9920

    accuracy                          0.594     20000
   macro avg      0.595     0.593     0.591     20000
weighted avg      0.595     0.594     0.592     20000

