# 5.2.1 데이터 추가 처리


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Book2/Ch5/healthcare-dataset-2.csv')
df.shape

(3915, 12)

In [None]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [None]:
df['gender'].value_counts(dropna=False)   # 개수 기준 분포 구하기

Female    2402
Male      1512
Other        1
Name: gender, dtype: int64

In [None]:
# 참조 코딩
df['gender'].value_counts(dropna=False, normalize=True)   # 분포 기준 분포 구하기

Female    0.613538
Male      0.386207
Other     0.000255
Name: gender, dtype: float64

In [None]:
# 범주형 변수를 cols1에 저장
cols1 = ['gender', 'hypertension', 'heart_disease', 'ever_married',
        'work_type', 'residence_type', 'smoking_status']
df1 = df[cols1]   # 범주형 변수만 모은 cols1로 구성된 데이터프레임 df1을 생성

# 데이터프레임 df1안의 컬럼명을 순차적으로 value_counts() 구문에 넣어서 결과 출력
for col in df1.columns:
    print ("") 
    print ("---- %s ---" % col)
    print (df[col].value_counts(dropna=False, normalize=True))  


---- gender ---
Female    0.613538
Male      0.386207
Other     0.000255
Name: gender, dtype: float64

---- hypertension ---
0    0.891699
1    0.108301
Name: hypertension, dtype: float64

---- heart_disease ---
0    0.941507
1    0.058493
Name: heart_disease, dtype: float64

---- ever_married ---
Yes    0.794636
No     0.205364
Name: ever_married, dtype: float64

---- work_type ---
Private          0.655428
Self-employed    0.187995
Govt_job         0.156066
Never_worked     0.000511
Name: work_type, dtype: float64

---- residence_type ---
Urban    0.5106
Rural    0.4894
Name: residence_type, dtype: float64

---- smoking_status ---
never smoked       0.419668
formerly smoked    0.201277
Unknown            0.199234
smokes             0.179821
Name: smoking_status, dtype: float64


In [None]:
# 참조 코딩
list(df.columns)

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [None]:
# 참조 코딩
df['ever_married'].value_counts()

Yes    3111
No      804
Name: ever_married, dtype: int64

In [None]:
# 참조 코딩
cols2 = ['ever_married']
df100 = pd.get_dummies(df, columns=cols2) 
#df100['ever_married'].value_counts()

In [None]:
# 참조 코딩
list(df100.columns)

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'work_type',
 'residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke',
 'ever_married_No',
 'ever_married_Yes']

In [None]:
from sklearn.preprocessing import OrdinalEncoder   # OrdinalEncoder를 import
df['ever_married_encoded'] = OrdinalEncoder().fit_transform(df['ever_married'].values.reshape(-1,1))
df.groupby(['ever_married', 'ever_married_encoded']).size()   # 변경전후 변수값 비교표 디스플레이

ever_married  ever_married_encoded
No            0.0                      804
Yes           1.0                     3111
dtype: int64

In [None]:
df['gender_encoded'] = OrdinalEncoder().fit_transform(df['gender'].values.reshape(-1,1))
df.groupby(['gender', 'gender_encoded']).size()   # 변경전후 변수값 비교표를 출력

gender  gender_encoded
Female  0.0               2402
Male    1.0               1512
Other   2.0                  1
dtype: int64

In [None]:
df['work_type_encoded'] = OrdinalEncoder().fit_transform(df['work_type'].values.reshape(-1,1))
df.groupby(['work_type', 'work_type_encoded']).size()   # 변경전후 변수값 비교표를 출력

work_type      work_type_encoded
Govt_job       0.0                   611
Never_worked   1.0                     2
Private        2.0                  2566
Self-employed  3.0                   736
dtype: int64

In [None]:
df['residence_type_encoded'] = OrdinalEncoder().fit_transform(df['residence_type'].values.reshape(-1,1))
df.groupby(['residence_type', 'residence_type_encoded']).size()   # 변경전후 변수값 비교표를 출력

residence_type  residence_type_encoded
Rural           0.0                       1916
Urban           1.0                       1999
dtype: int64

In [None]:
df['smoking_status_encoded'] = OrdinalEncoder().fit_transform(df['smoking_status'].values.reshape(-1,1))
df.groupby(['smoking_status', 'smoking_status_encoded']).size()   # 변경전후 변수값 비교표를 출력

smoking_status   smoking_status_encoded
Unknown          0.0                        780
formerly smoked  1.0                        788
never smoked     2.0                       1643
smokes           3.0                        704
dtype: int64

In [None]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke', 'ever_married_encoded', 'gender_encoded',
       'work_type_encoded', 'residence_type_encoded',
       'smoking_status_encoded'],
      dtype='object')

In [None]:
# drop(inplace=True)가 있는 구문은 한 번만 실행할 수 있습니다.
# 두번째 실행하면 이미 특정 변수명이 drop된 상태여서 에러를 발생합니다.
df.drop(['id','residence_type','ever_married','gender','work_type','smoking_status'],
        axis=1, inplace=True)   

In [None]:
# 책에서는 df.columns라는 더 간단한 명령어를 사용했습니다.
list(df.columns)

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'stroke',
 'ever_married_encoded',
 'gender_encoded',
 'work_type_encoded',
 'residence_type_encoded',
 'smoking_status_encoded']

In [None]:
df.shape

(3915, 11)

In [None]:
df.isna().any()[lambda x: x]   # null value를 갖고 있는 변수명(컬럼명) 찾기

Series([], dtype: bool)

In [None]:
df['bmi'].isnull().sum()       # bmi의 결측값 개수 확인

0

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/Book2/Ch5/healthcare-dataset-3.csv', index=False)