# 3.7.1 데이터 추가 처리

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df = pd.read_csv('healthcare-dataset-2.csv')
df.shape

(3915, 12)

In [4]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [5]:
df['gender'].value_counts(dropna=False)   # 개수 기준 분포 구하기

Female    2402
Male      1512
Other        1
Name: gender, dtype: int64

In [6]:
# 참조 코딩
df['gender'].value_counts(dropna=False, normalize=True)   # 분포 기준 분포 구하기

Female    0.613538
Male      0.386207
Other     0.000255
Name: gender, dtype: float64

In [7]:
# 범주형 변수를 cols1에 저장
cols1 = ['gender', 'hypertension', 'heart_disease', 'ever_married',
        'work_type', 'residence_type', 'smoking_status']
df1 = df[cols1]   # 범주형 변수만 모은 cols1로 구성된 데이터프레임 df1을 생성

# 데이터프레임 df1안의 컬럼명을 순차적으로 value_counts() 구문에 넣어서 결과 출력
for col in df1.columns:
    print ("") 
    print ("---- %s ---" % col)
    print (df[col].value_counts(dropna=False, normalize=True))  


---- gender ---
Female    0.613538
Male      0.386207
Other     0.000255
Name: gender, dtype: float64

---- hypertension ---
0    0.891699
1    0.108301
Name: hypertension, dtype: float64

---- heart_disease ---
0    0.941507
1    0.058493
Name: heart_disease, dtype: float64

---- ever_married ---
Yes    0.794636
No     0.205364
Name: ever_married, dtype: float64

---- work_type ---
Private          0.655428
Self-employed    0.187995
Govt_job         0.156066
Never_worked     0.000511
Name: work_type, dtype: float64

---- residence_type ---
Urban    0.5106
Rural    0.4894
Name: residence_type, dtype: float64

---- smoking_status ---
never smoked       0.419668
formerly smoked    0.201277
Unknown            0.199234
smokes             0.179821
Name: smoking_status, dtype: float64


In [8]:
# 참조 코딩
list(df.columns)

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'work_type',
 'residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke']

In [6]:
# 참조 코딩
df['ever_married'].value_counts()

Yes    3111
No      804
Name: ever_married, dtype: int64

In [9]:
# 참조 코딩
cols2 = ['ever_married']
df100 = pd.get_dummies(df, columns=cols2) 
#df100['ever_married'].value_counts()

In [10]:
# 참조 코딩
list(df100.columns)

['id',
 'gender',
 'age',
 'hypertension',
 'heart_disease',
 'work_type',
 'residence_type',
 'avg_glucose_level',
 'bmi',
 'smoking_status',
 'stroke',
 'ever_married_No',
 'ever_married_Yes']

In [11]:
from sklearn.preprocessing import OrdinalEncoder   # OrdinalEncoder를 import
df['ever_married_encoded'] = OrdinalEncoder().fit_transform(df['ever_married'].values.reshape(-1,1))
df.groupby(['ever_married', 'ever_married_encoded']).size()   # 변경전후 변수값 비교표 디스플레이

ever_married  ever_married_encoded
No            0.0                      804
Yes           1.0                     3111
dtype: int64

In [12]:
df['gender_encoded'] = OrdinalEncoder().fit_transform(df['gender'].values.reshape(-1,1))
df.groupby(['gender', 'gender_encoded']).size()   # 변경전후 변수값 비교표를 출력

gender  gender_encoded
Female  0.0               2402
Male    1.0               1512
Other   2.0                  1
dtype: int64

In [13]:
df['work_type_encoded'] = OrdinalEncoder().fit_transform(df['work_type'].values.reshape(-1,1))
df.groupby(['work_type', 'work_type_encoded']).size()   # 변경전후 변수값 비교표를 출력

work_type      work_type_encoded
Govt_job       0.0                   611
Never_worked   1.0                     2
Private        2.0                  2566
Self-employed  3.0                   736
dtype: int64

In [14]:
df['residence_type_encoded'] = OrdinalEncoder().fit_transform(df['residence_type'].values.reshape(-1,1))
df.groupby(['residence_type', 'residence_type_encoded']).size()   # 변경전후 변수값 비교표를 출력

residence_type  residence_type_encoded
Rural           0.0                       1916
Urban           1.0                       1999
dtype: int64

In [15]:
df['smoking_status_encoded'] = OrdinalEncoder().fit_transform(df['smoking_status'].values.reshape(-1,1))
df.groupby(['smoking_status', 'smoking_status_encoded']).size()   # 변경전후 변수값 비교표를 출력

smoking_status   smoking_status_encoded
Unknown          0.0                        780
formerly smoked  1.0                        788
never smoked     2.0                       1643
smokes           3.0                        704
dtype: int64

In [16]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke', 'ever_married_encoded', 'gender_encoded',
       'work_type_encoded', 'residence_type_encoded',
       'smoking_status_encoded'],
      dtype='object')

In [12]:
# drop(inplace=True)가 있는 구문은 한 번만 실행할 수 있습니다.
# 두번째 실행하면 이미 특정 변수명이 drop된 상태여서 에러를 발생합니다.
df.drop(['id','residence_type','ever_married','gender','work_type','smoking_status'],
        axis=1, inplace=True)   

In [13]:
# 책에서는 df.columns라는 더 간단한 명령어를 사용했습니다.
list(df.columns)

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'stroke',
 'ever_married_encoded',
 'gender_encoded',
 'work_type_encoded',
 'residence_type_encoded',
 'smoking_status_encoded']

In [14]:
df.shape

(3915, 11)

In [15]:
df.isna().any()[lambda x: x]   # null value를 갖고 있는 변수명(컬럼명) 찾기

Series([], dtype: bool)

In [16]:
df['bmi'].isnull().sum()       # bmi의 결측값 개수 확인

0

In [17]:
df.to_csv('healthcare-dataset-3.csv', index=False)

# 3.7.2 데이터 분할 및 대체

In [19]:
import pandas as pd
import numpy as np
df = pd.read_csv('healthcare-dataset-3.csv')
df.shape

(3915, 11)

In [19]:
# 참조 코딩
df.dtypes

age                       float64
hypertension                int64
heart_disease               int64
avg_glucose_level         float64
bmi                       float64
stroke                      int64
ever_married_encoded      float64
gender_encoded            float64
work_type_encoded         float64
residence_type_encoded    float64
smoking_status_encoded    float64
dtype: object

In [20]:
# 참조 코딩
list(df.columns)

['age',
 'hypertension',
 'heart_disease',
 'avg_glucose_level',
 'bmi',
 'stroke',
 'ever_married_encoded',
 'gender_encoded',
 'work_type_encoded',
 'residence_type_encoded',
 'smoking_status_encoded']

In [20]:
# dtype을 category로 지정해 줄 변수들을 cols2에 임시 저장
cols2 = ['hypertension', 'heart_disease', 'stroke', 'ever_married_encoded', 'gender_encoded',
         'work_type_encoded', 'residence_type_encoded', 'smoking_status_encoded']

df[cols2] = df[cols2].astype('category')   # cols2에 저장된 변수들의 dtype을 category로 변경

In [4]:
df.dtypes

age                        float64
hypertension              category
heart_disease             category
avg_glucose_level          float64
bmi                        float64
stroke                    category
ever_married_encoded      category
gender_encoded            category
work_type_encoded         category
residence_type_encoded    category
smoking_status_encoded    category
dtype: object

In [21]:
data = df.drop(['stroke'], axis=1)   # 타겟변수를 제외한 변수만 data에 저장
target = df['stroke']                # 타겟변수만 target에 저장

In [6]:
data.shape

(3915, 10)

In [7]:
target.shape

(3915,)

In [5]:
### 참조 코딩. 데이터를 UnderSampling 하기 전에 Decision Tree를 그냥 돌려본 모델

# 50:50 비율로 데이터 분할
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split( 
    data, target, test_size=0.5, random_state=42, stratify = target)

# Decision Tree 모델 (Default인 GINI기준이자 Maximal depth 조건)
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)   # Classifier로 DecisionTreeClassifer 지정
tree.fit(X_train, y_train)   # Clssifier를 트레이닝 데이터셋에서 학습시킴
 
print("Accuracy(GINI) on training set:{:.5f}".format(tree.score(X_train, y_train))) 
print("Accuracy(GINI) on test set:{:.5f}".format(tree.score(X_test, y_test)))

Accuracy(GINI) on training set:1.00000
Accuracy(GINI) on test set:0.91982


In [7]:
df['stroke'].value_counts(dropna=False)

0    3722
1     193
Name: stroke, dtype: int64

In [19]:
df['stroke'].value_counts(dropna=False, normalize=True)

0    0.950702
1    0.049298
Name: stroke, dtype: float64

In [6]:
target.value_counts(dropna=False)

0    3722
1     193
Name: stroke, dtype: int64

In [7]:
target.value_counts(dropna=False, normalize=True)

0    0.950702
1    0.049298
Name: stroke, dtype: float64

In [47]:
type(df)

pandas.core.frame.DataFrame

In [48]:
type(data)

pandas.core.frame.DataFrame

In [50]:
type(target)

pandas.core.series.Series

In [None]:
# 3-7-2. 데이터 분할 및 대체는 Decision Tree Stroke.ipynb 파일에서 이어집니다.