#**뇌졸중 예측 ML 모델**



### Variables:
* id: 환자 고유번호
* gender: 성별
* age: 나이
* hypertension: 고혈합 유무; [0, 1]
* heart_disease: 심장병 유무; [0, 1]
* ever_married: 결혼 경험 유무; [0, 1]
* work_type: 직업 유형; [교육, 민간 부문, 무직, 자영업, 공무직]
* Residence_type: 거주지 [도시, 시골]
* avg_glucose_level: 평균 혈당 수치
* bmi: 체질량지수 (몸무게/키 ^2)
* smoking_status: 흡연 상태 [비흡연자, 금연, 흡연자]
* stroke: 뇌졸중 경험 유무 [0, 1]

##**EDA 및 데이터 전처리**

In [None]:
import numpy as np
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CodeStates/Sec.3/train_strokes.csv')
df = data.copy()
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,30669,Male,3.0,0,0,No,children,Rural,95.12,18.0,,0
1,30468,Male,58.0,1,0,Yes,Private,Urban,87.96,39.2,never smoked,0
2,16523,Female,8.0,0,0,No,Private,Urban,110.89,17.6,,0
3,56543,Female,70.0,0,0,Yes,Private,Rural,69.04,35.9,formerly smoked,0
4,46136,Male,14.0,0,0,No,Never_worked,Rural,161.28,19.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
43395,56196,Female,10.0,0,0,No,children,Urban,58.64,20.4,never smoked,0
43396,5450,Female,56.0,0,0,Yes,Govt_job,Urban,213.61,55.4,formerly smoked,0
43397,28375,Female,82.0,1,0,Yes,Private,Urban,91.94,28.9,formerly smoked,0
43398,27973,Male,40.0,0,0,Yes,Private,Urban,99.16,33.2,never smoked,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43400 non-null  int64  
 1   gender             43400 non-null  object 
 2   age                43400 non-null  float64
 3   hypertension       43400 non-null  int64  
 4   heart_disease      43400 non-null  int64  
 5   ever_married       43400 non-null  object 
 6   work_type          43400 non-null  object 
 7   Residence_type     43400 non-null  object 
 8   avg_glucose_level  43400 non-null  float64
 9   bmi                41938 non-null  float64
 10  smoking_status     30108 non-null  object 
 11  stroke             43400 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


In [None]:
# 데이터 변수 이름 소문자로 통일
df.columns = df.columns.str.lower()

In [None]:
# id 고유번호로 중복된 데이터 확인
df.duplicated('id').sum()

0

In [None]:
# 혈당 수치를 당뇨진단범위에 의하여 3단계로 나눔
  # 대한진단검사의학회에 의하면 혈당 수치가 정상은 ~100mg/dl, 당뇨병전기는 100~126mg/dl, 당뇨병은 126~mg/dl로 분류
df['avg_glucose_level'] = pd.cut(x=df['avg_glucose_level'], 
                                 bins=[0, 100, 126, np.inf], 
                                 labels=['normal', 'prediabetic', 'diabetic'])

# 체질량지수를 4단계로 나눔
df['bmi'] = pd.cut(x=df['bmi'],
                   bins=[0, 18.5, 25, 30, np.inf],
                   labels=['underweight', 'normal', 'overweight', 'obese'])


In [None]:
# 불필요한 변수들 제거
  # 결혼 경험의 유무와 뇌졸중의 직접적인 관계를 찾기 힘들다고 판단
df.drop(['id', 'ever_married'], axis=1, inplace=True)

In [None]:
# 데이터셋에서 결측치 확인
df.isnull().sum()

gender                   0
age                      0
hypertension             0
heart_disease            0
work_type                0
residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [None]:
# 흡연 상태 데이터의 많은 부분인 3분의 1이 결측치이므로 결측치를 제거하기보다 모름으로 대체
df.smoking_status.fillna('unknown', inplace=True)

In [None]:
# 데이터 고유값 확인
for col in df:
  print(col, ':', df[col].unique())

gender : ['Male' 'Female']
age : [3.00e+00 5.80e+01 8.00e+00 7.00e+01 1.40e+01 4.70e+01 5.20e+01 7.50e+01
 3.20e+01 7.40e+01 7.90e+01 3.70e+01 4.00e+01 3.50e+01 2.00e+01 4.20e+01
 4.40e+01 6.50e+01 5.70e+01 4.90e+01 7.10e+01 5.90e+01 2.50e+01 6.70e+01
 3.80e+01 5.40e+01 2.70e+01 2.30e+01 5.50e+01 1.70e+01 1.30e+01 4.00e+00
 1.60e+01 2.20e+01 4.50e+01 6.60e+01 6.90e+01 5.30e+01 7.80e+01 4.30e+01
 5.10e+01 3.00e+01 4.60e+01 6.10e+01 4.80e+01 2.90e+01 1.10e+01 7.60e+01
 2.10e+01 1.80e+01 3.30e+01 8.20e+01 2.40e+01 3.40e+01 6.40e+01 6.80e+01
 6.00e+01 3.60e+01 3.90e+01 6.40e-01 7.20e+01 4.10e+01 5.60e+01 8.80e-01
 5.00e+00 8.00e+01 2.60e+01 3.10e+01 7.00e+00 1.20e+01 6.30e+01 6.20e+01
 2.00e+00 8.10e+01 9.00e+00 1.50e+01 2.80e+01 1.00e+01 7.70e+01 1.80e+00
 3.20e-01 1.08e+00 7.30e+01 5.00e+01 1.90e+01 6.00e+00 1.16e+00 1.00e+00
 1.40e+00 1.72e+00 2.40e-01 1.64e+00 1.56e+00 7.20e-01 1.88e+00 1.24e+00
 8.00e-01 4.00e-01 8.00e-02 1.48e+00 5.60e-01 4.80e-01 1.32e+00 1.60e-01]
hypertension : [0

In [None]:
#성별 분포 확인
print(df.gender.value_counts())

Female    25665
Male      17724
Other        11
Name: gender, dtype: int64


In [None]:
# 성별에 Other인 데이터의 비율이 너무 작아서 Other에 대한 유의미한 결과 도출이 불가할 것 같아 데이터 삭제
df = df[df.gender != 'Other']
df.shape

(43389, 10)

In [None]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,3.0,0,0,children,Rural,normal,underweight,unknown,0
1,Male,58.0,1,0,Private,Urban,normal,obese,never smoked,0
2,Female,8.0,0,0,Private,Urban,prediabetic,underweight,unknown,0
3,Female,70.0,0,0,Private,Rural,normal,obese,formerly smoked,0
4,Male,14.0,0,0,Never_worked,Rural,diabetic,normal,unknown,0


In [None]:
# 타겟 분포 확인
df.stroke.value_counts(normalize=True)

# 타켓 분포가 굉장히 불균형한 것을 확인

0    0.981954
1    0.018046
Name: stroke, dtype: float64

## **데이터 시각화**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

NameError: ignored

## **ML 모델링**

In [None]:
!pip3 install imbalanced-learn
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 2.7 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.0


In [None]:
# test 데이터셋을 hold-out
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, train_size=0.8, random_state=2)

# 모델 학습을 위해 검증 세트를 따로 분류
train, val = train_test_split(train, train_size=0.8, random_state=2)
print('Train set:', train.shape, '\nValidation set:', val.shape, '\nTest set:', test.shape)

Train set: (27768, 10) 
Validation set: (6943, 10) 
Test set: (8678, 10)


In [None]:
# 타켓의 분포가 
print("Train target distribution:\n", train['stroke'].value_counts(normalize=True), '\n')
print("Val target distribution:\n", val['stroke'].value_counts(normalize=True), '\n')
print("Test target distribution:\n", test['stroke'].value_counts(normalize=True))

Train target distribution:
 0    0.981129
1    0.018871
Name: stroke, dtype: float64 

Val target distribution:
 0    0.985597
1    0.014403
Name: stroke, dtype: float64 

Test target distribution:
 0    0.981678
1    0.018322
Name: stroke, dtype: float64


In [None]:
# 타켓 변수와 데이터셋 분리
y = 'stroke' # 타켓 변수

X_train, y_train = train.drop(columns=[y]), train[y]
X_val, y_val = val.drop(columns=[y]), val[y]
X_test, y_test = test.drop(columns=[y]), val[y]

### Baseline Model

In [None]:
# 환자가 뇌졸중이 있다/없다를 예측하는 2진분류 문제임으로 baseline model로 최빈값(mode)를 사용
mode = [0] # 타겟 변수의 최빈값
y_pred_base = mode * len(y_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_base));

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      6843
           1       0.00      0.00      0.00       100

    accuracy                           0.99      6943
   macro avg       0.49      0.50      0.50      6943
weighted avg       0.97      0.99      0.98      6943



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


###Decision Tree Classifier

### Random Forest Classifier

In [None]:
from category_encoders import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

###Gradient Boosting