# 빅데이터 분석기사 실기 준비 놀이터
- https://www.kaggle.com/datasets/agileteam/bigdatacertificationkr

## T2-2. Pima Indians Diabetes(Classification) / 당뇨병

### 2. 당뇨병 여부 판단
- 데이터셋 : diabetes.csv
- 이상치 처리 : Glucose, BloodPressure, SkinThickness, Insulin, BMI가 0인 값
- https://www.kaggle.com/code/agileteam/t2-2-pima-indians-diabetes/notebook

#### 0. 시험 환경 세팅

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("./data/diabetes.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Outcome')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 9), (154, 9), (614, 2), (154, 2))

#### 1. 라이브러리 및 데이터 호출

In [2]:
import numpy as np
import pandas as pd

X_train.shape, X_test.shape, y_train.shape

((614, 9), (154, 9), (614, 2))

#### 2. EDA

In [3]:
display(X_train.info(), '='*100, X_train.head(), '='*100)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 614 entries, 147 to 116
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        614 non-null    int64  
 1   Pregnancies               614 non-null    int64  
 2   Glucose                   614 non-null    int64  
 3   BloodPressure             614 non-null    int64  
 4   SkinThickness             614 non-null    int64  
 5   Insulin                   614 non-null    int64  
 6   BMI                       614 non-null    float64
 7   DiabetesPedigreeFunction  614 non-null    float64
 8   Age                       614 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 48.0 KB


None



Unnamed: 0,id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
147,147,2,106,64,35,119,30.5,1.4,34
344,344,8,95,72,0,0,36.8,0.485,57
390,390,1,100,66,29,196,32.0,0.444,42
150,150,1,136,74,50,204,37.4,0.399,24
132,132,3,170,64,37,225,34.5,0.356,30




In [4]:
display(X_test.info(), '='*100, X_test.head(), '='*100)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 154 entries, 258 to 174
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        154 non-null    int64  
 1   Pregnancies               154 non-null    int64  
 2   Glucose                   154 non-null    int64  
 3   BloodPressure             154 non-null    int64  
 4   SkinThickness             154 non-null    int64  
 5   Insulin                   154 non-null    int64  
 6   BMI                       154 non-null    float64
 7   DiabetesPedigreeFunction  154 non-null    float64
 8   Age                       154 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 12.0 KB


None



Unnamed: 0,id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
258,258,1,193,50,16,375,25.9,0.655,24
220,220,0,177,60,29,478,34.6,1.072,21
438,438,1,97,70,15,0,18.2,0.147,21
130,130,4,173,70,14,168,29.7,0.361,33
730,730,3,130,78,23,79,28.4,0.323,34




#### 3. Preprocessing
- 불필요 컬럼 제거 : id
- 이상치 처리 : Glucose, BloodPressure, SkinThickness, Insulin, BMI가 0인 값
    - Glucose : 행 제거
    - BloodPressure : 평균 대체(보류)
    - SkinThickness : 평균 대체(보류)
    - Insulin : 평균대체(보류)
    - BMI : 평균 대체(보류)
- 스케일링 : 정규화(MinMaxScaler)

In [5]:
x_train = X_train.copy()
x_test = X_test.copy()

In [6]:
check_col = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for c in check_col:
    print('train', c, len(x_train[x_train[c] == 0]))

train Glucose 5
train BloodPressure 31
train SkinThickness 175
train Insulin 287
train BMI 9


In [7]:
for c in check_col:
    print('test', c, len(x_test[x_test[c] == 0]))

test Glucose 0
test BloodPressure 4
test SkinThickness 52
test Insulin 87
test BMI 2


In [8]:
for c in check_col:
    print(c, 'x_train median :', x_train[c].median())
    print(c, 'x_train mean :', x_train[c].mean())
    print(c, 'x_train mode :', x_train[c].mode()[0])
    print('='*10)
    print(c, 'test median :', x_test[c].median())
    print(c, 'test mean :', x_test[c].mean())
    print(c, 'test mode :', x_test[c].mode()[0])
    print('='*40)

Glucose x_train median : 117.0
Glucose x_train mean : 120.70521172638436
Glucose x_train mode : 100
Glucose test median : 117.5
Glucose test mean : 121.64935064935065
Glucose test mode : 84
BloodPressure x_train median : 72.0
BloodPressure x_train mean : 68.53094462540717
BloodPressure x_train mode : 70
BloodPressure test median : 72.0
BloodPressure test mean : 71.3961038961039
BloodPressure test mode : 82
SkinThickness x_train median : 23.0
SkinThickness x_train mean : 20.982084690553744
SkinThickness x_train mode : 0
SkinThickness test median : 19.5
SkinThickness test mean : 18.75974025974026
SkinThickness test mode : 0
Insulin x_train median : 44.0
Insulin x_train mean : 82.22964169381108
Insulin x_train mode : 0
Insulin test median : 0.0
Insulin test mean : 70.1103896103896
Insulin test mode : 0
BMI x_train median : 32.25
BMI x_train mean : 31.9485342019544
BMI x_train mode : 31.6
BMI test median : 31.95
BMI test mean : 32.168181818181814
BMI test mode : 29.7


> 불필요 컬럼 id 제거

In [9]:
print(x_train.shape, x_test.shape)

x_train.drop('id', axis=1, inplace=True)
x_test.drop('id', axis=1, inplace=True)

print(x_train.shape, x_test.shape)

(614, 9) (154, 9)
(614, 8) (154, 8)


> Glucose값이 0인 행 제거

In [10]:
del_idx = x_train[x_train['Glucose'] == 0].index

print(x_train.shape, y_train.shape)

x_train.drop(index=del_idx, axis=0, inplace=True)
y_train.drop(index=del_idx, axis=0, inplace=True)

print(x_train.shape, y_train.shape)

(614, 8) (614, 2)
(609, 8) (609, 2)


> 컬럼 4개 값이 0인 데이터 평균 대체(보류)

In [11]:
repl_col = ['BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

train_col_mean = x_train[repl_col].mean()
test_col_mean = x_test[repl_col].mean()

# x_train[repl_col] = x_train[repl_col].replace(0, train_col_mean)
# x_test[repl_col] = x_test[repl_col].replace(0, test_col_mean)

for c in repl_col:
    print('train', c, len(x_train[x_train[c] == 0]))
    print('test', c, len(x_test[x_test[c] == 0]))

train BloodPressure 31
test BloodPressure 4
train SkinThickness 175
test SkinThickness 52
train Insulin 283
test Insulin 87
train BMI 9
test BMI 2


> 정규화

In [12]:
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# scaler = StandardScaler()
scaler = MinMaxScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

#### 4. Modeling

import sklearn  
help(sklearn)

import sklearn.model_selection  
help(sklearn.model_selection)

import sklearn.linear_model  
help(sklearn.linear_model)

import sklearn.ensemble  
help(sklearn.ensemble)

import xgboost  
help(xgboost)

import sklearn.metrics  
help(sklearn.metrics)

In [13]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier

from sklearn.metrics import accuracy_score, roc_auc_score

In [14]:
x_train.shape, y_train.shape, x_test.shape

((609, 8), (609, 2), (154, 8))

In [15]:
X_tr, X_val, y_tr, y_val = train_test_split(x_train, y_train['Outcome'], test_size=0.2, random_state=156)

lr = LogisticRegression(random_state=156)
rf = RandomForestClassifier(random_state=156)
gb = GradientBoostingClassifier(random_state=156)
xgb = XGBClassifier(random_state=156)
xgbrf = XGBRFClassifier(random_state=156)

models = [lr, rf, gb, xgb, xgbrf]

In [16]:
for model in models:
    model.fit(X_tr, y_tr)
    pred = model.predict(X_val)
    pred_proba = model.predict_proba(X_val)[:,1]
    
    name = str(model).split('(')[0]
    
    print(name)
    print('정확도 :', accuracy_score(y_val, pred))
    print('auc : ', roc_auc_score(y_val, pred))
    print('='*100)

LogisticRegression
정확도 : 0.7868852459016393
auc :  0.7045138888888889
RandomForestClassifier
정확도 : 0.7868852459016393
auc :  0.7548611111111111
GradientBoostingClassifier
정확도 : 0.7950819672131147
auc :  0.7604166666666666
XGBClassifier
정확도 : 0.8032786885245902
auc :  0.7760416666666667
XGBRFClassifier
정확도 : 0.8032786885245902
auc :  0.7559027777777778


In [17]:
gb = GradientBoostingClassifier(random_state=156)

gb.fit(x_train, y_train['Outcome'])
pred = gb.predict(x_test)
pred_proba = gb.predict_proba(x_test)[:,1]

In [18]:
# 가채점
print(accuracy_score(y_test['Outcome'], pred))
print(roc_auc_score(y_test['Outcome'], pred_proba))

0.7597402597402597
0.8415626695604992


In [19]:
outdata = pd.DataFrame({'idx' : X_test['id'], 'Outcome':pred})
outdata

Unnamed: 0,idx,Outcome
258,258,1
220,220,1
438,438,0
130,130,1
730,730,0
...,...,...
678,678,0
500,500,0
256,256,0
80,80,0


In [20]:
# out_data.to_csv('123456.csv', index=False)

#### 5. 다른 모델 평가

In [21]:
lr = LogisticRegression(random_state=156)
rf = RandomForestClassifier(random_state=156)
gb = GradientBoostingClassifier(random_state=156)
xgb = XGBClassifier(random_state=156)
xgbrf = XGBRFClassifier(random_state=156)

models = [lr, rf, gb, xgb, xgbrf]

In [22]:
for model in models:
    model.fit(x_train, y_train['Outcome'])
    pred = model.predict(x_test)
    pred_proba = model.predict_proba(x_test)[:,1]
    
    name = str(model).split('(')[0]
    
    print(name)
    print('정확도 :', accuracy_score(y_test['Outcome'], pred))
    print('auc : ', roc_auc_score(y_test['Outcome'], pred_proba))
    print('='*100)

LogisticRegression
정확도 : 0.7727272727272727
auc :  0.8484355217941761
RandomForestClassifier
정확도 : 0.7792207792207793
auc :  0.829625610417797
GradientBoostingClassifier
정확도 : 0.7597402597402597
auc :  0.8415626695604992
XGBClassifier
정확도 : 0.7337662337662337
auc :  0.7815156447820581
XGBRFClassifier
정확도 : 0.7597402597402597
auc :  0.8348706818592874
