In [42]:
# !pip install pandas
# !pip install numpy
# !pip install imblearn
# !pip install pycaret

In [43]:
# !pip install pycaret
# !pip install scikit-learn==1.2.2 

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from collections import Counter
from imblearn.datasets import make_imbalance
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
# import warnings
# warnings.filterwarnings(action='ignore')
from sklearn.metrics import roc_auc_score,classification_report, recall_score, f1_score,precision_recall_curve, auc,precision_score
import multiprocessing
from pycaret.classification import *

# target = DIBEV1

In [6]:
df = pd.read_csv('Data_preprocessing(dibev1수정).csv')

In [7]:
df.head()

Unnamed: 0,id,AGE,ALCSTAT,ARTH1,BMI,CHLEV,CIGAREV2,CPLROU,EPHEV,FSBALANC,...,INTIL2W,MRACBPI2,MRACRPI2,REGION,SMKSTAT2,TIRED_1,PREG,HEIGHT(cm),WEIGHT(kg),DIBEV1
0,100011,79.0,0.0,1.0,23.594147,1.0,0.0,1.0,0.0,0,...,0.0,1.0,1.0,3.0,0.0,1.0,1,157.48,58.513368,1
1,100061,37.0,0.0,0.0,32.775484,1.0,1.0,0.0,0.0,0,...,0.0,1.0,1.0,3.0,0.0,0.0,1,180.34,106.59412,0
2,100091,75.0,0.0,1.0,22.273536,1.0,1.0,1.0,0.0,1,...,0.0,7.0,11.0,3.0,0.0,0.0,1,167.64,62.595696,0
3,1000101,39.0,0.0,0.0,23.709925,0.0,0.0,1.0,0.0,0,...,0.0,1.0,1.0,1.0,0.0,1.0,1,180.34,77.11064,0
4,1000131,54.0,1.0,0.0,38.273579,1.0,0.0,1.0,0.0,0,...,0.0,2.0,2.0,3.0,0.0,1.0,1,165.1,104.32616,0


In [8]:
df.shape

(23524, 25)

In [9]:
# target 데이터를 구분하여 새로운 데이터셋 각각 생성
df_0 = df[df['DIBEV1']==0].copy()
df_1 = df[df['DIBEV1']==1].copy()

In [10]:
def data_sampling(X,n_set):
    '''비복원 데이터 샘플링'''
    np.random.seed(123)
    n_sets = n_set  # 만들고자하는 데이터세트 수
    set_size = round(X.shape[0]/n_set)  # 각 세트의 크기
    X_index = X.index  # 데이터프레임의 인덱스

    data_set = []  # X 데이터의 여러 세트를 저장할 리스트

    for _ in range(n_sets - 1): # 마지막 데이터셋은 데이터 행 개수가 다르므로 n-1개만 반복문으로 데이터셋 생성
        sampled_index = np.random.choice(X_index, size=set_size, replace=False) # 전체 데이터셋 중 size만큼 index 추출
        X_index = np.setdiff1d(X_index, sampled_index)  # 전체 인덱스에서 추출된 인덱스를 제외
        data_set.append(X.loc[sampled_index]) # 추출된 인덱스만큼 데이터셋 filtering 후 데이터셋에 저장
    
    data_set.append(X.loc[X_index]) #데이터 행 개수가 맞지 않는 마지막 데이터셋 추가
    return data_set

In [11]:
data_set = data_sampling(df_0,9)

In [12]:
# 첫 번째 데이터셋과 마지막 데이터셋의 개수 차이 확인
data_set[0].shape , data_set[-1].shape

((2253, 25), (2257, 25))

In [13]:
# 분할 된 각 데이터셋을 target이 1인 데이터셋과 합침
# 반복문 및 전역변수 선언을 통한 각 데이터셋 변수 생성
for i,data in enumerate(data_set,start=1):
    globals()[f'udf_{i}'] = pd.concat([data,df_1],axis=0)

In [14]:
# 언더샘플링 1~8을 학습용 / 마지막 udf_9를 검증용으로 사용
udf_set = [udf_1,udf_2,udf_3,udf_4,udf_5,udf_6,udf_7,udf_8]
# uff_9 <- 검증용으로 사용

In [15]:
# 각 데이터셋 타겟 비율 확인
for i,data in enumerate(udf_set+[udf_9]):
    print(f'{i+1}번째 데이터셋 타겟 비율 : {Counter(data.iloc[:,-1])}')

1번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
2번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
3번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
4번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
5번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
6번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
7번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
8번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2253})
9번째 데이터셋 타겟 비율 : Counter({1: 3243, 0: 2257})


## 2-1. 수치형 데이터 스케일링

In [16]:
# 전처리를 위해 학습용 데이터셋 합침
total_set = pd.concat(udf_set,axis=0)

In [17]:
# 스케일링을 적용할 컬럼 선정
num_col = ['AGE','BMI','HEIGHT(cm)','WEIGHT(kg)']

In [18]:
# 전체 데이터셋의 위 컬럼들에 대해 스케일링 fit 후 각 데이터셋 및 검증용 데이터셋에 transform 적용
for col in num_col:
    ss = StandardScaler()
    # ss.fit(total_set[[col]]) <- 굳이 없어도 될 코드?
    total_set[col] = ss.fit_transform(total_set[[col]])
    # for udf in udf_set:
    #     udf[col] = ss.transform(udf[[col]]) <- 굳이 없어도 될 코드?
    udf_9[col] = ss.transform(udf_9[[col]])

## 2-2. unique 개수가 3개 이상인 컬럼들 onehotencoding

In [19]:
# 데이터프레임 전체 컬럼에서 수치형 컬럼 제외
cols = np.setdiff1d(df.columns,num_col)

# 위 컬럼에서 고유값 개수가 3개 이상인 컬럼만 추출
# 0,1만 가지는 binary 컬럼은 굳이 ohe를 하지 않을 것
nom_col = [col for col in cols if total_set[col].nunique() >= 3 ]
# for col in cols:
#     if total_set[col].nunique() >= 3:
#         nom_col.append(col)

In [20]:
nom_col

['HISPAN_I', 'MRACBPI2', 'MRACRPI2', 'REGION', 'id']

In [21]:
# 위 명목형 컬럼들에서 id 컬럼 제외
nom_col.remove('id')
nom_col

['HISPAN_I', 'MRACBPI2', 'MRACRPI2', 'REGION']

In [22]:
# 명목형 컬럼들에 대한 dummy 데이터 생성(원핫인코딩)
train_dummies = [] # 학습용 데이터셋의 명목형 컬럼들의 더미데이터셋 저장용 리스트
test_dummies = [] # 검증용 데이터셋의 명목형 컬럼들의 더미데이터셋 저장용 리스트
for col in nom_col:
    train_dummies.append(pd.get_dummies(total_set[col],prefix=col,dummy_na=True,dtype='float')) # 학습데이터의 각 컬럼들의 더미데이터셋을 리스트에 저장
    test_dummies.append(pd.get_dummies(udf_9[col],prefix=col,dummy_na=True,dtype='float')) # 검증데이터의 각 컬럼들의 더미데이터셋을 리스트에 저장

In [23]:
train_dummies = pd.concat(train_dummies,axis=1) # 학습 데이터의 더미데이터셋 리스트를 하나로 합침
test_dummies = pd.concat(test_dummies,axis=1) # 검증 데이터의 더미데이터셋 리스트를 하나로 합침

In [24]:
train_dummies.shape, test_dummies.shape

((43968, 35), (5500, 35))

In [25]:
# 만약 고유값 개수 차이로 인해 학습셋과 테스트셋의 더미 데이터셋 컬럼 차이가 있다면 컬럼수가 적은 쪽으로 컬럼 재지정
# 학습 및 예측 오류 방지

if train_dummies.shape[1] > test_dummies.shape[1]:
    train_dummies = train_dummies[test_dummies.columns]
elif train_dummies.shape[1] == test_dummies.shape[1]:
    pass
else:
    test_dummies = test_dummies[train_dummies.columns]

In [26]:
train_dummies.shape, test_dummies.shape

((43968, 35), (5500, 35))

In [27]:
# 원본의 학습,테스트셋에 더미데이터셋 합친 후 기존 명목형 컬럼 제거
total_set = pd.concat([total_set,train_dummies],axis=1).drop(nom_col,axis=1)
udf_9 = pd.concat([udf_9,test_dummies],axis=1).drop(nom_col,axis=1)

In [28]:
udf_1.shape, total_set.shape

((5496, 25), (43968, 56))

In [29]:
# 전처리된 전체 데이터셋을 다시 8분할
for i in range(8):
    globals()[f'udf_{i+1}'] = total_set.iloc[5496*i:5496*(i+1)]

In [30]:
udf_set = [udf_1,udf_2,udf_3,udf_4,udf_5,udf_6,udf_7,udf_8]

# pycaret 모델링

In [31]:
best_models = []
for data in udf_set:
    clf = setup(data=data.drop(['id','DIBEV1'],axis=1),target=data['DIBEV1'],preprocess=False,verbose=False,n_jobs=-1,session_id=123) # pycaret AutoML사용을 위한 초기화
    best_5 = compare_models(fold=10, sort='F1',verbose=False,n_select=5) # pycaret에서 F1 score기준으로 데이터셋에 최적화된 모델 1개 선정(n_select = 1이 기본값이며, 이 값에 따라 선정되는 모델의 개수 변경 됨)
    # 각 데이터셋에 최적화된 모델 5개 중 predict_proba 메소드가 없는 모델 제거
    for model in best_5:
        try : 
            blend_models(estimator_list=[model], method='soft',verbose=False)
        except:
            best_5.remove(model)
            
    best = blend_models(estimator_list=best_5, method='soft',optimize='F1',verbose=False) # best_5 모델들에 대한 앙상블
    tuned_best = tune_model(best,optimize='F1',verbose=False) # 앙상블 모델을 최적화
    best_models.append(tuned_best) # 최적화된 모델을 모델 리스트에 추가

In [32]:
ensemble_model = blend_models(estimator_list=best_models, method='soft',optimize='F1') # 각 데이터셋으로 앙상블된 모델들을 최종 앙상블

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7584,0.8105,0.8326,0.7746,0.8025,0.4926,0.4947
1,0.7143,0.7864,0.7841,0.7448,0.7639,0.4027,0.4036
2,0.7532,0.8029,0.837,0.7661,0.8,0.4796,0.4828
3,0.7818,0.8322,0.8326,0.8043,0.8182,0.5457,0.5462
4,0.7481,0.8108,0.859,0.75,0.8008,0.4623,0.4702
5,0.7714,0.8434,0.8326,0.7908,0.8112,0.5222,0.5233
6,0.7455,0.7972,0.837,0.757,0.795,0.4616,0.4657
7,0.7656,0.8031,0.8546,0.7729,0.8117,0.5034,0.5079
8,0.7526,0.8346,0.8326,0.7683,0.7992,0.4785,0.4811
9,0.7474,0.8461,0.8194,0.7686,0.7932,0.4696,0.4712


In [33]:
ensemble_model

In [34]:
# 앙상블 모델을 최종 한개의 모델로 마지막 학습 / cross_val 사용
final_model = finalize_model(ensemble_model)

In [35]:
final_model

In [36]:
# 최종 예측
pred = predict_model(final_model,data=udf_9.drop(['id','DIBEV1'],axis=1),verbose=True)

In [37]:
pred

Unnamed: 0,AGE,ALCSTAT,ARTH1,BMI,CHLEV,CIGAREV2,CPLROU,EPHEV,FSBALANC,GENDER,...,MRACRPI2_16.0,MRACRPI2_17.0,MRACRPI2_nan,REGION_1.0,REGION_2.0,REGION_3.0,REGION_4.0,REGION_nan,prediction_label,prediction_score
9,0.555916,0.0,1.0,-1.930710,1.0,1.0,1.0,0.0,0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.6782
10,0.442308,0.0,0.0,-1.167195,0.0,1.0,1.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.6610
12,-0.977795,0.0,0.0,-1.119075,0.0,1.0,0.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.8146
13,-0.750579,0.0,1.0,-0.944147,0.0,0.0,1.0,0.0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0.5715
17,-0.523362,0.0,0.0,-1.340909,0.0,0.0,1.0,0.0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.5940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23506,0.442308,0.0,0.0,0.460561,1.0,0.0,1.0,0.0,0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.8465
23507,1.010350,1.0,0.0,-0.204676,1.0,0.0,1.0,0.0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,0.8046
23509,-0.012125,0.0,0.0,-1.309706,1.0,0.0,1.0,0.0,0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,0.8155
23513,1.010350,0.0,1.0,3.090395,1.0,0.0,1.0,0.0,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.8803


In [38]:
# 실제값 과 예측값 비교
print(classification_report(udf_9['DIBEV1'],pred['prediction_label']))

              precision    recall  f1-score   support

           0       0.77      0.64      0.70      2257
           1       0.78      0.87      0.82      3243

    accuracy                           0.78      5500
   macro avg       0.78      0.76      0.76      5500
weighted avg       0.78      0.78      0.77      5500



In [39]:
precision_score(udf_9['DIBEV1'],pred['prediction_label'])

0.7772263578715192

In [40]:
recall_score(udf_9['DIBEV1'],pred['prediction_label'])

0.8692568609312366

In [41]:
for j in [0.57,0.575,0.58]:
    print(f'임계값 : {j}')
    pred = predict_model(final_model,data=udf_9.drop(['id','DIBEV1'],axis=1),verbose=False,probability_threshold=j)
    precision = precision_score(udf_9['DIBEV1'],pred['prediction_label'])
    recall = recall_score(udf_9['DIBEV1'],pred['prediction_label'])
    f1 = f1_score(udf_9['DIBEV1'],pred['prediction_label'])
    print(f'''
임계값 : {j}
정밀도 : {precision}
재현율 : {recall}
f1_score : {f1}
''')
    print('--'*10)

임계값 : 0.57

임계값 : 0.57
정밀도 : 0.7976226760134105
재현율 : 0.8069688559975331
f1_score : 0.80226854690374

--------------------
임계값 : 0.575

임계값 : 0.575
정밀도 : 0.7996927803379417
재현율 : 0.8026518655565834
f1_score : 0.8011695906432749

--------------------
임계값 : 0.58

임계값 : 0.58
정밀도 : 0.8016759776536313
재현율 : 0.7964847363552267
f1_score : 0.7990719257540604

--------------------
