--- 

# **코드 설명**

---

- 파 일 명 : 심장 질환 예측 경진대회 <br>
- 시작날짜 : 2021.11.25 <br>
- 수정날짜 : 2021.11.25 <br>
- 작 성 자 : 김혁진 <br>
- 작성주제 : Dacon / 심장 질환 예측 <br>

--- 

- **순서** <br>
  0. 기본설정 <br>
    0.0. Google Drive Mount <br>
    0.1. GPU 사용 <br>
    0.2. Import Modules <br>
    0.3. Initial Values
    0.4. Set Off the Warning <br>
    0.5. User Defined Function <br>

  1. Data Load <br>

  2. EDA
  
---

- **참조**

  (1) 대회 홈페이지 : [Dacon](https://dacon.io/competitions/official/235848/overview/description) <br>
  (2) 하이퍼 파리미터 설명 : [Naver Blog](https://blog.naver.com/wideeyed/221333529176) <br>
  (3) Class문 설명 : [Github](https://zzsza.github.io/development/2020/07/05/python-class/) <br>
  (4) GPU 설정 : [Medium](https://medium.com/@am.sharma/lgbm-on-colab-with-gpu-c1c09e83f2af) <br>
  (5) RAM 모두사용으로 세션다운 : [Tistory](https://somjang.tistory.com/entry/Google-Colab-%EC%9E%90%EC%A3%BC%EB%81%8A%EA%B8%B0%EB%8A%94-%EB%9F%B0%ED%83%80%EC%9E%84-%EB%B0%A9%EC%A7%80%ED%95%98%EA%B8%B0)

---

- **고려사항** <br>
  (1) AutoEncoder로 파생변수 생성해보기 <br>
  (2) 하이퍼파라미터 탐색 : grid-search, bayesian-optimization, [optuna](https://dacon.io/competitions/official/235713/codeshare/2704?page=1&dtype=recent)

---

># **0. 기본설정**

## 0.1. Markdown : Tabular Left Align

In [None]:
%%html
<style>
    table {float:left}
</style>

## 0.2. Jupyter Notebook Style : Theme, Display

In [None]:
# # theme 설치
# !pip install jupyterthemes

# # jupyter notebook 최신버전
# !pip install --upgrade notebook

# # jupyter notebook 최신버전
# !pip install --upgrade jupyterthemes

# 0.2.1. 테마바꾸기(customizing)
# !jt -t onedork -fs 115 -nfs 125 -tfs 115 -dfs 115 -ofs 115 -cursc r -cellw 80% -lineh 115 -altmd  -kl -T -N

# 0.2.2. 쥬피터 노트북 화면 넓게 사용
# 출처: https://taehooh.tistory.com/entry/Jupyter-Notebook-주피터노트북-화면-넓게-쓰는방법
from IPython.core.display import display, HTML 
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
# # 0.2.1 Google Drive Mount
# # (Google Drive 사용 시 설정)
# from google.colab import drive
# drive.mount('/content/drive', force_remount = True) # 새로운 창에서 key 를 받아서 입력해야합니다. 

# # 0.2.1. 메모리 에러
# https://growingsaja.tistory.com/477

In [None]:
# # 0.2.2. GPU 사용 (6분)
# !git clone --recursive https://github.com/Microsoft/LightGBM
# !mkdir build
# %cd /content/LightGBM
# !cmake -DUSE_GPU=1 #avoid ..
# !make -j$(nproc)
# !sudo apt-get -y install python-pip
# !sudo -H pip install setuptools pandas numpy scipy scikit-learn -U
# %cd /content/LightGBM/python-package

### Install Modules

In [None]:
# !pip uninstall pandas -y
# !pip uninstall numpy  -y
# !pip uninstall lightgbm -y

# !pip install pandas==1.1.0
# !pip install numpy==1.21.2
# !pip install -U scikit-learn
# !pip install lightgbm --install-option=--gpu

# !pip install pandasql
# !pip install seaborn
# !pip install plotnine
# !pip install pandasql

# lightgbm 에러떴는데, 콘다에서 실행하면 해결됨
# conda install -c conda-forge lightgbm 

# bayesian optimization 설치
# !pip install bayesian-optimization

## 0.3. Import Modules

In [None]:
# jupyter notebook 전용
from tqdm.notebook import tqdm
# from tqdm import tqdm

# basic modules
import pandas as pd
import numpy as np
import math
import warnings
import random
import os
import time

# value_counts() 범용적인 버전
from collections import Counter as cnt


# plotting
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7, 8.27)})
sns.set_style('whitegrid')

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [11.7, 8.27] # [15, 10] # [11.7,8.27] - A4 size

from plotnine import *


# sqldf
from pandasql import sqldf
sql = lambda q: sqldf(q, globals())


# modeling
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler


# import lightgbm
# !pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"
import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

# Hyperparameter Optimization
from bayes_opt import BayesianOptimization

# color when print
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

## 0.4. Initial Values

In [None]:
# 0.4.1. Data Path
# jupyter.notebook : 'os.getcwd() + '/DAT/블랙 프라이데이 판매 예측/''
# google.colab     : '/content/drive/MyDrive/Python/4. 블랙프라이데이 판매예측/DAT/'
DATA_PATH = os.getcwd() + '/DAT/2. 심장 질환 예측 경진대회 (Dacon)/'
OUT_PATH  = os.getcwd() + '/OUT/2. 심장 질환 예측 경진대회 (Dacon)/'

# 0.4.2. set seed
SEED = 777

# 0.4.3. lightgbm parameter
# 처음 5회 랜덤 값으로 score 계산 후 45회 최적화
INIT_POINTS = 15
N_ITER = 15
N_CV = 4
EARLY_STOPPING_ROUNDS = 30

# 0.4.4. scaling
SCALE = False

## 0.5. Set Off the Warning

In [None]:
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')

## 0.6. User Defined Function

In [None]:
#-------------------------------------------------------------------------------------------------------#
# 0.6.1. Seed Fix
#-------------------------------------------------------------------------------------------------------#
def seed_everything(seed: int = 1):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)  # type: ignore
    # torch.backends.cudnn.deterministic = True  # type: ignore
    # torch.backends.cudnn.benchmark = True  # type: ignore
    
seed_everything(SEED)

#-------------------------------------------------------------------------------------------------------#
# 0.6.2. View all columns
#-------------------------------------------------------------------------------------------------------#
def View(data):

    pd.set_option('display.max_rows', 500)
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    
    print(data)

    pd.set_option('display.max_rows', 0)
    pd.set_option('display.max_columns', 0)
    pd.set_option('display.width', 0)

#-------------------------------------------------------------------------------------------------------#
# 0.6.3. minmax function
#-------------------------------------------------------------------------------------------------------#
def minmax(x):
    return min(x),max(x)

#-------------------------------------------------------------------------------------------------------#
# 0.6.4. 컬럼dict에서 target 제거
#-------------------------------------------------------------------------------------------------------#
# - dict : 기준 dict
# - key  : 삭제할 key
#-------------------------------------------------------------------------------------------------------#
def rmkey(dict, key):
    tmp = dict.copy()
    del tmp[key]
    return tmp

#-------------------------------------------------------------------------------------------------------#
# 0.6.5. 각 컬럼의 missing 개수를 파악하는 함수
#-------------------------------------------------------------------------------------------------------#
# - data     : 기준 data
# - col_type : {column명 : type}로 이루어진 dictionary
#-------------------------------------------------------------------------------------------------------#
def missing_column_check(data, col_type):
    num_na = []
    for col_nm in data.columns:
        data[col_nm] = data[col_nm].astype(col_type[col_nm])

    # str인 경우에는 blank(공백)도 있는지 확인
    if col_type[col_nm]=='str':
        num_na_x = data[col_nm].str.strip().isnull().sum() + sum(data[col_nm].str.strip()=='')
    
    # numeric인 경우에는 null의 개수만 확인
    else:
        num_na_x = data[col_nm].isnull().sum()
    
    num_na = num_na + [num_na_x]

    return(num_na)

#-------------------------------------------------------------------------------------------------------#
# 0.6.6. 교호작용항 추가
#-------------------------------------------------------------------------------------------------------#
# - data     : 기준 data
# - num_vari : 숫자형 변수 list
#-------------------------------------------------------------------------------------------------------#
def interaction_term(data,num_vari):

    num_var = list(set(num_vari) - set(['id']))

    for i in range(0,len(num_var)):
        for j in range(i,len(num_var)):
            data[f'{num_var[i]}*{num_var[j]}'] = data[f'{num_var[i]}']*data[f'{num_var[j]}']

    return(data)

#-------------------------------------------------------------------------------------------------------#
# 0.6.7. density plot : histogram + density plot
#-------------------------------------------------------------------------------------------------------#
# - data : 기준 data
# - vars : hist + kde를 그릴 숫자형 변수
# - hue  : group화 변수
# - binwidth_adj_ratio : binwidth 조정 비율
#-------------------------------------------------------------------------------------------------------#
def density_plot(data, vars, 
                 binwidths = None, hue = None,
                 binwidth_adj_ratio = None):

    from matplotlib.ticker import PercentFormatter

    # 1) vars가 1개뿐일 때 에러발생
    #    -> 1개     : type = str
    #    -> 2개이상 : type = ndarray, ...
    if type(vars)==str:
        vars = [vars]
    
    # 2) plotting (nrow,ncol) 설정
    nrow = math.ceil(len(vars)**(1/2))
    ncol = nrow

    # 3) binwidths가 없을 때, binwidth 설정
    # 출처 : http://www.aistudy.co.kr/paper/pdf/histogram_jeon.pdf
    if binwidths is None:
        binwidths = []
        for col in data[vars].columns:
            n_bin = math.ceil(1 + 3.32*math.log10(len(data)))
            binwidth = ( data[col].max() - train[col].min() ) / n_bin
            binwidths.append(binwidth)
            del binwidth
    
    # 4) 설정한 binwidth를 조정하는 비율
    if binwidth_adj_ratio is not None:
        binwidths = [binwidth * binwidth_adj_ratio for binwidth in binwidths]
    
    fig = plt.figure()
    
    # 5) vars 별로 plot 생성
    for iter,var in enumerate(vars):
        
        binwidth = binwidths[iter]
        
        # (1) histogram
        ax1 = fig.add_subplot(nrow, ncol, iter+1)
        g1 = sns.histplot(data = data, x = var, hue = hue,
                          kde = True, stat = 'probability', 
                          color = 'lightskyblue',
                          binwidth = binwidth, ax = ax1)
        ax2 = ax1.twinx()
        
        # (2) density plot
        g2 = sns.kdeplot(data = data, x = var, hue = hue,
                         color = 'red', lw = 2, ax = ax2)
        ax2.set_ylim(0, ax1.get_ylim()[1] / binwidth)                  # similir limits on the y-axis to align the plots
        #ax2.yaxis.set_major_formatter(PercentFormatter(1 / binwidth))  # show axis such that 1/binwidth corresponds to 100%
        ax2.grid(False)
        
        # (3) density plot y축 없애기
        g2.set(yticklabels=[]) 
        g2.set(ylabel=None)
        g2.tick_params(right=False)
        
        a,b = divmod(iter,ncol)
        if b!=0:
            g1.set(ylabel=None)
        
    # 안겹치도록 설정
    fig.tight_layout()
    plt.show()

# example : density_plot(train, vars=num_vari)

### 0.7. 버전 확인

In [None]:
import sys
print(sys.version)

># **1. Data**

## 1.1. 변수정보 (변수명 참조 : [Dacon](https://dacon.io/competitions/official/235848/data))

|변수명 | 변수정보 | 기준 | 변수상세 |
|:---:|:---|:---|:---|
| id | 데이터 고유 id | | |
| age | 나이 | | |
| sex | 성별  | 여자 = 0, 남자 = 1 | | |
| cp | 가슴 통증 종류 | 무증상 = 0, 일반적이지 않은 협심증 = 1, 협심증이 아닌 통증 = 2, 일반적인 협심증 = 3 | | |
| trestbps | 휴식 중 혈압(mmHg) | | | resting blood pressure |
| chol | 혈중 콜레스테롤(mg/dl) | | serum cholestoral |
| fbs | 공복 중 혈당 | 120 mg/dl 이하일 시 = 0, 초과일 시 = 1 | | fasting blood sugar |
| restecg | 휴식 중 심전도 결과 | 좌심실비대증이 의심되거나 확실한 경우 = 0, 정상 = 1, having ST-T wave abnormality = 2 | resting electrocardiographic |
| thalach | 최대 심박수 | | maximum heart rate achieved |
| exang | 활동으로 인한  협심증 여부 | 없음 = 0, 있음 = 1 | exercise induced angina |
| oldpeak | 휴식 대비 운동으로 인한 ST 하강 | | ST depression induced by exercise relative to rest |
| slope | 활동 ST 분절 피크의 기울기 | 하강 = 0, 평탄 = 1, 상승 = 2 | the slope of the peak exercise ST segment |
| ca | 형광 투시로 확인된 주요 혈관 수 | 0~3 개, <strong style="color:red">Null값은 4로 인코딩됨</strong> | number of major vessels colored by flouroscopy |
| thal | 지중해빈혈 여부 | 정상 = 1, 고정 결함 = 2, 가역 결함 = 3, <strong style="color:red">Null값은 0으로 인코딩됨</strong> | thalassemia |
| target | 심장 질환 진단 여부 | 혈관 지름 축소 50% 미만 = 0, 혈관 지름 축소 50% 이상 = 1 | |

## 1.2. Data Load

In [None]:
COL_TYPE = {
    'id' : int,
    'age' : int,
    'sex' : str,        # 0,1
    'cp' : str,         # 0,1,2,3
    'trestbps' : int,
    'chol' : int,
    'fbs' : str,        # 0,1
    'restecg' : str,    # 0,1,2
    'thalach' : int,
    'exang' : str,      # 0,1
    'oldpeak' : float,
    'slope' : str,      # 0,1,2
    'ca' : str,         # 0~3이고, Null=4
    'thal' : str,       # 1,2,3이고, Null=0
    'target' : str,     # 0,1
}

# Train Data Load (550,068 rows, 12 columns)
train = pd.read_csv(DATA_PATH + 'train.csv', dtype = COL_TYPE)
test  = pd.read_csv(DATA_PATH + 'test.csv', dtype = COL_TYPE)
sub   = pd.read_csv(DATA_PATH + 'sample_submission.csv', dtype = COL_TYPE)

train

## 1.3. Missing Check

In [None]:
print('> # of train Missing :', missing_column_check(train, COL_TYPE)[0])
print('> # of test  Missing :', missing_column_check(test , COL_TYPE)[0])

># **2. EDA**

## 2.1. Characteristic Variable

## 2.2. Numeric Variable

## 2.3. Numeric Variable * Characteristic Variable

># **3. Segment**

### segment를 구분하여 따로 모델 적합

># **4. Preprocessing**

In [None]:
def preprocessing(_df):
    
    df = _df.copy()
    
    return df

train2 = preprocessing(train.copy())
test2  = preprocessing(test .copy())

# col type에 추가
for str_var in []:
    COL_TYPE[str_var] = str

#### 교호작용항

In [None]:
INTERACTION = False

if INTERACTION is True:
    train3 = interaction_term(train2,num_vari)
    test3  = interaction_term(test2 ,num_vari)

    for int_var in train3.columns[[col.find('*')>0 for col in train3.columns]]:
        COL_TYPE[int_var] = int
else:
    train3 = train2.copy()
    test3  = test2 .copy()

In [None]:
# (1) 원-핫 인코딩
def onehot_encoding(data, col_types):

    raw_data = data.copy()
    
    cols = list(set(data.columns) - set(['target']))

    for col in cols:
        if col_types[col]==str:

            data = pd.concat([
                data.drop([col],axis=1).reset_index(drop=True),
                pd.get_dummies(data[col], prefix = col).reset_index(drop=True).apply(lambda x:x.astype(int))
                ],
                axis=1)
    
    return(data)

# (2) str들 모두 int/category로 바꾸기
def str_convert(data, col_types, convert = [int,'category']):

    cols = list(set(data.columns) - set(['target']))

    for col in cols:
        if col_types[col]==str:
            data[col] = data[col].astype(convert)
    
    return(data)


ONEHOT = False

if ONEHOT:
    train4 = onehot_encoding(data = train3, col_types = rmkey(COL_TYPE,'target'))
    test4  = onehot_encoding(data = test3 , col_types = rmkey(COL_TYPE,'target'))
else:
    train4 = str_convert(train3, col_types = rmkey(COL_TYPE,'target'), convert = 'category')
    test4  = str_convert(test3 , col_types = rmkey(COL_TYPE,'target'), convert = 'category')

### 다른 level 있는 변수들 search

In [None]:
def check_category(data,col_types, ret=['dict','list']):

    cols = list(set(data.columns) - set(['target']))

    if ret=='dict':
        len_cate = {}
    elif ret=='list':
        len_cate = []
    else:
        raise('error ret')
        
    for col in cols:
        if col_types[col]==str:
            _len = len(data[col].value_counts().index)
            
            if ret=='dict':
                len_cate[col] = _len
            elif ret=='list':
                len_cate.append(_len)
            
    return(len_cate)

check_category(train4,COL_TYPE,ret='dict')

In [None]:
def check_category2(data1,data2,col_types):

    cols = list(set(data1.columns) - set(['target']))
    max_char_len = max([len(x) if col_types[x]==str else 0 for x in col_types.keys()])
    
    # 없음
    for col in cols:
        if col_types[col]==str:
            data1_cate = data1[col].value_counts().index.values.sort_values()
            data2_cate = data2[col].value_counts().index.values.sort_values()
            n_blank = (max_char_len-len(col))
            if len(data1_cate)==len(data2_cate):
                same_index =  data1_cate != data2_cate
                print(col, ' '*n_blank, ':', same_index.sum())
            else:
                print(col, ' '*n_blank, ': differ length')
            
print(color.BOLD + color.BLUE + '> 다른 카테고리의 개수' + color.END)
check_category2(train4,test4,COL_TYPE)

># **5. Modelling**

### LGBM setting with optuna

In [None]:
from sklearn import metrics

# bayesian optimization에 쓰일 hyper parameter들의 boundary
bounds_LGB = {
    'num_leaves': (100, 800), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.9),
    'feature_fraction' : (0.3, 0.9),
    'min_child_weight': (0.01, 1.),   
    'reg_alpha': (0.01, 1.), 
    'reg_lambda': (0.01, 1),
    'max_depth':(6, 23),
}

# bayesian optimazation을 통하여 hyper parameter를 선택한
# lightgbm modelling
def build_lgb(x, y, val_x, val_y,
              init_points=INIT_POINTS, n_iter=N_ITER, cv=N_CV, 
              ret_param=True, verbose=-1, is_test=False, 
              SEED=SEED):
    
    # verbose : 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
    
    # (1) 각 hyper parameter들의 lgb model의 f1 score를 return
    def LGB_bayesian(
        num_leaves, 
        bagging_fraction,
        feature_fraction,
        min_child_weight, 
        min_data_in_leaf,
        max_depth,
        reg_alpha,
        reg_lambda,
         ):
        # LightGBM expects next three parameters need to be integer. 
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int

        params = {
            'num_leaves': num_leaves, 
            'min_data_in_leaf': min_data_in_leaf,
            'min_child_weight': min_child_weight,
            'bagging_fraction' : bagging_fraction,
            'feature_fraction' : feature_fraction,
            'learning_rate' : 0.05,
            'max_depth': max_depth,
            'reg_alpha': reg_alpha,
            'reg_lambda': reg_lambda,
            'objective': 'binary',
            'save_binary': True,
            'seed': SEED,
            'feature_fraction_seed': SEED,
            'bagging_seed': SEED,
            'drop_seed': SEED,
            'data_random_seed': SEED,
            'boosting': 'gbdt', 
            'verbose': -1,
            'verbose_eval': -1,
            'boost_from_average': True,
            'metric':'auc',
            'n_estimators': 1000,
            'n_jobs': -1,
        }    

        ## set reg options
        model = lgb.LGBMClassifier(**params)
        model.fit(x, y, eval_set=(val_x, val_y), early_stopping_rounds=30, verbose=0)
        pred = model.predict(val_x)
        score = metrics.f1_score(val_y, pred)
        return score
    
    # (2) Get hyper parameter by bayesian optimazation
    optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=SEED, verbose=-1)
    init_points = init_points
    n_iter = n_iter

    # initial point, n_iter에 대해서 maximize 하는 bayesian optimazation 실행
    optimizer.maximize(init_points=init_points, n_iter=n_iter)
    
    # (3) bayesian optimazation를 통해서 얻은 hyper parameter
    param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        'learning_rate': 0.05,
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'binary',
        'save_binary': True,
        'seed': SEED,
        'feature_fraction_seed': SEED,
        'bagging_seed': SEED,
        'drop_seed': SEED,
        'data_random_seed': SEED,
        'boosting': 'gbdt', 
        'verbose': -1,
        'boost_from_average': True,
        'metric': 'binary_logloss', #'auc',
        'n_estimators': 1000,
        'n_jobs': -1,
    }

    # final parameter
    params = param_lgb.copy()
    
    # final model
    model = lgb.LGBMClassifier(**params)
    model.fit(x, y, eval_set=(val_x, val_y), early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=-1,
              callbacks = [lgb.early_stopping(10, verbose=0), lgb.log_evaluation(period=0)])
    
    if ret_param:
        return model, params
    else:
        return model

### segment and dataset setting

In [None]:
# ca==cp > exang > slope > sex
# > ca는 2,3건수가 너무적음
# > cp는 1,3건수가 너무 적음
seg_var = ['is_ca','is_cp','sex','exang','slope2']

# 각 세그별 최소 건수
[train4[seg_var_x].value_counts().min() for seg_var_x in seg_var]

In [None]:
start_time = time.time()

from sklearn.model_selection import StratifiedKFold

train_df = train4.copy()
test_df  = test4 .copy()

for seg_var_x in seg_var:

    train_df[seg_var_x+'_pred'] = np.nan
    test_df [seg_var_x+'_pred'] = np.nan

    train_df[seg_var_x+'_tr_idx'] = np.nan

    # segment별로 modelling
    for iter in range(0,len(train_df[seg_var_x].value_counts().index)):

        # segment setting
        seg_var_value = train_df[seg_var_x].value_counts().index[iter]

        # data setting
        if seg_var_x is not None:
            tr_seg_df = train_df[train_df[seg_var_x] == seg_var_value]
            te_seg_df = test_df [test_df [seg_var_x] == seg_var_value]

            drop_var = ['id','target'] +\
            [seg_var_x           for seg_var_x in seg_var] +\
            [seg_var_x+'_pred'   for seg_var_x in seg_var] +\
            [seg_var_x+'_tr_idx' for seg_var_x in seg_var]
            
            X_train = tr_seg_df[list(set(tr_seg_df.columns)-set(drop_var))]
            X_test  = te_seg_df[list(set(te_seg_df.columns)-set(drop_var))]

            y_train = tr_seg_df['target'][tr_seg_df[seg_var_x] == seg_var_value].astype(int).values

        else:
            tr_seg_df = train_df
            te_seg_df = test_df

            X_train = tr_seg_df.drop(['id','target'],axis=1)
            X_test  = te_seg_df.drop(['id'         ],axis=1)

            y_train = tr_seg_df['target'].astype(int).values

        # modelling
        n_fold = 5
        sf = StratifiedKFold(n_fold, shuffle=True, random_state=SEED)

        y_tr = []
        y_te = []

        c = 1
        for tr_idx, val_idx in sf.split(X_train, y_train):
            print(len(tr_idx), len(val_idx))
            print('#'*25, f'CV {c}')

            model, _ = build_lgb(X_train.iloc[tr_idx ], y_train[tr_idx ], 
                                 X_train.iloc[val_idx], y_train[val_idx],
                                 init_points=INIT_POINTS, n_iter=N_ITER, cv=N_CV, 
                                 ret_param=True, is_test=False, 
                                 SEED=SEED)

            y_tr_0 = model.predict(X_train)
            y_te_0 = model.predict(X_test)

            y_tr.append(y_tr_0)
            y_te.append(y_te_0)

            c += 1

        # seg별 tr/val index 넣기
        train_df[seg_var_x+'_tr_idx'] = ['1' if idx in tr_idx else '0' for idx in train.index]

        # seg별 predict값 넣기
        train_df[seg_var_x+'_pred'][train_df[seg_var_x] == seg_var_value] = np.where(np.mean(y_tr, 0)>0.5, 1, 0)
        test_df [seg_var_x+'_pred'][test_df [seg_var_x] == seg_var_value] = np.where(np.mean(y_te, 0)>0.5, 1, 0)
        
end_time = time.time()

In [None]:
runtime = (end_time-start_time)/60
f'{runtime:.2f} Mins'

In [None]:
for seg_var_x in seg_var:
    
    all_pred = train_df[f'{seg_var_x}_pred']
    all_true = train_df.target.astype(int).values
    all_f1   = metrics.f1_score(all_pred,all_true)
    
    tr_pred = train_df[f'{seg_var_x}_pred'][train_df[f'{seg_var_x}_tr_idx']=='1']
    tr_true = train_df.target[train_df[f'{seg_var_x}_tr_idx']=='1'].astype(int).values
    tr_f1   = metrics.f1_score(tr_pred,tr_true)
    
    va_pred = train_df[f'{seg_var_x}_pred'][train_df[f'{seg_var_x}_tr_idx']=='0']
    va_true = train_df.target[train_df[f'{seg_var_x}_tr_idx']=='0'].astype(int).values
    va_f1   = metrics.f1_score(va_pred,va_true)
    
    print('-'*50)
    print(f'{seg_var_x} - all : {all_f1:.2f}, train : {tr_f1:.2f}, valid : {va_f1:.2f}')
#     print(pd.crosstab(all_pred,all_true))
#     print(pd.crosstab(tr_pred,tr_true))
    print(pd.crosstab(va_pred,va_true))

#### train_df 저장

In [None]:
# train_df.to_csv(OUT_PATH + 'train_df(2).csv', index=False)

In [None]:
train_df

#### segment 별 f1_score

In [None]:
from sklearn import metrics

max_char_len = max([len(seg_var_x) for seg_var_x in seg_var])


for seg_var_x in seg_var:

    var_pred = seg_var_x+'_pred'
    tr_idx   = train_df[seg_var_x+'_tr_idx']=='1'
    va_idx   = train_df[seg_var_x+'_tr_idx']=='0'
    
    tr_pred = np.array([1 if pred==1 else 0 for pred in train_df[var_pred][tr_idx]])
    tr_true = train_df.target[tr_idx].astype(int).values

    va_pred = np.array([1 if pred==1 else 0 for pred in train_df[var_pred][va_idx]])
    va_true = train_df.target[va_idx].astype(int).values

    tr_f1_score = metrics.f1_score(tr_pred,tr_true)
    va_f1_score = metrics.f1_score(va_pred,va_true)
    
    n_blank = ' '*(max_char_len-len(seg_var_x))
    
    if seg_var_x==seg_var[0]: print(' '*max_char_len, '  train   valid')
    print(f'{seg_var_x} {n_blank} {tr_f1_score: .3f}  {va_f1_score: .3f}')

#### 각 seg별 f1_score

In [None]:
f1_score_list = []
for seg_var_x in seg_var:
    f1_score = metrics.f1_score(train_df[[seg_var_x + '_pred']],train_df.target.astype(int).values)
    n_blank = ' '*(max_char_len-len(seg_var_x))
    
    if seg_var_x==seg_var[0]: print(' '*max_char_len, 'f1_score')
    print(f'{seg_var_x} {n_blank} {f1_score: .3f}')
    
    f1_score_list.append(f1_score)

#### 각 segment의 weight

In [None]:
weight = [f/sum(f1_score_list) for f in f1_score_list]
for seg,w in zip(seg_var,weight):
    n_blank = ' '*(max_char_len-len(seg))
    if seg==seg_var[0]: print(' '*max_char_len, '  weight')
    print(f'{seg} {n_blank} {w*100: .1f}%')

print('\n',weight)

#### weighted predicted value

In [None]:
# (1) mixed
tr_pred_mix = train_df[[seg + '_pred' for seg in seg_var]].sum(axis=1)/len(seg_var)
tr_f1_score_mix = metrics.f1_score(np.where(tr_pred_mix>0.5,1,0), train_df.target.astype(int).values)

# (2) weighted mixed
tr_pred_weighted_mix = np.array([train_df[seg+'_pred'].astype(int).values*weight[iter] 
                                 for iter,seg in enumerate(['is_ca', 'is_cp', 'sex', 'exang', 'slope2'])]).sum(axis=0)
tr_f1_score_weighted_mix = metrics.f1_score(np.where(tr_pred_weighted_mix>0.5,1,0), train_df.target.astype(int).values)

print(f'f1_score of          mixed segment predicted value:{tr_f1_score_mix         : .3f}')
print(f'f1_score of weighted mixed segment predicted value:{tr_f1_score_weighted_mix: .3f}')

# a=pd.DataFrame({'x' : np.where(tr_pred_mix>0.5,1,0),
#                 'y' : np.where(tr_pred_weighted_mix>0.5,1,0)})
# pd.crosstab(a.x,a.y)

In [None]:
train_df['tr_pred_mix'] = np.where(train_df[[seg + '_pred' for seg in seg_var]].sum(axis=1)/len(seg_var)>0.5, '1', '0')
test_df ['te_pred_mix'] = np.where(test_df [[seg + '_pred' for seg in seg_var]].sum(axis=1)/len(seg_var)>0.5, '1', '0')

In [None]:
test_df['te_pred_mix'].value_counts()

In [None]:
sub['target'] = ['1' if pred=='1' else '0' for pred in test_df['te_pred_mix']]
sub.target.value_counts()

In [None]:
# 점수 : 0.92537
sub.to_csv(OUT_PATH + 'sample_submission.csv', index=False)