In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import textwrap
from sklearn.metrics import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from lightgbm import LGBMClassifier
from lightgbm import plot_importance as lgbm_plot_importance
from xgboost import XGBClassifier
from xgboost import plot_importance as xgb_plot_importance

# feature engineering 및 모델별 성능 확인

### 평가 지표 함수

In [2]:
def eval_metrics(y_test, title='Confusion Matrix', pred=None, pred_proba=None):
    cm = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_auc = roc_auc_score(y_test, pred_proba)
    
    label_names = ['TN', 'FP', 'FN', 'TP']
    counts = ["{0:0.0f}".format(value) for value in cm.flatten()]
    percentages = ["{0:.2%}".format(value) for value in cm.flatten()/np.sum(cm)]
    labels = [f"{ln}\n{c}\n{p}" for ln, c, p in zip(label_names, counts, percentages)]
    labels = np.asarray(labels).reshape(2, 2)

    sns.heatmap(cm, annot=labels, fmt='', cmap='Blues').set(title=title)
    plt.show()
    
    print('accuracy: {0:.4f}, precision: {1:.4f}, recall: {2:.4f}, f1: {3:.4f}, roc_auc: {4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))
    
    scores_dict = {
        'accuracy': accuracy,
        'precision': precision, 
        'recall': recall, 
        'f1': f1, 
        'roc_auc': roc_auc
    }
    return scores_dict

# 0. 별다른 feature engineering 없이 진행

In [3]:
train_a = pd.read_csv('../input/sleep-research/osa_train_a.csv')
train_a.head()

Unnamed: 0,ID,sex,age,height,weight,BMI,Time in Bed(min),Total sleep time(min),Sleep latency(min),N2 sleep latency(min),...,ODI,90%ODI,EtCO2 (>50mmHg),Total LMI =PLMI(/h),Total LM Arousal#,Total LM AI(/h),PLM Arousal#,PLM AI(/h),MAI(/h),OSA
0,2170,0,38,173.0,88.0,29.4,448.9,434.5,5.5,4.0,...,62.6,47.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,3539,1,54,156.0,49.0,20.1,476.0,426.0,3.0,2.5,...,2.4,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,108,0,62,164.0,67.0,24.9,435.5,351.0,3.5,1.0,...,7.4,0.3,0.0,16.8,9.0,1.5,0.0,0.0,1.5,1
3,3149,0,33,175.0,106.0,34.6,404.5,374.5,7.0,3.5,...,19.2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,4893,0,56,164.0,72.0,26.8,359.5,236.5,12.5,11.0,...,70.5,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [4]:
test_a = pd.read_csv('../input/sleep-research/osa_test_a.csv')
test_a.head()

Unnamed: 0,ID,sex,age,height,weight,BMI,Time in Bed(min),Total sleep time(min),Sleep latency(min),N2 sleep latency(min),...,ODI,90%ODI,EtCO2 (>50mmHg),Total LMI =PLMI(/h),Total LM Arousal#,Total LM AI(/h),PLM Arousal#,PLM AI(/h),MAI(/h),OSA
0,2021,1,47,164.0,55.0,20.4,362.0,343.0,5.5,2.0,...,7.5,1.0,0.0,11.4,5.0,0.9,5.0,0.9,1.8,1
1,1506,0,64,175.0,75.0,24.5,395.0,342.0,5.0,4.5,...,9.5,0.4,0.0,0.4,2.0,0.4,0.0,0.0,0.4,0
2,3160,0,34,170.0,79.0,27.3,415.0,363.5,7.5,20.5,...,5.3,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1092,0,66,167.0,68.0,24.4,318.0,104.0,21.0,6.5,...,67.5,31.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,5043,0,60,169.0,67.0,23.5,383.0,310.0,6.5,4.0,...,31.5,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [5]:
X_train_a = train_a.drop(['ID', 'OSA'], axis=1)
y_train_a = train_a['OSA']

In [6]:
X_test_a = test_a.drop(['ID', 'OSA'], axis=1)
y_test_a = test_a['OSA']

In [7]:
features_a = list(X_train_a.columns)
train = X_train_a.copy()
test = X_test_a.copy()

scaler = MinMaxScaler(feature_range=(0,1))

scaler.fit(train)
train_a = scaler.transform(train)
test_a = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (4121, 77)
Testing data shape:  (1031, 77)


In [8]:
train_b = pd.read_csv('../input/sleep-research/osa_train_b.csv')
train_b.head()

Unnamed: 0,ID,sex,age,height,weight,BMI,Time in Bed(min),Total sleep time(min),Sleep latency(min),REM sleep latency(min),...,Lateral A+H+RERA_ Index(h),REM A+H+RERA Index(/h),NREM A+H+RERA Index(/h),Apnea_Max_ Length(sec),Hypopnea_Max_ Length(sec),Resp. Arousal /(A+H)(%),ODI,90%ODI,EtCO2 (>50mmHg),OSA
0,4238,0,39,170.0,65.0,22.5,480.5,435.0,2.5,63.0,...,4.2,11.6,8.2,0.0,69.2,87.9,2.3,0.0,0.0,1
1,1630,0,41,172.0,77.0,26.0,338.5,187.0,3.0,86.5,...,0.0,41.7,19.8,83.4,101.6,38.2,23.7,5.5,0.0,1
2,2711,0,64,166.0,69.0,25.0,450.0,376.0,8.0,62.0,...,20.7,54.1,42.5,57.4,112.9,79.9,31.9,8.3,0.0,1
3,662,1,58,157.0,56.0,22.7,394.0,371.0,1.0,188.5,...,0.0,51.9,32.1,47.0,62.7,19.2,33.0,8.1,0.0,1
4,4666,1,29,154.0,50.0,21.1,278.0,252.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [9]:
test_b = pd.read_csv('../input/sleep-research/osa_test_b.csv')
test_b.head()

Unnamed: 0,ID,sex,age,height,weight,BMI,Time in Bed(min),Total sleep time(min),Sleep latency(min),REM sleep latency(min),...,Lateral A+H+RERA_ Index(h),REM A+H+RERA Index(/h),NREM A+H+RERA Index(/h),Apnea_Max_ Length(sec),Hypopnea_Max_ Length(sec),Resp. Arousal /(A+H)(%),ODI,90%ODI,EtCO2 (>50mmHg),OSA
0,3402,1,35,164.0,56.0,20.8,486.5,452.0,11.0,41.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,4990,0,41,175.0,92.0,30.0,510.0,492.5,5.0,91.0,...,6.3,16.2,6.6,0.0,79.2,63.6,7.1,0.0,0.0,1
2,2171,0,54,169.0,82.0,28.7,416.0,312.5,7.0,109.5,...,45.3,50.7,73.2,49.7,65.6,83.8,58.0,30.3,0.0,1
3,508,0,45,180.0,76.0,23.5,375.0,342.5,4.0,132.5,...,0.0,34.0,5.1,24.5,67.6,62.3,4.4,0.7,0.0,1
4,3448,0,47,174.0,110.0,36.3,222.0,171.0,4.5,62.0,...,108.5,78.7,119.1,52.2,48.5,69.6,112.6,101.1,0.0,1


In [10]:
X_train_b = train_b.drop(['ID', 'OSA'], axis=1)
y_train_b = train_b['OSA']

In [11]:
X_test_b = test_b.drop(['ID', 'OSA'], axis=1)
y_test_b = test_b['OSA']

In [12]:
features_b = list(X_train_b.columns)
train = X_train_b.copy()
test = X_test_b.copy()

scaler = MinMaxScaler(feature_range=(0,1))

scaler.fit(train)
train_b = scaler.transform(train)
test_b = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (4347, 61)
Testing data shape:  (1087, 61)


## 0-4. LightGBM

### A. with osa_data_a

In [13]:
lgbm_a = LGBMClassifier(
    random_state=42, 
    device="gpu", 
    importance_type='gain',
    learning_rate=0.1,
    max_depth=2,
    min_data_in_leaf=50, 
    n_estimators=100,
    num_leaves=10
)
lgbm_a.fit(train_a, y_train_a, feature_name=features_a)



LGBMClassifier(device='gpu', importance_type='gain', max_depth=2,
               min_data_in_leaf=50, num_leaves=10, random_state=42)

In [14]:
y_pred_proba = lgbm_a.predict_proba(test_a)[:,1]
y_pred = (y_pred_proba > 0.5).astype(int)
y_pred

array([1, 1, 1, ..., 0, 1, 1])

In [15]:
feature_importances = pd.DataFrame({'Value': lgbm_a.feature_importances_, 'Feature': features_a})
sorted_feature_importances = feature_importances.sort_values(by=['Value'], ascending=False)
sorted_feature_importances.head(10)

Unnamed: 0,Value,Feature
56,5905.355681,Total- A+H+RERA#
47,5422.894258,Total A+H Index(/h)
46,1919.815612,Total A+H #
48,1057.476872,SUPINE A+H Index(/h)
57,279.880291,Total_A+H+RERA_ Index(/h)
40,239.82797,Supine_Apnea_ Index(/h)
3,192.65391,weight
27,183.2454,Movement Arousal #
37,158.447058,Obstructive apnea #
67,144.95555,Lowest SaO2 (%)


### B. with osa_data_b

In [16]:
lgbm_b = LGBMClassifier(
    random_state=42, 
    device="gpu", 
    importance_type='gain',
    learning_rate=0.1,
    max_depth=4,
    min_data_in_leaf=100, 
    n_estimators=100,
    num_leaves=10
)
lgbm_b.fit(train_b, y_train_b, feature_name=features_b)



LGBMClassifier(device='gpu', importance_type='gain', max_depth=4,
               min_data_in_leaf=100, num_leaves=10, random_state=42)

In [17]:
y_pred_proba = lgbm_b.predict_proba(test_b)[:,1]
y_pred = (y_pred_proba > 0.5).astype(int)
y_pred

array([0, 1, 1, ..., 1, 1, 1])

In [18]:
feature_importances = pd.DataFrame({'Value': lgbm_b.feature_importances_, 'Feature': features_b})
sorted_feature_importances = feature_importances.sort_values(by=['Value'], ascending=False)
sorted_feature_importances.head(10)

Unnamed: 0,Value,Feature
39,6913.541553,Total A+H #
40,3801.726788,Total A+H Index(/h)
49,3777.790609,Total- A+H+RERA#
50,936.140573,Total_A+H+RERA_ Index(/h)
51,416.606499,Supine0 A+H+RERA_Index(h)
23,369.187231,Movement Aroual index(/h)
34,262.741019,Supine_Apnea_ Index(/h)
2,248.165261,height
3,209.560669,weight
55,200.99498,Apnea_Max_ Length(sec)


## 0-5. XGBoost

### A. with osa_data_a

In [19]:
xgb_a = XGBClassifier(
    random_state=42, 
    tree_method='gpu_hist',
    learning_rate=0.01,
    n_estimators=100,
    max_depth=8,
    min_child_weight=10,
    gamma=0.2,
    reg_alpha=0.1
)
xgb_a.fit(train_a, y_train_a)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.2, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=10,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0.1, reg_lambda=1, ...)

In [20]:
y_pred_proba = xgb_a.predict_proba(test_a)[:,1]
y_pred = (y_pred_proba > 0.5).astype(int)
y_pred

array([1, 1, 1, ..., 0, 1, 1])

In [21]:
feature_importances = pd.DataFrame({'Value': xgb_a.feature_importances_, 'Feature': features_a})
sorted_feature_importances = feature_importances.sort_values(by=['Value'], ascending=False)
sorted_feature_importances.head(10)

Unnamed: 0,Value,Feature
46,0.508838,Total A+H #
47,0.153062,Total A+H Index(/h)
56,0.034062,Total- A+H+RERA#
2,0.014488,height
57,0.013736,Total_A+H+RERA_ Index(/h)
74,0.012003,PLM Arousal#
76,0.009962,MAI(/h)
27,0.00947,Movement Arousal #
34,0.009013,Lateral Arousal index(h)
28,0.008134,Movement Aroual index(/h)


### B. with osa_data_b

In [22]:
xgb_b = XGBClassifier(
    random_state=42, 
    tree_method='gpu_hist',
    learning_rate=0.1,
    n_estimators=100,
    max_depth=4,
    min_child_weight=1,
    gamma=0.5,
    reg_alpha=0.001
)
xgb_b.fit(train_b, y_train_b)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0.5, gpu_id=0, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0.001, reg_lambda=1, ...)

In [23]:
y_pred_proba = xgb_b.predict_proba(test_b)[:,1]
y_pred = (y_pred_proba > 0.5).astype(int)
y_pred

array([0, 1, 1, ..., 1, 1, 1])

In [24]:
feature_importances = pd.DataFrame({'Value': xgb_b.feature_importances_, 'Feature': features_b})
sorted_feature_importances = feature_importances.sort_values(by=['Value'], ascending=False)
sorted_feature_importances.head(10)

Unnamed: 0,Value,Feature
39,0.413808,Total A+H #
40,0.13204,Total A+H Index(/h)
49,0.070884,Total- A+H+RERA#
50,0.053507,Total_A+H+RERA_ Index(/h)
34,0.012899,Supine_Apnea_ Index(/h)
2,0.011193,height
1,0.008469,age
15,0.008367,Total Aroual index(/h)
32,0.007824,Obstructive apnea #
23,0.007719,Movement Aroual index(/h)
