> title : 200_etri_lifelog_model_vF (모델 학습 코드) <br>
 -  코드 실행 전 PATH 변경하세요.
  - PATH  =  '/content/drive/MyDrive/data/ch2025_data_items/share/submissions/

### 📌 set path

In [1]:
PATH  =  '/content/drive/MyDrive/data/ch2025_data_items/share/submissions/input'

In [2]:
from google.colab import drive, files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 📌 libraries

In [3]:
! pip install haversine >/dev/null
! pip install category_encoders >/dev/null
! pip install tabpfn  >/dev/null
! pip install torchmetrics >/dev/null

In [40]:
# 기본 모듈
import os
import sys
import re
import ast
import glob
import random
import warnings
from collections import Counter
from math import radians, cos, sin, asin, sqrt
from functools import reduce
from datetime import datetime, timedelta, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 머신러닝
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from category_encoders import TargetEncoder
from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier
import lightgbm as lgb
from tabpfn import TabPFNClassifier
from sklearn.inspection import permutation_importance
import shap
from sklearn.impute import KNNImputer

# PyTorch
import torch
from torch import nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Hugging Face
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    LlamaTokenizer,
    LlamaForCausalLM,
    LlamaForSequenceClassification
)

# PEFT (Parameter-Efficient Fine-Tuning)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    TaskType
)

# Evaluation & Utilities
from torchmetrics import Accuracy

# 기타
from tqdm import tqdm
from tqdm.auto import tqdm as auto_tqdm  # 필요 시 구분
from scipy.stats import entropy
from haversine import haversine
from io import StringIO
import gc

# 옵션
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%0.4f' % x)

# 기타
warnings.filterwarnings('ignore')

# 기본 시드 설정
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# 재현성 향상 옵션 (단, 성능 저하 가능성 있음)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### 📌 class : etri_lifelog_pipeline

In [47]:
class etri_lifelog_pipeline:

  def __init__(self):

    self.valid_id1 = """
    subject_id,sleep_date
    id01,2024-07-24
    id01,2024-08-26
    id01,2024-08-28
    id01,2024-08-29
    id02,2024-08-23
    id02,2024-09-24
    id02,2024-09-26
    id02,2024-09-27
    id03,2024-08-30
    id03,2024-09-01
    id03,2024-09-02
    id03,2024-09-06
    id04,2024-09-03
    id04,2024-10-10
    id04,2024-10-12
    id04,2024-10-13
    id05,2024-10-19
    id05,2024-10-23
    id05,2024-10-24
    id05,2024-10-27
    id06,2024-07-25
    id06,2024-07-26
    id06,2024-07-27
    id06,2024-07-30
    id07,2024-07-07
    id07,2024-08-02
    id07,2024-08-04
    id07,2024-08-05
    id08,2024-08-28
    id08,2024-08-29
    id08,2024-08-30
    id08,2024-09-02
    id09,2024-08-02
    id09,2024-08-31
    id09,2024-09-02
    id09,2024-09-03
    id10,2024-08-28
    id10,2024-08-30
    id10,2024-08-31
    id10,2024-09-03
    """

    self.valid_id2 = """
    subject_id,sleep_date
    id01,2024-07-24
    id01,2024-07-27
    id01,2024-08-18
    id01,2024-08-19
    id01,2024-08-20
    id01,2024-08-21
    id01,2024-08-22
    id01,2024-08-24
    id01,2024-08-25
    id01,2024-08-26
    id01,2024-08-27
    id01,2024-08-28
    id01,2024-08-29
    id01,2024-08-30
    id02,2024-08-23
    id02,2024-08-24
    id02,2024-09-16
    id02,2024-09-17
    id02,2024-09-19
    id02,2024-09-20
    id02,2024-09-21
    id02,2024-09-22
    id02,2024-09-23
    id02,2024-09-24
    id02,2024-09-25
    id02,2024-09-26
    id02,2024-09-27
    id02,2024-09-28
    id03,2024-08-30
    id03,2024-09-01
    id03,2024-09-02
    id03,2024-09-03
    id03,2024-09-05
    id03,2024-09-06
    id03,2024-09-07
    id04,2024-09-03
    id04,2024-09-04
    id04,2024-09-05
    id04,2024-09-06
    id04,2024-09-07
    id04,2024-09-08
    id04,2024-09-09
    id04,2024-10-08
    id04,2024-10-09
    id04,2024-10-10
    id04,2024-10-11
    id04,2024-10-12
    id04,2024-10-13
    id04,2024-10-14
    id05,2024-10-19
    id05,2024-10-23
    id05,2024-10-24
    id05,2024-10-25
    id05,2024-10-26
    id05,2024-10-27
    id05,2024-10-28
    id06,2024-07-25
    id06,2024-07-26
    id06,2024-07-27
    id06,2024-07-28
    id06,2024-07-29
    id06,2024-07-30
    id06,2024-07-31
    id07,2024-07-07
    id07,2024-07-08
    id07,2024-07-09
    id07,2024-07-10
    id07,2024-07-11
    id07,2024-07-12
    id07,2024-07-13
    id07,2024-07-30
    id07,2024-08-01
    id07,2024-08-02
    id07,2024-08-03
    id07,2024-08-04
    id07,2024-08-05
    id07,2024-08-06
    id08,2024-08-28
    id08,2024-08-29
    id08,2024-08-30
    id08,2024-08-31
    id08,2024-09-01
    id08,2024-09-02
    id08,2024-09-04
    id09,2024-08-02
    id09,2024-08-22
    id09,2024-08-23
    id09,2024-08-24
    id09,2024-08-25
    id09,2024-08-27
    id09,2024-08-28
    id09,2024-08-29
    id09,2024-08-30
    id09,2024-08-31
    id09,2024-09-01
    id09,2024-09-02
    id09,2024-09-03
    id09,2024-09-04
    id10,2024-08-28
    id10,2024-08-30
    id10,2024-08-31
    id10,2024-09-01
    id10,2024-09-02
    id10,2024-09-03
    id10,2024-09-06
    """

    self.common_params = {
      'n_estimators': 5000, # 5000
      "learning_rate": 0.01, # 0.01
      'lambda_l1': 5, # 5
      'lambda_l2': 1, # 1
      'bagging_fraction': 0.8, # 0.8
      'feature_fraction': 1,
      'n_jobs': -1,
      'verbosity': -1
    }

    self.lgb_params = {
      'n_estimators': 5000, # 5000
      "learning_rate": 0.01, # 0.01
      'lambda_l1': 5, # 5
      'lambda_l2': 1, # 1
      'bagging_fraction': 0.8, # 0.8
      'feature_fraction': 1,
      'n_jobs': -1,
      'verbosity': -1
    }

    self.xgb_params = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 1, # 1
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': seed
    }

    self.xgb_params_S1 = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 5, # 1
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': seed
    }

    self.tabpfn_params = {
        'device': 'cuda',  # GPU 사용
    }


  def read_originaldata(self):

    # [1]dataset version1
    train1 = pd.read_parquet(f'{PATH}/train_63775_vF.parquet')
    test1 = pd.read_parquet(f'{PATH}/test_63775_vF.parquet')

    # [2]dataset version2
    train2 = pd.read_parquet(f'{PATH}/train_hjy_0603_v1.parquet')
    test2 = pd.read_parquet(f'{PATH}/test_hjy_0603_v1.parquet')

    # [1]+[2]
    a1 = train1.columns.tolist()
    a2 = train2.columns.tolist()
    feats = ['subject_id','sleep_date','lifelog_date']+list(set(a2)-set(a1))
    train2 = train2[feats].copy()
    test2 = test2[feats].copy()
    train = train1.merge(train2,on=['subject_id','sleep_date','lifelog_date'],how='left')
    test = test1.merge(test2,on=['subject_id','sleep_date','lifelog_date'],how='left')

    return train, test

  def read_meanimputedata(self):

    # [1]dataset version1
    train1 = pd.read_parquet(f'{PATH}/train_63775_vF.parquet')
    test1 = pd.read_parquet(f'{PATH}/test_63775_vF.parquet')

    # [2]dataset version2
    train2 = pd.read_parquet(f'{PATH}/train_hjy_0603_v1.parquet')
    test2 = pd.read_parquet(f'{PATH}/test_hjy_0603_v1.parquet')

    # [1]+[2]
    a1 = train1.columns.tolist()
    a2 = train2.columns.tolist()
    feats = ['subject_id','sleep_date','lifelog_date']+list(set(a2)-set(a1))
    train2 = train2[feats].copy()
    test2 = test2[feats].copy()
    train = train1.merge(train2,on=['subject_id','sleep_date','lifelog_date'],how='left')
    test = test1.merge(test2,on=['subject_id','sleep_date','lifelog_date'],how='left')

    # experiment dataset
    feats = [
        'sleep_time',        'wake_time',        'sleep_duration_min',  'avg_sleep_time', 'avg_wake_time', 'avg_sleep_duration', 'sleep_time_diff', 'wake_time_diff', 'sleep_duration_diff', 'sleep_time_ratio', 'wake_time_ratio', 'sleep_duration_ratio',
        'sleep_time_lag1',   'wake_time_lag1',   'sleep_duration_lag1', 'sleep_time_diff_lag1', 'wake_time_diff_lag1', 'sleep_duration_diff_lag1', 'sleep_time_ratio_lag1', 'wake_time_ratio_lag1', 'sleep_duration_ratio_lag1',
        'sleep_time_lag2',   'wake_time_lag2',   'sleep_duration_lag2', 'sleep_time_diff_lag2', 'wake_time_diff_lag2', 'sleep_duration_diff_lag2', 'sleep_time_ratio_lag2', 'wake_time_ratio_lag2', 'sleep_duration_ratio_lag2', 'sleep_time_mean2d', 'wake_time_mean2d', 'sleep_duration_min_mean2d', 'sleep_time_diff_mean2d', 'wake_time_diff_mean2d', 'sleep_duration_diff_mean2d', 'sleep_time_ratio_mean2d', 'wake_time_ratio_mean2d', 'sleep_duration_ratio_mean2d', 'sleep_time_std2d', 'wake_time_std2d', 'sleep_duration_min_std2d', 'sleep_time_diff_std2d', 'wake_time_diff_std2d', 'sleep_duration_diff_std2d', 'sleep_time_ratio_std2d', 'wake_time_ratio_std2d', 'sleep_duration_ratio_std2d',
        'sleep_time_mean3d', 'wake_time_mean3d', 'sleep_duration_min_mean3d', 'sleep_time_diff_mean3d', 'wake_time_diff_mean3d', 'sleep_duration_diff_mean3d', 'sleep_time_ratio_mean3d', 'wake_time_ratio_mean3d', 'sleep_duration_ratio_mean3d', 'sleep_time_std3d', 'wake_time_std3d', 'sleep_duration_min_std3d', 'sleep_time_diff_std3d', 'wake_time_diff_std3d', 'sleep_duration_diff_std3d', 'sleep_time_ratio_std3d', 'wake_time_ratio_std3d', 'sleep_duration_ratio_std3d',
        #'sleep_time_mean5d', 'wake_time_mean5d', 'sleep_duration_min_mean5d', 'sleep_time_diff_mean5d', 'wake_time_diff_mean5d', 'sleep_duration_diff_mean5d', 'sleep_time_ratio_mean5d', 'wake_time_ratio_mean5d', 'sleep_duration_ratio_mean5d', 'sleep_time_std5d', 'wake_time_std5d', 'sleep_duration_min_std5d', 'sleep_time_diff_std5d', 'wake_time_diff_std5d', 'sleep_duration_diff_std5d', 'sleep_time_ratio_std5d', 'wake_time_ratio_std5d', 'sleep_duration_ratio_std5d',
        #'sleep_time_mean7d', 'wake_time_mean7d', 'sleep_duration_min_mean7d', 'sleep_time_diff_mean7d', 'wake_time_diff_mean7d', 'sleep_duration_diff_mean7d', 'sleep_time_ratio_mean7d', 'wake_time_ratio_mean7d', 'sleep_duration_ratio_mean7d', 'sleep_time_std7d', 'wake_time_std7d', 'sleep_duration_min_std7d', 'sleep_time_diff_std7d', 'wake_time_diff_std7d', 'sleep_duration_diff_std7d', 'sleep_time_ratio_std7d', 'wake_time_ratio_std7d', 'sleep_duration_ratio_std7d',
        'weekday_avg_sleep', 'sleep_duration_weekday_avg_diff', 'sleep_duration_weekday_avg_div'
    ]
    for feat in feats:
      a1_dict = train.groupby(['subject_id'])[feat].mean().to_dict()
      train[feat] = train[feat].fillna(train['subject_id'].map(a1_dict))
      test[feat] = test[feat].fillna(test['subject_id'].map(a1_dict))

    return train, test

  def read_knnimputedata(self):

    # [1]dataset version1
    train1 = pd.read_parquet(f'{PATH}/train_63775_vF.parquet')
    test1 = pd.read_parquet(f'{PATH}/test_63775_vF.parquet')

    # [2]dataset version2
    train2 = pd.read_parquet(f'{PATH}/train_hjy_0603_v1.parquet')
    test2 = pd.read_parquet(f'{PATH}/test_hjy_0603_v1.parquet')

    # [1]+[2]
    a1 = train1.columns.tolist()
    a2 = train2.columns.tolist()
    feats = ['subject_id','sleep_date','lifelog_date']+list(set(a2)-set(a1))
    train2 = train2[feats].copy()
    test2 = test2[feats].copy()
    train = train1.merge(train2,on=['subject_id','sleep_date','lifelog_date'],how='left')
    test = test1.merge(test2,on=['subject_id','sleep_date','lifelog_date'],how='left')

    # experiment dataset
    feats = [
        'sleep_time',        'wake_time',        'sleep_duration_min',  'avg_sleep_time', 'avg_wake_time', 'avg_sleep_duration', 'sleep_time_diff', 'wake_time_diff', 'sleep_duration_diff', 'sleep_time_ratio', 'wake_time_ratio', 'sleep_duration_ratio',
        'sleep_time_lag1',   'wake_time_lag1',   'sleep_duration_lag1', 'sleep_time_diff_lag1', 'wake_time_diff_lag1', 'sleep_duration_diff_lag1', 'sleep_time_ratio_lag1', 'wake_time_ratio_lag1', 'sleep_duration_ratio_lag1',
        'sleep_time_lag2',   'wake_time_lag2',   'sleep_duration_lag2', 'sleep_time_diff_lag2', 'wake_time_diff_lag2', 'sleep_duration_diff_lag2', 'sleep_time_ratio_lag2', 'wake_time_ratio_lag2', 'sleep_duration_ratio_lag2', 'sleep_time_mean2d', 'wake_time_mean2d', 'sleep_duration_min_mean2d', 'sleep_time_diff_mean2d', 'wake_time_diff_mean2d', 'sleep_duration_diff_mean2d', 'sleep_time_ratio_mean2d', 'wake_time_ratio_mean2d', 'sleep_duration_ratio_mean2d', 'sleep_time_std2d', 'wake_time_std2d', 'sleep_duration_min_std2d', 'sleep_time_diff_std2d', 'wake_time_diff_std2d', 'sleep_duration_diff_std2d', 'sleep_time_ratio_std2d', 'wake_time_ratio_std2d', 'sleep_duration_ratio_std2d',
        'sleep_time_mean3d', 'wake_time_mean3d', 'sleep_duration_min_mean3d', 'sleep_time_diff_mean3d', 'wake_time_diff_mean3d', 'sleep_duration_diff_mean3d', 'sleep_time_ratio_mean3d', 'wake_time_ratio_mean3d', 'sleep_duration_ratio_mean3d', 'sleep_time_std3d', 'wake_time_std3d', 'sleep_duration_min_std3d', 'sleep_time_diff_std3d', 'wake_time_diff_std3d', 'sleep_duration_diff_std3d', 'sleep_time_ratio_std3d', 'wake_time_ratio_std3d', 'sleep_duration_ratio_std3d',
        #'sleep_time_mean5d', 'wake_time_mean5d', 'sleep_duration_min_mean5d', 'sleep_time_diff_mean5d', 'wake_time_diff_mean5d', 'sleep_duration_diff_mean5d', 'sleep_time_ratio_mean5d', 'wake_time_ratio_mean5d', 'sleep_duration_ratio_mean5d', 'sleep_time_std5d', 'wake_time_std5d', 'sleep_duration_min_std5d', 'sleep_time_diff_std5d', 'wake_time_diff_std5d', 'sleep_duration_diff_std5d', 'sleep_time_ratio_std5d', 'wake_time_ratio_std5d', 'sleep_duration_ratio_std5d',
        #'sleep_time_mean7d', 'wake_time_mean7d', 'sleep_duration_min_mean7d', 'sleep_time_diff_mean7d', 'wake_time_diff_mean7d', 'sleep_duration_diff_mean7d', 'sleep_time_ratio_mean7d', 'wake_time_ratio_mean7d', 'sleep_duration_ratio_mean7d', 'sleep_time_std7d', 'wake_time_std7d', 'sleep_duration_min_std7d', 'sleep_time_diff_std7d', 'wake_time_diff_std7d', 'sleep_duration_diff_std7d', 'sleep_time_ratio_std7d', 'wake_time_ratio_std7d', 'sleep_duration_ratio_std7d',
        'weekday_avg_sleep', 'sleep_duration_weekday_avg_diff', 'sleep_duration_weekday_avg_div'
    ]
    imputer = KNNImputer(n_neighbors=5)
    imputer.fit(train[feats])
    train[feats] = imputer.transform(train[feats])
    test[feats] = imputer.transform(test[feats])

    return train, test

  def read_llmimputedata(self):

    # [1]dataset version1
    train1 = pd.read_parquet(f'{PATH}/train_63775_vF.parquet')
    test1 = pd.read_parquet(f'{PATH}/test_63775_vF.parquet')

    # [2]dataset version2
    train2 = pd.read_parquet(f'{PATH}/train_hjy_0603_v1.parquet')
    test2 = pd.read_parquet(f'{PATH}/test_hjy_0603_v1.parquet')

    # [1]+[2]
    a1 = train1.columns.tolist()
    a2 = train2.columns.tolist()
    feats = ['subject_id','sleep_date','lifelog_date']+list(set(a2)-set(a1))
    train2 = train2[feats].copy()
    test2 = test2[feats].copy()
    train = train1.merge(train2,on=['subject_id','sleep_date','lifelog_date'],how='left')
    test = test1.merge(test2,on=['subject_id','sleep_date','lifelog_date'],how='left')

    # [3]QWEN3 8B 활용한 결측처리 (대상: mScreenStatus)
    mScreenStatus_llm = pd.read_excel(f'{PATH}/mScreenStatus_llm결측값생성후파생변수생성_20250609_v1.xlsx')
    feats = [
        'sleep_time',        'wake_time',        'sleep_duration_min',  'avg_sleep_time', 'avg_wake_time', 'avg_sleep_duration', 'sleep_time_diff', 'wake_time_diff', 'sleep_duration_diff', 'sleep_time_ratio', 'wake_time_ratio', 'sleep_duration_ratio',
        'sleep_time_lag1',   'wake_time_lag1',   'sleep_duration_lag1', 'sleep_time_diff_lag1', 'wake_time_diff_lag1', 'sleep_duration_diff_lag1', 'sleep_time_ratio_lag1', 'wake_time_ratio_lag1', 'sleep_duration_ratio_lag1',
        'sleep_time_lag2',   'wake_time_lag2',   'sleep_duration_lag2', 'sleep_time_diff_lag2', 'wake_time_diff_lag2', 'sleep_duration_diff_lag2', 'sleep_time_ratio_lag2', 'wake_time_ratio_lag2', 'sleep_duration_ratio_lag2', 'sleep_time_mean2d', 'wake_time_mean2d', 'sleep_duration_min_mean2d', 'sleep_time_diff_mean2d', 'wake_time_diff_mean2d', 'sleep_duration_diff_mean2d', 'sleep_time_ratio_mean2d', 'wake_time_ratio_mean2d', 'sleep_duration_ratio_mean2d', 'sleep_time_std2d', 'wake_time_std2d', 'sleep_duration_min_std2d', 'sleep_time_diff_std2d', 'wake_time_diff_std2d', 'sleep_duration_diff_std2d', 'sleep_time_ratio_std2d', 'wake_time_ratio_std2d', 'sleep_duration_ratio_std2d',
        'sleep_time_mean3d', 'wake_time_mean3d', 'sleep_duration_min_mean3d', 'sleep_time_diff_mean3d', 'wake_time_diff_mean3d', 'sleep_duration_diff_mean3d', 'sleep_time_ratio_mean3d', 'wake_time_ratio_mean3d', 'sleep_duration_ratio_mean3d', 'sleep_time_std3d', 'wake_time_std3d', 'sleep_duration_min_std3d', 'sleep_time_diff_std3d', 'wake_time_diff_std3d', 'sleep_duration_diff_std3d', 'sleep_time_ratio_std3d', 'wake_time_ratio_std3d', 'sleep_duration_ratio_std3d',
        #'sleep_time_mean5d', 'wake_time_mean5d', 'sleep_duration_min_mean5d', 'sleep_time_diff_mean5d', 'wake_time_diff_mean5d', 'sleep_duration_diff_mean5d', 'sleep_time_ratio_mean5d', 'wake_time_ratio_mean5d', 'sleep_duration_ratio_mean5d', 'sleep_time_std5d', 'wake_time_std5d', 'sleep_duration_min_std5d', 'sleep_time_diff_std5d', 'wake_time_diff_std5d', 'sleep_duration_diff_std5d', 'sleep_time_ratio_std5d', 'wake_time_ratio_std5d', 'sleep_duration_ratio_std5d',
        #'sleep_time_mean7d', 'wake_time_mean7d', 'sleep_duration_min_mean7d', 'sleep_time_diff_mean7d', 'wake_time_diff_mean7d', 'sleep_duration_diff_mean7d', 'sleep_time_ratio_mean7d', 'wake_time_ratio_mean7d', 'sleep_duration_ratio_mean7d', 'sleep_time_std7d', 'wake_time_std7d', 'sleep_duration_min_std7d', 'sleep_time_diff_std7d', 'wake_time_diff_std7d', 'sleep_duration_diff_std7d', 'sleep_time_ratio_std7d', 'wake_time_ratio_std7d', 'sleep_duration_ratio_std7d',
        'weekday_avg_sleep', 'sleep_duration_weekday_avg_diff', 'sleep_duration_weekday_avg_div'
    ]
    drop_features = [i for i in feats if i in train.columns]
    train = train.drop(columns=drop_features)
    train = train.merge(mScreenStatus_llm,on=['subject_id','lifelog_date'],how='left')
    test = test.drop(columns=drop_features)
    test = test.merge(mScreenStatus_llm,on=['subject_id','lifelog_date'],how='left')

    return train, test

  def preprocess(self, train, test):

    drop_featurs = [
     'Unnamed: 0'
    ,'light_week_type_lag1'
    ,'week_type'
    ,'week_type_lag1'
    ,'activehour_top_bssid'
    ,'beforebed_top_bssid'
    ]
    drop_featurs = [i for i in drop_featurs if i in train.columns]
    train = train.drop(columns=drop_featurs)
    test = test.drop(columns=drop_featurs)

    # weekend
    train['weekend'] = np.where(train['weekday'].isin(['토요일','금요일']),1,0)
    test['weekend'] = np.where(test['weekday'].isin(['토요일','금요일']),1,0)

    # REPLACE MISSING VALUES (COMMON)
    train[train.select_dtypes(include='number').columns] = train.select_dtypes(include='number').fillna(-1)
    test[test.select_dtypes(include='number').columns] = test.select_dtypes(include='number').fillna(-1)

    # drop_featurs
    drop_featurs = ['light_month']
    drop_featurs = [i for i in drop_featurs if i in train.columns]
    train = train.drop(columns=drop_featurs)
    test = test.drop(columns=drop_featurs)

    return train, test

  def run_basemodel(self, train, test, valid_ids, common_params, n_splits, random_state, S1_balance):

      lgb_A = 0.4
      xgb_B = 0.3
      tab_C = 0.3 ###
      print(f'# lgb_A:{lgb_A} xgb_B:{xgb_B} tab_C:{tab_C}')

      lgb_params = common_params['Q1'].copy()
      lgb_params['random_state'] = random_state

      train_df = train.copy()
      test_df = test.copy()

      submission_final = test_df[['subject_id', 'sleep_date', 'lifelog_date']].copy()
      submission_final['lifelog_date'] = pd.to_datetime(submission_final['lifelog_date']).dt.date

      # TARGET
      targets_binary = ['Q1', 'Q2', 'Q3', 'S2', 'S3']
      targets_binary_name = ['기상직후수면질','취침전신체적피로','취침전스트레스','수면효율','수면잠들기시간']
      target_multiclass = 'S1'
      all_targets = targets_binary + [target_multiclass]

      # add_noise
      def add_noise(series, noise_level, seed=3):
          rng = np.random.default_rng(seed)
          return series * (1 + noise_level * rng.standard_normal(len(series)))

      noise_level = 0.015

      # ----------------------------------------------------------------------------------------------------------

      # predweekday 생성: subject_id별로 sleep_duration_min이 긴 요일 2개를 선정
      top2_days = (
          train_df.groupby(['subject_id', 'weekday'])['sleep_duration_min']
          .mean()
          .reset_index()
          .sort_values(['subject_id', 'sleep_duration_min'], ascending=[True, False])
          .groupby('subject_id')
          .head(1)
      )

      # subject_id별 top2 weekday 집합 만들기
      top2_day_dict = top2_days.groupby('subject_id')['weekday'].apply(set).to_dict()

      # train_df, test_df에 predweekday 컬럼 추가
      def mark_predweekday(row):
          return int(row['weekday'] in top2_day_dict.get(row['subject_id'], set()))

      train_df['predweekday'] = train_df.apply(mark_predweekday, axis=1)
      test_df['predweekday'] = test_df.apply(mark_predweekday, axis=1)

      # ----------------------------------------------------------------------------------------------------------

      # [1]
      train_df['new1'] = np.where(train_df['img0']>0,1,0)
      train_df['new2'] = np.where(train_df['light_wake_time_diff']>0,1,0)
      train_df['new3'] = np.where(train_df['light_sleep_time_ratio']>0,1,0)
      train_df['new4'] = np.where(train_df['light_wake_time_ratio']>0,1,0)
      train_df['new5'] = np.where(train_df['light_sleep_duration_ratio']>0,1,0)
      train_df['new6'] = np.where(train_df['sleep_duration_vs_weekday_avg']>0,1,0)
      train_df['new7'] = np.where(train_df['wake_time_ratio']>1,1,0)
      train_df['new8'] = np.where(train_df['sleep_duration_ratio']>1,1,0)

      # [2]
      test_df['new1'] = np.where(test_df['img0']>0,1,0)
      test_df['new2'] = np.where(test_df['light_wake_time_diff']>0,1,0)
      test_df['new3'] = np.where(test_df['light_sleep_time_ratio']>0,1,0)
      test_df['new4'] = np.where(test_df['light_wake_time_ratio']>0,1,0)
      test_df['new5'] = np.where(test_df['light_sleep_duration_ratio']>0,1,0)
      test_df['new6'] = np.where(test_df['sleep_duration_vs_weekday_avg']>0,1,0)
      test_df['new7'] = np.where(test_df['wake_time_ratio']>1,1,0)
      test_df['new8'] = np.where(test_df['sleep_duration_ratio']>1,1,0)

      # ----------------------------------------------------------------------------------------------------------

      # TARGET ENCODER
      for new in ['new1','new2','new6','new7','new8','weekend']:

        for tgt in all_targets:

          encoder_feats = ['subject_id','month',new] # 'weekday', 'subject_id','month','weekend'

          #### ENCODER1

          subject_mean = train_df.groupby(encoder_feats)[tgt].mean().rename(f'{tgt}_{encoder_feats[2]}_te')
          train_df = train_df.merge(subject_mean, on=encoder_feats, how='left')
          test_df = test_df.merge(subject_mean, on=encoder_feats, how='left')
          global_mean = train_df[tgt].mean()
          test_df[f'{tgt}_{encoder_feats[2]}_te'] = test_df[f'{tgt}_{encoder_feats[2]}_te'].fillna(global_mean)

          # 노이즈 추가
          train_df[f'{tgt}_{encoder_feats[2]}_te'] = add_noise(train_df[f'{tgt}_{encoder_feats[2]}_te'], noise_level)
          test_df[f'{tgt}_{encoder_feats[2]}_te'] = add_noise(test_df[f'{tgt}_{encoder_feats[2]}_te'], noise_level)

          #### ENCODER2

          # 새로운 범주형 열 생성
          train_df['TMP'] = train_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)
          test_df['TMP'] = test_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)

          # ENCODER
          encoder = TargetEncoder(cols=['TMP'], smoothing=300) # 40
          encoder.fit(train_df[['TMP']], train_df[tgt])

          # 인코딩 결과를 새로운 열에 저장
          train_df[f'{tgt}_{encoder_feats[2]}_te2'] = encoder.transform(train_df[['TMP']])
          test_df[f'{tgt}_{encoder_feats[2]}_te2'] = encoder.transform(test_df[['TMP']])

          # ADD NOISE
          train_df[f'{tgt}_{encoder_feats[2]}_te2'] = add_noise(train_df[f'{tgt}_{encoder_feats[2]}_te2'], noise_level)
          test_df[f'{tgt}_{encoder_feats[2]}_te2'] = add_noise(test_df[f'{tgt}_{encoder_feats[2]}_te2'], noise_level)

          # DROP TMP COLUMNS
          train_df = train_df.drop(columns=['TMP'])
          test_df = test_df.drop(columns=['TMP'])


      # ENCODER
      PK = ['sleep_date', 'lifelog_date', 'subject_id']
      encoder = LabelEncoder()
      categorical_features = [i for i in train_df.select_dtypes(include=['object', 'category']).columns if i not in PK+['pk']]
      for col in categorical_features:
          # print(col)
          train_df[col] = encoder.fit_transform(train_df[col])
          test_df[col] = encoder.fit_transform(test_df[col])

      # X
      X = train_df.drop(columns=PK + all_targets)
      test_X = test_df.drop(columns=PK + all_targets)

      total_avg_f1s = []
      val_f1 = []
      binary_val_preds = {}
      multiclass_val_preds = {}
      binary_test_preds = {}
      multiclass_test_preds = {}
      test_preds = {}
      xfeatures_dict = {}

      # Find optimal weights
      best_weights = []
      best_scores = []

      # ------
      # binary
      # ------

      for col in targets_binary:

          y = train_df[col]

          valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
          valid_ids['pk'] = valid_ids['pk'].str.strip().tolist()
          train_df['pk'] = train_df['subject_id']+train_df['sleep_date']

          X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
          X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
          y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
          y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

          # ----------------------------------------------------------------------------------------------------------

          feature_names = X.columns.tolist()

          if col in ['Q3']:
            # xfeatures1
            model = XGBClassifier(**self.xgb_params)
            model.fit(X_train, y_train)
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(X_train)
            shap_importance = np.abs(shap_values).mean(axis=0)
            shap_df = pd.DataFrame({
                'feature': X_train.columns,
                'shap_importance': shap_importance
            }).sort_values(by='shap_importance', ascending=False)
            xfeatures1 = shap_df.head(15)['feature'].tolist()

          else:
            xfeatures1 = []

          # xfeatures2
          correlations = X.select_dtypes(include=['number']).corrwith(y)
          sorted_correlations = correlations.abs().sort_values(ascending=False)
          xfeatures2 = sorted_correlations[sorted_correlations>0.1].index.tolist()
          xfeatures_dict[col] = [i for i in X.columns.tolist() if i in set(xfeatures1+xfeatures2)]

          # ----------------------------------------------------------------------------------------------------------

          X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),xfeatures_dict[col]].reset_index(drop=True).copy()
          X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),xfeatures_dict[col]].reset_index(drop=True).copy()
          y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
          y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

          # Train LightGBM
          lgb_model = LGBMClassifier(**self.lgb_params)
          lgb_model.fit(X_train, y_train)

          # Train XGBoost
          xgb_model = XGBClassifier(**self.xgb_params)
          xgb_model.fit(X_train, y_train)

          # Train TabPFN
          tabpfn_model = TabPFNClassifier(**self.tabpfn_params)
          tabpfn_model.fit(X_train, y_train)

          tab_pred_valid = tabpfn_model.predict_proba(X_valid.values)[:, 1]
          lgb_pred_valid = lgb_model.predict_proba(X_valid)[:, 1]
          xgb_pred_valid = xgb_model.predict_proba(X_valid)[:, 1]

          pred_valid = (lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + tab_C * tab_pred_valid > 0.5).astype(int)

          f1 = f1_score(y_valid, pred_valid, average='macro')
          val_f1.append(f1)

          # Store predictions
          binary_val_preds[col] = {
              'lgb': lgb_pred_valid,
              'xgb': xgb_pred_valid,
              'tab': tab_pred_valid,
              'true': y_valid
          }

      # ----------
      # multiclass
      # ----------

      y = train_df[target_multiclass]
      X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
      X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
      y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
      y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

      # ----------------------------------------------------------------------------------------------------------

      feature_names = X.columns.tolist()
      xfeatures1 = []
      correlations = X.select_dtypes(include=['number']).corrwith(y)
      sorted_correlations = correlations.abs().sort_values(ascending=False)
      xfeatures2 = sorted_correlations[sorted_correlations>0.1].index.tolist()
      xfeatures_dict['S1'] = [i for i in X.columns.tolist() if i in set(xfeatures1+xfeatures2)]

      # ----------------------------------------------------------------------------------------------------------

      y = train_df[target_multiclass]
      X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),xfeatures_dict['S1']].reset_index(drop=True).copy()
      X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),xfeatures_dict['S1']].reset_index(drop=True).copy()
      y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
      y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

      # 클래스 weight 계산
      classes = np.unique(y_train)
      weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
      class_weights = dict(zip(classes, weights))

      # 각 샘플에 대해 weight 매핑
      w_train = pd.Series(y_train).map(class_weights)
      w_train = compute_sample_weight(class_weight='balanced', y=y_train)

      if S1_balance==True:
        # Train LightGBM
        lgb_model = LGBMClassifier(**self.lgb_params, objective='multiclass', num_class=3)
        lgb_model.fit(X_train, y_train, sample_weight=w_train)

        # Train XGBoost
        xgb_model = XGBClassifier(**self.xgb_params_S1, objective='multi:softmax', num_class=3)
        xgb_model.fit(X_train, y_train,sample_weight=w_train)
      else:
        # Train LightGBM
        lgb_model = LGBMClassifier(**self.lgb_params, objective='multiclass', num_class=3)
        lgb_model.fit(X_train, y_train)

        # Train XGBoost
        xgb_model = XGBClassifier(**self.xgb_params_S1, objective='multi:softmax', num_class=3)
        xgb_model.fit(X_train, y_train)

      # Train TabPFN
      tabpfn_model = TabPFNClassifier(**self.tabpfn_params)
      tabpfn_model.fit(X_train, y_train)

      # Get predictions and ensemble
      lgb_pred_valid = lgb_model.predict_proba(X_valid)
      xgb_pred_valid = xgb_model.predict_proba(X_valid)
      tab_pred_valid = tabpfn_model.predict_proba(X_valid.values)

      pred_valid = np.argmax(lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + tab_C * tab_pred_valid, axis=1)

      f1 = f1_score(y_valid, pred_valid, average='macro')
      val_f1.append(f1)

      multiclass_val_preds = {
          'lgb': lgb_pred_valid,
          'xgb': xgb_pred_valid,
          'tab': tab_pred_valid,
          'true': y_valid
      }

      # Generate all possible weight combinations that sum to 1
      from itertools import product

      step = 0.1
      candidates = np.arange(0, 1.1, step)

      for lgb_A, xgb_B, tab_C in product(candidates, repeat=3):
          total = lgb_A + xgb_B + tab_C
          if np.isclose(total, 1.0):
              weights = (lgb_A, xgb_B, tab_C)
              val_scores = []

              # Binary targets
              for col in targets_binary:
                  preds = binary_val_preds[col]
                  blended = lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab']
                  val_scores.append(f1_score(preds['true'], (blended > 0.5).astype(int), average='macro'))

              # Multiclass target
              preds = multiclass_val_preds
              blended = lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab']
              val_scores.append(f1_score(preds['true'], np.argmax(blended, axis=1), average='macro'))

              best_weights.append(weights)
              best_scores.append(np.mean(val_scores))

      # ------------------------------------------------------------------------------------------------------------------------------------------

      # Sort results and get top
      sorted_indices = np.argsort(best_scores)[::-1]
      top_weights = [best_weights[i] for i in sorted_indices]
      top_scores = [best_scores[i] for i in sorted_indices]

      # eval
      avg_f1 = np.mean(val_f1)
      total_avg_f1s.append(avg_f1)
      detail = " ".join([f"{name}({tname}):{score:.4f}" for name, tname, score in zip(targets_binary + [target_multiclass], targets_binary_name + ['S1'], val_f1)])
      print(f"# 6 Targets F1 avg: {avg_f1:.4f} / [Details] {detail}")

      # binary
      binary_preds = {}
      binary_preds_proba = {}
      for col in targets_binary:

          y = train_df[col]
          is_multiclass = False

          # Train LightGBM
          lgb_model = LGBMClassifier(**self.lgb_params)
          lgb_model.fit(X[xfeatures_dict[col]], y)

          # Train XGBoost
          xgb_model = XGBClassifier(**self.xgb_params)
          xgb_model.fit(X[xfeatures_dict[col]], y)

          # Train TabPFN
          tabpfn_model = TabPFNClassifier(**self.tabpfn_params)
          tabpfn_model.fit(X[xfeatures_dict[col]], y)

          tab_pred = tabpfn_model.predict_proba(test_X[xfeatures_dict[col]])[:, 1]
          lgb_pred = lgb_model.predict_proba(test_X[xfeatures_dict[col]])[:, 1]
          xgb_pred = xgb_model.predict_proba(test_X[xfeatures_dict[col]])[:, 1]

          binary_preds[col] = (lgb_A * lgb_pred + xgb_B * xgb_pred + tab_C * tab_pred > 0.5).astype(int)

          # Store predictions
          binary_test_preds[col] = {
              'lgb': lgb_pred,
              'xgb': xgb_pred,
              'tab': tab_pred
          }

          # Feature importance (using LightGBM's importance)
          fi_df = pd.DataFrame({'feature': X[xfeatures_dict[col]].columns, 'importance': lgb_model.feature_importances_})
          fi_df = fi_df[~fi_df['feature'].str.contains('_te')]
          top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
          feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
          # print(f"[{col}] {feat_str}")


      # ----------
      # multiclass
      # ----------

      y = train_df['S1']

      # CLASS WEIGHTS
      classes = np.unique(y)
      weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
      class_weights = dict(zip(classes, weights))

      # 각 샘플에 대해 weight 매핑
      w_train = pd.Series(y).map(class_weights)
      w_train = compute_sample_weight(class_weight='balanced', y=y)

      is_multiclass = True

      if S1_balance==True:
        # Train LightGBM
        lgb_model = LGBMClassifier(**self.lgb_params, objective='multiclass', num_class=3)
        lgb_model.fit(X[xfeatures_dict['S1']], y, sample_weight=w_train)

        # Train XGBoost
        xgb_model = XGBClassifier(**self.xgb_params_S1, objective='multi:softmax', num_class=3)
        xgb_model.fit(X[xfeatures_dict['S1']], y,sample_weight=w_train)
      else:
        # Train LightGBM
        lgb_model = LGBMClassifier(**self.lgb_params, objective='multiclass', num_class=3)
        lgb_model.fit(X[xfeatures_dict['S1']], y)

        # Train XGBoost
        xgb_model = XGBClassifier(**self.xgb_params_S1, objective='multi:softmax', num_class=3)
        xgb_model.fit(X[xfeatures_dict['S1']], y)

      # Train TabPFN
      tabpfn_model = TabPFNClassifier(**self.tabpfn_params)
      tabpfn_model.fit(X[xfeatures_dict['S1']], y)

      # Get predictions and ensemble
      lgb_pred = lgb_model.predict_proba(test_X[xfeatures_dict['S1']])
      xgb_pred = xgb_model.predict_proba(test_X[xfeatures_dict['S1']])
      tab_pred = tabpfn_model.predict_proba(test_X[xfeatures_dict['S1']])

      multiclass_test_preds = {
          'lgb': lgb_pred,
          'xgb': xgb_pred,
          'tab': tab_pred
      }

      multiclass_pred = np.argmax(lgb_A * lgb_pred + xgb_B * xgb_pred + tab_C * tab_pred, axis=1)
      multiclass_pred_proba = lgb_A * lgb_pred + xgb_B * xgb_pred + tab_C * tab_pred

      # Feature importance
      fi_df = pd.DataFrame({'feature': X[xfeatures_dict['S1']].columns, 'importance': lgb_model.feature_importances_})
      fi_df = fi_df[~fi_df['feature'].str.contains('_te')]
      top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
      feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
      # print(f"[S1] {feat_str}")

      # SAVE PRED
      submission_final['S1'] = multiclass_pred
      for col in targets_binary:
        submission_final[col] = binary_preds[col]
      submission_final = submission_final[['subject_id', 'sleep_date', 'lifelog_date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']]
      fname = f"submission_{np.mean(total_avg_f1s)}.csv"
      submission_final.to_csv(fname, index=False)

      # Top 1 Weight Combinations
      submission_final_dict = {}
      print("\nTop 1 Weight Combinations:")
      for i, (weights, score) in enumerate(zip(top_weights[:1], top_scores[:1])):

          print(f"Rank {i+1}: lgb_A={weights[0]:.1f}, xgb_B={weights[1]:.1f}, tab_C={weights[2]:.1f} - Score: {score:.4f}")
          lgb_A, xgb_B, tab_C = weights

          # Binary predictions
          for col in targets_binary:
              preds = binary_test_preds[col]
              ensemble_pred = (lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab'] > 0.5).astype(int)
              submission_final[col] = ensemble_pred

          # Multiclass prediction
          preds = multiclass_test_preds
          ensemble_pred = np.argmax(lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab'], axis=1)
          submission_final['S1'] = ensemble_pred

          # SAVE SUBMISSIONS
          submission_final_dict[i] = submission_final.copy()
          fname = f"submission_top{i+1}_{score:.4f}.csv"
          submission_final_dict[i].to_csv(fname, index=False)
          print(f"Saved submission to {fname} \n")

      # 모델별 예측결과 비율 비교
      a11 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
      a13 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
      a12 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
      a21 = submission_final_dict[0][['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
      a23 = submission_final_dict[0][['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
      a22 = submission_final_dict[0][['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
      result = pd.concat([a11, a13, a12, a21, a23, a22], axis=1)
      result.columns = ['학습sum','학습len','학습mean','테스트sum','테스트len','테스트mean']
      # print('\n STEP3: 예측결과 비교표')
      # display(result)

      # S1분포
      a1 = train['S1'].value_counts(normalize=True)
      a2 = submission_final['S1'].value_counts(normalize=True)
      S1분포 = pd.concat([a1,a2],axis=1)
      # display(S1분포)

      oof_result = []

      return submission_final_dict[0], oof_result

### 📌 do experiments
1. original
2. mean impute
3. KNN impute (K=5)
4. LLM impute

#### 1. eval : original

In [51]:
%%time

"""
# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6841 / [Details] Q1(기상직후수면질):0.7163 Q2(취침전신체적피로):0.7802 Q3(취침전스트레스):0.7151 S2(수면효율):0.6698 S3(수면잠들기시간):0.7475 S1(S1):0.4759

Top 1 Weight Combinations:
Rank 1: lgb_A=0.1, xgb_B=0.2, tab_C=0.7 - Score: 0.6998
Saved submission to submission_top1_0.6998.csv

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6956 / [Details] Q1(기상직후수면질):0.7043 Q2(취침전신체적피로):0.8081 Q3(취침전스트레스):0.6907 S2(수면효율):0.6870 S3(수면잠들기시간):0.7041 S1(S1):0.5793

Top 1 Weight Combinations:
Rank 1: lgb_A=0.2, xgb_B=0.0, tab_C=0.8 - Score: 0.7060
Saved submission to submission_top1_0.7060.csv

CPU times: user 22min 44s, sys: 2.81 s, total: 22min 47s
Wall time: 3min 20s
"""

ETRI = etri_lifelog_pipeline()
valid_ids1 = pd.read_csv(StringIO(ETRI.valid_id1), sep=',')
valid_ids2 = pd.read_csv(StringIO(ETRI.valid_id2), sep=',')

for valid_id_num in ['1','2']:

  # 2 VALID DATASETS
  if valid_id_num == '1':
    valid_ids = valid_ids1.copy()
  elif valid_id_num == '2':
    valid_ids = valid_ids2.copy()

  # SET VALID DATASET
  valid_ids.columns = valid_ids.columns.str.strip()
  valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
  valid_ids['pk'] = valid_ids['pk'].str.strip().tolist()
  print(f"# valid size:{len(valid_ids)}")

  # READ DATA
  train, test = ETRI.read_originaldata()

  # PREPROCESS DATA
  train, test = ETRI.preprocess(train, test)

  # SET PARAMETERS
  best_param_dict = {}
  best_param_dict['Q3'] = ETRI.common_params
  best_param_dict['S1'] = ETRI.common_params
  best_param_dict['S2'] = ETRI.common_params
  best_param_dict['S3'] = ETRI.common_params
  best_param_dict['Q1'] = ETRI.common_params
  best_param_dict['Q2'] = ETRI.common_params

  # RUN MODEL
  submission_final, oof_result = ETRI.run_basemodel(train, test, valid_ids, best_param_dict, n_splits=5, random_state=42, S1_balance=True)

# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6841 / [Details] Q1(기상직후수면질):0.7163 Q2(취침전신체적피로):0.7802 Q3(취침전스트레스):0.7151 S2(수면효율):0.6698 S3(수면잠들기시간):0.7475 S1(S1):0.4759

Top 1 Weight Combinations:
Rank 1: lgb_A=0.1, xgb_B=0.2, tab_C=0.7 - Score: 0.6998
Saved submission to submission_top1_0.6998.csv 

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6956 / [Details] Q1(기상직후수면질):0.7043 Q2(취침전신체적피로):0.8081 Q3(취침전스트레스):0.6907 S2(수면효율):0.6870 S3(수면잠들기시간):0.7041 S1(S1):0.5793

Top 1 Weight Combinations:
Rank 1: lgb_A=0.2, xgb_B=0.0, tab_C=0.8 - Score: 0.7060
Saved submission to submission_top1_0.7060.csv 

CPU times: user 22min 44s, sys: 2.81 s, total: 22min 47s
Wall time: 3min 20s


#### 2. eval : mean impute

In [50]:
%%time

"""
# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6684 / [Details] Q1(기상직후수면질):0.6931 Q2(취침전신체적피로):0.8107 Q3(취침전스트레스):0.6800 S2(수면효율):0.6190 S3(수면잠들기시간):0.7234 S1(S1):0.4843

Top 1 Weight Combinations:
Rank 1: lgb_A=0.2, xgb_B=0.0, tab_C=0.8 - Score: 0.7152
Saved submission to submission_top1_0.7152.csv

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6912 / [Details] Q1(기상직후수면질):0.6945 Q2(취침전신체적피로):0.7797 Q3(취침전스트레스):0.6701 S2(수면효율):0.6955 S3(수면잠들기시간):0.7165 S1(S1):0.5907

Top 1 Weight Combinations:
Rank 1: lgb_A=0.0, xgb_B=0.1, tab_C=0.9 - Score: 0.7028
Saved submission to submission_top1_0.7028.csv

CPU times: user 23min 21s, sys: 2.73 s, total: 23min 24s
Wall time: 3min 25s
"""

ETRI = etri_lifelog_pipeline()
valid_ids1 = pd.read_csv(StringIO(ETRI.valid_id1), sep=',')
valid_ids2 = pd.read_csv(StringIO(ETRI.valid_id2), sep=',')

for valid_id_num in ['1','2']:

  # 2 VALID DATASETS
  if valid_id_num == '1':
    valid_ids = valid_ids1.copy()
  elif valid_id_num == '2':
    valid_ids = valid_ids2.copy()

  # SET VALID DATASET
  valid_ids.columns = valid_ids.columns.str.strip()
  valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
  valid_ids['pk'] = valid_ids['pk'].str.strip().tolist()
  print(f"# valid size:{len(valid_ids)}")

  # READ DATA
  train, test = ETRI.read_meanimputedata()

  # PREPROCESS DATA
  train, test = ETRI.preprocess(train, test)

  # SET PARAMETERS
  best_param_dict = {}
  best_param_dict['Q3'] = ETRI.common_params
  best_param_dict['S1'] = ETRI.common_params
  best_param_dict['S2'] = ETRI.common_params
  best_param_dict['S3'] = ETRI.common_params
  best_param_dict['Q1'] = ETRI.common_params
  best_param_dict['Q2'] = ETRI.common_params

  # RUN MODEL
  submission_final, oof_result = ETRI.run_basemodel(train, test, valid_ids, best_param_dict, n_splits=5, random_state=42, S1_balance=True)

# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6684 / [Details] Q1(기상직후수면질):0.6931 Q2(취침전신체적피로):0.8107 Q3(취침전스트레스):0.6800 S2(수면효율):0.6190 S3(수면잠들기시간):0.7234 S1(S1):0.4843

Top 1 Weight Combinations:
Rank 1: lgb_A=0.2, xgb_B=0.0, tab_C=0.8 - Score: 0.7152
Saved submission to submission_top1_0.7152.csv 

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6912 / [Details] Q1(기상직후수면질):0.6945 Q2(취침전신체적피로):0.7797 Q3(취침전스트레스):0.6701 S2(수면효율):0.6955 S3(수면잠들기시간):0.7165 S1(S1):0.5907

Top 1 Weight Combinations:
Rank 1: lgb_A=0.0, xgb_B=0.1, tab_C=0.9 - Score: 0.7028
Saved submission to submission_top1_0.7028.csv 

CPU times: user 23min 21s, sys: 2.73 s, total: 23min 24s
Wall time: 3min 25s


#### 3. eval: KNN impute

In [49]:
%%time

"""
# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6989 / [Details] Q1(기상직후수면질):0.6970 Q2(취침전신체적피로):0.8400 Q3(취침전스트레스):0.7151 S2(수면효율):0.6992 S3(수면잠들기시간):0.7234 S1(S1):0.5185

Top 1 Weight Combinations:
Rank 1: lgb_A=0.2, xgb_B=0.1, tab_C=0.7 - Score: 0.7116
Saved submission to submission_top1_0.7116.csv

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6867 / [Details] Q1(기상직후수면질):0.7043 Q2(취침전신체적피로):0.8074 Q3(취침전스트레스):0.6244 S2(수면효율):0.6870 S3(수면잠들기시간):0.7165 S1(S1):0.5805

Top 1 Weight Combinations:
Rank 1: lgb_A=0.4, xgb_B=0.0, tab_C=0.6 - Score: 0.7034
Saved submission to submission_top1_0.7034.csv

CPU times: user 21min 29s, sys: 2.66 s, total: 21min 32s
Wall time: 3min 9s
"""

ETRI = etri_lifelog_pipeline()
valid_ids1 = pd.read_csv(StringIO(ETRI.valid_id1), sep=',')
valid_ids2 = pd.read_csv(StringIO(ETRI.valid_id2), sep=',')

for valid_id_num in ['1','2']:

  # 2 VALID DATASETS
  if valid_id_num == '1':
    valid_ids = valid_ids1.copy()
  elif valid_id_num == '2':
    valid_ids = valid_ids2.copy()

  # SET VALID DATASET
  valid_ids.columns = valid_ids.columns.str.strip()
  valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
  valid_ids['pk'] = valid_ids['pk'].str.strip().tolist()
  print(f"# valid size:{len(valid_ids)}")

  # READ DATA
  train, test = ETRI.read_knnimputedata()

  # PREPROCESS DATA
  train, test = ETRI.preprocess(train, test)

  # SET PARAMETERS
  best_param_dict = {}
  best_param_dict['Q3'] = ETRI.common_params
  best_param_dict['S1'] = ETRI.common_params
  best_param_dict['S2'] = ETRI.common_params
  best_param_dict['S3'] = ETRI.common_params
  best_param_dict['Q1'] = ETRI.common_params
  best_param_dict['Q2'] = ETRI.common_params

  # RUN MODEL
  submission_final, oof_result = ETRI.run_basemodel(train, test, valid_ids, best_param_dict, n_splits=5, random_state=42, S1_balance=True)

# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6964 / [Details] Q1(기상직후수면질):0.6748 Q2(취침전신체적피로):0.7867 Q3(취침전스트레스):0.7484 S2(수면효율):0.7234 S3(수면잠들기시간):0.7234 S1(S1):0.5218

Top 1 Weight Combinations:
Rank 1: lgb_A=0.4, xgb_B=0.2, tab_C=0.4 - Score: 0.7164
Saved submission to submission_top1_0.7164.csv 

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6988 / [Details] Q1(기상직후수면질):0.7141 Q2(취침전신체적피로):0.8273 Q3(취침전스트레스):0.6702 S2(수면효율):0.6955 S3(수면잠들기시간):0.7165 S1(S1):0.5693

Top 1 Weight Combinations:
Rank 1: lgb_A=0.3, xgb_B=0.1, tab_C=0.6 - Score: 0.7186
Saved submission to submission_top1_0.7186.csv 

CPU times: user 22min 25s, sys: 2.72 s, total: 22min 27s
Wall time: 3min 16s


#### 4. eval : LMM impute


In [48]:
%%time

"""
# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.7218 / [Details] Q1(기상직후수면질):0.7206 Q2(취침전신체적피로):0.8400 Q3(취침전스트레스):0.7484 S2(수면효율):0.7749 S3(수면잠들기시간):0.7234 S1(S1):0.5235

Top 1 Weight Combinations:
Rank 1: lgb_A=0.3, xgb_B=0.0, tab_C=0.7 - Score: 0.7238
Saved submission to submission_top1_0.7238.csv

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6902 / [Details] Q1(기상직후수면질):0.7141 Q2(취침전신체적피로):0.7781 Q3(취침전스트레스):0.6785 S2(수면효율):0.6654 S3(수면잠들기시간):0.7041 S1(S1):0.6008

Top 1 Weight Combinations:
Rank 1: lgb_A=0.1, xgb_B=0.1, tab_C=0.8 - Score: 0.7081
Saved submission to submission_top1_0.7081.csv

CPU times: user 23min 37s, sys: 2.83 s, total: 23min 40s
Wall time: 3min 27s
"""

ETRI = etri_lifelog_pipeline()
valid_ids1 = pd.read_csv(StringIO(ETRI.valid_id1), sep=',')
valid_ids2 = pd.read_csv(StringIO(ETRI.valid_id2), sep=',')

for valid_id_num in ['1','2']:

  # 2 VALID DATASETS
  if valid_id_num == '1':
    valid_ids = valid_ids1.copy()
  elif valid_id_num == '2':
    valid_ids = valid_ids2.copy()

  # SET VALID DATASET
  valid_ids.columns = valid_ids.columns.str.strip()
  valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
  valid_ids['pk'] = valid_ids['pk'].str.strip().tolist()
  print(f"# valid size:{len(valid_ids)}")

  # READ DATA
  train, test = ETRI.read_llmimputedata()

  # PREPROCESS DATA
  train, test = ETRI.preprocess(train, test)

  # SET PARAMETERS
  best_param_dict = {}
  best_param_dict['Q3'] = ETRI.common_params
  best_param_dict['S1'] = ETRI.common_params
  best_param_dict['S2'] = ETRI.common_params
  best_param_dict['S3'] = ETRI.common_params
  best_param_dict['Q1'] = ETRI.common_params
  best_param_dict['Q2'] = ETRI.common_params

  # RUN MODEL
  submission_final, oof_result = ETRI.run_basemodel(train, test, valid_ids, best_param_dict, n_splits=5, random_state=seed, S1_balance=True)

# valid size:40
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.7218 / [Details] Q1(기상직후수면질):0.7206 Q2(취침전신체적피로):0.8400 Q3(취침전스트레스):0.7484 S2(수면효율):0.7749 S3(수면잠들기시간):0.7234 S1(S1):0.5235

Top 1 Weight Combinations:
Rank 1: lgb_A=0.3, xgb_B=0.0, tab_C=0.7 - Score: 0.7238
Saved submission to submission_top1_0.7238.csv 

# valid size:105
# lgb_A:0.4 xgb_B:0.3 tab_C:0.3
# 6 Targets F1 avg: 0.6902 / [Details] Q1(기상직후수면질):0.7141 Q2(취침전신체적피로):0.7781 Q3(취침전스트레스):0.6785 S2(수면효율):0.6654 S3(수면잠들기시간):0.7041 S1(S1):0.6008

Top 1 Weight Combinations:
Rank 1: lgb_A=0.1, xgb_B=0.1, tab_C=0.8 - Score: 0.7081
Saved submission to submission_top1_0.7081.csv 

CPU times: user 23min 37s, sys: 2.83 s, total: 23min 40s
Wall time: 3min 27s
