<a href="https://colab.research.google.com/github/jacobgreen4477/The-4th-ETRI-AI-Human-Understanding-Competition/blob/main/etri_template_vF1_0_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

> title : 제 4회 ETRI 휴먼이해 인공지능 논문경진대회 <br>
> author : hjy,byc <br>

### 📦 라이브러리

In [None]:
! pip install haversine >/dev/null
! pip install optuna  >/dev/null
! pip install category_encoders >/dev/null
! pip install tabpfn  >/dev/null
! pip install catboost >/dev/null
! pip install torchmetrics >/dev/null

In [None]:
# 기본 모듈
import os
import sys
import re
import ast
import glob
import random
import warnings
from collections import Counter
from math import radians, cos, sin, asin, sqrt
from functools import reduce
from datetime import datetime, timedelta, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 머신러닝
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from category_encoders import TargetEncoder
from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from tabpfn import TabPFNClassifier

# PyTorch
import torch
from torch import nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Hugging Face
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    LlamaTokenizer,
    LlamaForCausalLM,
    LlamaForSequenceClassification
)

# PEFT (Parameter-Efficient Fine-Tuning)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    TaskType
)

# Evaluation & Utilities
from torchmetrics import Accuracy

# 기타
from tqdm import tqdm
from tqdm.auto import tqdm as auto_tqdm  # 필요 시 구분
from scipy.stats import entropy
from haversine import haversine
from io import StringIO
import gc

# wandb
import wandb
wandb.login(key="5fa8dfb2c5be3c888bfe0101437a8fa22fbdf0e0")
wandb.init(project="etri_lifelog", entity="byc3230")

# 옵션
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%0.4f' % x)

# 기타
warnings.filterwarnings('ignore')

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbyc3230[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(1)

In [None]:
string = """
subject_id,sleep_date
id01,2024-07-24
id01,2024-08-26
id01,2024-08-28
id01,2024-08-29
id02,2024-08-23
id02,2024-09-24
id02,2024-09-26
id02,2024-09-27
id03,2024-08-30
id03,2024-09-01
id03,2024-09-02
id03,2024-09-06
id04,2024-09-03
id04,2024-10-10
id04,2024-10-12
id04,2024-10-13
id05,2024-10-19
id05,2024-10-23
id05,2024-10-24
id05,2024-10-27
id06,2024-07-25
id06,2024-07-26
id06,2024-07-27
id06,2024-07-30
id07,2024-07-07
id07,2024-08-02
id07,2024-08-04
id07,2024-08-05
id08,2024-08-28
id08,2024-08-29
id08,2024-08-30
id08,2024-09-02
id09,2024-08-02
id09,2024-08-31
id09,2024-09-02
id09,2024-09-03
id10,2024-08-28
id10,2024-08-30
id10,2024-08-31
id10,2024-09-03
"""

# DataFrame 생성
valid_ids = pd.read_csv(StringIO(string), sep=',')
valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']

### 📦 데이터 읽기

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

path = '/content/drive/MyDrive/data/ch2025_data_items/share/'

train = pd.read_parquet(f'{path}train_63775.parquet')
test = pd.read_parquet(f'{path}test_63775.parquet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# drop_features = ['afterwork_max_label','sleeptime_max_label','worktime_max_label']
drop_features = ['top_bssid'] # ,'week_type','week_type_lag1'
drop_features = [i for i in drop_features if i in train.columns.tolist()]
print('# drop_features:',drop_features)
train = train.drop(columns=drop_features)
test = test.drop(columns=drop_features)

# drop_features: []


In [None]:
# ---
# 추정수면효율
# ---

def calculate_sleep_duration_min(sleep_time, wake_time):
    """
    취침 시각(sleep_time)과 기상 시각(wake_time)을 입력받아 수면 시간(분) 반환
    단위는 float 시간 (예: 23.5, 6.25)
    """
    if pd.isna(sleep_time) or pd.isna(wake_time):
        return None
    if wake_time < sleep_time:
        wake_time += 24  # 자정 넘긴 경우 보정
    duration = (wake_time - sleep_time) * 60
    return round(duration)

train['불끈시간부터기상시간'] = train.apply(lambda x: calculate_sleep_duration_min(x['lights_off_time'],x['wake_time']),axis=1)
test['불끈시간부터기상시간'] = test.apply(lambda x: calculate_sleep_duration_min(x['lights_off_time'],x['wake_time']),axis=1)

train['추정수면효율'] = train['불끈시간부터기상시간']/train['sleep_duration_min']
test['추정수면효율'] = test['불끈시간부터기상시간']/test['sleep_duration_min']

# 이상값 제거
train['추정수면효율'] = np.where(train['추정수면효율']<-5,np.nan,train['추정수면효율'])
test['추정수면효율'] = np.where(test['추정수면효율']<-5,np.nan,test['추정수면효율'])
train['추정수면효율'] = np.where(train['추정수면효율']>5,np.nan,train['추정수면효율'])
test['추정수면효율'] = np.where(test['추정수면효율']>55,np.nan,test['추정수면효율'])

In [None]:
# 요일 컬럼 추가 (예: 월요일, 화요일, ...)
train['lifelog_date'] = pd.to_datetime(train['lifelog_date'])
test['lifelog_date'] = pd.to_datetime(test['lifelog_date'])

# 요일
weekday_map = {
    0: '월요일', 1: '화요일', 2: '수요일', 3: '목요일',
    4: '금요일', 5: '토요일', 6: '일요일'
}
train['weekday'] = train['lifelog_date'].dt.dayofweek.map(weekday_map)
test['weekday'] = test['lifelog_date'].dt.dayofweek.map(weekday_map)

# 월
train['month'] = train['lifelog_date'].dt.month
test['month'] = test['lifelog_date'].dt.month

# weekend
train['weekend'] = np.where(train['weekday'].isin(['토요일','일요일']),1,0)
test['weekend'] = np.where(test['weekday'].isin(['토요일','일요일']),1,0)

# 공휴일
공휴일 = [
     '2024-08-15'
    ,'2024-09-16'
    ,'2024-09-17'
    ,'2024-09-18'
    ,'2024-10-03'
    ,'2024-10-09'
]
train['공휴일'] = np.where(train['lifelog_date'].isin(공휴일),1,0)
test['공휴일'] = np.where(test['lifelog_date'].isin(공휴일),1,0)

# 주말 + 공휴일 묶어주기
train['weekend_holilday'] = np.where( ((train['weekend']==0) & (train['공휴일']==1)), 1, train['weekend'])
test['weekend_holilday'] = np.where( ((test['weekend']==0) & (test['공휴일']==1)), 1, test['weekend'])

In [None]:
def add_prev_day_flag(df):
    df = df.copy()
    df['lifelog_date'] = pd.to_datetime(df['lifelog_date'])

    # 각 subject_id별로 전날 날짜 만들기
    df['prev_day'] = df['lifelog_date'] - pd.Timedelta(days=1)

    # subject_id + 날짜 기준으로 원본 키 구성
    key_set = set(zip(df['subject_id'], df['lifelog_date']))

    # 전날 데이터가 존재하면 1, 없으면 0
    df['has_prev_day_data'] = df[['subject_id', 'prev_day']].apply(
        lambda row: 1 if (row['subject_id'], row['prev_day']) in key_set else 0, axis=1
    )

    return df.drop(columns=['prev_day'])

train = add_prev_day_flag(train)
test = add_prev_day_flag(test)

In [None]:
# ---
# 추정휴가
# ---

def rule_based_sum(x):
    rules = (
        # (x['sleep_duration_min'] > (x['avg_sleep_duration']+30))
          (x['sleep_duration_min'] > (x['avg_sleep_duration']+60))
        & (x['week_type'] == 'weekday')
        # & (x['month'].isin([7,8]))
    )
    return rules

train['vacation'] = train.groupby('subject_id').apply(rule_based_sum).reset_index(level=0, drop=True).astype(int)
test['vacation'] = test.groupby('subject_id').apply(rule_based_sum).reset_index(level=0, drop=True).astype(int)

# check
test.groupby(['subject_id'])['vacation'].sum().head()

Unnamed: 0_level_0,vacation
subject_id,Unnamed: 1_level_1
id01,2
id02,3
id03,4
id04,9
id05,4


In [None]:
# 숫자형 컬럼만 선택해서 결측값 -1로 채우기
train[train.select_dtypes(include='number').columns] = train.select_dtypes(include='number').fillna(-1)
test[test.select_dtypes(include='number').columns] = test.select_dtypes(include='number').fillna(-1)

### ============================

### CustomDataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, twitterDF):
        self.twitterDF = twitterDF

    def __len__(self):
        return len(self.twitterDF.index)

    def __getitem__(self, idx):
        return np.array([idx])

### train_iter

In [None]:
def train_iter(model, loader, optimizer, criterion, twitterDFTrain_clean):
    model.train()

    total_loss = 0
    correct = 0
    pred = []
    label = []
    for batchIdx, sampledIdx in enumerate(tqdm(loader, position=0, leave=True)):

        optimizer.zero_grad()

        #text
        sampledIdx = sampledIdx.cpu().data.numpy()
        sampledRowText = list(twitterDFTrain_clean["text"].iloc[list(sampledIdx.flatten())])

        #label
        sampledRowLabels = torch.tensor(list(twitterDFTrain_clean["label"].iloc[list(sampledIdx.flatten())])).to(device)

        #encoded
        encoded_input = tokenizer(sampledRowText, truncation=True, padding=True, return_tensors='pt').to(device) # Output shape: [bs, num_Labels]
        encoded_inputIds = encoded_input["input_ids"].to(device)
        encoded_attnMask = encoded_input["attention_mask"].to(device)

        #model
        outputs = model(input_ids=encoded_inputIds, attention_mask=encoded_attnMask)

        # label type change
        sampledRowLabels = sampledRowLabels.to(outputs.logits.device).long()  # shape: [1]

        # loss
        loss = criterion(outputs.logits, sampledRowLabels)
        total_loss += loss.item()

        #acurracy
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(outputs.logits, dim=-1)
        pred.extend(predicted_class.flatten().cpu().data.numpy())
        label.extend(sampledRowLabels.cpu().data.numpy())

        #back propagation
        loss.backward()
        optimizer.step()

    train_accuracy_score = metrics.f1_score(label,pred, average='macro')
    return total_loss / len(loader), train_accuracy_score

### valid_iter

In [None]:
def valid_iter(model, valid_loader, criterion, twitterDFVal_clean):
    model.eval()
    with torch.no_grad():

        total_loss = 0
        correct = 0
        pred = []
        label = []
        for batchIdx, sampledIdx in enumerate(tqdm(valid_loader, position=0, leave=True)):

            sampledRowText = list(twitterDFVal_clean["text"].iloc[list(sampledIdx.flatten())])
            sampledRowLabels = torch.tensor(list(twitterDFVal_clean["label"].iloc[list(sampledIdx.flatten())]))
            encoded_input = tokenizer(sampledRowText, truncation=True, padding=True, return_tensors='pt').to(device) # Output shape: [bs, num_Labels]
            encoded_inputIds = encoded_input["input_ids"].to(device)
            encoded_attnMask = encoded_input["attention_mask"].to(device)

            #model
            outputs = model(input_ids=encoded_inputIds, attention_mask=encoded_attnMask)

            # label type change
            sampledRowLabels = sampledRowLabels.to(outputs.logits.device).long()  # shape: [1]

            # loss
            loss = criterion(outputs.logits, sampledRowLabels)
            total_loss += loss.item()

            #acurracy
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_class = torch.argmax(outputs.logits, dim=-1)
            pred.extend(predicted_class.flatten().cpu().data.numpy())
            label.extend(sampledRowLabels.cpu().data.numpy())

        valid_accuracy_score = metrics.f1_score(label,pred, average='macro')

    return total_loss/len(valid_loader), valid_accuracy_score

### FocalLoss

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha  # optional: list or tensor of class weights
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)  # prevents nans when probability is 0
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

### get_oof_predictions

In [None]:
def get_oof_predictions(X, y, lgb_params, xgb_params, n_splits=5, is_multiclass=False, num_class=None, early_stop=False):
    oof_preds_lgb = np.zeros(len(X))
    oof_preds_xgb = np.zeros(len(X))
    oof_preds_cat = np.zeros(len(X))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        if is_multiclass:
            llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path, num_labels=3, torch_dtype=torch.float16, device_map='auto')
            lora_model = get_peft_model(llm_model, config)
        else:
            llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path, num_labels=2, torch_dtype=torch.float16, device_map='auto')
            lora_model = get_peft_model(llm_model, config)
        # LightGBM
        if is_multiclass:
            lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=num_class)
        else:
            lgb_model = LGBMClassifier(**lgb_params)

        # XGBoost
        if is_multiclass:
            xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=num_class)
        else:
            xgb_model = XGBClassifier(**xgb_params)

        # CatBoost
        if is_multiclass:
            cat_model = CatBoostClassifier(**common_params_cat2, objective='MultiClass', classes_count=num_class)
        else:
            cat_model = CatBoostClassifier(**common_params_cat)

        if early_stop:
            lgb_model.fit(
                X_train, y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                callbacks=[early_stopping(stopping_rounds=100, verbose=False)]
            )
            xgb_model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=False
            )
            cat_model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=False
            )
        else:
            X_train_llm = X_train
            X_valid_llm = X_valid

            def prepare_input_text(row):
                target_cols = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']
                feature_cols = [col for col in X_train.columns if col not in target_cols]

                instruct_txt = ""
                if is_multiclass:
                    instruct_txt = "Classify 0 or 1 or 2. ###DATA### "
                else:
                    instruct_txt = "Classify 0 or 1. ###DATA### "

                for clm in feature_cols:
                    instruct_txt += f"{clm} : {row[clm]}, "

                return instruct_txt.strip()[:-1]+'.'

            X_train_llm['input'] = X_train_llm.apply(lambda x: prepare_input_text(x),axis=1)
            X_valid_llm['input'] = X_valid_llm.apply(lambda x: prepare_input_text(x),axis=1)

            X_train_llm["label"] = y_train
            X_valid_llm['label'] = y_valid
            X_train_llm["text"] = X_train_llm["input"]
            X_valid_llm["text"] = X_valid_llm["input"]

            training_data = CustomDataset(X_train_llm)
            validation_data = CustomDataset(X_valid_llm)

            train_dataloader = DataLoader(training_data, batch_size=1, shuffle=True)
            val_dataloader = DataLoader(validation_data, batch_size=1, shuffle=False)

            twitterDFTrain_clean = X_train_llm
            twitterDFVal_clean = X_valid_llm

            if is_multiclass:
                total_loss = 0
                correct = 0
                bad_counter = 0
                #best model 를 찾기 위한
                best = np.inf

                for epoch in range(epochs):
                    avg_train_loss, avg_train_acc  = train_iter(llm_model, train_dataloader, optimizer, criterion, twitterDFTrain_clean)

                    avg_vaild_loss, avg_vaild_acc = valid_iter(llm_model, val_dataloader, criterion, twitterDFVal_clean)

                    if avg_vaild_loss < best:
                        best = avg_vaild_loss
                        torch.save(llm_model.state_dict(), best_model_path)
                        bad_counter = 0
                    else:
                        bad_counter += 1

                    if bad_counter == patience:
                        break

                    print(f'Epoch: {str(epoch+1)}: t_loss:{avg_train_loss:.3f} t_acc:{avg_train_acc:.3f} v_loss:{avg_vaild_loss:.3f} v_acc:{avg_vaild_acc:.3f}')

                    wandb.log({"train": {'epoch': epoch, "acc": avg_train_acc, "loss": avg_train_loss}, "val": {'epoch': epoch, "acc": avg_vaild_acc, "loss": avg_vaild_loss}})
            else:
                total_loss = 0
                correct = 0
                bad_counter = 0
                #best model 를 찾기 위한
                best = np.inf
                for epoch in range(epochs):
                    avg_train_loss, avg_train_acc  = train_iter(llm_model, train_dataloader, optimizer, criterion, twitterDFTrain_clean)

                    avg_vaild_loss, avg_vaild_acc = valid_iter(llm_model, val_dataloader, criterion, twitterDFVal_clean)

                    if avg_vaild_loss < best:
                        best = avg_vaild_loss
                        torch.save(llm_model.state_dict(), best_model_path)
                        bad_counter = 0
                    else:
                        bad_counter += 1

                    if bad_counter == patience:
                        break

                    print(f'Epoch: {str(epoch+1)}: t_loss:{avg_train_loss:.3f} t_acc:{avg_train_acc:.3f} v_loss:{avg_vaild_loss:.3f} v_acc:{avg_vaild_acc:.3f}')

                    wandb.log({"train": {'epoch': epoch, "acc": avg_train_acc, "loss": avg_train_loss}, "val": {'epoch': epoch, "acc": avg_vaild_acc, "loss": avg_vaild_loss}})

            if is_multiclass:

                # 클래스 weight 계산
                classes = np.unique(y_train)
                weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
                class_weights = dict(zip(classes, weights))

                # 각 샘플에 대해 weight 매핑
                w_train = pd.Series(y_train).map(class_weights)
                #print(w_train)

                w_train = compute_sample_weight(class_weight='balanced', y=y_train)
                print(w_train)

                lgb_model.fit(X_train, y_train, sample_weight=w_train)
                xgb_model.fit(X_train, y_train, sample_weight=w_train)
                cat_model.fit(X_train, y_train)
            else:
                lgb_model.fit(X_train, y_train)
                xgb_model.fit(X_train, y_train)
                cat_model.fit(X_train, y_train)


        # Get predictions
        lgb_preds = lgb_model.predict(X_valid)
        xgb_preds = xgb_model.predict(X_valid)
        cat_preds = cat_model.predict(X_valid).ravel()  # ✅ 2차원 → 1차원

        # Store predictions
        oof_preds_lgb[valid_idx] = lgb_preds
        oof_preds_xgb[valid_idx] = xgb_preds
        oof_preds_cat[valid_idx] = cat_preds

    # Ensemble predictions (7:3 ratio)
    oof_preds = lgb_A * oof_preds_lgb + xgb_B * oof_preds_xgb + cat_C * oof_preds_cat

    if not is_multiclass:
        oof_preds = (oof_preds > 0.5).astype(int)
    else:
        oof_preds = np.round(oof_preds).astype(int)

    return oof_preds

### inference

In [None]:
def inference(model, loader, test_data, col):
    #best model 불러와서 call 하기
    model.load_state_dict(torch.load(col+"_"+best_model_path))
    model.eval()

    preds = []
    preds_prob = []
    with torch.no_grad():
        total_loss = 0
        correct = 0

        pred = []
        label = []

        for batchIdx, sampledIdx in enumerate(tqdm(loader, position=0, leave=True)):

            sampledRowText = list(test_data["text"].iloc[list(sampledIdx.flatten())])

            encoded_input = tokenizer(sampledRowText, truncation=True, padding=True, return_tensors='pt').to("cuda") # Output shape: [bs, num_Labels]
            encoded_inputIds = encoded_input["input_ids"].to("cuda")
            encoded_attnMask = encoded_input["attention_mask"].to("cuda")

            outputs = model(input_ids=encoded_inputIds, attention_mask=encoded_attnMask)
            logits = outputs.logits


            #acurracy
            probs = torch.nn.functional.softmax(outputs.logits.cpu(), dim=-1)

            #확률구하기
            preds_prob.extend(probs.cpu().data.numpy())

            #acurracy
            pred.extend(torch.argmax(logits, dim=1).flatten().cpu().data.numpy())

    return pred, preds_prob

### run_basemodel

In [28]:
def run_basemodel(train, test, valid_ids, common_params, n_splits, random_state=42, early_stop=False):

    #version 33로 진행완료 Best
    lgb_A = 0.3
    xgb_B = 0.3
    cat_C = 0.3
    llm_D = 0.1

    print("=========valid_ids==========")
    # print(valid_ids)
    train_df = train.copy()
    test_df = test.copy()

    submission_final = test_df[['subject_id', 'sleep_date', 'lifelog_date']].copy()
    submission_final['lifelog_date'] = pd.to_datetime(submission_final['lifelog_date']).dt.date

    # 타겟
    targets_binary = ['Q1', 'Q2', 'Q3', 'S2', 'S3']
    targets_binary_name = ['기상직후수면질','취침전신체적피로','취침전스트레스','수면효율','수면잠들기시간']
    target_multiclass = 'S1'
    all_targets = targets_binary + [target_multiclass]

    # 노이즈 수준 설정
    def add_noise(series, noise_level, seed=3):
        rng = np.random.default_rng(seed)
        return series * (1 + noise_level * rng.standard_normal(len(series)))

    noise_level = 0.015  # 필요에 따라 조정

    # 타겟인코딩
    for tgt in all_targets:

      encoder_feats = ['subject_id','month','weekend'] # 'weekday', 'subject_id','month','weekend'

      #### 타겟인코딩1

      subject_mean = train_df.groupby(encoder_feats)[tgt].mean().rename(f'{tgt}_te')
      train_df = train_df.merge(subject_mean, on=encoder_feats, how='left')
      test_df = test_df.merge(subject_mean, on=encoder_feats, how='left')
      global_mean = train_df[tgt].mean()
      test_df[f'{tgt}_te'] = test_df[f'{tgt}_te'].fillna(global_mean)

      # 노이즈 추가
      train_df[f'{tgt}_te'] = add_noise(train_df[f'{tgt}_te'], noise_level)
      test_df[f'{tgt}_te'] = add_noise(test_df[f'{tgt}_te'], noise_level)

      #### 타겟인코딩2

      # 새로운 범주형 열 생성
      train_df['TMP'] = train_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)
      test_df['TMP'] = test_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)

      # 인코더
      encoder = TargetEncoder(cols=['TMP'], smoothing=300) # 40
      encoder.fit(train_df[['TMP']], train_df[tgt])

      # 인코딩 결과를 새로운 열에 저장
      train_df[f'{tgt}_te2'] = encoder.transform(train_df[['TMP']])
      test_df[f'{tgt}_te2'] = encoder.transform(test_df[['TMP']])

      # 노이즈 추가
      train_df[f'{tgt}_te2'] = add_noise(train_df[f'{tgt}_te2'], noise_level)
      test_df[f'{tgt}_te2'] = add_noise(test_df[f'{tgt}_te2'], noise_level)

      # 불필요한 변수 제거
      train_df = train_df.drop(columns=['TMP'])
      test_df = test_df.drop(columns=['TMP'])


    # 인코딩
    PK = ['sleep_date', 'lifelog_date', 'subject_id']
    encoder = LabelEncoder()
    categorical_features = [i for i in train_df.select_dtypes(include=['object', 'category']).columns if i not in PK+['pk']]
    for col in categorical_features:
        print(col)
        train_df[col] = encoder.fit_transform(train_df[col])
        test_df[col] = encoder.fit_transform(test_df[col])


    # X
    X = train_df.drop(columns=PK + all_targets)
    test_X = test_df.drop(columns=PK + all_targets)
    print(f'# X shape: {X.shape}')
    print(f'# test_X shape: {test_X.shape}')

    print('\n STEP1: 실험 결과 확인')
    print("=============== Validation Results ==============")
    total_avg_f1s = []
    best_iteration_temp = {k: [] for k in all_targets}

    val_f1 = []

    binary_val_preds = {}
    multiclass_val_preds = {}

    binary_test_preds = {}
    multiclass_test_preds = {}

    test_preds = {}

    # Find optimal weights
    best_weights = []
    best_scores = []

    for col in targets_binary:
        # binary
        y = train_df[col]

        valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
        train_df['pk'] = train_df['subject_id']+train_df['sleep_date']

        X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
        X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
        y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
        y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

        # Get parameters for both models
        lgb_params = common_params[col].copy()
        lgb_params['random_state'] = random_state

        xgb_params = {
            'n_estimators': 1000,
            'learning_rate': 0.01,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': random_state
        }

        # Train LLM
        is_multiclass = False
        llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path, num_labels=2, torch_dtype=torch.float16, device_map='auto')
        lora_model = get_peft_model(llm_model, config)
        # print(llm_model)
        # optimizer = torch.optim.AdamW(llm_model.parameters(), lr=1e-6, weight_decay=1e-4)
        optimizer = torch.optim.AdamW(llm_model.parameters(), lr=3e-4, weight_decay=1e-4)
        criterion = FocalLoss(gamma=2.0)

        # Add learning rate scheduler
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=2,
            verbose=True,
            min_lr=1e-6
        )

        total_loss = 0
        correct = 0
        bad_counter = 0
        #best model 를 찾기 위한
        best = np.inf

        X_train_llm = X_train.copy()
        X_valid_llm = X_valid.copy()

        X_train_llm = X_train_llm[X_Feature[col]]
        X_valid_llm = X_valid_llm[X_Feature[col]]

        def prepare_input_text(row):
            target_cols = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']
            feature_cols = [col for col in X_Feature[col] if col not in target_cols]


            # 프롬프트 시작 문장에 결측치 설명 추가
            base_prompt = Prompt["S1"] if is_multiclass else Prompt[col]
            instruct_txt = base_prompt.strip() + "\n(Note: -1.0 indicates missing value in the data)\n###DATA###\n"

            if is_multiclass:
                # feature 값을 alias 이름과 함께 텍스트로 구성
                for orig_name, alias_name in zip(X_Feature["S1"], X_Feature_alias["S1"]):
                    if orig_name not in target_cols:
                        instruct_txt += f"{alias_name} : {row[orig_name]}, "
            else:
                # feature 값을 alias 이름과 함께 텍스트로 구성
                for orig_name, alias_name in zip(X_Feature[col], X_Feature_alias[col]):
                    if orig_name not in target_cols:
                        instruct_txt += f"{alias_name} : {row[orig_name]}, "

            #print(instruct_txt)
            return instruct_txt.strip()[:-1]+'.'

        # ---------------------------------- llm 학습 시작 ----------------------------------
        is_multiclass = False
        X_train_llm['input'] = X_train_llm.apply(lambda x: prepare_input_text(x),axis=1)
        #print(X_train_llm.head(1))
        X_valid_llm['input'] = X_valid_llm.apply(lambda x: prepare_input_text(x),axis=1)

        # X_train_llm["label"] = y_train[:15]
        # X_valid_llm["label"] = y_valid[:15]
        X_train_llm["label"] = y_train
        X_valid_llm["label"] = y_valid
        X_train_llm["text"] = X_train_llm["input"]
        X_valid_llm["text"] = X_valid_llm["input"]

        training_data = CustomDataset(X_train_llm)
        validation_data = CustomDataset(X_valid_llm)

        train_dataloader = DataLoader(training_data, batch_size=1, shuffle=True,worker_init_fn=seed_worker, generator=g)
        val_dataloader = DataLoader(validation_data, batch_size=1, shuffle=False)

        # twitterDFTrain_clean = X_train_llm[:15]
        # twitterDFVal_clean = X_valid_llm[:15]

        twitterDFTrain_clean = X_train_llm
        twitterDFVal_clean = X_valid_llm

        for epoch in range(epochs):
            avg_train_loss, avg_train_acc  = train_iter(llm_model, train_dataloader, optimizer, criterion, twitterDFTrain_clean)

            avg_vaild_loss, avg_vaild_acc = valid_iter(llm_model, val_dataloader, criterion, twitterDFVal_clean)

            # Update learning rate based on validation loss
            scheduler.step(avg_vaild_loss)

            if avg_vaild_loss < best:
                best = avg_vaild_loss
                # torch.save(llm_model.state_dict(), col+"_"+best_model_path)
                # torch.save(llm_model.cpu().state_dict(), col+"_"+ best_model_path)

                # 디렉토리 자동 생성
                filename = f"{col}_{best_model_path}"
                os.makedirs(os.path.dirname(filename) or ".", exist_ok=True)  # 디렉토리가 없으면 생성

                torch.save(llm_model.cpu().state_dict(), filename)
                llm_model.cuda()

                bad_counter = 0
            else:
                bad_counter += 1

            if bad_counter == patience:
                break

            print(f'{col} Epoch: {str(epoch+1)}: t_loss:{avg_train_loss:.3f} t_acc:{avg_train_acc:.3f} v_loss:{avg_vaild_loss:.3f} v_acc:{avg_vaild_acc:.3f}')

            wandb.log({"train": {'epoch': epoch, "acc": avg_train_acc, "loss": avg_train_loss}, "val": {'epoch': epoch, "acc": avg_vaild_acc, "loss": avg_vaild_loss}})

            # 메모리 정리
            del avg_train_loss, avg_train_acc, avg_vaild_loss, avg_vaild_acc
            gc.collect()
            torch.cuda.empty_cache()
        # ---------------------------------- llm 학습 끝 ----------------------------------

        # Train LightGBM
        lgb_model = LGBMClassifier(**lgb_params)
        if early_stop:
            lgb_model.fit(
                X_train, y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                callbacks=[early_stopping(stopping_rounds=100,verbose=False)]
            )
            best_iteration_temp[col].append(lgb_model.best_iteration_)
        else:
            lgb_model.fit(X_train, y_train)
            best_iteration_temp[col].append(1000)

        # Train XGBoost
        xgb_model = XGBClassifier(**xgb_params)
        if early_stop:
            xgb_model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=False
            )
        else:
            xgb_model.fit(X_train, y_train)


        tabpfn_params = {
            'device': 'cuda'
        }

        # Train TabPFN
        tabpfn_model = TabPFNClassifier(**tabpfn_params)
        tabpfn_model.fit(X_train, y_train)

        # Get predictions and ensemble
        _, llm_pred_valid = inference(llm_model, val_dataloader, X_valid_llm, col)

        lgb_pred_valid = lgb_model.predict_proba(X_valid)[:, 1]
        xgb_pred_valid = xgb_model.predict_proba(X_valid)[:, 1]
        # cat_pred_valid = cat_model.predict_proba(X_valid)[:, 1]
        # tabnet_pred_valid = tabpfn_model.predict_proba(X_valid.values)[:, 1]
        cat_pred_valid = tabpfn_model.predict_proba(X_valid.values)[:, 1]

        llm_pred_valid = np.array([arr[1] for arr in llm_pred_valid], dtype=np.float32)
        # print(llm_pred_valid)
        pred_valid = (lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + cat_C * cat_pred_valid + llm_D * llm_pred_valid  > 0.5).astype(int)

        f1 = f1_score(y_valid, pred_valid, average='macro')
        val_f1.append(f1)

        # Store predictions
        binary_val_preds[col] = {
            'llm': llm_pred_valid,
            'lgb': lgb_pred_valid,
            'xgb': xgb_pred_valid,
            'cat': cat_pred_valid,
            'true': y_valid
        }

    # multiclass
    y = train_df[target_multiclass]

    X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
    X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
    y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
    y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

    # Get parameters for both models
    lgb_params = common_params['S1'].copy()
    lgb_params['random_state'] = random_state

    xgb_params = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': random_state
    }

    # 클래스 weight 계산
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    # 각 샘플에 대해 weight 매핑
    w_train = pd.Series(y_train).map(class_weights)
    # print("----compute_class_weight:")
    # print(w_train)

    w_train = compute_sample_weight(class_weight='balanced', y=y_train)
    # print("----compute_sample_weight:")
    # print(w_train)

    # ---------------------------------- llm 학습 시작 ----------------------------------
    is_multiclass = True
    llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path, num_labels=3, torch_dtype=torch.float16, device_map='auto')
    lora_model = get_peft_model(llm_model, config)

    optimizer = torch.optim.AdamW(llm_model.parameters(), lr=3e-4, weight_decay=1e-4)
    #optimizer = torch.optim.AdamW(llm_model.parameters(), lr=1e-5, weight_decay=1e-4)
    # alpha = torch.tensor([1.048, 0.670, 1.807], dtype=torch.float16, device=device)
    # criterion = FocalLoss(gamma=2.0,alpha=alpha)
    criterion = FocalLoss(gamma=2.0)

    # Add learning rate scheduler
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=2,
        verbose=True,
        min_lr=1e-6
    )

    total_loss = 0
    correct = 0
    bad_counter = 0

    #best model 를 찾기 위한
    best = np.inf

    X_train_llm = X_train.copy()
    X_valid_llm = X_valid.copy()

    X_train_llm = X_train_llm[X_Feature["S1"]]
    X_valid_llm = X_valid_llm[X_Feature["S1"]]

    X_train_llm['input'] = X_train_llm.apply(lambda x: prepare_input_text(x),axis=1)
    #print(X_train_llm.head(1))
    X_valid_llm['input'] = X_valid_llm.apply(lambda x: prepare_input_text(x),axis=1)

    # X_train_llm["label"] = y_train[:15]
    # X_valid_llm["label"] = y_valid[:15]
    X_train_llm["label"] = y_train
    X_valid_llm["label"] = y_valid
    X_train_llm["text"] = X_train_llm["input"]
    X_valid_llm["text"] = X_valid_llm["input"]

    training_data = CustomDataset(X_train_llm)
    validation_data = CustomDataset(X_valid_llm)

    #train_dataloader = DataLoader(training_data, batch_size=1, shuffle=True,worker_init_fn=seed_worker, generator=g)
    train_dataloader = DataLoader(training_data, batch_size=1, shuffle=True)
    val_dataloader = DataLoader(validation_data, batch_size=1, shuffle=False)

    # twitterDFTrain_clean = X_train_llm[:15]
    # twitterDFVal_clean = X_valid_llm[:15]

    twitterDFTrain_clean = X_train_llm
    twitterDFVal_clean = X_valid_llm

    for epoch in range(epochs):
        avg_train_loss, avg_train_acc  = train_iter(llm_model, train_dataloader, optimizer, criterion, twitterDFTrain_clean)

        avg_vaild_loss, avg_vaild_acc = valid_iter(llm_model, val_dataloader, criterion, twitterDFVal_clean)

        # Update learning rate based on validation loss
        scheduler.step(avg_vaild_loss)

        if avg_vaild_loss < best:
            best = avg_vaild_loss
            # torch.save(llm_model.state_dict(), "S1_"+best_model_path)
            # torch.save(llm_model.cpu().state_dict(), "S1_" + best_model_path)

            # 디렉토리 자동 생성
            filename = f"S1_{best_model_path}"
            os.makedirs(os.path.dirname(filename) or ".", exist_ok=True)  # 디렉토리가 없으면 생성

            torch.save(llm_model.cpu().state_dict(), filename)
            llm_model.cuda()
            bad_counter = 0
        else:
            bad_counter += 1

        if bad_counter == patience:
            break

        print(f'S1 Epoch: {str(epoch+1)}: t_loss:{avg_train_loss:.3f} t_acc:{avg_train_acc:.3f} v_loss:{avg_vaild_loss:.3f} v_acc:{avg_vaild_acc:.3f}')

        wandb.log({"train": {'epoch': epoch, "acc": avg_train_acc, "loss": avg_train_loss}, "val": {'epoch': epoch, "acc": avg_vaild_acc, "loss": avg_vaild_loss}})

        # ✅ 메모리 정리
        del avg_train_loss, avg_train_acc, avg_vaild_loss, avg_vaild_acc
        gc.collect()
        torch.cuda.empty_cache()

    # ---------------------------------- llm 학습 끝 ----------------------------------

    # Train LightGBM
    lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=3)
    if early_stop:
        lgb_model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            callbacks=[early_stopping(stopping_rounds=100,verbose=False)], sample_weight=w_train
        )
        best_iteration_temp[target_multiclass].append(lgb_model.best_iteration_)
    else:
        lgb_model.fit(X_train, y_train, sample_weight=w_train)
        best_iteration_temp[target_multiclass].append(1000)

    # Train XGBoost
    xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=3)
    if early_stop:
        xgb_model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=False, sample_weight=w_train
        )
    else:
        xgb_model.fit(X_train, y_train,sample_weight=w_train)


    tabpfn_params = {
        'device': 'cuda'
    }

    # Train TabPFN
    tabpfn_model = TabPFNClassifier(**tabpfn_params)
    tabpfn_model.fit(X_train, y_train)

    # Get predictions and ensemble
    _, llm_pred_valid = inference(llm_model, val_dataloader, X_valid_llm, "S1")
    lgb_pred_valid = lgb_model.predict_proba(X_valid)
    xgb_pred_valid = xgb_model.predict_proba(X_valid)
    #cat_pred_valid = cat_model.predict_proba(X_valid)
    #tabnet_pred_valid = tabpfn_model.predict_proba(X_valid.values)
    cat_pred_valid = tabpfn_model.predict_proba(X_valid.values)
    # print(llm_pred_valid)
    # print(lgb_pred_valid)
    llm_pred_valid = np.array(llm_pred_valid, dtype=np.float32)
    #print(llm_pred_valid)
    pred_valid = np.argmax(lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + cat_C * cat_pred_valid + llm_D * llm_pred_valid, axis=1)

    f1 = f1_score(y_valid, pred_valid, average='macro')
    val_f1.append(f1)

    multiclass_val_preds = {
        'llm': llm_pred_valid,
        'lgb': lgb_pred_valid,
        'xgb': xgb_pred_valid,
        'cat': cat_pred_valid,
        'true': y_valid
    }

    # Generate all possible weight combinations that sum to 1
    step = 0.1
    for lgb_A in np.arange(0, 1.1, step):
        for xgb_B in np.arange(0, 1.1 - lgb_A, step):
            for cat_C in np.arange(0, 1.1 - lgb_A - xgb_B, step):
                llm_D = 1 - (lgb_A + xgb_B + cat_C)
                if llm_D >= 0:
                    weights = (lgb_A, xgb_B, cat_C, llm_D)
                    print("========================================")
                    print(f"\nTrying weights: lgb_A={lgb_A:.1f}, xgb_B={xgb_B:.1f}, cat_C={cat_C:.1f}, llm_D={llm_D:.1f}")

                    # Calculate validation score with current weights
                    val_scores = []

                    # Binary targets
                    for col in targets_binary:
                        preds = binary_val_preds[col]

                        ensemble_pred = (lgb_A * preds['lgb'] + xgb_B * preds['xgb'] +
                                      cat_C * preds['cat'] + llm_D * preds['llm'] > 0.5).astype(int)
                        f1 = f1_score(preds['true'], ensemble_pred, average='macro')
                        val_scores.append(f1)
                        print(f" Validation Score {col}:{f1:.4f}")

                    # Multiclass target
                    preds = multiclass_val_preds
                    ensemble_pred = np.argmax(lgb_A * preds['lgb'] + xgb_B * preds['xgb'] +
                                           cat_C * preds['cat'] + llm_D * preds['llm'], axis=1)
                    f1 = f1_score(preds['true'], ensemble_pred, average='macro')
                    print(f" Validation Score S1:{f1:.4f}")
                    val_scores.append(f1)

                    avg_score = np.mean(val_scores)
                    best_weights.append(weights)
                    best_scores.append(avg_score)

                    print(f"Average Validation Score: {avg_score:.4f}")

    # Sort results and get top 3
    #sorted_indices = np.argsort(best_scores)[::-1][:3]
    sorted_indices = np.argsort(best_scores)[::-1]
    top_3_weights = [best_weights[i] for i in sorted_indices]
    top_3_scores = [best_scores[i] for i in sorted_indices]

    print("\nTop All Weight Combinations:")
    for i, (weights, score) in enumerate(zip(top_3_weights, top_3_scores)):
        print(f"Rank {i+1}: lgb_A={weights[0]:.1f}, xgb_B={weights[1]:.1f}, cat_C={weights[2]:.1f}, llm_D={weights[3]:.1f} - Score: {score:.4f}")

    avg_f1 = np.mean(val_f1)
    total_avg_f1s.append(avg_f1)
    detail = " ".join([f"{name}({tname}):{score:.4f}" for name, tname, score in zip(targets_binary + [target_multiclass], targets_binary_name + ['S1'], val_f1)])
    print(f" 평균 F1: {avg_f1:.4f} / [상세] {detail}")

    best_iteration_dict = {k: max(best_iteration_temp[k]) for k in all_targets}

    if early_stop==True:
      print("\n[best_iteration_dict]")
      for k, v in best_iteration_dict.items():
          print(f"{k}: {v}")

    print(f"# 전체 평균 F1: {np.mean(total_avg_f1s):.4f}")
    print("================================================")

    # modoling with 100% train & no valid
    print('\n STEP2: 전체 데이터로 모델 재학습')
    print("====== modeling with 100% train & no valid =====")

    # binary
    binary_preds = {}
    binary_preds_proba = {}
    for col in targets_binary:
        # Get parameters for both models
        lgb_params = common_params[col].copy()
        lgb_params['random_state'] = random_state

        xgb_params = {
            'n_estimators': 1000,
            'learning_rate': 0.01,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': random_state
        }

        y = train_df[col]

        if early_stop:
            lgb_params['n_estimators'] = best_iteration_dict[col]
            xgb_params['n_estimators'] = best_iteration_dict[col]

        # LLM Valid Model load and Inference

        is_multiclass = False
        test_X_llm = test_X.copy()
        test_X_llm = test_X_llm[X_Feature[col]]
        test_X_llm['input'] = test_X_llm.apply(lambda x: prepare_input_text(x),axis=1)
        test_X_llm["text"] = test_X_llm["input"]
        test_data = CustomDataset(test_X_llm)
        test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)

        llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path, num_labels=2, torch_dtype=torch.float16, device_map='auto')
        lora_model = get_peft_model(llm_model, config)
        _, llm_pred = inference(llm_model, test_dataloader, test_X_llm, col)
        llm_pred = np.array([arr[1] for arr in llm_pred], dtype=np.float32)

        # Train LightGBM
        lgb_model = LGBMClassifier(**lgb_params)
        lgb_model.fit(X, y)

        # Train XGBoost
        xgb_model = XGBClassifier(**xgb_params)
        xgb_model.fit(X, y)

        # Train CatBoost
        # cat_model = CatBoostClassifier(**common_params_cat)
        # cat_model.fit(X, y)

        tabpfn_params = {
            'device': 'cuda'
        }

        # Train TabPFN
        tabpfn_model = TabPFNClassifier(**tabpfn_params)
        tabpfn_model.fit(X, y)

        # Get predictions and ensemble
        #_, llm_pred_vaild = inference(llm_model, val_dataloader, X_valid_llm, "S1")
        lgb_pred = lgb_model.predict_proba(test_X)[:, 1]
        xgb_pred = xgb_model.predict_proba(test_X)[:, 1]
        #cat_pred = cat_model.predict_proba(test_X)[:, 1]
        cat_pred = tabpfn_model.predict_proba(test_X)[:, 1]

        binary_preds[col] = (lgb_A * lgb_pred + xgb_B * xgb_pred + cat_C * cat_pred + llm_D * llm_pred  > 0.5).astype(int)
        #binary_preds_proba[col] = lgb_A * lgb_model.predict_proba(test_X) + xgb_B * xgb_model.predict_proba(test_X) + cat_C * cat_model.predict_proba(test_X)

        # Store predictions
        binary_test_preds[col] = {
            'llm': llm_pred,
            'lgb': lgb_pred,
            'xgb': xgb_pred,
            'cat': cat_pred
        }

        # Feature importance (using LightGBM's importance)
        fi_df = pd.DataFrame({'feature': X.columns, 'importance': lgb_model.feature_importances_})
        top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
        feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
        print(f"[{col}] {feat_str}")

    # multiclass
    y = train_df['S1']

    # Get parameters for both models
    lgb_params = common_params['S1'].copy()
    lgb_params['random_state'] = random_state

    xgb_params = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': random_state
    }

    if early_stop:
        lgb_params['n_estimators'] = best_iteration_dict['S1']
        xgb_params['n_estimators'] = best_iteration_dict['S1']

    # 클래스 weight 계산
    classes = np.unique(y)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    class_weights = dict(zip(classes, weights))

    # 각 샘플에 대해 weight 매핑
    w_train = pd.Series(y).map(class_weights)
    # print("----compute_class_weight:")
    # print(w_train)

    w_train = compute_sample_weight(class_weight='balanced', y=y)
    # print("----compute_sample_weight:")
    # print(w_train)

    # LLM Valid Model load and Inference

    is_multiclass = True
    test_X_llm = test_X.copy()
    test_X_llm = test_X_llm[X_Feature["S1"]]
    test_X_llm['input'] = test_X_llm.apply(lambda x: prepare_input_text(x),axis=1)
    test_X_llm["text"] = test_X_llm["input"]
    test_data = CustomDataset(test_X_llm)
    test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)

    llm_model = AutoModelForSequenceClassification.from_pretrained(llm_model_path, num_labels=3, torch_dtype=torch.float16, device_map='auto')
    lora_model = get_peft_model(llm_model, config)
    _, llm_pred = inference(llm_model, test_dataloader, test_X_llm, "S1")
    llm_pred = np.array(llm_pred, dtype=np.float32)
    #print(llm_pred)

    # Train LightGBM
    lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=3)
    lgb_model.fit(X, y, sample_weight=w_train)

    # Train XGBoost
    xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=3)
    xgb_model.fit(X, y, sample_weight=w_train)

    # Train CatBoost
    # cat_model = CatBoostClassifier(**common_params_cat2, objective='MultiClass', classes_count=3)
    # cat_model.fit(X, y)

    tabpfn_params = {
        'device': 'cuda'
    }

     # Train TabPFN
    tabpfn_model = TabPFNClassifier(**tabpfn_params)
    tabpfn_model.fit(X, y)

    # Get predictions and ensemble
    lgb_pred = lgb_model.predict_proba(test_X)
    xgb_pred = xgb_model.predict_proba(test_X)
    #cat_pred = cat_model.predict_proba(test_X)
    cat_pred = tabpfn_model.predict_proba(test_X)

    multiclass_test_preds = {
        'llm': llm_pred,
        'lgb': lgb_pred,
        'xgb': xgb_pred,
        'cat': cat_pred
    }

    multiclass_pred = np.argmax(lgb_A * lgb_pred + xgb_B * xgb_pred + cat_C * cat_pred +llm_D * llm_pred, axis=1)
    multiclass_pred_proba = lgb_A * lgb_pred + xgb_B * xgb_pred + cat_C * cat_pred + llm_D * llm_pred

    # Feature importance
    fi_df = pd.DataFrame({'feature': X.columns, 'importance': lgb_model.feature_importances_})
    top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
    feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
    print(f"[S1] {feat_str}")

    # 예측 저장
    submission_final['S1'] = multiclass_pred
    for col in targets_binary:
      submission_final[col] = binary_preds[col]
    submission_final = submission_final[['subject_id', 'sleep_date', 'lifelog_date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']]
    fname = f"submission_{np.mean(total_avg_f1s)}.csv"
    submission_final.to_csv(fname, index=False)
    print(f"# {fname} 저장 완료")
    print(f"# submission shape:{submission_final.shape}")
    print("================================================")
    print("\nTop 3 Weight Combinations:")
    for i, (weights, score) in enumerate(zip(top_3_weights, top_3_scores)):
        print(f"Rank {i+1}: lgb_A={weights[0]:.1f}, xgb_B={weights[1]:.1f}, cat_C={weights[2]:.1f}, llm_D={weights[3]:.1f} - Score: {score:.4f}")

        # Generate submission with these weights
        lgb_A, xgb_B, cat_C, llm_D = weights

        # Binary predictions
        for col in targets_binary:
            preds = binary_test_preds[col]
            ensemble_pred = (lgb_A * preds['lgb'] + xgb_B * preds['xgb'] +
                          cat_C * preds['cat'] + llm_D * preds['llm'] > 0.5).astype(int)
            submission_final[col] = ensemble_pred

        # Multiclass prediction
        preds = multiclass_test_preds
        ensemble_pred = np.argmax(lgb_A * preds['lgb'] + xgb_B * preds['xgb'] +
                               cat_C * preds['cat'] + llm_D * preds['llm'], axis=1)
        submission_final['S1'] = ensemble_pred

        fname = f"submission_top{i+1}_{score:.4f}.csv"
        submission_final.to_csv(fname, index=False)
        print(f"Saved submission to {fname}")

    # Use the best weights for final submission
    best_weights = top_3_weights[0]
    lgb_A, xgb_B, cat_C, llm_D = best_weights

    # 모델별 예측결과 비율 비교
    a11 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
    a13 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
    a12 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
    a21 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
    a23 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
    a22 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
    result = pd.concat([a11, a13, a12, a21, a23, a22], axis=1)
    result.columns = ['학습sum','학습len','학습mean','테스트sum','테스트len','테스트mean']
    print('\n STEP3: 예측결과 비교표')
    display(result)
    oof_result = []
    return submission_final, oof_result

### ============================

### 📦 모델 학습

In [29]:
X_Feature = {
    "Q1": [
        "Q1_te2",
        "wake_time_ratio",
        "mlight_first_wakeup_minutes",
        "wake_time_diff",
        "Q1_te",
        "lights_off_time",
        "sleep_duration_ratio",
        "active_hour_mean_speed",
        "beforebed_통화_time",
        "rolling_sleep_time_2d",
        "activehour_NAVER_time"
    ],
    "Q2": [
        "Q2_te",
        "Q2_te2",
        "activehour_total_screen_time",
        "beforebed_unique_bssid_count",
        "wake_time_lag1",
        "light_rolling_wake_time_2d",
        "beforebed_max_rssi",
        "active_hour_std_hr",
        "beforebed_top_bssid_count",
        "activehour_screen_time_vs_avg_pct",
        "activehour_메신저_time"
    ],
    "Q3": [
        "Q3_te2",
        "light_sleep_time_lag2",
        "mlight_first_wakeup_minutes",
        "rolling_sleep_time_3d",
        "light_rolling_sleep_duration_3d",
        "Q3_te",
        "beforebed_scan_count",
        "active_hour_distance_x",
        "activehour_통화_time",
        "walking_minutes"
    ],
    "S1": [
        "S1_te",
        "wake_time_diff",
        "S1_te2",
        "sleep_duration_ratio",
        "m_activity_met@240min@sum@04h00m",
        "beforebed_screen_time_vs_avg_pct",
        "wake_time_ratio",
        "rolling_wake_time_3d",
        "m_activity_0@240min@std@20h00m",
        "m_activity@240min@std@12h00m",
        "beforebed_메신저_time",
        "sleep_duration_diff",
        "light_sleep_time_diff",
        "active_hour_min_hr"
    ],
    "S2": [
        "S2_te2",
        "S2_te",
        "light_sleep_time_lag1",
        "work_hour_unknown_ratio",
        "m_activity@240min@std@12h00m",
        "beforebed_strong_signal_ratio",
        "light_rolling_wake_time_2d",
        "free_hour_rssi_mean",
        "activehour_전화_time",
        "sleep_hour_mean_speed",
        "activehour_screen_time_vs_avg_pct",
        "light_wake_time_diff_lag2",
        "beforebed_max_rssi",
        "avg_charging_duration",
        "m_activity_met@240min@std@12h00m"
    ],
    "S3": [
        "S3_te",
        "S3_te2",
        "beforebed_메신저_time",
        "light_wake_time_diff",
        "sleep_time_diff_lag1",
        "light_sleep_time_lag2",
        "m_activity_met@240min@sum@16h00m",
        "free_hour_rssi_max",
        "light_weekday_avg_sleep",
        "불끈시간부터기상시간",
        "sleep_hour_distance_x",
        "activehour_scan_count"
    ]
}

X_Feature_alias = {
    "Q1": [
        "Q1_encoded_time_2",
        "wake_time_to_baseline_ratio",
        "minutes_to_first_wake_after_light",
        "wake_time_difference",
        "Q1_encoded_time",
        "lights_off_clock_time",
        "sleep_duration_ratio_to_guideline",
        "mean_speed_during_active_hours",
        "call_duration_before_bed",
        "sleep_time_rolling_avg_2d",
        "NAVER_time_active_hours"
    ],
    "Q2": [
        "Q2_encoded_time",
        "Q2_encoded_time_2",
        "total_screen_time_active_hours",
        "unique_wifi_count_before_bed",
        "previous_day_wake_time",
        "light_based_wake_time_rolling_avg_2d",
        "max_wifi_signal_before_bed",
        "std_heart_rate_active_hours",
        "frequent_wifi_count_before_bed",
        "screen_time_vs_avg_pct_active_hours",
        "messenger_usage_time_active_hours"
    ],
    "Q3": [
        "Q3_encoded_time_2",
        "light_sleep_duration_lag2",
        "minutes_to_first_wake_after_light",
        "sleep_time_rolling_avg_3d",
        "light_based_sleep_duration_rolling_avg_3d",
        "Q3_encoded_time",
        "wifi_scan_count_before_bed",
        "distance_traveled_active_hours",
        "call_duration_active_hours",
        "total_walking_minutes"
    ],
    "S1": [
        "S1_encoded_time",
        "wake_time_difference",
        "S1_encoded_time_2",
        "sleep_duration_ratio_to_guideline",
        "met_sum_0to4am",
        "screen_time_vs_avg_pct_before_bed",
        "wake_time_to_baseline_ratio",
        "wake_time_rolling_avg_3d",
        "activity_std_8pm_to_midnight",
        "activity_std_12pm_to_4pm"
        "messenger_usage_time_before_bed",
        "sleep_time_difference",
        "light_based_sleep_time_difference",
        "activity_time_mininum_hours"

    ],
    "S2": [
        "S2_encoded_time_2",
        "S2_encoded_time",
        "light_sleep_duration_lag1",
        "unknown_activity_ratio_work_hours",
        "activity_std_12pm_to_4pm",
        "strong_wifi_signal_ratio_before_bed",
        "light_based_wake_time_rolling_avg_2d",
        "avg_wifi_signal_strength_free_hours",
        "phone_call_time_active_hours",
        "mean_movement_speed_sleep_hours",
        "activity_time_screen_hour_vs_avg_pct","light_wake_time_diff_lag2","beforebed_max_rssi","avg_charging_duration","m_activity_met@240min@std@12h00m"
    ],
    "S3": [
        "S3_encoded_time",
        "S3_encoded_time_2",
        "messenger_usage_time_before_bed",
        "light_based_wake_time_difference",
        "sleep_time_difference_lag1",
        "light_sleep_duration_lag2",
        "met_sum_4pm_to_8pm",
        "max_wifi_signal_strength_free_hours",
        "light_based_weekday_avg_sleep_duration",
        "time_from_first_movement_to_final_wake",
        "sleep_hour_distance_x","active_hour_scan_count"
    ]
}

In [30]:
Prompt = {
    "Q1": "Based on the following sleep-related information, classify the overall perceived sleep quality upon waking as either below or above the individual’s average. (0: Below average, 1: Above average) ",
    "Q2": "Using the following data, determine the level of physical fatigue the individual felt before sleep. (0: High fatigue, 1: Low fatigue) ",
    "Q3": "Based on the information below, classify the stress level the individual experienced before going to bed. (0: High stress, 1: Low stress) ",
    "S1": "Based on the provided information, classify how well the individual's total sleep time (TST) aligns with recommended sleep guidelines. (0: Not met, 1: Partially met, 2: Fully met) ",
    "S2": "Using the following indicators, determine whether the individual met the guideline for sleep efficiency (SE). (0: Not met, 1: Met) ",
    "S3": "From the data provided, assess if the sleep onset latency (SOL) guideline was met. (0: Not met, 1: Met) "
}

In [31]:
def seed_worker(worker_id):
    worker_seed = 42 + worker_id
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(42)

<torch._C.Generator at 0x7e7b6c5dc830>

In [32]:
# ---
# Define llm_model
# --

#llm_model_path = 'Qwen/Qwen2.5-0.5B'
#llm_model_path = "Qwen/Qwen3-0.6B-Base" #똑같음
#llm_model_path = "Qwen/Qwen3-1.7B" # 성공 좋음
#llm_model_path = "EleutherAI/pythia-70m"
#llm_model_path = "facebook/opt-125m" # 괜찮음
#llm_model_path = "openai-community/gpt2"
#llm_model_path = 'Qwen/Qwen2.5-0.5B-Instruct'
#llm_model_path = "distilbert/distilgpt2"
llm_model_path = "Qwen/Qwen3-0.6B"

tokenizer = AutoTokenizer.from_pretrained(llm_model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [33]:
# ---
# Define LoraConfig
# ---

config = LoraConfig(
    # r=16,
    # lora_alpha=32,
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],#Best
    #target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    # task_type="SEQ_CLS",
    #task_type=TaskType.SEQ_CLS,
    modules_to_save=["classifier"],
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [34]:
# ---
# Define your training loop
# ---

epochs = 16 # 5
#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
#criterion = nn.BCEWithLogitsLoss()
total_loss = 0
correct = 0
best = np.inf # best model 를 찾기 위한
patience = 3  # 3번까지 validation loss 터지면 stop 시킴

# best model save
result_path = "./"
best_model_path = os.path.join(result_path, 'best_model.pt')
# best_model_path = 'best_model.pt'
bad_counter = 0

In [None]:
# 공통 하이퍼파라미터
common_params = {
  'n_estimators': 5000,
  "learning_rate": 0.01,
  # 'min_data_in_leaf':2,
  # 'bagging_fraction':0.9,
  # 'feature_fraction':0.6,
  'lambda_l1': 5,
  'lambda_l2': 1,
  # 'max_depth': 4,
  'n_jobs': -1,
  'verbosity': -1
}

# 모델별 세부 하이퍼파라미터
best_param_dict = {}
best_param_dict['Q3'] = common_params
best_param_dict['S1'] = common_params
best_param_dict['S2'] = common_params
best_param_dict['S3'] = common_params
best_param_dict['Q1'] = common_params
best_param_dict['Q2'] = common_params

"""
// submission_top1_0.6492.csv

# submission_0.6261336267831857.csv 저장 완료
# submission shape:(250, 9)

Top 3 Weight Combinations:
Rank 1: lgb_A=0.2, xgb_B=0.4, cat_C=0.0, llm_D=0.4 - Score: 0.6492
Saved submission to submission_top1_0.6492.csv
Rank 2: lgb_A=0.1, xgb_B=0.5, cat_C=0.2, llm_D=0.2 - Score: 0.6487
Saved submission to submission_top2_0.6487.csv
Rank 3: lgb_A=0.1, xgb_B=0.5, cat_C=0.1, llm_D=0.3 - Score: 0.6478
Saved submission to submission_top282_0.5889.csv
"""

submission_final, oof_result = run_basemodel(train, test, valid_ids, best_param_dict, n_splits=5, random_state=41, early_stop=False)

light_week_type_lag1
weekday
week_type
week_type_lag1
activehour_top_bssid
beforebed_top_bssid
# X shape: (450, 247)
# test_X shape: (250, 247)

 STEP1: 실험 결과 확인


Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 410/410 [01:13<00:00,  5.59it/s]
100%|██████████| 40/40 [00:02<00:00, 15.91it/s]


Q1 Epoch: 1: t_loss:0.265 t_acc:0.552 v_loss:0.211 v_acc:0.286


100%|██████████| 410/410 [01:12<00:00,  5.65it/s]
100%|██████████| 40/40 [00:02<00:00, 16.40it/s]


Q1 Epoch: 2: t_loss:0.189 t_acc:0.555 v_loss:0.191 v_acc:0.440


100%|██████████| 410/410 [01:12<00:00,  5.66it/s]
100%|██████████| 40/40 [00:02<00:00, 16.00it/s]


Q1 Epoch: 3: t_loss:0.181 t_acc:0.583 v_loss:0.168 v_acc:0.365


100%|██████████| 410/410 [01:13<00:00,  5.57it/s]
100%|██████████| 40/40 [00:02<00:00, 16.04it/s]


Q1 Epoch: 4: t_loss:0.186 t_acc:0.552 v_loss:0.236 v_acc:0.286


100%|██████████| 410/410 [01:12<00:00,  5.64it/s]
100%|██████████| 40/40 [00:02<00:00, 16.09it/s]


Q1 Epoch: 5: t_loss:0.179 t_acc:0.572 v_loss:0.169 v_acc:0.469


100%|██████████| 410/410 [01:13<00:00,  5.60it/s]
100%|██████████| 40/40 [00:02<00:00, 16.33it/s]


tabpfn-v2-classifier.ckpt:   0%|          | 0.00/29.0M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/37.0 [00:00<?, ?B/s]

100%|██████████| 40/40 [00:02<00:00, 16.48it/s]
Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 66%|██████▌   | 270/410 [00:47<00:24,  5.67it/s]

### 📦 이전제출과 비교

In [None]:
from pathlib import Path

# Reference file
reference_file = '/content/drive/MyDrive/data/ch2025_data_items/share/submissions/submission_top1_0.6492.csv'
ref_df = pd.read_csv(reference_file)

# Get all CSV files in data directory
data_dir = Path('./')
csv_files = list(data_dir.glob('*.csv'))

# Store differences for each file
differences = []

for csv_file in csv_files:
    if csv_file.name == os.path.basename(reference_file):
        continue

    # Read current file
    current_df = pd.read_csv(csv_file)

    # Calculate differences in specified columns
    diff_count = 0
    for col in ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']:
        diff_count += (ref_df[col] != current_df[col]).sum()

    differences.append((csv_file.name, diff_count))
    print(f"File: {csv_file.name}, Differences: {diff_count}")

# Sort by difference count and get top 20
differences.sort(key=lambda x: x[1])
print("\nTop 20 files with smallest differences:")
for i, (file_name, diff_count) in enumerate(differences[:20], 1):
    print(f"{i}. {file_name}: {diff_count} differences")