<a href="https://colab.research.google.com/github/jacobgreen4477/The-4th-ETRI-AI-Human-Understanding-Competition/blob/main/etri_template_vF1_0_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

> title : 제 4회 ETRI 휴먼이해 인공지능 논문경진대회 <br>
> author : hjy,byc <br>

### 📦 라이브러리

In [1]:
! pip install haversine >/dev/null
! pip install optuna  >/dev/null
! pip install category_encoders >/dev/null
! pip install tabpfn  >/dev/null
! pip install catboost >/dev/null
! pip install torchmetrics >/dev/null

In [2]:
# 기본 모듈
import os
import sys
import re
import ast
import glob
import random
import warnings
from collections import Counter
from math import radians, cos, sin, asin, sqrt
from functools import reduce
from datetime import datetime, timedelta, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 머신러닝
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from category_encoders import TargetEncoder
from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from tabpfn import TabPFNClassifier

# PyTorch
import torch
from torch import nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# Hugging Face
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    LlamaTokenizer,
    LlamaForCausalLM,
    LlamaForSequenceClassification
)

# PEFT (Parameter-Efficient Fine-Tuning)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    TaskType
)

# Evaluation & Utilities
from torchmetrics import Accuracy

# 기타
from tqdm import tqdm
from tqdm.auto import tqdm as auto_tqdm  # 필요 시 구분
from scipy.stats import entropy
from haversine import haversine
from io import StringIO
import gc

# wandb
import wandb
wandb.login(key="5fa8dfb2c5be3c888bfe0101437a8fa22fbdf0e0")
wandb.init(project="etri_lifelog", entity="byc3230")

# 옵션
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%0.4f' % x)

# 기타
warnings.filterwarnings('ignore')

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbyc3230[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
string = """
subject_id,sleep_date
id01,2024-07-24
id01,2024-08-26
id01,2024-08-28
id01,2024-08-29
id02,2024-08-23
id02,2024-09-24
id02,2024-09-26
id02,2024-09-27
id03,2024-08-30
id03,2024-09-01
id03,2024-09-02
id03,2024-09-06
id04,2024-09-03
id04,2024-10-10
id04,2024-10-12
id04,2024-10-13
id05,2024-10-19
id05,2024-10-23
id05,2024-10-24
id05,2024-10-27
id06,2024-07-25
id06,2024-07-26
id06,2024-07-27
id06,2024-07-30
id07,2024-07-07
id07,2024-08-02
id07,2024-08-04
id07,2024-08-05
id08,2024-08-28
id08,2024-08-29
id08,2024-08-30
id08,2024-09-02
id09,2024-08-02
id09,2024-08-31
id09,2024-09-02
id09,2024-09-03
id10,2024-08-28
id10,2024-08-30
id10,2024-08-31
id10,2024-09-03
"""

# DataFrame 생성
valid_ids = pd.read_csv(StringIO(string), sep=',')
valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']

### 📦 데이터 읽기

In [4]:
from google.colab import drive, files
drive.mount('/content/drive')

path = '/content/drive/MyDrive/data/ch2025_data_items/share/'

train = pd.read_parquet(f'{path}train_63775.parquet')
test = pd.read_parquet(f'{path}test_63775.parquet')

print('# train  shape:',train.shape)
print('# test   shape:',test.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
# train  shape: (450, 237)
# test   shape: (250, 237)


In [5]:
# drop_features = ['afterwork_max_label','sleeptime_max_label','worktime_max_label']
drop_features = ['top_bssid'] # ,'week_type','week_type_lag1'
drop_features = [i for i in drop_features if i in train.columns.tolist()]
print('# drop_features:',drop_features)
train = train.drop(columns=drop_features)
test = test.drop(columns=drop_features)

# drop_features: []


In [6]:
# ---
# 추정수면효율
# ---

def calculate_sleep_duration_min(sleep_time, wake_time):
    """
    취침 시각(sleep_time)과 기상 시각(wake_time)을 입력받아 수면 시간(분) 반환
    단위는 float 시간 (예: 23.5, 6.25)
    """
    if pd.isna(sleep_time) or pd.isna(wake_time):
        return None
    if wake_time < sleep_time:
        wake_time += 24  # 자정 넘긴 경우 보정
    duration = (wake_time - sleep_time) * 60
    return round(duration)

train['불끈시간부터기상시간'] = train.apply(lambda x: calculate_sleep_duration_min(x['lights_off_time'],x['wake_time']),axis=1)
test['불끈시간부터기상시간'] = test.apply(lambda x: calculate_sleep_duration_min(x['lights_off_time'],x['wake_time']),axis=1)

train['추정수면효율'] = train['불끈시간부터기상시간']/train['sleep_duration_min']
test['추정수면효율'] = test['불끈시간부터기상시간']/test['sleep_duration_min']

# 이상값 제거
train['추정수면효율'] = np.where(train['추정수면효율']<-5,np.nan,train['추정수면효율'])
test['추정수면효율'] = np.where(test['추정수면효율']<-5,np.nan,test['추정수면효율'])
train['추정수면효율'] = np.where(train['추정수면효율']>5,np.nan,train['추정수면효율'])
test['추정수면효율'] = np.where(test['추정수면효율']>55,np.nan,test['추정수면효율'])

In [7]:
# 요일 컬럼 추가 (예: 월요일, 화요일, ...)
train['lifelog_date'] = pd.to_datetime(train['lifelog_date'])
test['lifelog_date'] = pd.to_datetime(test['lifelog_date'])

# 요일
weekday_map = {
    0: '월요일', 1: '화요일', 2: '수요일', 3: '목요일',
    4: '금요일', 5: '토요일', 6: '일요일'
}
train['weekday'] = train['lifelog_date'].dt.dayofweek.map(weekday_map)
test['weekday'] = test['lifelog_date'].dt.dayofweek.map(weekday_map)

# 월
train['month'] = train['lifelog_date'].dt.month
test['month'] = test['lifelog_date'].dt.month

# weekend
train['weekend'] = np.where(train['weekday'].isin(['토요일','일요일']),1,0)
test['weekend'] = np.where(test['weekday'].isin(['토요일','일요일']),1,0)

# 공휴일
공휴일 = [
     '2024-08-15'
    ,'2024-09-16'
    ,'2024-09-17'
    ,'2024-09-18'
    ,'2024-10-03'
    ,'2024-10-09'
]
train['공휴일'] = np.where(train['lifelog_date'].isin(공휴일),1,0)
test['공휴일'] = np.where(test['lifelog_date'].isin(공휴일),1,0)

# 주말 + 공휴일 묶어주기
train['weekend_holilday'] = np.where( ((train['weekend']==0) & (train['공휴일']==1)), 1, train['weekend'])
test['weekend_holilday'] = np.where( ((test['weekend']==0) & (test['공휴일']==1)), 1, test['weekend'])

In [8]:
def add_prev_day_flag(df):
    df = df.copy()
    df['lifelog_date'] = pd.to_datetime(df['lifelog_date'])

    # 각 subject_id별로 전날 날짜 만들기
    df['prev_day'] = df['lifelog_date'] - pd.Timedelta(days=1)

    # subject_id + 날짜 기준으로 원본 키 구성
    key_set = set(zip(df['subject_id'], df['lifelog_date']))

    # 전날 데이터가 존재하면 1, 없으면 0
    df['has_prev_day_data'] = df[['subject_id', 'prev_day']].apply(
        lambda row: 1 if (row['subject_id'], row['prev_day']) in key_set else 0, axis=1
    )

    return df.drop(columns=['prev_day'])

train = add_prev_day_flag(train)
test = add_prev_day_flag(test)

In [9]:
# ---
# 추정휴가
# ---

def rule_based_sum(x):
    rules = (
        # (x['sleep_duration_min'] > (x['avg_sleep_duration']+30))
          (x['sleep_duration_min'] > (x['avg_sleep_duration']+60))
        & (x['week_type'] == 'weekday')
        # & (x['month'].isin([7,8]))
    )
    return rules

train['vacation'] = train.groupby('subject_id').apply(rule_based_sum).reset_index(level=0, drop=True).astype(int)
test['vacation'] = test.groupby('subject_id').apply(rule_based_sum).reset_index(level=0, drop=True).astype(int)

In [10]:
# 숫자형 컬럼만 선택해서 결측값 -1로 채우기
train[train.select_dtypes(include='number').columns] = train.select_dtypes(include='number').fillna(-1)
test[test.select_dtypes(include='number').columns] = test.select_dtypes(include='number').fillna(-1)

### ============================

### run_basemodel

In [11]:
def run_basemodel(train, test, valid_ids, common_params, n_splits, random_state=42):

    lgb_A = 0.3
    xgb_B = 0.3
    tab_C = 0.3

    train_df = train.copy()
    test_df = test.copy()

    submission_final = test_df[['subject_id', 'sleep_date', 'lifelog_date']].copy()
    submission_final['lifelog_date'] = pd.to_datetime(submission_final['lifelog_date']).dt.date

    # 타겟
    targets_binary = ['Q1', 'Q2', 'Q3', 'S2', 'S3']
    targets_binary_name = ['기상직후수면질','취침전신체적피로','취침전스트레스','수면효율','수면잠들기시간']
    target_multiclass = 'S1'
    all_targets = targets_binary + [target_multiclass]

    # 노이즈 수준 설정
    def add_noise(series, noise_level, seed=3):
        rng = np.random.default_rng(seed)
        return series * (1 + noise_level * rng.standard_normal(len(series)))

    noise_level = 0.015  # 필요에 따라 조정

    # 타겟인코딩
    for tgt in all_targets:

      encoder_feats = ['subject_id','month','weekend'] # 'weekday', 'subject_id','month','weekend'

      #### 타겟인코딩1

      subject_mean = train_df.groupby(encoder_feats)[tgt].mean().rename(f'{tgt}_te')
      train_df = train_df.merge(subject_mean, on=encoder_feats, how='left')
      test_df = test_df.merge(subject_mean, on=encoder_feats, how='left')
      global_mean = train_df[tgt].mean()
      test_df[f'{tgt}_te'] = test_df[f'{tgt}_te'].fillna(global_mean)

      # 노이즈 추가
      train_df[f'{tgt}_te'] = add_noise(train_df[f'{tgt}_te'], noise_level)
      test_df[f'{tgt}_te'] = add_noise(test_df[f'{tgt}_te'], noise_level)

      #### 타겟인코딩2

      # 새로운 범주형 열 생성
      train_df['TMP'] = train_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)
      test_df['TMP'] = test_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)

      # 인코더
      encoder = TargetEncoder(cols=['TMP'], smoothing=300) # 40
      encoder.fit(train_df[['TMP']], train_df[tgt])

      # 인코딩 결과를 새로운 열에 저장
      train_df[f'{tgt}_te2'] = encoder.transform(train_df[['TMP']])
      test_df[f'{tgt}_te2'] = encoder.transform(test_df[['TMP']])

      # 노이즈 추가
      train_df[f'{tgt}_te2'] = add_noise(train_df[f'{tgt}_te2'], noise_level)
      test_df[f'{tgt}_te2'] = add_noise(test_df[f'{tgt}_te2'], noise_level)

      # 불필요한 변수 제거
      train_df = train_df.drop(columns=['TMP'])
      test_df = test_df.drop(columns=['TMP'])


    # 인코딩
    PK = ['sleep_date', 'lifelog_date', 'subject_id']
    encoder = LabelEncoder()
    categorical_features = [i for i in train_df.select_dtypes(include=['object', 'category']).columns if i not in PK+['pk']]
    for col in categorical_features:
        print(col)
        train_df[col] = encoder.fit_transform(train_df[col])
        test_df[col] = encoder.fit_transform(test_df[col])

    # X
    X = train_df.drop(columns=PK + all_targets)
    test_X = test_df.drop(columns=PK + all_targets)
    print(f'# X shape: {X.shape}')
    print(f'# test_X shape: {test_X.shape}')

    print('\n STEP1: 실험 결과 확인')
    print("=============== Validation Results ==============")
    total_avg_f1s = []
    val_f1 = []
    binary_val_preds = {}
    multiclass_val_preds = {}
    binary_test_preds = {}
    multiclass_test_preds = {}
    test_preds = {}

    # Find optimal weights
    best_weights = []
    best_scores = []

    for col in targets_binary:
        # binary
        y = train_df[col]

        valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
        train_df['pk'] = train_df['subject_id']+train_df['sleep_date']

        X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
        X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
        y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
        y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

        # Get parameters for both models
        lgb_params = common_params[col].copy()
        lgb_params['random_state'] = random_state

        xgb_params = {
            'n_estimators': 1000,
            'learning_rate': 0.01,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': random_state
        }

        # Train LightGBM
        lgb_model = LGBMClassifier(**lgb_params)
        lgb_model.fit(X_train, y_train)

        # Train XGBoost
        xgb_model = XGBClassifier(**xgb_params)
        xgb_model.fit(X_train, y_train)

        tabpfn_params = {
            'device': 'cuda'
        }

        # Train TabPFN
        tabpfn_model = TabPFNClassifier(**tabpfn_params)
        tabpfn_model.fit(X_train, y_train)
        tab_pred_valid = tabpfn_model.predict_proba(X_valid.values)[:, 1]

        lgb_pred_valid = lgb_model.predict_proba(X_valid)[:, 1]
        xgb_pred_valid = xgb_model.predict_proba(X_valid)[:, 1]

        pred_valid = (lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + tab_C * tab_pred_valid > 0.5).astype(int)

        f1 = f1_score(y_valid, pred_valid, average='macro')
        val_f1.append(f1)

        # Store predictions
        binary_val_preds[col] = {
            'lgb': lgb_pred_valid,
            'xgb': xgb_pred_valid,
            'tab': tab_pred_valid,
            'true': y_valid
        }

    # multiclass
    y = train_df[target_multiclass]
    X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
    X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
    y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
    y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

    # Get parameters for both models
    lgb_params = common_params['S1'].copy()
    lgb_params['random_state'] = random_state

    xgb_params = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': random_state
    }

    # 클래스 weight 계산
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    # 각 샘플에 대해 weight 매핑
    w_train = pd.Series(y_train).map(class_weights)
    w_train = compute_sample_weight(class_weight='balanced', y=y_train)

    # Train LightGBM
    lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=3)
    lgb_model.fit(X_train, y_train, sample_weight=w_train)

    # Train XGBoost
    xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=3)
    xgb_model.fit(X_train, y_train,sample_weight=w_train)

    tabpfn_params = {
        'device': 'cuda'
    }

    # Train TabPFN
    tabpfn_model = TabPFNClassifier(**tabpfn_params)
    tabpfn_model.fit(X_train, y_train)

    # Get predictions and ensemble
    lgb_pred_valid = lgb_model.predict_proba(X_valid)
    xgb_pred_valid = xgb_model.predict_proba(X_valid)
    tab_pred_valid = tabpfn_model.predict_proba(X_valid.values)

    pred_valid = np.argmax(lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + tab_C * tab_pred_valid, axis=1)

    f1 = f1_score(y_valid, pred_valid, average='macro')
    val_f1.append(f1)

    multiclass_val_preds = {
        'lgb': lgb_pred_valid,
        'xgb': xgb_pred_valid,
        'tab': tab_pred_valid,
        'true': y_valid
    }

    # Generate all possible weight combinations that sum to 1
    step = 0.1
    for lgb_A in np.arange(0, 1.1, step):
        for xgb_B in np.arange(0, 1.1 - lgb_A, step):
            for tab_C in np.arange(0, 1.1 - lgb_A - xgb_B, step):
                TOT = 1 - (lgb_A + xgb_B + tab_C)
                if TOT >= 0:
                    weights = (lgb_A, xgb_B, tab_C)
                    val_scores = []

                    # Binary targets
                    for col in targets_binary:
                        preds = binary_val_preds[col]
                        ensemble_pred = (lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab'] > 0.5).astype(int)
                        f1 = f1_score(preds['true'], ensemble_pred, average='macro')
                        val_scores.append(f1)
                        # print(f" Validation Score {col}:{f1:.4f}")

                    # Multiclass target
                    preds = multiclass_val_preds
                    ensemble_pred = np.argmax(lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab'] , axis=1)
                    f1 = f1_score(preds['true'], ensemble_pred, average='macro')
                    # print(f" Validation Score S1:{f1:.4f}")
                    val_scores.append(f1)

                    avg_score = np.mean(val_scores)
                    best_weights.append(weights)
                    best_scores.append(avg_score)

                    # print(f"Average Validation Score: {avg_score:.4f}")

    # Sort results and get top 3
    sorted_indices = np.argsort(best_scores)[::-1]
    top_3_weights = [best_weights[i] for i in sorted_indices]
    top_3_scores = [best_scores[i] for i in sorted_indices]

    # print("\nTop All Weight Combinations:")
    # for i, (weights, score) in enumerate(zip(top_3_weights, top_3_scores)):
    #     print(f"Rank {i+1}: lgb_A={weights[0]:.1f}, xgb_B={weights[1]:.1f}, tab_C={weights[2]:.1f} - Score: {score:.4f}")

    avg_f1 = np.mean(val_f1)
    total_avg_f1s.append(avg_f1)
    detail = " ".join([f"{name}({tname}):{score:.4f}" for name, tname, score in zip(targets_binary + [target_multiclass], targets_binary_name + ['S1'], val_f1)])
    print(f" 평균 F1: {avg_f1:.4f} / [상세] {detail}")
    print(f"# 전체 평균 F1: {np.mean(total_avg_f1s):.4f}")
    print("================================================")

    # modoling with 100% train & no valid
    print('\n STEP2: 전체 데이터로 모델 재학습')
    print("====== modeling with 100% train & no valid =====")

    # binary
    binary_preds = {}
    binary_preds_proba = {}
    for col in targets_binary:
        # Get parameters for both models
        lgb_params = common_params[col].copy()
        lgb_params['random_state'] = random_state

        xgb_params = {
            'n_estimators': 1000,
            'learning_rate': 0.01,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': random_state
        }

        y = train_df[col]

        is_multiclass = False

        # Train LightGBM
        lgb_model = LGBMClassifier(**lgb_params)
        lgb_model.fit(X, y)

        # Train XGBoost
        xgb_model = XGBClassifier(**xgb_params)
        xgb_model.fit(X, y)

        tabpfn_params = {
            'device': 'cuda'
        }

        # Train TabPFN
        tabpfn_model = TabPFNClassifier(**tabpfn_params)
        tabpfn_model.fit(X, y)

        tab_pred = tabpfn_model.predict_proba(test_X)[:, 1]
        lgb_pred = lgb_model.predict_proba(test_X)[:, 1]
        xgb_pred = xgb_model.predict_proba(test_X)[:, 1]

        binary_preds[col] = (lgb_A * lgb_pred + xgb_B * xgb_pred + tab_C * tab_pred > 0.5).astype(int)

        # Store predictions
        binary_test_preds[col] = {
            'lgb': lgb_pred,
            'xgb': xgb_pred,
            'tab': tab_pred
        }

        # Feature importance (using LightGBM's importance)
        fi_df = pd.DataFrame({'feature': X.columns, 'importance': lgb_model.feature_importances_})
        top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
        feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
        print(f"[{col}] {feat_str}")

    # multiclass
    y = train_df['S1']

    # Get parameters for both models
    lgb_params = common_params['S1'].copy()
    lgb_params['random_state'] = random_state

    xgb_params = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': random_state
    }

    # 클래스 weight 계산
    classes = np.unique(y)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
    class_weights = dict(zip(classes, weights))

    # 각 샘플에 대해 weight 매핑
    w_train = pd.Series(y).map(class_weights)
    w_train = compute_sample_weight(class_weight='balanced', y=y)

    is_multiclass = True

    # Train LightGBM
    lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=3)
    lgb_model.fit(X, y, sample_weight=w_train)

    # Train XGBoost
    xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=3)
    xgb_model.fit(X, y, sample_weight=w_train)

    tabpfn_params = {
        'device': 'cuda'
    }

     # Train TabPFN
    tabpfn_model = TabPFNClassifier(**tabpfn_params)
    tabpfn_model.fit(X, y)

    # Get predictions and ensemble
    lgb_pred = lgb_model.predict_proba(test_X)
    xgb_pred = xgb_model.predict_proba(test_X)
    tab_pred = tabpfn_model.predict_proba(test_X)

    multiclass_test_preds = {
        'lgb': lgb_pred,
        'xgb': xgb_pred,
        'tab': tab_pred
    }

    multiclass_pred = np.argmax(lgb_A * lgb_pred + xgb_B * xgb_pred + tab_C * tab_pred, axis=1)
    multiclass_pred_proba = lgb_A * lgb_pred + xgb_B * xgb_pred + tab_C * tab_pred

    # Feature importance
    fi_df = pd.DataFrame({'feature': X.columns, 'importance': lgb_model.feature_importances_})
    top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
    feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
    print(f"[S1] {feat_str}")

    # 예측 저장
    submission_final['S1'] = multiclass_pred
    for col in targets_binary:
      submission_final[col] = binary_preds[col]
    submission_final = submission_final[['subject_id', 'sleep_date', 'lifelog_date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']]
    fname = f"submission_{np.mean(total_avg_f1s)}.csv"
    submission_final.to_csv(fname, index=False)
    print(f"# {fname} 저장 완료")
    print(f"# submission shape:{submission_final.shape}")
    print("================================================")

    # Top 10 Weight Combinations
    print("\nTop 10 Weight Combinations:")
    for i, (weights, score) in enumerate(zip(top_3_weights[:10], top_3_scores[:10])):
        print(f"Rank {i+1}: lgb_A={weights[0]:.1f}, xgb_B={weights[1]:.1f}, tab_C={weights[2]:.1f} - Score: {score:.4f}")

        # Generate submission with these weights
        lgb_A, xgb_B, tab_C = weights

        # Binary predictions
        for col in targets_binary:
            preds = binary_test_preds[col]
            ensemble_pred = (lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab'] > 0.5).astype(int)
            submission_final[col] = ensemble_pred

        # Multiclass prediction
        preds = multiclass_test_preds
        ensemble_pred = np.argmax(lgb_A * preds['lgb'] + xgb_B * preds['xgb'] + tab_C * preds['tab'], axis=1)
        submission_final['S1'] = ensemble_pred

        fname = f"submission_top{i+1}_{score:.4f}.csv"
        submission_final.to_csv(fname, index=False)
        print(f"Saved submission to {fname}")

    # Use the best weights for final submission
    best_weights = top_3_weights[0]
    lgb_A, xgb_B, tab_C = best_weights

    # 모델별 예측결과 비율 비교
    a11 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
    a13 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
    a12 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
    a21 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
    a23 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
    a22 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
    result = pd.concat([a11, a13, a12, a21, a23, a22], axis=1)
    result.columns = ['학습sum','학습len','학습mean','테스트sum','테스트len','테스트mean']
    print('\n STEP3: 예측결과 비교표')
    display(result)
    oof_result = []
    return submission_final, oof_result

### seed

In [12]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(1)

### ============================

### 📦 모델 학습

In [13]:
%%time

# 공통 하이퍼파라미터
common_params = {
  'n_estimators': 5000,
  "learning_rate": 0.01,
  # 'min_data_in_leaf':2,
  # 'bagging_fraction':0.9,
  # 'feature_fraction':0.6,
  'lambda_l1': 5,
  'lambda_l2': 1,
  # 'max_depth': 4,
  'n_jobs': -1,
  'verbosity': -1
}

# 모델별 세부 하이퍼파라미터
best_param_dict = {}
best_param_dict['Q3'] = common_params
best_param_dict['S1'] = common_params
best_param_dict['S2'] = common_params
best_param_dict['S3'] = common_params
best_param_dict['Q1'] = common_params
best_param_dict['Q2'] = common_params

"""
// submission_top1_0.6492.csv

light_week_type_lag1
weekday
week_type
week_type_lag1
activehour_top_bssid
beforebed_top_bssid
# X shape: (450, 247)
# test_X shape: (250, 247)

 STEP1: 실험 결과 확인
=============== Validation Results ==============
tabpfn-v2-classifier.ckpt: 100%
 29.0M/29.0M [00:00<00:00, 78.9MB/s]
config.json: 100%
 37.0/37.0 [00:00<00:00, 4.81kB/s]
 평균 F1: 0.6441 / [상세] Q1(기상직후수면질):0.7234 Q2(취침전신체적피로):0.8157 Q3(취침전스트레스):0.6366 S2(수면효율):0.5489 S3(수면잠들기시간):0.7234 S1(S1):0.4163
# 전체 평균 F1: 0.6441
================================================

 STEP2: 전체 데이터로 모델 재학습
====== modeling with 100% train & no valid =====
[Q1] beforebed_통화_time(1144), Q1_te2(477), wake_time_ratio(453), wake_time_diff(322), mlight_first_wakeup_minutes(310), Q1_te(290), lights_off_time(282), sleep_duration_ratio(214), active_hour_mean_speed(199), activehour_total_screen_time(181)
[Q2] Q2_te(1990), Q2_te2(1827), activehour_total_screen_time(293), beforebed_unique_bssid_count(176), wake_time_lag1(163), light_rolling_wake_time_2d(161), activehour_screen_time_vs_avg_pct(144), beforebed_max_rssi(140), beforebed_top_bssid_count(139), active_hour_std_hr(134)
[Q3] Q3_te2(2299), light_sleep_time_lag2(330), mlight_first_wakeup_minutes(288), rolling_sleep_time_3d(275), light_rolling_sleep_duration_3d(218), Q3_te(214), beforebed_scan_count(211), active_hour_distance_x(196), activehour_통화_time(181), walking_minutes(169)
[S2] S2_te2(433), S2_te(422), light_sleep_time_lag1(198), work_hour_unknown_ratio(190), m_activity@240min@std@12h00m(176), beforebed_strong_signal_ratio(154), light_rolling_wake_time_2d(151), free_hour_rssi_mean(150), activehour_전화_time(147), sleep_hour_mean_speed(136)
[S3] S3_te(2612), S3_te2(336), beforebed_메신저_time(252), light_wake_time_diff(216), sleep_time_diff_lag1(209), light_sleep_time_lag2(199), m_activity_met@240min@sum@16h00m(156), free_hour_rssi_max(150), light_weekday_avg_sleep(137), 불끈시간부터기상시간(126)
[S1] S1_te(693), wake_time_diff(628), S1_te2(606), sleep_duration_ratio(495), m_activity_met@240min@sum@04h00m(413), beforebed_screen_time_vs_avg_pct(400), wake_time_ratio(382), rolling_wake_time_3d(340), m_activity_0@240min@std@20h00m(318), m_activity@240min@std@12h00m(316)
# submission_0.6440575238969299.csv 저장 완료
# submission shape:(250, 9)
================================================

Top 3 Weight Combinations:
Rank 1: lgb_A=0.0, xgb_B=0.6, tab_C=0.1 - Score: 0.6696
Saved submission to submission_top1_0.6696.csv
Rank 2: lgb_A=0.0, xgb_B=0.5, tab_C=0.2 - Score: 0.6695
Saved submission to submission_top2_0.6695.csv
Rank 3: lgb_A=0.0, xgb_B=0.3, tab_C=0.4 - Score: 0.6681
Saved submission to submission_top3_0.6681.csv
Rank 4: lgb_A=0.0, xgb_B=0.3, tab_C=0.5 - Score: 0.6662
Saved submission to submission_top4_0.6662.csv
Rank 5: lgb_A=0.0, xgb_B=0.4, tab_C=0.4 - Score: 0.6656
Saved submission to submission_top5_0.6656.csv
Rank 6: lgb_A=0.1, xgb_B=0.2, tab_C=0.5 - Score: 0.6654
Saved submission to submission_top6_0.6654.csv
Rank 7: lgb_A=0.0, xgb_B=0.0, tab_C=0.7 - Score: 0.6647
Saved submission to submission_top7_0.6647.csv
Rank 8: lgb_A=0.1, xgb_B=0.3, tab_C=0.3 - Score: 0.6645
Saved submission to submission_top8_0.6645.csv
Rank 9: lgb_A=0.0, xgb_B=0.2, tab_C=0.6 - Score: 0.6625
Saved submission to submission_top9_0.6625.csv
Rank 10: lgb_A=0.1, xgb_B=0.3, tab_C=0.4 - Score: 0.6624
Saved submission to submission_top10_0.6624.csv

 STEP3: 예측결과 비교표
학습sum	학습len	학습mean	테스트sum	테스트len	테스트mean
Q1	223	450	0.4956	106	250	0.4240
Q2	253	450	0.5622	126	250	0.5040
Q3	270	450	0.6000	143	250	0.5720
S1	390	450	0.8667	198	250	0.7920
S2	293	450	0.6511	124	250	0.4960
S3	298	450	0.6622	148	250	0.5920

CPU times: user 17min 32s, sys: 2.28 s, total: 17min 35s
Wall time: 2min 40s
"""

submission_final, oof_result = run_basemodel(train, test, valid_ids, best_param_dict, n_splits=5, random_state=41)

light_week_type_lag1
weekday
week_type
week_type_lag1
activehour_top_bssid
beforebed_top_bssid
# X shape: (450, 247)
# test_X shape: (250, 247)

 STEP1: 실험 결과 확인
 평균 F1: 0.6441 / [상세] Q1(기상직후수면질):0.7234 Q2(취침전신체적피로):0.8157 Q3(취침전스트레스):0.6366 S2(수면효율):0.5489 S3(수면잠들기시간):0.7234 S1(S1):0.4163
# 전체 평균 F1: 0.6441

 STEP2: 전체 데이터로 모델 재학습
[Q1] beforebed_통화_time(1144), Q1_te2(477), wake_time_ratio(453), wake_time_diff(322), mlight_first_wakeup_minutes(310), Q1_te(290), lights_off_time(282), sleep_duration_ratio(214), active_hour_mean_speed(199), activehour_total_screen_time(181)
[Q2] Q2_te(1990), Q2_te2(1827), activehour_total_screen_time(293), beforebed_unique_bssid_count(176), wake_time_lag1(163), light_rolling_wake_time_2d(161), activehour_screen_time_vs_avg_pct(144), beforebed_max_rssi(140), beforebed_top_bssid_count(139), active_hour_std_hr(134)
[Q3] Q3_te2(2299), light_sleep_time_lag2(330), mlight_first_wakeup_minutes(288), rolling_sleep_time_3d(275), light_rolling_sleep_duration_3d(218)

Unnamed: 0,학습sum,학습len,학습mean,테스트sum,테스트len,테스트mean
Q1,223,450,0.4956,106,250,0.424
Q2,253,450,0.5622,126,250,0.504
Q3,270,450,0.6,143,250,0.572
S1,390,450,0.8667,198,250,0.792
S2,293,450,0.6511,124,250,0.496
S3,298,450,0.6622,148,250,0.592


CPU times: user 20min 56s, sys: 2.44 s, total: 20min 58s
Wall time: 3min 10s


### 📦 이전제출과 비교

In [16]:
from pathlib import Path

# Reference file
reference_file = '/content/drive/MyDrive/data/ch2025_data_items/share/submissions/submission_top1_0.6492.csv'
ref_df = pd.read_csv(reference_file)

# Get all CSV files in data directory
data_dir = Path('./')
csv_files = list(data_dir.glob('*.csv'))

# Store differences for each file
differences = []

for csv_file in csv_files:
    if csv_file.name == os.path.basename(reference_file):
        continue

    # Read current file
    current_df = pd.read_csv(csv_file)

    # Calculate differences in specified columns
    diff_count = 0
    for col in ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']:
        diff_count += (ref_df[col] != current_df[col]).sum()

    differences.append((csv_file.name, diff_count))
    # print(f"File: {csv_file.name}, Differences: {diff_count}")

# Sort by difference count and get top 20
differences.sort(key=lambda x: x[1])
print("\nTop 10 files with smallest differences:")
for i, (file_name, diff_count) in enumerate(differences[:20], 1):
    print(f"{str(i).zfill(2)}. {file_name}: {diff_count} differences")


Top 10 files with smallest differences:
01. submission_0.6440575238969299.csv: 103 differences
02. submission_top5_0.6656.csv: 137 differences
03. submission_top10_0.6624.csv: 146 differences
04. submission_top4_0.6662.csv: 159 differences
05. submission_top6_0.6654.csv: 168 differences
06. submission_top9_0.6625.csv: 182 differences
07. submission_top1_0.6696.csv: 188 differences
08. submission_top2_0.6695.csv: 213 differences
09. submission_top8_0.6645.csv: 263 differences
10. submission_top3_0.6681.csv: 273 differences
11. submission_top7_0.6647.csv: 410 differences
