In [1]:
import numpy as np
import pandas as pd

In [2]:
merged_df = pd.read_csv('dataset/merged_df.csv')
metrics_train = pd.read_csv('dataset/ch2025_data_items/ch2025_metrics_train.csv')
sample_submission = pd.read_csv('dataset/ch2025_submission_sample.csv')

In [3]:
users = merged_df.subject_id.unique()

In [4]:
def prepare_train_test_data(metrics_train, merged_df):
    metrics_train['lifelog_date'] = pd.to_datetime(metrics_train['lifelog_date']).dt.date
    merged_df['date'] = pd.to_datetime(merged_df['date']).dt.date

    metrics_train = metrics_train.rename(columns={'lifelog_date': 'date'})

    train_df = pd.merge(metrics_train, merged_df, on=['subject_id', 'date'], how='inner')

    merged_keys = merged_df[['subject_id', 'date']]
    train_keys = metrics_train[['subject_id', 'date']]
    test_keys = pd.merge(merged_keys, train_keys, on=['subject_id', 'date'], how='left', indicator=True)
    test_keys = test_keys[test_keys['_merge'] == 'left_only'].drop(columns=['_merge'])

    test_df = pd.merge(test_keys, merged_df, on=['subject_id', 'date'], how='left')
    return train_df, test_df


def generate_submission(sample_submission, binary_preds, multiclass_pred, filename):
    sample_submission['lifelog_date'] = pd.to_datetime(sample_submission['lifelog_date']).dt.date
    submission_final = sample_submission[['subject_id', 'sleep_date', 'lifelog_date']].copy()
    submission_final['ID'] = submission_final['subject_id'] + '_' + submission_final['lifelog_date'].astype(str)

    submission_final['S1'] = multiclass_pred
    for col in binary_preds:
        submission_final[col] = binary_preds[col].astype(int)

    submission_final = submission_final[['subject_id', 'sleep_date', 'lifelog_date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']]
    submission_final.to_csv(filename, index=False)
    print(f"✅ 제출 파일 생성 완료: {filename}")

In [5]:
train_df, test_df = prepare_train_test_data(metrics_train, merged_df)

In [6]:
targets_binary = ['Q1', 'Q2', 'Q3', 'S2', 'S3']
target_multiclass = 'S1'

In [7]:
X = train_df.drop(columns=['sleep_date', 'date'] + targets_binary + [target_multiclass]).fillna(0)

Y = train_df[['subject_id'] + targets_binary + [target_multiclass]].fillna(0)

In [8]:
seq_len = 14 # Best : 14

X_seq = []
Y_seq = []

for user in users:
    for i in range(X[X.subject_id==user].shape[0]-seq_len):
        X_seq.append(X[X.subject_id==user].iloc[i:i+seq_len, 1:].to_numpy())
        Y_seq.append(Y[Y.subject_id==user].iloc[i+seq_len, 1:])
    
X_seq = np.array(X_seq)
#Y_seq = np.array(Y_seq)

print(X_seq.shape, np.array(Y_seq).shape)

X_seq_len = X_seq.shape[0]

(310, 14, 126) (310, 6)


In [9]:
import datetime as dt

test_X_seq = []

for user in users:
    for date in sample_submission[sample_submission.subject_id == user].lifelog_date:
        c_index = merged_df[(merged_df['subject_id'] == user) & (merged_df['date'] == dt.datetime.strptime(date, "%Y-%m-%d").date())].index
        test_X_seq.append(merged_df.iloc[c_index[0]-seq_len:c_index[0], :].drop(columns=['subject_id', 'date']).fillna(0).to_numpy())
        
test_X_seq = np.array(test_X_seq)
test_X_seq_len = test_X_seq.shape[0]

In [10]:
from sklearn.preprocessing import StandardScaler
import torch

# Example: Check the shape of X_seq
print(X_seq.shape)

# If X_seq is 3D, reshape it to 2D
if len(X_seq.shape) == 3:
    X_seq = X_seq.reshape(-1, X_seq.shape[-1])
    test_X_seq = test_X_seq.reshape(-1, test_X_seq.shape[-1])

# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_seq)
X_train = X_train_scaled.reshape(X_seq_len, seq_len, 126)
X_train = torch.tensor(X_train, dtype=torch.float32)

X_test_scaled = scaler.transform(test_X_seq)
X_test = X_test_scaled.reshape(test_X_seq_len, seq_len, 126)

(310, 14, 126)


In [11]:
from models.iftransformer import *
from train import *
from prediction import *

import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch

seed_value = 42

torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


binary_preds = {}
binary_f1 = {}

f1_scores = []

hidden_dim =512 # Best : 256
num_heads =64 # Best : 64
n_layers = 6 # Best : 6

epoch = 100 # Best : 10

X_train = torch.tensor(X_train, dtype=torch.float32)

for col in targets_binary:
    y_train = torch.tensor(np.array([[y[col]] for y in Y_seq]), dtype=torch.long).view(-1)
    
    print(X_train.shape, y_train.shape)
    dataset = TensorDataset(X_train, y_train)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    
    model_bin = IFTransformer(input_dim=X_train.shape[-1], num_heads = num_heads, n_layers = n_layers, num_classes=2, hidden_dim=int(hidden_dim), dropout=0.1)
    binary_f1[col] = train_model(model_bin, dataloader, nn.CrossEntropyLoss(), optim.Adam(model_bin.parameters(), lr=0.0001), col = col, epochs=epoch) # Best lr : 0.0001
    binary_preds[col] = predict(model_bin, torch.tensor(X_test, dtype=torch.float32), col)
    f1_scores.append(binary_f1[col])

y_multi_train = torch.tensor(np.array([[y[target_multiclass]] for y in Y_seq]), dtype=torch.long).view(-1)
dataset_multi = TensorDataset(X_train, y_multi_train)
dataloader_multi = DataLoader(dataset_multi, batch_size=32, shuffle=True)

model_multi = IFTransformer(input_dim=X_train.shape[-1], num_heads = num_heads, n_layers = n_layers, num_classes=3, hidden_dim=int(hidden_dim), dropout=0.1)
multiclass_f1 = train_model(model_multi, dataloader_multi, nn.CrossEntropyLoss(), optim.Adam(model_multi.parameters(), lr=0.0001), col = 'S1', epochs=epoch) # Best lr : 0.0001
f1_scores.append(multiclass_f1)

multiclass_pred = predict(model_multi, torch.tensor(X_test, dtype=torch.float32), 'S1')
avg_f1 = sum(f1_scores) / len(f1_scores)    


torch.Size([310, 14, 126]) torch.Size([310])


  X_train = torch.tensor(X_train, dtype=torch.float32)


Epoch 1: Warm-up LR = 0.000010
Epoch: 1, Avg Val Loss: 0.8167, Avg Val F1 Score: 0.5581
✅ Best model saved for Q1
Epoch 2: Warm-up LR = 0.000020
Epoch: 2, Avg Val Loss: 1.0645, Avg Val F1 Score: 0.5854
✅ Best model saved for Q1
Epoch 3: Warm-up LR = 0.000030
Epoch: 3, Avg Val Loss: 1.0478, Avg Val F1 Score: 0.5811
Epoch 4: Warm-up LR = 0.000040
Epoch: 4, Avg Val Loss: 1.0291, Avg Val F1 Score: 0.6205
✅ Best model saved for Q1
Epoch 5: Warm-up LR = 0.000050
Epoch: 5, Avg Val Loss: 1.1112, Avg Val F1 Score: 0.5631
Epoch 6: Warm-up LR = 0.000060
Epoch: 6, Avg Val Loss: 1.5918, Avg Val F1 Score: 0.4425
Epoch 7: Warm-up LR = 0.000070
Epoch: 7, Avg Val Loss: 1.057, Avg Val F1 Score: 0.576
Epoch 8: Warm-up LR = 0.000080
Epoch: 8, Avg Val Loss: 1.1392, Avg Val F1 Score: 0.5785
Epoch 9: Warm-up LR = 0.000090
Epoch: 9, Avg Val Loss: 0.993, Avg Val F1 Score: 0.548
Epoch 10: Warm-up LR = 0.000100
Epoch: 10, Avg Val Loss: 1.2872, Avg Val F1 Score: 0.5494
Epoch: 11, Avg Val Loss: 1.3433, Avg Val F1 

In [12]:
import datetime

current_time = datetime.datetime.now()

generate_submission(sample_submission, binary_preds, multiclass_pred, f'submission_IFTransformer_{hidden_dim}_head_{num_heads}_num_layer_{n_layers}_time_{current_time}_seq_{seq_len}_epoch_{epoch}_val_0.2.csv')

✅ 제출 파일 생성 완료: submission_IFTransformer_1024_head_256_num_layer_12_time_2025-05-19 23:06:41.023351_seq_14_epoch_100_val_0.2.csv


In [13]:
avg_f1

0.6634791677409732