# Data Preprocessing of Time Series Transformer

Q1-Q3 Column Prediction model "TST" 전용 전처리 코드입니다.

In [1]:
import datetime
import glob
import os
import gc

import pandas as pd
import numpy as np

from tqdm import tqdm

코드 실행 전에 다음 파일들이 `./data` 경로에 포함되어야 합니다.

```
.
├── [4.0K]  answer_sample.csv
├── [9.5K]  README_2020.txt
├── [4.0K]  test
│   ├── [627M]  ch2024_test__m_acc_part_5.parquet.gzip
│   ├── [1.1G]  ch2024_test__m_acc_part_6.parquet.gzip
│   ├── [1.0G]  ch2024_test__m_acc_part_7.parquet.gzip
│   ├── [935M]  ch2024_test__m_acc_part_8.parquet.gzip
│   ├── [912K]  ch2024_test__m_activity.parquet.gzip
│   ├── [3.2M]  ch2024_test__m_ambience.parquet.gzip
│   ├── [ 12M]  ch2024_test__m_gps.parquet.gzip
│   ├── [ 93K]  ch2024_test__m_light.parquet.gzip
│   ├── [203K]  ch2024_test__m_usage_stats.parquet.gzip
│   ├── [935K]  ch2024_test__w_heart_rate.parquet.gzip
│   ├── [106K]  ch2024_test__w_light.parquet.gzip
│   └── [909K]  ch2024_test__w_pedo.parquet.gzip
├── [4.0K]  train
│   ├── [4.0K]  user01
│   ├── [4.0K]  user02
│   ├── [4.0K]  user03
│   ├── [4.0K]  user04
│   ├── [4.0K]  user05
│   ├── [4.0K]  user06
│   ├── [4.0K]  user07
│   ├── [4.0K]  user08
│   ├── [4.0K]  user09
│   ├── [4.0K]  user10
│   ├── [4.0K]  user11
│   ├── [4.0K]  user12
│   ├── [4.0K]  user21
│   ├── [4.0K]  user22
│   ├── [4.0K]  user23
│   ├── [4.0K]  user24
│   ├── [4.0K]  user25
│   ├── [4.0K]  user26
│   ├── [4.0K]  user27
│   ├── [4.0K]  user28
│   ├── [4.0K]  user29
│   ├── [4.0K]  user30
│   ├── [1.1K]  user_info_2020.csv
│   ├── [ 73K]  user_sleep_2020.csv
│   └── [ 89K]  user_survey_2020.csv
├── [ 18K]  train_label.csv
├── [4.0K]  val
│   ├── [1.3G]  ch2024_val__m_acc_part_1.parquet.gzip
│   ├── [563M]  ch2024_val__m_acc_part_2.parquet.gzip
│   ├── [662M]  ch2024_val__m_acc_part_3.parquet.gzip
│   ├── [838M]  ch2024_val__m_acc_part_4.parquet.gzip
│   ├── [853K]  ch2024_val__m_activity.parquet.gzip
│   ├── [3.7M]  ch2024_val__m_ambience.parquet.gzip
│   ├── [ 15M]  ch2024_val__m_gps.parquet.gzip
│   ├── [ 88K]  ch2024_val__m_light.parquet.gzip
│   ├── [192K]  ch2024_val__m_usage_stats.parquet.gzip
│   ├── [925K]  ch2024_val__w_heart_rate.parquet.gzip
│   ├── [101K]  ch2024_val__w_light.parquet.gzip
│   └── [901K]  ch2024_val__w_pedo.parquet.gzip
└── [2.9K]  val_label.csv
```

In [2]:
# 경로 설정
data_dir = "../data"  # 데이터 저장 경로
preprocessed_dir_ts = "../data_preprocessed_ts"  # 전처리 완료된 데이터를 저장할 경로

train_label_dir = os.path.join(preprocessed_dir_ts, "train_label")

## 0. Regression Label 전처리

### user_survey_2020.csv

In [None]:
survey_df = pd.read_csv(os.path.join(data_dir, "train", "user_survey_2020.csv"))
survey_df.sort_values(by=['userId', 'date'], inplace=True)
survey_df['date'] = pd.to_datetime(survey_df['date'])
survey_df.reset_index(drop=True, inplace=True)

if pd.isna(survey_df.at[1, 'sleep']):
    results = []

    for index, row in tqdm(survey_df.iterrows(), total=survey_df.shape[0]):
        if row['amPm'] == 'pm' and index + 1 < len(survey_df) and survey_df.iloc[index + 1]['amPm'] == 'am':
            next_row = survey_df.iloc[index + 1]
            new_row = row.copy()
            new_row['sleep'] = next_row['sleep']
            new_row['sleepProblem'] = next_row['sleepProblem']
            new_row['dream'] = next_row['dream']
            new_row['amCondition'] = next_row['amCondition']
            new_row['amEmotion'] = next_row['amEmotion']
            
            results.append(new_row)
            
    survey_combined_df = pd.DataFrame(results)
    survey_combined_df.drop(columns=['amPm', 'startInput', 'endInput'], inplace=True)
    survey_combined_df.reset_index(drop=True, inplace=True)

os.makedirs(train_label_dir, exist_ok=True)
survey_combined_df.to_csv(os.path.join(train_label_dir, "user_survey_2020.csv"), index=False)

### user_sleep_2020.csv

In [7]:
def count_duplicate_rows(df):
    # 'userId'와 'date' 기준으로 중복된 데이터를 찾습니다.
    duplicates = df.duplicated(subset=['userId', 'date'], keep=False)
    # 중복된 행만 필터링합니다.
    duplicate_rows = df[duplicates]
    # 필터링된 데이터의 개수를 반환합니다.
    return len(duplicate_rows)

In [8]:
sleep_df = pd.read_csv(os.path.join(data_dir, "train", "user_sleep_2020.csv"))
sleep_df.sort_values(by=['userId', 'startDt'], inplace=True)
sleep_df.reset_index(drop=True, inplace=True)

sleep_df['date'] = pd.to_datetime(sleep_df['date'])
sleep_df['lastUpdate'] = sleep_df['lastUpdate'].apply(lambda x: datetime.datetime.fromtimestamp(x, datetime.timezone(datetime.timedelta(hours=9))))
sleep_df['startDt'] = sleep_df['startDt'].apply(lambda x: datetime.datetime.fromtimestamp(x, datetime.timezone(datetime.timedelta(hours=9))))
sleep_df['endDt'] = sleep_df['endDt'].apply(lambda x: datetime.datetime.fromtimestamp(x, datetime.timezone(datetime.timedelta(hours=9))))
sleep_df.drop(columns=['timezone'], inplace=True)

sleep_df["time_in_bed"] = (sleep_df["endDt"] - sleep_df["startDt"]).dt.total_seconds()

aggregations = {
    'startDt': 'first',
    'endDt': 'last',
    'lastUpdate': 'last',
    'wakeupduration': 'sum',
    'lightsleepduration': 'sum',
    'deepsleepduration': 'sum',
    'wakeupcount': 'sum',
    'durationtosleep': 'sum',
    'remsleepduration': 'sum',
    'durationtowakeup': 'sum',
    'hr_average': 'mean',
    'hr_min': 'min',
    'hr_max': 'max',
    'rr_average': 'mean',
    'rr_min': 'min',
    'rr_max': 'max',
    'breathing_disturbances_intensity': 'sum',
    'snoring': 'sum',
    'snoringepisodecount': 'sum',
    'sleep_score': 'mean',
    'time_in_bed': 'sum'
}

sleep_df_agg = sleep_df.groupby(['userId', 'date']).agg(aggregations).reset_index()
sleep_df_agg['date'] = sleep_df_agg['date'] - pd.Timedelta(days=1)
sleep_df_agg.to_csv(os.path.join(train_label_dir, "user_sleep_2020.csv"), index=False)

### train_label.csv 와 결합

In [9]:
train_label_df = pd.read_csv(os.path.join(data_dir, "train_label.csv"), index_col=0)

train_label_df['date'] = pd.to_datetime(train_label_df['date'])
train_label_df.rename(columns={"subject_id": "userId"}, inplace=True)

result_df = pd.merge(survey_combined_df, sleep_df_agg, on=['userId', 'date'], how='inner')
result_df = pd.merge(result_df, train_label_df, on=['userId', 'date'], how='inner')

survey_combined_df["date"] = pd.to_datetime(survey_combined_df["date"])

result_df["y1"] = result_df["sleep"]
result_df["y2"] = result_df["pmEmotion"]
result_df["y3"] = result_df["pmStress"]

result_df["y4"] = result_df["deepsleepduration"] + result_df["lightsleepduration"] + result_df["remsleepduration"]
result_df["y5"] = result_df["wakeupduration"]
result_df["y6"] = result_df["durationtosleep"]
result_df["y7"] = result_df["durationtowakeup"]

result_df.to_csv(os.path.join(train_label_dir, "y_concated_regression.csv"), index=False)

## 1. Train

In [4]:
from data_preprocessing_tst import preprocess_train_first, preprocess_train_second_add_activity

In [None]:
# 해당 작업은 매우 오랜 시간이 소모됩니다. (약 10시간)
preprocess_train_first(os.path.join(data_dir, "train"), os.path.join(preprocessed_dir_ts, "train"))

In [None]:
preprocess_train_second_add_activity(preprocessed_dir_ts, os.path.join(preprocessed_dir_ts, "train_after"))

In [None]:
label_csv = pd.read_csv(os.path.join(train_label_dir, "y_concated_regression.csv"))
label_csv['date'] = pd.to_datetime(label_csv['date'])

train_all_df = None
y_all_df = None

train_max_idx = -1
y_max_idx = -1

train_check_all, y_check_all = 0, 0

for user in tqdm(range(1, 31)):
    try:
        user_df = pd.read_pickle(os.path.join(preprocessed_dir_ts, "train_after", f"user{user:02d}.pkl"))
    except Exception as e:
        print(e)
        continue

    user_df['date'] = user_df.index.date
    user_df.reset_index(inplace=True)

    label_csv_user = label_csv[label_csv['userId'] == f'user{user:02d}']
    label_csv_user = label_csv_user[label_csv_user['date'].isin(user_df['date'])]
    label_csv_user = label_csv_user[['date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4', 'y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7']]

    unique_dates = label_csv_user['date'].dt.date.unique()

    user_df = user_df[user_df['date'].isin(unique_dates)]
    user_df["id"] = user_df['date'].factorize()[0]
    user_df["id"] += train_max_idx + 1
    user_df.set_index('id', inplace=True)
    user_df.drop(columns=['date'], inplace=True)
    label_csv_user.reset_index(drop=True, inplace=True)
    label_csv_user.index += y_max_idx + 1

    train_check_all += len(user_df.index.unique())
    y_check_all += len(label_csv_user.index)

    train_all_df = user_df if train_all_df is None else pd.concat([train_all_df, user_df])
    train_max_idx = train_all_df.index.max()
    y_all_df = label_csv_user if y_all_df is None else pd.concat([y_all_df, label_csv_user])
    y_max_idx = y_all_df.index.max()

train_all_df.drop(columns=['time'], inplace=True)
y_all_df.drop(columns=['date'], inplace=True)

print(f"train_check_all: {train_check_all}, y_check_all: {y_check_all}")
print("train_all_df.shape", len(train_all_df.index.unique()), "y_all_df.shape", y_all_df.shape[0])

os.makedirs(os.path.join(preprocessed_dir_ts, "train_final"), exist_ok=True)
train_all_df.to_csv(os.path.join(preprocessed_dir_ts, "train_final", "train_all.csv"))
y_all_df.to_csv(os.path.join(preprocessed_dir_ts, "train_final", "train_y_all.csv"))

## 2. Validation

In [8]:
def split_sensor_by_user(dic, sensor_type):
    grouped = dic[sensor_type].groupby('subject_id')
    sensor_type_rename = "s_" + sensor_type[2:] if sensor_type.startswith("w_") else sensor_type
    for name, group in tqdm(grouped):
        if sensor_type_rename == "m_activity":
            group["m_activity"] = group["m_activity"].apply(lambda x : int(x))
            group_one_hot_label = pd.get_dummies(group["m_activity"], prefix='m_activity')
            group = pd.concat([group, group_one_hot_label], axis=1)
            group.set_index('timestamp', inplace=True)
            group = group.resample('1S').ffill().bfill()
            group.drop(columns=["m_activity"], inplace=True)
            for i in range(9):
                if f'm_activity_{i}' not in group.columns:
                    group[f'm_activity_{i}'] = np.int8(0)
                elif f'm_activity_{i}' in group.columns:
                    group[f'm_activity_{i}'] = group[f'm_activity_{i}'].astype(np.int8)
            group.reset_index(inplace=True)

        group.drop(columns=['subject_id'], inplace=True)
        group.set_index('timestamp', inplace=True)
        group.index.names = ['time']
            
        if sensor_type_rename != "m_activity":
            if len(group.columns) > 1:
                group.columns = [sensor_type_rename + "_" + col if col != 'timestamp' else col for col in group.columns]
            else:
                group.columns = [sensor_type_rename]

        if sensor_type_rename not in ["m_activity"]:
            group = group[~group.index.duplicated(keep='first')]
            resampled = group.resample('1S').mean().interpolate(method='time')
        else:
            resampled = group
        dic[f'{sensor_type_rename}_{name}'] = resampled
    print(f"Deleting {sensor_type}")
    del grouped, resampled, dic[sensor_type]

In [9]:
def preprocess_val_test(is_val=True):
    user_list = list(range(1, 5)) if is_val else list(range(5, 9))
    file_path = glob.glob(os.path.join(data_dir, "val", "*")) if is_val else glob.glob(os.path.join(data_dir, "test", "*"))
    dic = {}

    for val in tqdm(file_path, desc="Reading files to dict"):
        extracted_text = (os.path.basename(val).split('__'))[1].split('.')[0]
        dic[extracted_text] = pd.read_parquet(val)

    dic["m_acc"] = pd.concat([
        dic[f"m_acc_part_{user_list[0]}"].reset_index(drop=True),
        dic[f"m_acc_part_{user_list[1]}"].reset_index(drop=True),
        dic[f"m_acc_part_{user_list[2]}"].reset_index(drop=True),
        dic[f"m_acc_part_{user_list[3]}"].reset_index(drop=True)
    ], ignore_index=True)

    del dic[f"m_acc_part_{user_list[0]}"], dic[f"m_acc_part_{user_list[1]}"], dic[f"m_acc_part_{user_list[2]}"], dic[f"m_acc_part_{user_list[3]}"]
    del dic["m_light"], dic["m_usage_stats"], dic["w_pedo"], dic["w_light"], dic["m_ambience"]

    gc.collect()

    split_sensor_by_user(dic, "m_acc")
    split_sensor_by_user(dic, "m_gps")
    split_sensor_by_user(dic, "m_activity")
    split_sensor_by_user(dic, "w_heart_rate")

    return dic

In [10]:
val_dict = preprocess_val_test(is_val=True)

Reading files to dict: 100%|██████████| 12/12 [00:17<00:00,  1.42s/it]
100%|██████████| 4/4 [03:31<00:00, 52.92s/it]


Deleting m_acc


100%|██████████| 4/4 [00:04<00:00,  1.21s/it]


Deleting m_gps


100%|██████████| 4/4 [00:03<00:00,  1.02it/s]


Deleting m_activity


100%|██████████| 4/4 [00:01<00:00,  2.79it/s]

Deleting w_heart_rate





In [11]:
for i in tqdm(range(1, 5)):
    os.makedirs(os.path.join(preprocessed_dir_ts, "val"), exist_ok=True)
    user_df = None
    for key, value in val_dict.items():
        if key.endswith(f"_{i}"):
            user_df = value if user_df is None else pd.concat([user_df, value], join='outer', axis=1)
    for col in user_df.columns:
        if not col.startswith("m_activity"):
            user_df[col] = user_df[col].interpolate(method='time')
        else:
            user_df[col] = user_df[col].ffill().bfill()
    user_df.to_pickle(os.path.join(preprocessed_dir_ts, "val", f"user{i:02d}.pkl"))

100%|██████████| 4/4 [00:15<00:00,  3.98s/it]


In [None]:
for user in tqdm(range(1, 5)):
    user_df = pd.read_pickle(os.path.join(preprocessed_dir_ts, "val", f"user{user:02d}.pkl"))

    # m_activity 컬럼이 없는지 확인하고 없으면 추가
    for i in range(9):
        if f'm_activity_{i}' not in user_df.columns:
            user_df[f'm_activity_{i}'] = None
    
    user_df.drop(columns=['m_gps_altitude', 'm_gps_speed'], inplace=True, errors='ignore')

    user_df = user_df[['m_acc_x', 'm_acc_y', 'm_acc_z',
                       'm_gps_latitude', 'm_gps_longitude', 's_heart_rate',
                       'm_activity_0', 'm_activity_1', 'm_activity_2', 'm_activity_3',
                       'm_activity_4', 'm_activity_5', 'm_activity_6', 'm_activity_7', 'm_activity_8']]
    
    resample_dict_10_min = {
        'm_acc_x': ['mean', 'std', 'min', 'max'],
        'm_acc_y': ['mean', 'std', 'min', 'max'],
        'm_acc_z': ['mean', 'std', 'min', 'max'],
        'm_gps_latitude': ['mean', 'std', 'min', 'max'],
        'm_gps_longitude': ['mean', 'std', 'min', 'max'],
        's_heart_rate': ['mean', 'std', 'min', 'max'],
        'm_activity_0': ['max'],
        'm_activity_1': ['max'],
        'm_activity_2': ['max'],
        'm_activity_3': ['max'],
        'm_activity_4': ['max'],
        'm_activity_5': ['max'],
        'm_activity_6': ['max'],
        'm_activity_7': ['max'],
        'm_activity_8': ['max']
    }
    # resample
    user_df = user_df.resample('10min').agg(resample_dict_10_min)
    # 컬럼명 수정
    user_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in user_df.columns]
    user_df.ffill(inplace=True)
    user_df.bfill(inplace=True)
    os.makedirs(os.path.join(preprocessed_dir_ts, "val_after"), exist_ok=True)

    user_df.to_pickle(os.path.join(preprocessed_dir_ts, "val_after", f"user{user:02d}.pkl"))

In [None]:
label_csv = pd.read_csv(os.path.join(data_dir, "val_label.csv"))
label_csv['date'] = pd.to_datetime(label_csv['date'])

val_test_all_df = None
y_all_df = None

val_test_max_idx = -1
y_max_idx = -1

val_test_check_all, y_check_all = 0, 0

for user in tqdm(range(1, 5)):
    try:
        user_df = pd.read_pickle(os.path.join(preprocessed_dir_ts, "val_after", f"user{user:02d}.pkl"))
    except Exception as e:
        print(e)
        continue

    user_df['date'] = user_df.index.date
    user_df.reset_index(inplace=True)

    label_csv_user = label_csv[label_csv['subject_id'] == user]
    label_csv_user = label_csv_user[label_csv_user['date'].isin(user_df['date'])]
    label_csv_user = label_csv_user[['date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']]

    unique_dates = label_csv_user['date'].dt.date.unique()

    user_df = user_df[user_df['date'].isin(unique_dates)]
    user_df["id"] = user_df['date'].factorize()[0]
    user_df["id"] += val_test_max_idx + 1
    user_df.set_index('id', inplace=True)
    user_df.drop(columns=['date'], inplace=True)
    label_csv_user.reset_index(drop=True, inplace=True)
    label_csv_user.index += y_max_idx + 1

    val_test_check_all += len(user_df.index.unique())
    y_check_all += len(label_csv_user.index)
    
    val_test_all_df = user_df if val_test_all_df is None else pd.concat([val_test_all_df, user_df])
    val_test_max_idx = val_test_all_df.index.max()
    y_all_df = label_csv_user if y_all_df is None else pd.concat([y_all_df, label_csv_user])
    y_max_idx = y_all_df.index.max()

val_test_all_df.drop(columns=['time'], inplace=True)
y_all_df.drop(columns=['date'], inplace=True)

print(val_test_all_df.columns)
print(y_all_df.columns)
print(f"val_test_check_all: {val_test_check_all}, y_check_all: {y_check_all}")
print("val_test_all_df.shape", len(val_test_all_df.index.unique()), "y_all_df.shape", y_all_df.shape[0])

os.makedirs(os.path.join(preprocessed_dir_ts, "val_final"), exist_ok=True)
val_test_all_df.to_csv(os.path.join(preprocessed_dir_ts, "val_final", "val_all.csv"))
y_all_df.to_csv(os.path.join(preprocessed_dir_ts, "val_final", "val_y_all.csv"))

TST Pretraining을 위해서, Train과 Validation 합하기

In [16]:
train_df = pd.read_csv(os.path.join(preprocessed_dir_ts, "train_final", "train_all.csv"), index_col=0)
val_df = pd.read_csv(os.path.join(preprocessed_dir_ts, "val_final", "val_all.csv"), index_col=0)
val_df.index += (max(list(train_df.index.unique())) + 1)

train_y_df = pd.read_csv(os.path.join(preprocessed_dir_ts, "train_final", "train_y_all.csv"), index_col=0)
val_y_df = pd.read_csv(os.path.join(preprocessed_dir_ts, "val_final", "val_y_all.csv"), index_col=0)
val_y_df.index += (max(list(train_y_df.index.unique())) + 1)

In [17]:
train_val_concat = pd.concat([train_df, val_df])
train_val_y_concat = pd.concat([train_y_df, val_y_df])

In [18]:
train_val_concat.to_csv(os.path.join(preprocessed_dir_ts, "train_val_concat.csv"))
train_val_y_concat.to_csv(os.path.join(preprocessed_dir_ts, "train_val_y_concat.csv"))

## 3. Test

In [19]:
test_dict = preprocess_val_test(is_val=False)

Reading files to dict: 100%|██████████| 12/12 [00:49<00:00,  4.12s/it]
100%|██████████| 4/4 [03:44<00:00, 56.13s/it]


Deleting m_acc


100%|██████████| 4/4 [00:05<00:00,  1.30s/it]


Deleting m_gps


100%|██████████| 4/4 [00:03<00:00,  1.01it/s]


Deleting m_activity


100%|██████████| 4/4 [00:01<00:00,  2.70it/s]

Deleting w_heart_rate





In [20]:
for i in tqdm(range(5, 9)):
    os.makedirs(os.path.join(preprocessed_dir_ts, "test"), exist_ok=True)
    user_df = None
    for key, value in test_dict.items():
        if key.endswith(f"_{i}"):
            user_df = value if user_df is None else pd.concat([user_df, value], join='outer', axis=1)
    user_df.to_pickle(os.path.join(preprocessed_dir_ts, "test", f"user{i:02d}.pkl"))

100%|██████████| 4/4 [00:08<00:00,  2.22s/it]


In [None]:
for user in range(5, 9):
    user_df = pd.read_pickle(os.path.join(preprocessed_dir_ts, "test", f"user{user:02d}.pkl"))

    # m_activity 컬럼이 없는지 확인하고 없으면 추가
    for i in range(9):
        if f'm_activity_{i}' not in user_df.columns:
            user_df[f'm_activity_{i}'] = None
    
    user_df.drop(columns=['m_gps_altitude', 'm_gps_speed'], inplace=True, errors='ignore')

    user_df = user_df[['m_acc_x', 'm_acc_y', 'm_acc_z',
                       'm_gps_latitude', 'm_gps_longitude', 's_heart_rate',
                       'm_activity_0', 'm_activity_1', 'm_activity_2', 'm_activity_3',
                       'm_activity_4', 'm_activity_5', 'm_activity_6', 'm_activity_7', 'm_activity_8']]
    
    resample_dict_10_min = {
        'm_acc_x': ['mean', 'std', 'min', 'max'],
        'm_acc_y': ['mean', 'std', 'min', 'max'],
        'm_acc_z': ['mean', 'std', 'min', 'max'],
        'm_gps_latitude': ['mean', 'std', 'min', 'max'],
        'm_gps_longitude': ['mean', 'std', 'min', 'max'],
        's_heart_rate': ['mean', 'std', 'min', 'max'],
        'm_activity_0': ['max'],
        'm_activity_1': ['max'],
        'm_activity_2': ['max'],
        'm_activity_3': ['max'],
        'm_activity_4': ['max'],
        'm_activity_5': ['max'],
        'm_activity_6': ['max'],
        'm_activity_7': ['max'],
        'm_activity_8': ['max']
    }
    # resample
    user_df = user_df.resample('10min').agg(resample_dict_10_min)
    # 컬럼명 수정
    user_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in user_df.columns]
    user_df.ffill(inplace=True)
    user_df.bfill(inplace=True)
    os.makedirs(os.path.join(preprocessed_dir_ts, "test_after"), exist_ok=True)

    user_df.to_pickle(os.path.join(preprocessed_dir_ts, "test_after", f"user{user:02d}.pkl"))

In [None]:
# test
label_csv = pd.read_csv(os.path.join(data_dir, "answer_sample.csv"))
label_csv['date'] = pd.to_datetime(label_csv['date'])

val_test_all_df = None
y_all_df = None

val_test_max_idx = -1
y_max_idx = -1

val_test_check_all, y_check_all = 0, 0

for user in tqdm(range(5, 9)):
    try:
        user_df = pd.read_pickle(os.path.join(preprocessed_dir_ts, "test_after", f"user{user:02d}.pkl"))
    except Exception as e:
        print(e)
        continue

    user_df['date'] = user_df.index.date
    user_df.reset_index(inplace=True)

    label_csv_user = label_csv[label_csv['subject_id'] == user]
    label_csv_user = label_csv_user[label_csv_user['date'].isin(user_df['date'])]

    unique_dates = label_csv_user['date'].dt.date.unique()

    user_df = user_df[user_df['date'].isin(unique_dates)]
    user_df["id"] = user_df['date'].factorize()[0]
    user_df["id"] += val_test_max_idx + 1
    user_df.set_index('id', inplace=True)
    user_df.drop(columns=['date'], inplace=True)
    label_csv_user.reset_index(drop=True, inplace=True)
    label_csv_user.index += y_max_idx + 1

    val_test_check_all += len(user_df.index.unique())
    y_check_all += len(label_csv_user.index)
    
    val_test_all_df = user_df if val_test_all_df is None else pd.concat([val_test_all_df, user_df])
    val_test_max_idx = val_test_all_df.index.max()
    y_all_df = label_csv_user if y_all_df is None else pd.concat([y_all_df, label_csv_user])
    y_max_idx = y_all_df.index.max()


val_test_all_df.drop(columns=['time'], inplace=True)
y_all_df.drop(columns=['subject_id', 'date'], inplace=True)

print(f"val_test_check_all: {val_test_check_all}, y_check_all: {y_check_all}")
print("val_test_all_df.shape", len(val_test_all_df.index.unique()), "y_all_df.shape", y_all_df.shape[0])

os.makedirs(os.path.join(preprocessed_dir_ts, "test_final"), exist_ok=True)
val_test_all_df.to_csv(os.path.join(preprocessed_dir_ts, "test_final", "test_all.csv"))
y_all_df.to_csv(os.path.join(preprocessed_dir_ts, "test_final", "test_y_all.csv"))

최종 결과물에서 다음 파일을 활용하였습니다.

`../data_preprocessed_ts/train_val_concat.csv`  
`../data_preprocessed_ts/train_val_y_concat.csv`

`../data_preprocessed_ts/train_final/train_all.csv`  
`../data_preprocessed_ts/train_final/train_y_all.csv`

`../data_preprocessed_ts/test_final/test_all.csv`  
`../data_preprocessed_ts/test_final/test_y_all.csv`