<a href="https://colab.research.google.com/github/jacobgreen4477/The-4th-ETRI-AI-Human-Understanding-Competition/blob/main/dacon_etri_lifelog_best_score_v3_3_2_model_pv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

> title : 제 4회 ETRI 휴먼이해 인공지능 논문경진대회 <br>
> author : hjy,byc <br>

### 📦 라이브러리

In [None]:
! pip install haversine >/dev/null
! pip install optuna >/dev/null
! pip install imbalanced-learn >/dev/null
! pip install category_encoders >/dev/null
! pip install catboost >/dev/null

In [None]:
# Core Libraries
import os
import sys
import re
import ast
import glob
import random
from functools import reduce
from io import StringIO
from collections import Counter
from datetime import datetime, timedelta, time

# Numerical Operations
import numpy as np
import pandas as pd

# Math & Geospatial
from math import radians, cos, sin, asin, sqrt
from scipy.stats import entropy
from haversine import haversine

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import (
    train_test_split, KFold, StratifiedKFold, cross_val_score
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, log_loss
from lightgbm import LGBMClassifier, log_evaluation, early_stopping
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

# Deep Learning (PyTorch)
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torch.nn import functional as F

# Progress Tracking
from tqdm import tqdm
from tqdm.auto import tqdm
from category_encoders import TargetEncoder

# Warnings
import warnings
warnings.filterwarnings('ignore')

# pandas 옵션
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%0.4f' % x)

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(1)

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from io import StringIO

string = """
subject_id	sleep_date
id01	2024-07-24
id01	2024-07-27
id01	2024-08-18
id01	2024-08-19
id01	2024-08-20
id01	2024-08-21
id01	2024-08-22
id01	2024-08-24
id01	2024-08-25
id01	2024-08-26
id01	2024-08-27
id01	2024-08-28
id01	2024-08-29
id01	2024-08-30
id02	2024-08-23
id02	2024-08-24
id02	2024-09-16
id02	2024-09-17
id02	2024-09-19
id02	2024-09-20
id02	2024-09-21
id02	2024-09-22
id02	2024-09-23
id02	2024-09-24
id02	2024-09-25
id02	2024-09-26
id02	2024-09-27
id02	2024-09-28
id03	2024-08-30
id03	2024-09-01
id03	2024-09-02
id03	2024-09-03
id03	2024-09-05
id03	2024-09-06
id03	2024-09-07
id04	2024-09-03
id04	2024-09-04
id04	2024-09-05
id04	2024-09-06
id04	2024-09-07
id04	2024-09-08
id04	2024-09-09
id04	2024-10-08
id04	2024-10-09
id04	2024-10-10
id04	2024-10-11
id04	2024-10-12
id04	2024-10-13
id04	2024-10-14
id05	2024-10-19
id05	2024-10-23
id05	2024-10-24
id05	2024-10-25
id05	2024-10-26
id05	2024-10-27
id05	2024-10-28
id06	2024-07-25
id06	2024-07-26
id06	2024-07-27
id06	2024-07-28
id06	2024-07-29
id06	2024-07-30
id06	2024-07-31
id07	2024-07-07
id07	2024-07-08
id07	2024-07-09
id07	2024-07-10
id07	2024-07-11
id07	2024-07-12
id07	2024-07-13
id07	2024-07-30
id07	2024-08-01
id07	2024-08-02
id07	2024-08-03
id07	2024-08-04
id07	2024-08-05
id07	2024-08-06
id08	2024-08-28
id08	2024-08-29
id08	2024-08-30
id08	2024-08-31
id08	2024-09-01
id08	2024-09-02
id08	2024-09-04
id09	2024-08-02
id09	2024-08-22
id09	2024-08-23
id09	2024-08-24
id09	2024-08-25
id09	2024-08-27
id09	2024-08-28
id09	2024-08-29
id09	2024-08-30
id09	2024-08-31
id09	2024-09-01
id09	2024-09-02
id09	2024-09-03
id09	2024-09-04
id10	2024-08-28
id10	2024-08-30
id10	2024-08-31
id10	2024-09-01
id10	2024-09-02
id10	2024-09-03
id10	2024-09-06
"""

# DataFrame 생성
valid_ids = pd.read_csv(StringIO(string), sep='\t')
valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']

## 📦 데이터 전처리

In [None]:
path = '/content/drive/MyDrive/data/ch2025_data_items/'

# 1
mACStatus = pd.read_parquet(path+'ch2025_mACStatus.parquet')
mActivity = pd.read_parquet(path+'ch2025_mActivity.parquet')
mAmbience = pd.read_parquet(path+'ch2025_mAmbience.parquet')
mBle = pd.read_parquet(path+'ch2025_mBle.parquet')
mGps = pd.read_parquet(path+'ch2025_mGps.parquet')
mLight = pd.read_parquet(path+'ch2025_mLight.parquet')
mScreenStatus = pd.read_parquet(path+'ch2025_mScreenStatus.parquet')
mUsageStats = pd.read_parquet(path+'ch2025_mUsageStats.parquet')
mWifi = pd.read_parquet(path+'ch2025_mWifi.parquet')
wHr = pd.read_parquet(path+'ch2025_wHr.parquet')
wLight = pd.read_parquet(path+'ch2025_wLight.parquet')
wPedo = pd.read_parquet(path+'ch2025_wPedo.parquet')

# 2
train = pd.read_csv('/content/drive/MyDrive/data/ch2025_metrics_train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/ch2025_submission_sample.csv')

### ✔️ mACStatus 핸드폰 충전상태
- Indicates whether the smartphone is currently being charged.
- m_charging : 0/1 상태
- 핸드폰이 오랫 동안 충전했다는 의미?
 - 한 자리에 장시간 머물러 있었다.
 - 핸드폰을 장시간 사용하지 않았다.  

In [None]:
mACStatus['lifelog_date'] = mACStatus['timestamp'].astype(str).str[:10]
mACStatus.head(1)

Unnamed: 0,subject_id,timestamp,m_charging,lifelog_date
0,id01,2024-06-26 12:03:00,0,2024-06-26


In [None]:
def process_mACStatus(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['subject_id', 'timestamp'])

    results = []

    for (subj, lifelog_date), group in df.groupby(['subject_id', 'lifelog_date']):
        status = group['m_charging'].values  # 0/1 상태
        times = group['timestamp'].values

        ratio_charging = status.mean()
        sum_charging = status.sum()

        # 상태 전이 횟수
        transitions = (status[1:] != status[:-1]).sum()

        # 연속된 1 상태 길이들
        lengths = []
        current_len = 0
        for val in status:
            if val == 1:
                current_len += 1
            elif current_len > 0:
                lengths.append(current_len)
                current_len = 0
        if current_len > 0:
            lengths.append(current_len)

        avg_charging_duration = np.mean(lengths) if lengths else 0
        max_charging_duration = np.max(lengths) if lengths else 0

        results.append({
            'subject_id': subj,
            'lifelog_date': lifelog_date,
            'charging_ratio': ratio_charging,
            'charging_sum': sum_charging,
            'charging_transitions': transitions,
            'avg_charging_duration': avg_charging_duration,
            'max_charging_duration': max_charging_duration,
        })

    return pd.DataFrame(results)

mACStatus2 = process_mACStatus(mACStatus)

# check
print(f'# mACStatus2 shape: {mACStatus2.shape}')
mACStatus2.head(1)

# mACStatus2 shape: (700, 7)


Unnamed: 0,subject_id,lifelog_date,charging_ratio,charging_sum,charging_transitions,avg_charging_duration,max_charging_duration
0,id01,2024-06-26,0.2159,147,22,13.3636,41


### ✔️ mActivity 추정행동
- Value calculated by the Google Activity Recognition API.
 - 0 : IN_VEHICLE
 - 1 : ON_BICYCLE
 - 2 : ON_FOOT
 - 3 : STILL (not moving)
 - 4 : UNKNOWN
 - 5 : TILTING (This often occurs when a device is picked up from a desk or a user who is sitting stands up.)
 - 7 : WALKING
 - 8 : RUNNING
- 근무시간   : 오전 7시부터 오후 6시까지
- 근무외시간 : 오후6시부터 12시까지

In [None]:
mActivity['lifelog_date'] = mActivity['timestamp'].astype(str).str[:10]
mActivity.head()

Unnamed: 0,subject_id,timestamp,m_activity,lifelog_date
0,id01,2024-06-26 12:03:00,4,2024-06-26
1,id01,2024-06-26 12:04:00,0,2024-06-26
2,id01,2024-06-26 12:05:00,0,2024-06-26
3,id01,2024-06-26 12:06:00,0,2024-06-26
4,id01,2024-06-26 12:07:00,0,2024-06-26


In [None]:
def process_mActivity(df):
    """
    # 포함
    - 0 : IN_VEHICLE
    - 1 : ON_BICYCLE
    - 2 : ON_FOOT
    - 5 : TILTING (This often occurs when a device is picked up from a desk or a user who is sitting stands up.)
    - 7 : WALKING
    - 8 : RUNNING

    # 제외
    - 3 : STILL (not moving)
    - 4 : UNKNOWN
    """
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour

    results = []

    for (subj, date), group in df.groupby(['subject_id', 'lifelog_date']):
        row = {'subject_id': subj, 'lifelog_date': date}

        # 전체 시간에서 1,2,7,8
        a1 = group[group['m_activity'].isin([1,2,7,8])]
        row['all_WALKING_n_ETC_minutes'] = len(a1)

        # 전체 시간에서 0 (IN_VEHICLE)
        a2 = group[group['m_activity'].isin([0])]
        row['all_VEHICLE_minutes'] = len(a2)

        # 전체 시간에서 유효한 활동
        all_valid = group[group['m_activity'].isin([0, 1, 2, 5, 7, 8])]
        row['all_ACTIVITY_minutes'] = len(all_valid)

        # sleeptime 0~5시 에서 유효한 활동
        dawn_valid = all_valid[(all_valid['hour'] >= 0) & (all_valid['hour'] <5)]
        row['dawn_ACTIVITY_minutes'] = len(dawn_valid)

        results.append(row)

    return pd.DataFrame(results)

In [None]:
mActivity2 = process_mActivity(mActivity)

# check
print(f'# mActivity2 shape: {mActivity2.shape}')
mActivity2.head(1)

# mActivity2 shape: (700, 6)


Unnamed: 0,subject_id,lifelog_date,all_WALKING_n_ETC_minutes,all_VEHICLE_minutes,all_ACTIVITY_minutes,dawn_ACTIVITY_minutes
0,id01,2024-06-26,32,89,121,0


### ✔️ mAmbience 추정주변소리
- Ambient sound identification labels and their respective probabilities.
- 무슨 소리가 난게 중요할까?
- 새벽에 무슨 소리던지 소리가 난게 중요한 걸까?
- 여러 가지 소리 중에 노이즈도 포함되어 있을까?

In [None]:
def extract_labels_and_probs(row):
    items = row['m_ambience']
    labels = [item[0] for item in items]
    probs = [item[1] for item in items]
    return pd.Series({'labels': labels, 'prob': probs})

mAmbience[['labels', 'prob']]  = mAmbience.apply(extract_labels_and_probs, axis=1)
mAmbience['lifelog_date'] = mAmbience['timestamp'].astype(str).str[:10]
mAmbience = mAmbience.drop(columns=['m_ambience'])
mAmbience.head(1)

Unnamed: 0,subject_id,timestamp,labels,prob,lifelog_date
0,id01,2024-06-26 13:00:10,"[Music, Vehicle, Motor vehicle (road), Outside, urban or manmade, Outside, rural or natural, Car, Speech, Inside, large room or hall, Truck, Sound effect]","[0.30902618, 0.081680894, 0.04035286, 0.037144363, 0.032663062, 0.03199804, 0.029806137, 0.01684492, 0.016206821, 0.01591479]",2024-06-26


In [None]:
def process_mAmbience(df):

    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour

    # 시간대 분류
    df['time_period'] = df['hour'].apply(lambda h: 'sleeptime' if 0 <= h < 5 else 'activehour')

    # explode labels
    exploded = df.explode('labels')

    # unique label count
    unique_labels = (
        exploded.groupby(['subject_id', 'lifelog_date', 'time_period'])['labels']
        .nunique()
        .reset_index(name='unique_label_count')
    )

    # snor 포함 라벨 count
    snor_labels = (
        exploded[exploded['labels'].astype(str).str.contains('snor', case=False, na=False)]
        .groupby(['subject_id', 'lifelog_date', 'time_period'])['labels']
        .count()
        .reset_index(name='snor_count')
    )

    # 병합
    result = pd.merge(unique_labels, snor_labels, on=['subject_id', 'lifelog_date', 'time_period'], how='outer').fillna(0)
    result['snor_count'] = result['snor_count'].astype(int)
    result = result.pivot(index=['subject_id', 'lifelog_date'], columns='time_period')
    result.columns = [f"{tp}_{metric}" for metric, tp in result.columns]
    result = result.reset_index()

    return result

In [None]:
mAmbience2 = process_mAmbience(mAmbience)

# check
print(f'# mAmbience2 shape: {mAmbience2.shape}')
mAmbience2.head(1)

# mAmbience2 shape: (700, 6)


Unnamed: 0,subject_id,lifelog_date,activehour_unique_label_count,sleeptime_unique_label_count,activehour_snor_count,sleeptime_snor_count
0,id01,2024-06-26,265.0,,4.0,


### ✔️ mBle 블루투스
- Bluetooth devices around individual subject.
 - 7936 : Wearable, Headset, AV Device
 - 1796 : Peripheral (입력장치) 계열
 - 0 : 정보 없음 또는 알 수 없음(Unknown)
 - 1084 : Audio/Video (스피커, 헤드셋, 이어폰, TV 등)
 - 524 : Phone (휴대폰, 스마트폰)
 - 1060 : Headphones
 - 284 : commputer (PC, 노트북, PDA)

In [None]:
def extract_mble_info(row):
    m_data = row['m_ble']
    address = [item['address'] for item in m_data]
    device_class = [item['device_class'] for item in m_data]
    rssi = [item['rssi'] for item in m_data]
    return pd.Series({'address': address, 'device_class': device_class, 'rssi': rssi})

mBle[['address','device_class','rssi']] = mBle.apply(extract_mble_info, axis=1)
mBle['lifelog_date'] = mBle['timestamp'].astype(str).str[:10]
mBle.head(1)

Unnamed: 0,subject_id,timestamp,m_ble,address,device_class,rssi,lifelog_date
0,id01,2024-06-26 12:13:00,"[{'address': '00:15:7C:11:80:8D', 'device_class': '0', 'rssi': -82}, {'address': '01:B1:D2:20:9E:3A', 'device_class': '0', 'rssi': -61}, {'address': '04:33:1F:D9:C1:50', 'device_class': '0', 'rssi': -86}, {'address': '06:5C:2D:BC:39:BE', 'device_class': '0', 'rssi': -75}, {'address': '09:42:21:0D:AD:DF', 'device_class': '0', 'rssi': -70}, {'address': '0B:66:0D:D5:9C:4A', 'device_class': '0', 'rssi': -89}, {'address': '10:B5:88:E7:85:69', 'device_class': '0', 'rssi': -89}, {'address': '13:F0:CA:3B:DB:EF', 'device_class': '0', 'rssi': -77}, {'address': '1A:23:C0:8F:43:4D', 'device_class': '0', 'rssi': -66}, {'address': '24:11:53:BB:62:89', 'device_class': '1796', 'rssi': -37}, {'address': '24:2D:F0:EE:1E:D0', 'device_class': '0', 'rssi': -85}, {'address': '26:0C:48:28:15:77', 'device_class': '0', 'rssi': -63}, {'address': '27:C1:C0:8B:82:C9', 'device_class': '0', 'rssi': -88}, {'address': '28:9C:11:73:39:05', 'device_class': '0', 'rssi': -30}, {'address': '34:40:DE:35:F8:65', 'device_class': '0', 'rssi': -93}, {'address': '35:0A:59:BF:75:F5', 'device_class': '0', 'rssi': -72}, {'address': '41:A6:C4:20:E3:2C', 'device_class': '7936', 'rssi': -83}, {'address': '42:6B:51:95:1B:D4', 'device_class': '0', 'rssi': -77}, {'address': '44:B2:0B:78:04:0F', 'device_class': '0', 'rssi': -69}, {'address': '45:37:48:E2:7F:CC', 'device_class': '0', 'rssi': -87}, {'address': '4E:1B:C2:DF:C5:87', 'device_class': '0', 'rssi': -76}, {'address': '4E:9F:1B:A9:56:5D', 'device_class': '0', 'rssi': -66}, {'address': '50:63:B0:82:07:00', 'device_class': '0', 'rssi': -86}, {'address': '53:13:6C:4F:04:D2', 'device_class': '0', 'rssi': -69}, {'address': '54:15:89:95:27:44', 'device_class': '7936', 'rssi': -71}, {'address': '56:0E:2E:B0:D4:11', 'device_class': '0', 'rssi': -61}, {'address': '5A:7A:2E:42:03:B1', 'device_class': '0', 'rssi': -82}, {'address': '5A:9D:3E:AB:38:C6', 'device_class': '0', 'rssi': -83}, {'address': '5E:A6:8E:B8:74:74', 'device_class': '0', 'rssi': -84}, {'address': '5F:BC:08:0F:C1:6A', 'device_class': '0', 'rssi': -87}, {'address': '62:E1:9D:41:F4:AE', 'device_class': '0', 'rssi': -73}, {'address': '67:23:FE:88:69:A8', 'device_class': '0', 'rssi': -88}, {'address': '68:EC:C5:0C:D1:C1', 'device_class': '0', 'rssi': -78}, {'address': '6B:28:DA:C0:1B:29', 'device_class': '0', 'rssi': -75}, {'address': '6F:0B:91:00:33:19', 'device_class': '0', 'rssi': -80}, {'address': '70:7A:4B:82:44:90', 'device_class': '0', 'rssi': -88}, {'address': '7B:62:D4:5B:59:D3', 'device_class': '0', 'rssi': -74}, {'address': '7B:BE:A4:9D:FD:11', 'device_class': '0', 'rssi': -72}, {'address': '7F:FD:C4:00:77:7D', 'device_class': '0', 'rssi': -52}, {'address': 'C4:F0:92:C8:F1:8D', 'device_class': '7936', 'rssi': -87}, {'address': 'C7:3F:2C:7B:86:66', 'device_class': '7936', 'rssi': -89}]","[00:15:7C:11:80:8D, 01:B1:D2:20:9E:3A, 04:33:1F:D9:C1:50, 06:5C:2D:BC:39:BE, 09:42:21:0D:AD:DF, 0B:66:0D:D5:9C:4A, 10:B5:88:E7:85:69, 13:F0:CA:3B:DB:EF, 1A:23:C0:8F:43:4D, 24:11:53:BB:62:89, 24:2D:F0:EE:1E:D0, 26:0C:48:28:15:77, 27:C1:C0:8B:82:C9, 28:9C:11:73:39:05, 34:40:DE:35:F8:65, 35:0A:59:BF:75:F5, 41:A6:C4:20:E3:2C, 42:6B:51:95:1B:D4, 44:B2:0B:78:04:0F, 45:37:48:E2:7F:CC, 4E:1B:C2:DF:C5:87, 4E:9F:1B:A9:56:5D, 50:63:B0:82:07:00, 53:13:6C:4F:04:D2, 54:15:89:95:27:44, 56:0E:2E:B0:D4:11, 5A:7A:2E:42:03:B1, 5A:9D:3E:AB:38:C6, 5E:A6:8E:B8:74:74, 5F:BC:08:0F:C1:6A, 62:E1:9D:41:F4:AE, 67:23:FE:88:69:A8, 68:EC:C5:0C:D1:C1, 6B:28:DA:C0:1B:29, 6F:0B:91:00:33:19, 70:7A:4B:82:44:90, 7B:62:D4:5B:59:D3, 7B:BE:A4:9D:FD:11, 7F:FD:C4:00:77:7D, C4:F0:92:C8:F1:8D, C7:3F:2C:7B:86:66]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1796, 0, 0, 0, 0, 0, 0, 7936, 0, 0, 0, 0, 0, 0, 0, 7936, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7936, 7936]","[-82, -61, -86, -75, -70, -89, -89, -77, -66, -37, -85, -63, -88, -30, -93, -72, -83, -77, -69, -87, -76, -66, -86, -69, -71, -61, -82, -83, -84, -87, -73, -88, -78, -75, -80, -88, -74, -72, -52, -87, -89]",2024-06-26


In [None]:
def process_mBle(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour

    # 시간대 분류
    def map_time_period(row):
        if 0 <= row['hour'] < 7:
            return 'sleeptime'
        elif 7 <= row['hour'] < 18:
            return 'worktime'
        else:
            return 'afterwork'

    df['time_period'] = df.apply(map_time_period, axis=1)

    features = []

    for idx, row in df.iterrows():
        entry = ast.literal_eval(row['m_ble']) if isinstance(row['m_ble'], str) else row['m_ble']

        rssi_list = []
        class_0_cnt = 0
        class_other_cnt = 0

        for device in entry:
            try:
                rssi = int(device['rssi'])
                rssi_list.append(rssi)

                device_class = str(device['device_class'])
                if device_class == '0':
                    class_0_cnt += 1
                else:
                    class_other_cnt += 1
            except:
                continue  # malformed record

        feature = {
            'subject_id': row['subject_id'],
            'lifelog_date': row['lifelog_date'],
            'time_period': row['time_period'],
            'ble_class_unknwn_cnt': class_0_cnt,
            'ble_class_others_cnt': class_other_cnt,
            'ble_count': len(rssi_list),
            'ble_rssi_mean': np.mean(rssi_list) if rssi_list else np.nan,
            'ble_rssi_min': np.min(rssi_list) if rssi_list else np.nan,
            'ble_rssi_max': np.max(rssi_list) if rssi_list else np.nan,
        }
        features.append(feature)

    return pd.DataFrame(features)

def summarize_mBle_daily(df):

    # row 단위 BLE feature 추출
    df = process_mBle(df)

    # 하루 + 시간대별로 groupby
    grouped = df.groupby(['subject_id', 'lifelog_date', 'time_period']).agg({
        'ble_class_unknwn_cnt': 'sum',
        'ble_class_others_cnt': 'sum',
        'ble_rssi_mean': 'mean',
        'ble_rssi_min': 'min',
        'ble_rssi_max': 'max',
    }).reset_index()

    # 총합 구해서 비율 계산
    total_cnt = grouped['ble_class_unknwn_cnt'] + grouped['ble_class_others_cnt']
    grouped['ble_class_unknwn_ratio'] = grouped['ble_class_unknwn_cnt'] / total_cnt.replace(0, np.nan)
    grouped['ble_class_others_ratio'] = grouped['ble_class_others_cnt'] / total_cnt.replace(0, np.nan)

    # 필요 없는 cnt 컬럼 제거
    grouped.drop(columns=[
        'ble_class_unknwn_cnt',
        'ble_class_others_cnt'
    ], inplace=True)

    # pivot해서 time_period별로 펼치기
    final = grouped.pivot(index=['subject_id', 'lifelog_date'], columns='time_period')
    final.columns = ['_'.join(col).strip() for col in final.columns.values]
    final = final.reset_index()

    return final

In [None]:
mBle2 = summarize_mBle_daily(mBle)

# check
print(f'\n # mBle2 shape: {mBle2.shape}')
mBle2.head(1)


 # mBle2 shape: (651, 17)


Unnamed: 0,subject_id,lifelog_date,ble_rssi_mean_afterwork,ble_rssi_mean_sleeptime,ble_rssi_mean_worktime,ble_rssi_min_afterwork,ble_rssi_min_sleeptime,ble_rssi_min_worktime,ble_rssi_max_afterwork,ble_rssi_max_sleeptime,ble_rssi_max_worktime,ble_class_unknwn_ratio_afterwork,ble_class_unknwn_ratio_sleeptime,ble_class_unknwn_ratio_worktime,ble_class_others_ratio_afterwork,ble_class_others_ratio_sleeptime,ble_class_others_ratio_worktime
0,id01,2024-06-26,-76.2155,,-75.0522,-92.0,,-94.0,-43.0,,-27.0,0.9237,,0.9421,0.0763,,0.0579


### ✔️ mGps, GPS 기반 핸드폰 위치
- Multiple GPS coordinates measured within a single minute using the smartphone.
- speed가 1보다 큰경우 정지 상태가 아니고 움직이고 있다고 판단
 - 0.5-2 : 걸어서 이동하는 경우  
 - 2-5 : 조깅
 - 5 이상 : 차를 타고 이동하는 경우

- speed가 0.5-2사이를 하루에 몇분동안 지속했는지?
- speed가 2-5사이를 하루에 몇분동안 지속했는지? (유산소 운동 시간)
- speed가 5이상을 하루에 몇분동안 지속했는지?  

In [None]:
def extract_gps_info(row):
    m_data = row['m_gps']
    altitude = [item['altitude'] for item in m_data]
    latitude = [item['latitude'] for item in m_data]
    longitude = [item['longitude'] for item in m_data]
    speed = [item['speed'] for item in m_data]
    return pd.Series({'altitude': altitude, 'latitude': latitude, 'longitude': longitude, 'speed': speed})

mGps[['altitude','latitude','longitude','speed']] = mGps.apply(extract_gps_info, axis=1)
mGps['lifelog_date'] = mGps['timestamp'].astype(str).str[:10]
mGps = mGps.drop(columns=['m_gps'])
mGps.head(1)

Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed,lifelog_date
0,id01,2024-06-26 12:03:00,"[110.6, 110.8, 110.8, 110.7, 110.7, 110.8, 110.8, 110.8, 110.8, 110.8, 110.8]","[0.2077385, 0.2077759, 0.2077728, 0.20779, 0.2077914, 0.2077972, 0.2078002, 0.2077985, 0.207801, 0.207802, 0.2078011]","[0.170027, 0.1699851, 0.1699834, 0.1699686, 0.1699708, 0.1699657, 0.1699627, 0.1699631, 0.1699642, 0.1699639, 0.169963]","[0.0, 0.721, 0.0505, 0.6587, 0.0568, 0.1768, 0.0907, 0.0337, 0.0411, 0.0296, 0.0194]",2024-06-26


In [None]:
# 거리 계산 함수
def haversine(coord1, coord2, unit='m'):
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371000  # 지구 반지름(m)
    return c * r if unit == 'm' else c * r / 1000

def process_mGps(df):
    df = df.copy()

    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = df['timestamp'].dt.date
    df['week'] = df['timestamp'].dt.isocalendar().week

    expanded_rows = []

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing GPS data"):
        speeds = ast.literal_eval(row['speed']) if isinstance(row['speed'], str) else row['speed']
        lats = ast.literal_eval(row['latitude']) if isinstance(row['latitude'], str) else row['latitude']
        lons = ast.literal_eval(row['longitude']) if isinstance(row['longitude'], str) else row['longitude']
        alts = ast.literal_eval(row['altitude']) if isinstance(row['altitude'], str) else row['altitude']
        n = len(speeds)
        if n > 0:
            expanded_rows.append(pd.DataFrame({
                'subject_id': [row['subject_id']] * n,
                'lifelog_date': [row['lifelog_date']] * n,
                'timestamp': pd.date_range(start=row['timestamp'], periods=n, freq='1S'),
                'speed': speeds,
                'latitude': lats,
                'longitude': lons,
                'altitude': alts
            }))

    expanded_df = pd.concat(expanded_rows, ignore_index=True)

    # 벡터화
    speeds = expanded_df['speed'].values

    walk_mask = (0.5 <= speeds) & (speeds < 2)
    jog_mask = (2 <= speeds) & (speeds < 5)
    vehicle_mask = (speeds >= 5)
    le5_mask = (speeds <= 5)

    expanded_df['walk'] = walk_mask.astype(int)
    expanded_df['jog'] = jog_mask.astype(int)
    expanded_df['vehicle'] = vehicle_mask.astype(int)
    expanded_df['le5_speed'] = expanded_df['speed'].where(le5_mask)

    # 아침/저녁 구간 조건
    expanded_df['hour'] = expanded_df['timestamp'].dt.hour
    morning_condition = (expanded_df['hour'] >= 6) & (expanded_df['hour'] < 9) & (expanded_df['speed'] >= 1)
    evening_condition = (expanded_df['hour'] >= 21) & (expanded_df['hour'] <= 23) & (expanded_df['speed'] <= 1)

    # 이동 특성 계산
    movement_features = []
    for (subject_id, lifelog_date), group in expanded_df.groupby(['subject_id', 'lifelog_date']):
        all_speeds = group['speed'].values
        all_alts = group['altitude'].values
        all_lats = group['latitude'].values
        all_lons = group['longitude'].values

        active_mins = group.shape[0] / 60  # 1초 단위 → 분
        movement_ratio = (all_speeds > 1.0).mean() if len(all_speeds) > 0 else 0
        alt_change = all_alts[-1] - all_alts[0] if len(all_alts) > 0 else 0
        lat_change = all_lats[-1] - all_lats[0] if len(all_lats) > 0 else 0
        lon_change = all_lons[-1] - all_lons[0] if len(all_lons) > 0 else 0

        total_dist = 0.0
        if len(all_lats) > 1:
            for i in range(len(all_lats)-1):
                coord1 = (all_lats[i], all_lons[i])
                coord2 = (all_lats[i+1], all_lons[i+1])
                total_dist += haversine(coord1, coord2, unit='m')

        movement_features.append({
            'subject_id': subject_id,
            'lifelog_date': lifelog_date,
            'active_minutes': active_mins,
            'movement_ratio': movement_ratio,
            'alt_change': alt_change,
            'lat_change': lat_change,
            'lon_change': lon_change,
            'total_distance_m': total_dist
        })

    movement_df = pd.DataFrame(movement_features)

    # Groupby + Aggregation
    agg_funcs = {
        'walk_minutes': ('walk', lambda x: x.sum() / 60),
        'jog_minutes': ('jog', lambda x: x.sum() / 60),
        'vehicle_minutes': ('vehicle', lambda x: x.sum() / 60),
        'speed_le5_max': ('le5_speed', 'max'),
        'speed_le5_mean': ('le5_speed', 'mean'),
        'speed_le5_std': ('le5_speed', 'std')
    }

    grouped = expanded_df.groupby(['subject_id', 'lifelog_date']).agg(**agg_funcs).reset_index()
    grouped['exercise_flag'] = np.where(grouped['jog_minutes'] >= 5,1,0)

    # 아침 wakeup time
    morning_first_movement = (
        expanded_df[morning_condition]
        .groupby(['subject_id', 'lifelog_date'])['timestamp']
        .min()
        .reset_index()
        .rename(columns={'timestamp': 'morning_wakeup_time'})
    )


    # 최종 merge
    final = pd.merge(grouped, movement_df, on=['subject_id', 'lifelog_date'], how='left')
    final = pd.merge(final, morning_first_movement, on=['subject_id', 'lifelog_date'], how='left')

    # 아침 wakeup_time 처리
    valid_wakeup = final['morning_wakeup_time'].dropna()
    if not valid_wakeup.empty:
        total_seconds = valid_wakeup.dt.hour * 3600 + valid_wakeup.dt.minute * 60 + valid_wakeup.dt.second
        mean_seconds = total_seconds.mean()
        mean_hour = int(mean_seconds // 3600)
        mean_minute = int((mean_seconds % 3600) // 60)
        mean_second = int(mean_seconds % 60)
        mean_wakeup_time = time(mean_hour, mean_minute, mean_second)
    else:
        mean_wakeup_time = time(7, 0, 0)

    final['morning_wakeup_time'] = final['morning_wakeup_time'].fillna(
        pd.Timestamp.combine(pd.to_datetime('today').date(), mean_wakeup_time)
    )
    final['morning_wakeup_time'] = final['morning_wakeup_time'].dt.hour * 100 + final['morning_wakeup_time'].dt.minute

    mean_wakeup_hhmm = mean_wakeup_time.hour * 100 + mean_wakeup_time.minute

    # wake_up_early_minutes
    def compute_minutes_diff(actual_hhmm, mean_hhmm):
        actual_hour = actual_hhmm // 100
        actual_minute = actual_hhmm % 100
        mean_hour = mean_hhmm // 100
        mean_minute = mean_hhmm % 100
        actual_sec = actual_hour * 3600 + actual_minute * 60
        mean_sec = mean_hour * 3600 + mean_minute * 60
        return (mean_sec - actual_sec) / 60

    final['wake_up_early_minutes'] = final['morning_wakeup_time'].apply(lambda x: compute_minutes_diff(x, mean_wakeup_hhmm))

    return final

In [None]:
%%time

mGps2 = process_mGps(mGps)

# check
print(f'\n # mGps2 shape: {mGps2.shape}')
mGps2.head(1)

Processing GPS data:   0%|          | 0/800611 [00:00<?, ?it/s]


 # mGps2 shape: (660, 17)
CPU times: user 7min 54s, sys: 15 s, total: 8min 9s
Wall time: 7min 59s


Unnamed: 0,subject_id,lifelog_date,walk_minutes,jog_minutes,vehicle_minutes,speed_le5_max,speed_le5_mean,speed_le5_std,exercise_flag,active_minutes,movement_ratio,alt_change,lat_change,lon_change,total_distance_m,morning_wakeup_time,wake_up_early_minutes
0,id01,2024-06-26,11.1667,1.3,3.8667,4.9907,0.2503,0.5089,0,100.2833,0.1034,-6.7,0.0229,-0.0757,29113.576,655,0.0


### ✔️ mLight 주변 밝기
- Ambient light measured by the smartphone.
 - 어두운 밤	0.1 ~ 1 lux	캄캄한 방, 달빛 없는 밤
 - 가로등 켜진 거리	10 ~ 20 lux	흐릿한 외부 조명
 - 실내 조명	100 ~ 500 lux	사무실, 일반 거실
 - 밝은 실외	10,000 ~ 25,000 lux	맑은 날 햇빛
 - 직사광선 아래	30,000 ~ 100,000 lux	여름 한낮, 매우 강한 햇빛

- 밝기에 따라서 언제 불을 끄고 잠든 시간 추정
- 직사광선 잠에 좋은 영향을 주는지? (논문)
- 결측치 처리 x

In [None]:
mLight['lifelog_date'] = mLight['timestamp'].astype(str).str[:10]
mLight.head(1)

Unnamed: 0,subject_id,timestamp,m_light,lifelog_date
0,id01,2024-06-26 12:03:00,534.0,2024-06-26


In [None]:
def process_mLight(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour
    df['is_night'] = df['hour'].apply(lambda h: h >= 22 or h < 6)

    # 하루 요약 통계
    daily_light = df.groupby(['subject_id', 'lifelog_date']).agg(
        light_mean=('m_light', 'mean'),
        light_std=('m_light', 'std'),
        light_max=('m_light', 'max'),
        light_min=('m_light', 'min'),
        light_night_mean=('m_light', lambda x: x[df.loc[x.index, 'is_night']].mean()),
        light_day_mean=('m_light', lambda x: x[~df.loc[x.index, 'is_night']].mean()),
        light_night_ratio=('is_night', 'mean')
    ).reset_index()

    results = []

    for subject_id, group in tqdm(df.groupby('subject_id'), desc="Processing light-based sleep detection"):
        group = group.sort_values('timestamp').reset_index(drop=True)

        recorded_dates = set()
        sleeping = False
        zero_count = 0
        first_zero_time = None

        for i in range(len(group)):
            light = group.loc[i, 'm_light']
            hour = group.loc[i, 'hour']

            if light == 0:
                zero_count += 1
                if zero_count == 1:
                    first_zero_time = group.loc[i, 'timestamp']
                if zero_count >= 6 and not sleeping:
                    sleep_hour = first_zero_time.hour
                    if (sleep_hour >= 21 or sleep_hour <= 2):
                        sleeping = True
            else:
                if sleeping:
                    candidate_wakeup = group.loc[i, 'timestamp']
                    wake_hour = candidate_wakeup.hour

                    if 5 <= wake_hour <= 9 and first_zero_time is not None:
                        wake_time = candidate_wakeup
                        sleep_time = first_zero_time
                        duration_min = (wake_time - sleep_time).total_seconds() / 60

                        if 0 < duration_min <= 840:
                            sleep_duration = duration_min
                        else:
                            sleep_duration = np.nan

                        lifelog_date = wake_time.date() + pd.Timedelta(days=-1)

                        if lifelog_date not in recorded_dates:
                            results.append({
                                'subject_id': subject_id,
                                'lifelog_date': lifelog_date,
                                'sleep_duration_min_mLight': sleep_duration,
                                'sleep_time_min_mLight': sleep_time.hour * 60 + sleep_time.minute,
                                'wake_time_min_mLight': wake_time.hour * 60 + wake_time.minute,
                                'hour_slept_mLight': sleep_time.hour + sleep_time.minute / 60,
                                'hour_woke_up_mLight': wake_time.hour + wake_time.minute / 60
                            })
                            recorded_dates.add(lifelog_date)

                        sleeping = False
                        zero_count = 0
                        first_zero_time = None

            if light > 0:
                zero_count = 0
                first_zero_time = None

    sleep_df = pd.DataFrame(results)

    # 정렬 + 보간
    sleep_df = sleep_df.sort_values(['subject_id', 'lifelog_date'])
    sleep_df['sleep_duration_interp_mLight'] = sleep_df.groupby('subject_id')['sleep_duration_min_mLight'].transform(lambda x: x.interpolate())

    # 시간 단위 파생 컬럼
    sleep_df['sleep_duration_hour_mLight'] = sleep_df['sleep_duration_min_mLight'] / 60
    sleep_df['sleep_duration_interp_hour_mLight'] = sleep_df['sleep_duration_interp_mLight'] / 60

    # 병합
    final = pd.merge(daily_light, sleep_df, on=['subject_id', 'lifelog_date'], how='left')

    return final

In [None]:
mLight2 = process_mLight(mLight)

# check
print(f'\n # mLight2 shape: {mLight2.shape}')
mLight2.head(1)

Processing light-based sleep detection:   0%|          | 0/10 [00:00<?, ?it/s]


 # mLight2 shape: (700, 17)


Unnamed: 0,subject_id,lifelog_date,light_mean,light_std,light_max,light_min,light_night_mean,light_day_mean,light_night_ratio,sleep_duration_min_mLight,sleep_time_min_mLight,wake_time_min_mLight,hour_slept_mLight,hour_woke_up_mLight,sleep_duration_interp_mLight,sleep_duration_hour_mLight,sleep_duration_interp_hour_mLight
0,id01,2024-06-26,364.5068,395.6594,1886.0,0.0,184.9231,403.4167,0.1781,340.0,1409.0,309.0,23.4833,5.15,340.0,5.6667,5.6667


### 🔥 mScreenStatus 화면 사용여부

- Indicates whether the smartphone screen is in use.
 - 기상시간, 취침시간, 수면시간
 - 휴대폰 이용횟수, 이용시간
 - 00 - 05 사이에 휴대폰 이용한 건수
 - 결측치 처리 x

In [None]:
mScreenStatus['lifelog_date'] = mScreenStatus['timestamp'].astype(str).str[:10]
mScreenStatus.head(1)

Unnamed: 0,subject_id,timestamp,m_screen_use,lifelog_date
0,id01,2024-06-26 12:03:00,0,2024-06-26


In [None]:
def preprocess_mScreenStatus(df):
    from datetime import datetime, time as dtime, timedelta

    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = pd.to_datetime(df['lifelog_date'])

    # base key 확보
    base_keys = df[['subject_id', 'lifelog_date']].drop_duplicates()
    base_keys['lifelog_date'] = base_keys['lifelog_date'].dt.date

    # 밤 9시부터 다음날 오전 11시 필터링
    df['hour'] = df['timestamp'].dt.hour
    df = df[(df['hour'] >= 21) | (df['hour'] < 11)].copy()
    df.loc[df['hour'] < 11, 'lifelog_date'] -= pd.Timedelta(days=1)

    df.sort_values(['subject_id', 'timestamp'], inplace=True)

    results = []

    for (subject_id, lifelog_date), group in df.groupby(['subject_id', 'lifelog_date']):
        group = group.sort_values('timestamp').reset_index(drop=True)

        # 1. 중간 각성(앞뒤 0, 본인 1) 제거
        prev = group['m_screen_use'].shift(1)
        next_ = group['m_screen_use'].shift(-1)
        mask = (group['m_screen_use'] == 1) & (prev == 0) & (next_ == 0)
        group.loc[mask, 'm_screen_use'] = 0

        # 2. 블록 단위로 짧은 각성 블록 제거
        group['is_sleep'] = group['m_screen_use'] == 0
        group['block'] = (group['is_sleep'] != group['is_sleep'].shift()).cumsum()
        block_info = group.groupby('block').agg(
            is_sleep=('is_sleep', 'first'),
            size=('is_sleep', 'size')
        )

        for i in range(1, len(block_info) - 1):
            if (
                block_info.iloc[i]['is_sleep'] == False and
                block_info.iloc[i]['size'] <= 2 and
                block_info.iloc[i - 1]['is_sleep'] and
                block_info.iloc[i + 1]['is_sleep']
            ):
                group.loc[group['block'] == block_info.index[i], 'm_screen_use'] = 0

        # 다시 블록 재계산 후 수면 추정
        group['is_sleep'] = group['m_screen_use'] == 0
        group['block'] = (group['is_sleep'] != group['is_sleep'].shift()).cumsum()
        sleep_blocks = group[group['is_sleep']].groupby('block').agg(
            sleep_start=('timestamp', 'first'),
            sleep_end=('timestamp', 'last'),
            duration_min=('timestamp', lambda x: (x.max() - x.min()).total_seconds() / 60)
        )

        sleep_time = wake_time = duration_min = None
        if not sleep_blocks.empty:
            longest_sleep = sleep_blocks.loc[sleep_blocks['duration_min'].idxmax()]
            sleep_time = longest_sleep['sleep_start'].time()
            wake_time = longest_sleep['sleep_end'].time()
            duration_min = (
                datetime.combine(datetime.today(), wake_time) - datetime.combine(datetime.today(), sleep_time)
            ).total_seconds() / 60
            if duration_min < 0:
                duration_min += 1440

            if not (4 <= wake_time.hour < 11):
                wake_time = None
            if not (sleep_time.hour >= 21 or sleep_time.hour < 3):
                sleep_time = None
            if duration_min < 100:
                sleep_time = None
                wake_time = None
                duration_min = None

        results.append({
            'subject_id': subject_id,
            'lifelog_date': lifelog_date.date(),
            'sleep_time': sleep_time,
            'wake_time': wake_time,
            'sleep_duration_min': round(duration_min, 1) if duration_min is not None else None
        })


    sleep_df = pd.DataFrame(results)
    result_df = base_keys.merge(sleep_df, on=['subject_id', 'lifelog_date'], how='left')

    # 시간 → 실수형 숫자 (예: 23:30 → 23.5)
    def time_to_float(t):
        if pd.isna(t):
            return None
        return round(t.hour + t.minute / 60 + t.second / 3600, 4)

    result_df['sleep_time'] = result_df['sleep_time'].apply(time_to_float)
    result_df['wake_time'] = result_df['wake_time'].apply(time_to_float)

    return result_df

In [None]:
def preprocess_mScreenStatus(df):
    from datetime import datetime, timedelta

    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = pd.to_datetime(df['lifelog_date'])

    base_keys = df[['subject_id', 'lifelog_date']].drop_duplicates()
    base_keys['lifelog_date'] = base_keys['lifelog_date'].dt.date

    # 밤 9시 ~ 다음날 오전 11시 필터링
    df['hour'] = df['timestamp'].dt.hour
    df = df[(df['hour'] >= 21) | (df['hour'] < 11)].copy()
    df.loc[df['hour'] < 11, 'lifelog_date'] -= pd.Timedelta(days=1)
    df.sort_values(['subject_id', 'timestamp'], inplace=True)

    results = []

    for (subject_id, lifelog_date), group in df.groupby(['subject_id', 'lifelog_date']):
        group = group.sort_values('timestamp').reset_index(drop=True)

        # 중간 각성 제거
        prev = group['m_screen_use'].shift(1)
        next_ = group['m_screen_use'].shift(-1)
        mask = (group['m_screen_use'] == 1) & (prev == 0) & (next_ == 0)
        group.loc[mask, 'm_screen_use'] = 0

        # 짧은 각성 블록 제거
        group['is_sleep'] = group['m_screen_use'] == 0
        group['block'] = (group['is_sleep'] != group['is_sleep'].shift()).cumsum()
        block_info = group.groupby('block').agg(
            is_sleep=('is_sleep', 'first'),
            size=('is_sleep', 'size')
        )

        for i in range(1, len(block_info) - 1):
            if (
                block_info.iloc[i]['is_sleep'] == False and
                block_info.iloc[i]['size'] <= 2 and
                block_info.iloc[i - 1]['is_sleep'] and
                block_info.iloc[i + 1]['is_sleep']
            ):
                group.loc[group['block'] == block_info.index[i], 'm_screen_use'] = 0

        # 블록 재계산
        group['is_sleep'] = group['m_screen_use'] == 0
        group['block'] = (group['is_sleep'] != group['is_sleep'].shift()).cumsum()

        sleep_blocks = group[group['is_sleep']].groupby('block').agg(
            sleep_start=('timestamp', 'first'),
            sleep_end=('timestamp', 'last'),
            duration_min=('timestamp', lambda x: (x.max() - x.min()).total_seconds() / 60)
        )

        sleep_time = wake_time = duration_min = None
        if not sleep_blocks.empty:
            longest_sleep = sleep_blocks.loc[sleep_blocks['duration_min'].idxmax()]
            sleep_time = longest_sleep['sleep_start'].time()
            wake_time = longest_sleep['sleep_end'].time()
            duration_min = longest_sleep['duration_min']  # ✅ 정확하게 자정 넘는 경우도 반영됨

            # 유효 시간 범위 조건
            if not (4 <= wake_time.hour < 11):
                wake_time = None
            if not (sleep_time.hour >= 21 or sleep_time.hour < 3):
                sleep_time = None
            if duration_min < 100:
                sleep_time = None
                wake_time = None
                duration_min = None

        results.append({
            'subject_id': subject_id,
            'lifelog_date': lifelog_date.date(),
            'sleep_time': sleep_time,
            'wake_time': wake_time,
            'sleep_duration_min': round(duration_min, 1) if duration_min is not None else None
        })

    sleep_df = pd.DataFrame(results)
    result_df = base_keys.merge(sleep_df, on=['subject_id', 'lifelog_date'], how='left')

    # 시간 → 실수형 숫자 변환
    def time_to_float(t):
        if pd.isna(t):
            return None
        return round(t.hour + t.minute / 60 + t.second / 3600, 4)

    result_df['sleep_time'] = result_df['sleep_time'].apply(time_to_float)
    result_df['wake_time'] = result_df['wake_time'].apply(time_to_float)

    # 자정 넘어가는 경우 고려한 sleep_duration_min 재계산
    def compute_duration(row):
        sleep = row['sleep_time']
        wake = row['wake_time']
        if pd.isna(sleep) or pd.isna(wake):
            return None
        duration = (wake - sleep + 24) % 24
        return round(duration * 60, 1)  # 시간 단위 → 분 단위

    result_df['sleep_duration_min'] = result_df.apply(compute_duration, axis=1)

    return result_df

In [None]:
def calculate_circular_mean_sleep_time(sleep_times):
    sleep_times = pd.Series(sleep_times).dropna()
    if len(sleep_times) == 0:
        return np.nan  # 혹은 return 0.0 등 기본값 설정 가능

    def hour_to_radian(hour):
        return (hour % 24) / 24 * 2 * np.pi

    radians = np.array([hour_to_radian(t) for t in sleep_times])
    mean_radian = np.arctan2(np.mean(np.sin(radians)), np.mean(np.cos(radians)))
    mean_hour = (mean_radian / (2 * np.pi)) * 24 % 24

    return mean_hour

In [None]:
def circular_mean_sleep_time(times):

    # 결측치 제거
    valid_times = [t for t in times if pd.notna(t)]

    # 유효 데이터 개수 확인
    if len(valid_times) == 0:
        return None  # 결측치만 있는 경우

    # 시간 → 라디안 변환
    radians = [(t % 24) / 24 * 2 * np.pi for t in valid_times]

    # 사인/코사인 평균 계산
    sin_sum = np.mean(np.sin(radians))
    cos_sum = np.mean(np.cos(radians))

    # 평균 각도 계산
    if sin_sum == 0 and cos_sum == 0:
        return np.nan  # 불가능한 경우

    mean_radian = np.arctan2(sin_sum, cos_sum)

    # 평균 시간으로 변환
    mean_hour = (mean_radian / (2 * np.pi)) * 24
    if mean_hour < 0:
        mean_hour += 24

    return f'{int(mean_hour):02d}:{int((mean_hour % 1) * 60):02d}'

In [None]:
def add_ratios(df):
    df = df.copy()
    df['lifelog_date'] = pd.to_datetime(df['lifelog_date'])
    df['weekday'] = df['lifelog_date'].dt.weekday  # 0=월 ~ 6=일
    df['week_type'] = df['weekday'].apply(lambda x: 'weekend' if x >= 5 else 'weekday')
    df['month'] = df['lifelog_date'].dt.month

    # 일반 평균 계산
    avg_duration = (
        df.groupby(['subject_id', 'month', 'week_type'])['sleep_duration_min']
        .mean()
        .reset_index(name='avg_sleep_duration')
    )

    # sleep_time, wake_time은 원형 평균 적용
    sleep_time_avg = (
        df.groupby(['subject_id', 'month', 'week_type'])['sleep_time']
        .apply(calculate_circular_mean_sleep_time)
        .reset_index(name='avg_sleep_time')
    )

    wake_time_avg = (
        df.groupby(['subject_id', 'month', 'week_type'])['wake_time']
        .apply(calculate_circular_mean_sleep_time)
        .reset_index(name='avg_wake_time')
    )

    # 평균값 합치기
    avg_df = sleep_time_avg.merge(wake_time_avg, on=['subject_id', 'month', 'week_type'])
    avg_df = avg_df.merge(avg_duration, on=['subject_id', 'month', 'week_type'])

    # 원본에 매칭
    df = df.merge(avg_df, on=['subject_id', 'month', 'week_type'], how='left')

    # 비율 변수 계산 (음수면 더 일찍 취침/기상/수면시간 짧음)
    df['sleep_time_diff'] = df['avg_sleep_time'] - df['sleep_time']
    df['wake_time_diff'] = df['avg_wake_time'] - df['wake_time']
    df['sleep_duration_diff'] = df['avg_sleep_duration'] - df['sleep_duration_min']
    df['sleep_time_ratio'] = df['sleep_time'] / df['avg_sleep_time']
    df['wake_time_ratio'] = df['wake_time'] / df['avg_wake_time']
    df['sleep_duration_ratio'] = df['sleep_duration_min'] / df['avg_sleep_duration']

    # lag feature
    df = df.sort_values(['subject_id', 'lifelog_date'])
    df['sleep_time_lag1'] = df.groupby('subject_id')['sleep_time'].shift(1)
    df['wake_time_lag1'] = df.groupby('subject_id')['wake_time'].shift(1)
    df['sleep_duration_lag1'] = df.groupby('subject_id')['sleep_duration_min'].shift(1)
    df['week_type_lag1'] = df.groupby('subject_id')['week_type'].shift(1)

    # 변화량
    df['sleep_time_diff_lag1'] = df.groupby('subject_id')['sleep_time'].diff()
    df['wake_time_diff_lag1'] = df.groupby('subject_id')['wake_time'].diff()
    df['sleep_duration_diff_lag1'] = df.groupby('subject_id')['sleep_duration_min'].diff()

    # 이동 평균 (3일)
    df['rolling_sleep_time_3d'] = (
        df.groupby('subject_id')['sleep_time']
        .rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
    )
    df['rolling_wake_time_3d'] = (
        df.groupby('subject_id')['wake_time']
        .rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
    )
    df['rolling_sleep_duration_3d'] = (
        df.groupby('subject_id')['sleep_duration_min']
        .rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)
    )

    # 존재하는 컬럼만 채우기
    existing_columns = df.columns.tolist()
    columns_to_fill_filtered = [col for col in [
        'sleep_time', 'wake_time', 'sleep_duration_min',
        'sleep_time_ratio', 'wake_time_ratio', 'sleep_duration_ratio',
        'sleep_time_diff', 'wake_time_diff', 'sleep_duration_diff',
        'sleep_time_lag1', 'wake_time_lag1', 'sleep_duration_lag1',
        'sleep_time_diff_lag1', 'wake_time_diff_lag1', 'sleep_duration_diff_lag1',
        'rolling_sleep_time_3d', 'rolling_wake_time_3d', 'rolling_sleep_duration_3d'
    ] if col in existing_columns]

    df[columns_to_fill_filtered] = df.groupby('subject_id')[columns_to_fill_filtered].ffill()

    result = df[[
        'subject_id', 'lifelog_date', 'week_type',
        'sleep_time', 'wake_time', 'sleep_duration_min',
        'avg_sleep_time', 'avg_wake_time', 'avg_sleep_duration',
        'sleep_time_ratio', 'wake_time_ratio', 'sleep_duration_ratio',
        'sleep_time_diff', 'wake_time_diff', 'sleep_duration_diff',
        'sleep_time_lag1','wake_time_lag1', 'sleep_duration_lag1','week_type_lag1',
        'sleep_time_diff_lag1','wake_time_diff_lag1','sleep_duration_diff_lag1',
        'rolling_sleep_time_3d','rolling_wake_time_3d','rolling_sleep_duration_3d'
    ]]

    return result

In [None]:
mScreenStatus2 = preprocess_mScreenStatus(mScreenStatus)
mScreenStatus2 = add_ratios(mScreenStatus2)

# check
print(f'\n # mScreenStatus2 shape: {mScreenStatus2.shape}')
mScreenStatus2.head(1)


 # mScreenStatus2 shape: (700, 25)


Unnamed: 0,subject_id,lifelog_date,week_type,sleep_time,wake_time,sleep_duration_min,avg_sleep_time,avg_wake_time,avg_sleep_duration,sleep_time_ratio,wake_time_ratio,sleep_duration_ratio,sleep_time_diff,wake_time_diff,sleep_duration_diff,sleep_time_lag1,wake_time_lag1,sleep_duration_lag1,week_type_lag1,sleep_time_diff_lag1,wake_time_diff_lag1,sleep_duration_diff_lag1,rolling_sleep_time_3d,rolling_wake_time_3d,rolling_sleep_duration_3d
0,id01,2024-06-26,weekday,23.45,5.25,348.0,23.1944,5.4887,377.6667,1.011,0.9565,0.9214,-0.2556,0.2387,29.6667,,,,,,,,23.45,5.25,348.0


In [None]:
mScreenStatus2평균수면시간 = mScreenStatus2.groupby(['subject_id','week_type']).apply(lambda x: pd.Series({
     '평균 취침시간':circular_mean_sleep_time(x['sleep_time'])
    ,'평균 기상시간':circular_mean_sleep_time(x['wake_time'])
    ,'평균 수면시간':x['sleep_duration_min'].mean()
})).reset_index()

# 저장
fname = f'mScreenStatus2평균수면시간.xlsx'
print(fname)
mScreenStatus2평균수면시간.to_excel(fname, index=False)

# # check
mScreenStatus2평균수면시간

mScreenStatus2평균수면시간.xlsx


Unnamed: 0,subject_id,week_type,평균 취침시간,평균 기상시간,평균 수면시간
0,id01,weekday,22:42,05:55,429.2292
1,id01,weekend,22:21,06:09,467.45
2,id02,weekday,22:54,07:13,496.0
3,id02,weekend,23:13,07:27,494.9583
4,id03,weekday,00:21,09:03,457.4359
5,id03,weekend,00:18,08:54,450.8667
6,id04,weekday,00:03,06:50,396.6721
7,id04,weekend,00:09,06:59,401.1739
8,id05,weekday,22:52,07:25,500.1064
9,id05,weekend,22:39,07:42,518.2778


### ✔️ mUsageStats 앱사용통계
- mUsageStats: Indicates which apps were used on the smartphone and for how long.

 - 몇시까지 핸드폰 보다가 잠잤는지
 - 통화, 전화 얼마나 했는지
 - YouTube 얼마나 봤는지
 - 메시지, 카카오톡 얼마나 했는지
 - NAVER 얼마나 했는지
 - 평소보다 얼마나 많은 앱을 이용했는지
 - 제외? -> 시스템 UI,One UI 홈

In [None]:
def extract_mUsageStats_info(row):
    m_data = row['m_usage_stats']
    app_name = [item['app_name'] for item in m_data]
    total_time = [item['total_time'] for item in m_data]
    return pd.Series({'app_name': app_name, 'total_time': total_time})

mUsageStats[['app_name', 'total_time']] = mUsageStats.apply(extract_mUsageStats_info, axis=1)
mUsageStats['lifelog_date'] = mUsageStats['timestamp'].astype(str).str[:10]
mUsageStats.head(1)

Unnamed: 0,subject_id,timestamp,m_usage_stats,app_name,total_time,lifelog_date
0,id01,2024-06-26 13:00:00,"[{'app_name': ' 캐시워크', 'total_time': 69}, {'app_name': 'NAVER', 'total_time': 549}, {'app_name': ' ✝️성경일독Q', 'total_time': 7337}]","[ 캐시워크, NAVER, ✝️성경일독Q]","[69, 549, 7337]",2024-06-26


In [None]:
# def process_mUsageStats(df):
#     df = df.copy()
#     df['lifelog_date'] = pd.to_datetime(df['lifelog_date'])
#     df['timestamp'] = pd.to_datetime(df['timestamp'])
#     df['요일'] = df['lifelog_date'].dt.day_name()

#     # 리스트 평탄화
#     exploded_df = df.explode(['app_name', 'total_time'])
#     exploded_df['total_time'] = exploded_df['total_time'].astype(float)
#     exploded_df['total_time'] = exploded_df['total_time'] * 0.001 / 60  # 밀리초 → 초 → 분 변환

#     # app_name 특수문자 제거
#     exploded_df['app_name'] = exploded_df['app_name'].astype(str).apply(
#         lambda x: re.sub(r'[^가-힣a-zA-Z0-9]', '', x)
#     )

#     # 시스템 앱 제거
#     filtered_df = exploded_df[~exploded_df['app_name'].isin(['시스템UI'])]  # '시스템UI'만 제거 (OneUI홈은 포함)

#     # 주요 파생변수 생성
#     def calculate_daily_metrics(group):
#         last_use = group['timestamp'].max()

#         app_times = {
#             '통화_time': group[group['app_name'] == '통화']['total_time'].sum(),
#             '전화_time': group[group['app_name'] == '전화']['total_time'].sum(),
#             'YouTube_time': group[group['app_name'] == 'YouTube']['total_time'].sum(),
#             '메신저_time': group[group['app_name'].isin(['메시지', '카카오톡'])]['total_time'].sum(),
#             'NAVER_time': group[group['app_name'] == 'NAVER']['total_time'].sum(),
#             '캐시워크_time': group[group['app_name'] == '캐시워크']['total_time'].sum(),
#             '성경일독Q_time': group[group['app_name'] == '성경일독Q']['total_time'].sum(),
#             'OneUI홈_time': group[group['app_name'] == 'OneUI홈']['total_time'].sum(),
#         }

#         return pd.Series({
#             **app_times,
#             'unique_app_count': group['app_name'].nunique(),
#             'total_screen_time': group['total_time'].sum()
#         })

#     # daily metrics 생성
#     daily_stats = filtered_df.groupby(['subject_id', 'lifelog_date']).apply(calculate_daily_metrics).reset_index()

#     # subject_id별 평균 총화면시간 구하기
#     avg_screen_time = daily_stats.groupby('subject_id')['total_screen_time'].mean().to_dict()

#     # 평균대비 화면사용량(%) 생성
#     def compute_screen_usage(row):
#         avg_time = avg_screen_time.get(row['subject_id'], np.nan)
#         if pd.isna(avg_time) or avg_time == 0:
#             return np.nan
#         return round((row['total_screen_time'] / avg_time - 1) * 100, 1)

#     daily_stats['screen_time_vs_avg_pct'] = daily_stats.apply(compute_screen_usage, axis=1)

#     return daily_stats

In [None]:
def process_mUsageStats(df):
    df = df.copy()
    df['lifelog_date'] = pd.to_datetime(df['lifelog_date'])
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['요일'] = df['lifelog_date'].dt.day_name()
    df['hour'] = df['timestamp'].dt.hour

    # 시간대 분류
    def map_time_period(row):
        if 20 <= row['hour'] <= 23:
            return 'beforebed'
        else:
            return 'activehour'

    df['time_period'] = df.apply(map_time_period, axis=1)

    # 리스트 평탄화
    exploded_df = df.explode(['app_name', 'total_time'])
    exploded_df['total_time'] = exploded_df['total_time'].astype(float)
    exploded_df['total_time'] = exploded_df['total_time'] * 0.001 / 60  # 밀리초 → 초 → 분 변환

    # app_name 특수문자 제거
    exploded_df['app_name'] = exploded_df['app_name'].astype(str).apply(
        lambda x: re.sub(r'[^가-힣a-zA-Z0-9]', '', x)
    )

    # 시스템 앱 제거
    filtered_df = exploded_df[~exploded_df['app_name'].isin(['시스템UI'])]

    # 주요 파생변수 생성
    def calculate_daily_metrics(group):
        app_times = {
            '통화_time': group[group['app_name'] == '통화']['total_time'].sum(),
            '전화_time': group[group['app_name'] == '전화']['total_time'].sum(),
            'YouTube_time': group[group['app_name'] == 'YouTube']['total_time'].sum(),
            '메신저_time': group[group['app_name'].isin(['메시지', '카카오톡'])]['total_time'].sum(),
            'NAVER_time': group[group['app_name'] == 'NAVER']['total_time'].sum(),
            '캐시워크_time': group[group['app_name'] == '캐시워크']['total_time'].sum(),
            '성경일독Q_time': group[group['app_name'] == '성경일독Q']['total_time'].sum(),
            'OneUI홈_time': group[group['app_name'] == 'OneUI홈']['total_time'].sum(),
        }

        return pd.Series({
            **app_times,
            'unique_app_count': group['app_name'].nunique(),
            'total_screen_time': group['total_time'].sum()
        })

    # 일자/시간대별 요약
    daily_stats = filtered_df.groupby(['subject_id', 'lifelog_date', 'time_period']).apply(calculate_daily_metrics).reset_index()

    # subject_id별 평균 총화면시간
    avg_screen_time = daily_stats.groupby('subject_id')['total_screen_time'].mean().to_dict()

    # 평균 대비 비율
    def compute_screen_usage(row):
        avg_time = avg_screen_time.get(row['subject_id'], np.nan)
        if pd.isna(avg_time) or avg_time == 0:
            return np.nan
        return round((row['total_screen_time'] / avg_time - 1) * 100, 1)

    daily_stats['screen_time_vs_avg_pct'] = daily_stats.apply(compute_screen_usage, axis=1)

    # 피벗
    daily_stats = daily_stats.pivot(index=['subject_id', 'lifelog_date'], columns='time_period')
    daily_stats.columns = [f"{tp}_{metric}" for metric, tp in daily_stats.columns]
    daily_stats = daily_stats.reset_index()

    return daily_stats

In [None]:
mUsageStats2 = process_mUsageStats(mUsageStats)

# check
print(f'\n # mUsageStats2 shape: {mUsageStats2.shape}')
mUsageStats2.head(1)


 # mUsageStats2 shape: (689, 24)


Unnamed: 0,subject_id,lifelog_date,activehour_통화_time,beforebed_통화_time,activehour_전화_time,beforebed_전화_time,activehour_YouTube_time,beforebed_YouTube_time,activehour_메신저_time,beforebed_메신저_time,activehour_NAVER_time,beforebed_NAVER_time,activehour_캐시워크_time,beforebed_캐시워크_time,activehour_성경일독Q_time,beforebed_성경일독Q_time,activehour_OneUI홈_time,beforebed_OneUI홈_time,activehour_unique_app_count,beforebed_unique_app_count,activehour_total_screen_time,beforebed_total_screen_time,activehour_screen_time_vs_avg_pct,beforebed_screen_time_vs_avg_pct
0,id01,2024-06-26,9.001,0.2079,11.3007,0.7731,0.1061,0.0,43.6359,14.5713,8.4852,0.1351,18.6694,5.4722,88.3836,27.6892,61.116,27.9861,25.0,20.0,266.7672,156.8681,-29.0,-58.3


### ✔️ mWifi 주변wifi 정보
- Wifi devices around individual subject.
 - -30 ~ -50 dBm	매우 강한 신호 (최적)
 - -51 ~ -60 dBm	강한 신호 (문제 없음)
 - -61 ~ -70 dBm	괜찮은 신호 (약간 느릴 수 있음)
 - -71 ~ -80 dBm	약한 신호 (끊김 주의)
 - -81 dBm 이하	매우 약한 신호 (거의 끊김)

In [None]:
def extract_wifi_info(row):
    wifi_data = row['m_wifi']
    bssids = [item['bssid'] for item in wifi_data]
    rssis = [item['rssi'] for item in wifi_data]
    return pd.Series({'bssid': bssids, 'rssi': rssis})

mWifi[['bssid', 'rssi']] = mWifi.apply(extract_wifi_info, axis=1)
mWifi['lifelog_date'] = mWifi['timestamp'].astype(str).str[:10]
mWifi.head(1)

Unnamed: 0,subject_id,timestamp,m_wifi,bssid,rssi,lifelog_date
0,id01,2024-06-26 12:03:00,"[{'bssid': 'a0:0f:37:9a:5d:8b', 'rssi': -78}, {'bssid': 'a0:0f:37:9a:5d:8c', 'rssi': -78}, {'bssid': 'a0:0f:37:9a:5d:8d', 'rssi': -78}, {'bssid': 'a0:0f:37:9a:5d:8e', 'rssi': -78}, {'bssid': 'a0:0f:37:9a:5d:8f', 'rssi': -78}, {'bssid': 'a0:0f:37:96:56:ef', 'rssi': -58}, {'bssid': '88:36:6c:86:75:84', 'rssi': -72}, {'bssid': 'a0:0f:37:96:56:ee', 'rssi': -58}, {'bssid': 'a0:0f:37:96:56:ed', 'rssi': -58}, {'bssid': '86:25:19:b5:b2:a5', 'rssi': -61}, {'bssid': 'a0:0f:37:96:56:ec', 'rssi': -58}, {'bssid': '1e:39:29:8e:fb:e9', 'rssi': -71}, {'bssid': '52:c2:e8:c7:9b:e4', 'rssi': -82}, {'bssid': 'a0:0f:37:96:56:eb', 'rssi': -58}, {'bssid': '12:e3:c7:09:20:34', 'rssi': -88}, {'bssid': '58:86:94:4a:08:b8', 'rssi': -82}, {'bssid': '90:9f:33:28:d0:2e', 'rssi': -78}, {'bssid': '00:26:66:bc:4e:18', 'rssi': -85}, {'bssid': 'f6:0a:f4:43:4b:ba', 'rssi': -45}, {'bssid': '10:e3:c7:09:20:35', 'rssi': -63}, {'bssid': '10:e3:c7:09:20:34', 'rssi': -89}, {'bssid': '1c:39:29:48:04:92', 'rssi': -82}, {'bssid': '12:e3:c7:07:9d:df', 'rssi': -83}, {'bssid': '86:25:19:c3:44:07', 'rssi': -84}, {'bssid': 'a0:0f:37:9a:37:2f', 'rssi': -76}, {'bssid': 'a0:0f:37:9a:37:2e', 'rssi': -76}, {'bssid': 'a0:0f:37:9a:37:2d', 'rssi': -76}, {'bssid': '0a:09:b4:74:05:ec', 'rssi': -72}, {'bssid': 'a0:0f:37:9a:37:2c', 'rssi': -76}, {'bssid': 'a0:0f:37:9a:37:2b', 'rssi': -76}, {'bssid': '0a:09:b4:74:05:eb', 'rssi': -59}, {'bssid': 'c0:25:2f:d8:c1:a6', 'rssi': -82}, {'bssid': '16:7f:67:bb:fa:f8', 'rssi': -79}, {'bssid': '3c:f3:92:ff:00:01', 'rssi': -82}, {'bssid': '06:09:b4:74:05:ec', 'rssi': -72}, {'bssid': '06:09:b4:74:05:eb', 'rssi': -59}, {'bssid': '12:e3:c7:0a:74:d1', 'rssi': -78}, {'bssid': '88:36:6c:a9:6f:8e', 'rssi': -63}, {'bssid': '02:e3:c7:09:20:34', 'rssi': -88}, {'bssid': '00:09:b4:74:05:eb', 'rssi': -60}, {'bssid': '00:09:b4:74:05:ec', 'rssi': -72}, {'bssid': '00:1d:93:93:cf:fe', 'rssi': -19}, {'bssid': '8e:e2:ac:a5:9d:15', 'rssi': -72}]","[a0:0f:37:9a:5d:8b, a0:0f:37:9a:5d:8c, a0:0f:37:9a:5d:8d, a0:0f:37:9a:5d:8e, a0:0f:37:9a:5d:8f, a0:0f:37:96:56:ef, 88:36:6c:86:75:84, a0:0f:37:96:56:ee, a0:0f:37:96:56:ed, 86:25:19:b5:b2:a5, a0:0f:37:96:56:ec, 1e:39:29:8e:fb:e9, 52:c2:e8:c7:9b:e4, a0:0f:37:96:56:eb, 12:e3:c7:09:20:34, 58:86:94:4a:08:b8, 90:9f:33:28:d0:2e, 00:26:66:bc:4e:18, f6:0a:f4:43:4b:ba, 10:e3:c7:09:20:35, 10:e3:c7:09:20:34, 1c:39:29:48:04:92, 12:e3:c7:07:9d:df, 86:25:19:c3:44:07, a0:0f:37:9a:37:2f, a0:0f:37:9a:37:2e, a0:0f:37:9a:37:2d, 0a:09:b4:74:05:ec, a0:0f:37:9a:37:2c, a0:0f:37:9a:37:2b, 0a:09:b4:74:05:eb, c0:25:2f:d8:c1:a6, 16:7f:67:bb:fa:f8, 3c:f3:92:ff:00:01, 06:09:b4:74:05:ec, 06:09:b4:74:05:eb, 12:e3:c7:0a:74:d1, 88:36:6c:a9:6f:8e, 02:e3:c7:09:20:34, 00:09:b4:74:05:eb, 00:09:b4:74:05:ec, 00:1d:93:93:cf:fe, 8e:e2:ac:a5:9d:15]","[-78, -78, -78, -78, -78, -58, -72, -58, -58, -61, -58, -71, -82, -58, -88, -82, -78, -85, -45, -63, -89, -82, -83, -84, -76, -76, -76, -72, -76, -76, -59, -82, -79, -82, -72, -59, -78, -63, -88, -60, -72, -19, -72]",2024-06-26


In [None]:
def process_mWifi(df,threshold):

    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    def filter_strong_rssi(df,threshold):
        filtered_df = df.copy()
        def filter_row(row):
            bssids = row['bssid']
            rssis = row['rssi']
            # RSSI > threshold 조건 만족하는 항목만 추출
            filtered = [(b, r) for b, r in zip(bssids, rssis) if r > threshold]
            if filtered:
                new_bssids, new_rssis = zip(*filtered)
                return pd.Series({'bssid': list(new_bssids), 'rssi': list(new_rssis)})
            else:
                return pd.Series({'bssid': [], 'rssi': []})
        filtered_df[['bssid', 'rssi']] = filtered_df.apply(filter_row, axis=1)
        return filtered_df

    # === wifi 약신호 제거 ===
    df = filter_strong_rssi(df, threshold=threshold) ####

    features = []
    grouped = df.groupby(['subject_id', 'lifelog_date'])

    for (subject_id, date), group in grouped:
        scan_count = len(group)
        bssid_flat = sum(group['bssid'], [])  # flatten
        rssi_flat = sum(group['rssi'], [])    # flatten

        unique_bssid_count = len(set(bssid_flat))
        avg_rssi = sum(rssi_flat) / len(rssi_flat) if rssi_flat else None
        max_rssi = max(rssi_flat) if rssi_flat else None
        min_rssi = min(rssi_flat) if rssi_flat else None
        strong_rssi_ratio = sum(1 for r in rssi_flat if r > -60) / len(rssi_flat) if rssi_flat else 0
        empty_scan_count = sum(1 for b in group['bssid'] if len(b) == 0)

        # 가장 많이 탐지된 BSSID
        bssid_counter = Counter(bssid_flat)
        top_bssid, top_bssid_count = bssid_counter.most_common(1)[0] if bssid_counter else (None, 0)

        first_time = group['timestamp'].min()
        last_time = group['timestamp'].max()
        hour_span = (last_time - first_time).total_seconds() / 60  # 분 단위

        features.append({
            'subject_id': subject_id,
            'lifelog_date': date,
            'scan_count': scan_count,
            'unique_bssid_count': unique_bssid_count,
            'avg_rssi': avg_rssi,
            'max_rssi': max_rssi,
            # 'min_rssi': min_rssi,
            # 'strong_signal_ratio': strong_rssi_ratio,
            'empty_scan_count': empty_scan_count,
            'top_bssid': top_bssid,
            'top_bssid_count': top_bssid_count,
            'hour_span_minutes': hour_span
        })

    return pd.DataFrame(features)

In [None]:
mWifi2 = process_mWifi(mWifi,threshold=-60)

# check
print(f'\n # mWifi2 shape: {mWifi2.shape}')
mWifi2.head(1)


 # mWifi2 shape: (685, 10)


Unnamed: 0,subject_id,lifelog_date,scan_count,unique_bssid_count,avg_rssi,max_rssi,empty_scan_count,top_bssid,top_bssid_count,hour_span_minutes
0,id01,2024-06-26,69,48,-49.6109,-19.0,11,86:25:19:9f:9b:be,19,716.0


### ✔️ wHr 심박동수
- Heart rate readings recorded by the smartwatch.


In [None]:
wHr['lifelog_date'] = wHr['timestamp'].astype(str).str[:10]
wHr.head(1)

Unnamed: 0,subject_id,timestamp,heart_rate,lifelog_date
0,id01,2024-06-26 12:23:00,"[134, 134, 135, 133, 134, 135, 134, 135, 134, 133, 133, 133, 132, 132, 131, 131, 131, 132, 132, 134, 134, 134, 132, 130, 128, 126, 126, 126, 127, 129, 130, 129, 130, 130, 127, 127, 126, 125, 123]",2024-06-26


In [None]:
def get_time_block(hour):
    if 0 <= hour < 6:
        return 'early_morning'
    elif 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'

def process_wHr_by_timeblock(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = df['timestamp'].dt.date
    df['block'] = df['timestamp'].dt.hour.map(get_time_block)

    results = []

    for (subj, date), group in df.groupby(['subject_id', 'lifelog_date']):
        block_stats = {'subject_id': subj, 'lifelog_date': date}

        for block, block_group in group.groupby('block'):
            hr_all = []
            for row in block_group['heart_rate']:
                parsed = ast.literal_eval(row) if isinstance(row, str) else row
                hr_all.extend([int(h) for h in parsed if h is not None])

            if not hr_all:
                continue

            above_100 = [hr for hr in hr_all if hr > 100]
            block_stats[f'hr_{block}_mean'] = np.mean(hr_all)
            block_stats[f'hr_{block}_std'] = np.std(hr_all)
            block_stats[f'hr_{block}_max'] = np.max(hr_all)
            block_stats[f'hr_{block}_min'] = np.min(hr_all)
            block_stats[f'hr_{block}_above_100_ratio'] = len(above_100) / len(hr_all)

        results.append(block_stats)

    return pd.DataFrame(results)

In [None]:
wHr2 = process_wHr_by_timeblock(wHr)

# check
print(f'\n # wHr2 shape: {wHr2.shape}')
wHr2.head(1)


 # wHr2 shape: (636, 22)


Unnamed: 0,subject_id,lifelog_date,hr_afternoon_mean,hr_afternoon_std,hr_afternoon_max,hr_afternoon_min,hr_afternoon_above_100_ratio,hr_evening_mean,hr_evening_std,hr_evening_max,hr_evening_min,hr_evening_above_100_ratio,hr_early_morning_mean,hr_early_morning_std,hr_early_morning_max,hr_early_morning_min,hr_early_morning_above_100_ratio,hr_morning_mean,hr_morning_std,hr_morning_max,hr_morning_min,hr_morning_above_100_ratio
0,id01,2024-06-26,80.5333,12.6366,142.0,59.0,0.0773,82.4768,10.2932,124.0,59.0,0.0555,,,,,,,,,,


### ✔️ wLight 앰비언트 라이트
- Ambient light measured by the smartwatch.  
  - 어두운 밤 0.1 ~ 1 lux 캄캄한 방, 달빛 없는 밤
  - 가로등 켜진 거리 10 ~ 20 lux 흐릿한 외부 조명
  - 실내 조명 100 ~ 500 lux 사무실, 일반 거실
  - 밝은 실외 10,000 ~ 25,000 lux 맑은 날 햇빛
  - 직사광선 아래 30,000 ~ 100,000 lux 여름 한낮, 매우 강한 햇빛

In [None]:
wLight['lifelog_date'] = wLight['timestamp'].astype(str).str[:10]
wLight.head(1)

Unnamed: 0,subject_id,timestamp,w_light,lifelog_date
0,id01,2024-06-26 12:17:00,633.0,2024-06-26


In [None]:
def get_time_block(hour):
    if 0 <= hour < 6:
        return 'early_morning'
    elif 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    else:
        return 'evening'

def process_wLight_by_timeblock(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = df['timestamp'].dt.date
    df['block'] = df['timestamp'].dt.hour.map(get_time_block)

    results = []

    for (subj, date), group in df.groupby(['subject_id', 'lifelog_date']):
        block_stats = {'subject_id': subj, 'lifelog_date': date}

        for block, block_group in group.groupby('block'):
            lux = block_group['w_light'].dropna().values
            if len(lux) == 0:
                continue

            block_stats[f'wlight_{block}_mean'] = np.mean(lux)
            block_stats[f'wlight_{block}_std'] = np.std(lux)
            block_stats[f'wlight_{block}_max'] = np.max(lux)
            block_stats[f'wlight_{block}_min'] = np.min(lux)

        results.append(block_stats)

    return pd.DataFrame(results)

In [None]:
wLight2 = process_wLight_by_timeblock(wLight)

# check
print(f'\n # wLight2 shape: {wLight2.shape}')
wLight2.head(1)


 # wLight2 shape: (664, 18)


Unnamed: 0,subject_id,lifelog_date,wlight_afternoon_mean,wlight_afternoon_std,wlight_afternoon_max,wlight_afternoon_min,wlight_evening_mean,wlight_evening_std,wlight_evening_max,wlight_evening_min,wlight_early_morning_mean,wlight_early_morning_std,wlight_early_morning_max,wlight_early_morning_min,wlight_morning_mean,wlight_morning_std,wlight_morning_max,wlight_morning_min
0,id01,2024-06-26,394.5251,1458.7346,20874.0,0.0,89.0202,101.6844,264.0,0.0,,,,,,,,


### ✔️ wPedo 걸음수
- Step data recorded by the smartwatch.

In [None]:
wPedo['lifelog_date'] = wPedo['timestamp'].astype(str).str[:10]
wPedo.head(1)

Unnamed: 0,subject_id,timestamp,step,step_frequency,running_step,walking_step,distance,speed,burned_calories,lifelog_date
0,id01,2024-06-26 12:09:00,10,0.1667,0,0,8.33,0.1388,0.0,2024-06-26


In [None]:
def process_wPedo(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['lifelog_date'] = df['timestamp'].dt.date

    summary = df.groupby(['subject_id', 'lifelog_date']).agg({
        'step': 'sum',
        'step_frequency': 'mean',
        'distance': 'sum',
        'speed': ['mean', 'max'],
        'burned_calories': 'sum'
    }).reset_index()

    # 컬럼 이름 정리
    summary.columns = ['subject_id', 'lifelog_date',
                       'step_sum', 'step_frequency_mean',
                       'distance_sum', 'speed_mean', 'speed_max',
                       'burned_calories_sum']

    return summary

In [None]:
wPedo2 = process_wPedo(wPedo)

# check
print(f'\n # wPedo2 shape: {wPedo2.shape}')
wPedo2.head(1)


 # wPedo2 shape: (653, 8)


Unnamed: 0,subject_id,lifelog_date,step_sum,step_frequency_mean,distance_sum,speed_mean,speed_max,burned_calories_sum
0,id01,2024-06-26,3578,0.0927,2782.1901,0.0721,1.5882,189.3191


### 🔥 운동 추정 파생변수

- mActivity 추정행동
- mGps, 핸드폰 위치
- wHr 심박동수
- wPedo 걸음수

In [None]:
def average_list_columns(df, list_columns, pk_cols=['subject_id', 'lifelog_date']):

    for col in list_columns:

        def safe_mean(x):
            if isinstance(x, list):
                return np.mean(x) if len(x) > 0 else np.nan
            elif isinstance(x, (int, float, np.integer, np.floating, type(None))):
                return x
            elif isinstance(x, (np.ndarray, pd.Series)):
                return np.mean(x)
            elif pd.api.types.is_scalar(x) and pd.isna(x):
                return np.nan
            else:
                return np.nan

        df[col] = df[col].apply(safe_mean)

    return df

def compute_estimated_exercise(mActivity, mGps, wHr, wPedo, minutes):

    # 리스트 평균 처리
    mGps = mGps.copy()
    wHr = wHr.copy()
    mGps = average_list_columns(mGps, ['speed'])
    Hr = average_list_columns(wHr, ['heart_rate'])

    for df in [mActivity, mGps, wHr, wPedo]:
        df['timestamp'] = pd.to_datetime(df['timestamp'])

    # 5분 지속 조건 판단 함수
    def sustained_condition(df, cond_col,minutes):
        df = df[df[cond_col]].sort_values('timestamp')
        times = df['timestamp']
        start = prev = None
        for t in times:
            if start is None:
                start = prev = t
            elif t <= prev + timedelta(minutes=1):
                prev = t
            else:
                if prev - start >= timedelta(minutes=minutes):
                    return True
                start = prev = t
        return (prev - start) >= timedelta(minutes=minutes) if start else False

    # mActivity: m_activity == 7 지속
    mActivity['m_cond'] = mActivity['m_activity'] == 7
    act_flag = mActivity.groupby(['subject_id', 'lifelog_date']) \
                        .apply(lambda df: sustained_condition(df, 'm_cond',40)) \
                        .reset_index(name='act_exe_flag')

    # mGps: speed ∈ [2.5, 5.5] 지속
    mGps['gps_cond'] = mGps['speed'].between(2.5, 5.5)
    gps_flag = mGps.groupby(['subject_id', 'lifelog_date']) \
                   .apply(lambda df: sustained_condition(df, 'gps_cond',minutes)) \
                   .reset_index(name='gps_exe_flag')

    # wHr: hr ≥ 133 상태가 5분 이상 유지
    wHr['whr_cond'] = wHr['heart_rate'] >= 133
    hr_flag = wHr.groupby(['subject_id', 'lifelog_date']) \
                   .apply(lambda df: sustained_condition(df, 'whr_cond',minutes)) \
                   .reset_index(name='hr_exe_flag')

    # wPedo: step ≥ 10000 또는 running_step ≥ 1이 5분 이상
    pedo_daily = wPedo.groupby(['subject_id', 'lifelog_date'])['step'].sum().reset_index(name='total_steps')
    pedo_daily['step_flag'] = pedo_daily['total_steps'] >= 10000

    wPedo['r_cond'] = wPedo['running_step'] >= 1
    run_flag = wPedo.groupby(['subject_id', 'lifelog_date']) \
                    .apply(lambda df: sustained_condition(df, 'r_cond', minutes)) \
                    .reset_index(name='run_flag')

    pedo_flag = pedo_daily.merge(run_flag, on=['subject_id', 'lifelog_date'], how='outer')
    pedo_flag['step_flag'] = pedo_flag['step_flag'].fillna(False)
    pedo_flag['run_flag'] = pedo_flag['run_flag'].fillna(False)
    pedo_flag['pedo_exe_flag'] = pedo_flag[['step_flag', 'run_flag']].any(axis=1)

    # 병합 및 최종 판단
    result = act_flag.merge(gps_flag, on=['subject_id', 'lifelog_date'], how='outer') \
                     .merge(hr_flag, on=['subject_id', 'lifelog_date'], how='outer') \
                     .merge(pedo_flag[['subject_id', 'lifelog_date', 'pedo_exe_flag']], on=['subject_id', 'lifelog_date'], how='outer')

    # NaN 처리 및 1/0 변환
    for col in ['act_exe_flag', 'gps_exe_flag', 'hr_exe_flag', 'pedo_exe_flag']:
        result[col] = result[col].fillna(False)

    result['pred_exe_flag'] = result[['act_exe_flag', 'gps_exe_flag', 'hr_exe_flag', 'pedo_exe_flag']].any(axis=1)

    # 👉 1/0 변환
    for col in ['act_exe_flag', 'gps_exe_flag', 'hr_exe_flag', 'pedo_exe_flag', 'pred_exe_flag']:
        result[col] = result[col].astype(int)

    display(result[['pred_exe_flag', 'act_exe_flag', 'gps_exe_flag', 'hr_exe_flag', 'pedo_exe_flag']].sum())

    return result[['subject_id', 'lifelog_date', 'pred_exe_flag', 'act_exe_flag', 'gps_exe_flag', 'hr_exe_flag', 'pedo_exe_flag']]

In [None]:
# 추정운동여부
exeFlag = compute_estimated_exercise(mActivity, mGps, wHr, wPedo,10)

# check
print(f'\n # exeFlag shape: {exeFlag.shape}')
exeFlag.head(1)

Unnamed: 0,0
pred_exe_flag,66
act_exe_flag,31
gps_exe_flag,5
hr_exe_flag,18
pedo_exe_flag,19



 # exeFlag shape: (700, 7)


Unnamed: 0,subject_id,lifelog_date,pred_exe_flag,act_exe_flag,gps_exe_flag,hr_exe_flag,pedo_exe_flag
0,id01,2024-06-26,0,0,0,0,0


### 🔥 Sleeptime 일어난 건수

- Sleeptime에 (mLight 주변 밝기), (wLight 앰비언트 라이트) 변화 건수

In [None]:
def compute_night_awake_features(df, prefix):

    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # 00시~06시 필터
    df['hour'] = df['timestamp'].dt.hour
    df_night = df[(df['hour'] >= 0) & (df['hour'] < 6)].copy()

    # 깨어있는 분 계산
    df_night['awake_minute'] = (df_night[prefix] > 0).astype(int)

    # 깨어난 횟수 계산 (0 → 양수 전환)
    def count_awake_blocks(x):
        return ((x > 0) & (x.shift(fill_value=0) == 0)).sum()

    # 그룹별 집계
    result = df_night.groupby(['subject_id', 'lifelog_date']).agg(
        awake_minutes=('awake_minute', 'sum'),
        awake_blocks=(prefix, count_awake_blocks)
    ).reset_index()

    # 컬럼명 변경
    result = result.rename(columns={
        'awake_minutes': f'{prefix}_awake_minutes',
        'awake_blocks': f'{prefix}_awake_blocks'
    })

    # train에 결과 합치기 위해서 -1 day 하기
    result['lifelog_date'] = pd.to_datetime(result['lifelog_date'])
    result['lifelog_date'] = result['lifelog_date'] + pd.Timedelta(days=-1)

    result['lifelog_date'] = result['lifelog_date'].astype(str)

    return result

In [None]:
a1 = compute_night_awake_features(mLight,'m_light')
a2 = compute_night_awake_features(wLight,'w_light')
sleepWakeCnt = train[['subject_id','lifelog_date']].copy()

sleepWakeCnt = sleepWakeCnt.merge(a1, on=['subject_id','lifelog_date'], how='left')
sleepWakeCnt = sleepWakeCnt.merge(a2, on=['subject_id','lifelog_date'], how='left')

sleepWakeCnt['awake_minutes'] = sleepWakeCnt[['m_light_awake_minutes','w_light_awake_minutes']].max(axis=1)
sleepWakeCnt['awake_blocks'] = sleepWakeCnt[['m_light_awake_blocks','w_light_awake_blocks']].max(axis=1)

# check
sleepWakeCnt.head()

Unnamed: 0,subject_id,lifelog_date,m_light_awake_minutes,m_light_awake_blocks,w_light_awake_minutes,w_light_awake_blocks,awake_minutes,awake_blocks
0,id01,2024-06-26,4.0,1.0,17.0,3.0,17.0,3.0
1,id01,2024-06-27,4.0,1.0,14.0,3.0,14.0,3.0
2,id01,2024-06-28,4.0,1.0,0.0,0.0,4.0,1.0
3,id01,2024-06-29,1.0,1.0,0.0,0.0,1.0,1.0
4,id01,2024-06-30,2.0,1.0,0.0,0.0,2.0,1.0


### 📦 merge 데이터
- train, test 기간 서로 겹침

In [None]:
train = pd.read_csv('/content/drive/MyDrive/data/ch2025_metrics_train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/ch2025_submission_sample.csv')

# 일자변수 타입 변환
mACStatus2['lifelog_date'] = mACStatus2['lifelog_date'].astype(str)
mActivity2['lifelog_date'] = mActivity2['lifelog_date'].astype(str)
mAmbience2['lifelog_date'] = mAmbience2['lifelog_date'].astype(str)
mBle2['lifelog_date'] = mBle2['lifelog_date'].astype(str)
mGps2['lifelog_date'] = mGps2['lifelog_date'].astype(str)
mLight2['lifelog_date'] = mLight2['lifelog_date'].astype(str)
mScreenStatus2['lifelog_date'] = mScreenStatus2['lifelog_date'].astype(str)
mUsageStats2['lifelog_date'] = mUsageStats2['lifelog_date'].astype(str)
mWifi2['lifelog_date'] = mWifi2['lifelog_date'].astype(str)
wHr2['lifelog_date'] = wHr2['lifelog_date'].astype(str)
wLight2['lifelog_date'] = wLight2['lifelog_date'].astype(str)
wPedo2['lifelog_date'] = wPedo2['lifelog_date'].astype(str)

# ---- new ----

exeFlag['lifelog_date'] = exeFlag['lifelog_date'].astype(str)
sleepWakeCnt['lifelog_date'] = sleepWakeCnt['lifelog_date'].astype(str)

In [None]:
df_list = [
    mACStatus2,       # 1
    mActivity2,       # 2
    mAmbience2,       # 3
    mBle2,            # 4
    mGps2,            # 5
    mLight2,          # 6
    mScreenStatus2,   # 7
    mUsageStats2,     # 8
    mWifi2,           # 9
    wHr2,             # 10
    wLight2,          # 11
    wPedo2,           # 12
    # ---- new ----
    sleepWakeCnt,
    exeFlag
]

data = reduce(lambda left, right: pd.merge(left, right, on=['subject_id', 'lifelog_date'], how='outer'), df_list)
data['lifelog_date'] = data['lifelog_date'].astype(str)

# 중복체크
print(data.shape)
print(data[['subject_id','lifelog_date']].drop_duplicates().shape)

# merge
train2 = train.merge(data, on=['subject_id','lifelog_date'], how='left')
test2 = test.merge(data, on=['subject_id','lifelog_date'], how='left')

# 저장
print('# train  shape:',train.shape)
print('# train2 shape:',test2.shape)
print('# test   shape:',test.shape)
print('# test2  shape:',test2.shape)

(700, 166)
(700, 2)
# train  shape: (450, 9)
# train2 shape: (250, 173)
# test   shape: (250, 9)
# test2  shape: (250, 173)


In [None]:
# 저장
train2.to_parquet(f"/content/drive/MyDrive/data/train_0530.parquet")
test2.to_parquet(f"/content/drive/MyDrive/data/test_0530.parquet")

## 📌 모델 학습

In [141]:
train2 = pd.read_parquet(f"/content/drive/MyDrive/data/train_0530.parquet")
test2 = pd.read_parquet(f"/content/drive/MyDrive/data/test_0530.parquet")

In [142]:
train = train2.copy()
test = test2.copy()

# drop_features = ['afterwork_max_label','sleeptime_max_label','worktime_max_label']
drop_features = ['top_bssid','week_type'] #
drop_features = [i for i in drop_features if i in train.columns.tolist()]
print('# drop_features:',drop_features)
train = train.drop(columns=drop_features)
test = test.drop(columns=drop_features)

# drop_features: ['top_bssid', 'week_type']


### 이미지

In [143]:
img_model = 'resnet50' # resnet50, xception

# 데이터 읽기
img_features = pd.read_csv(f'/content/drive/MyDrive/data/ch2025_data_items/img_features_ch5_sleeptime_{img_model}_5.csv')
img_features = img_features[sorted(img_features.columns,reverse=True)]
img_features.columns = ['image_path']+['img'+i for i in img_features.columns if i not in ['image_path']]

# 정규표현식으로 추출
img_features['subject_id'] = img_features['image_path'].str.extract(r'user(id\d+)_')[0]
img_features['lifelog_date'] = img_features['image_path'].str.extract(r'_(\d{4}-\d{2}-\d{2})_')[0]

# check
img_features = img_features.drop(columns=['image_path'])
print(len(img_features))
display(img_features.head(1))

# add img features
train['lifelog_date'] = train['lifelog_date'].astype(str)
test['lifelog_date'] = test['lifelog_date'].astype(str)
train = train.merge(img_features,on=['subject_id','lifelog_date'],how='left')
test = test.merge(img_features,on=['subject_id','lifelog_date'],how='left')

700


Unnamed: 0,img4,img3,img2,img1,img0,subject_id,lifelog_date
0,0.27,0.0377,0.2072,-0.4265,1.1548,id01,2024-06-26


In [144]:
# 요일 컬럼 추가 (예: 월요일, 화요일, ...)
train['lifelog_date'] = pd.to_datetime(train['lifelog_date'])
test['lifelog_date'] = pd.to_datetime(test['lifelog_date'])

# 요일
weekday_map = {
    0: '월요일', 1: '화요일', 2: '수요일', 3: '목요일',
    4: '금요일', 5: '토요일', 6: '일요일'
}
train['weekday'] = train['lifelog_date'].dt.dayofweek.map(weekday_map)
test['weekday'] = test['lifelog_date'].dt.dayofweek.map(weekday_map)

# 월
train['month'] = train['lifelog_date'].dt.month
test['month'] = test['lifelog_date'].dt.month

# weekend
train['weekend'] = np.where(train['weekday'].isin(['토요일','일요일']),1,0)
test['weekend'] = np.where(test['weekday'].isin(['토요일','일요일']),1,0)

# weekend
train['weekend2'] = np.where(train['weekday'].isin(['토요일','금요일']),1,0)
test['weekend2'] = np.where(test['weekday'].isin(['토요일','금요일']),1,0)

# weekend
train['weekend3'] = np.where(train['weekday'].isin(['토요일','금요일','일요일']),1,0)
test['weekend3'] = np.where(test['weekday'].isin(['토요일','금요일','일요일']),1,0)

# 공휴일
공휴일 = [
     '2024-08-15'
    ,'2024-09-16'
    ,'2024-09-17'
    ,'2024-09-18'
    ,'2024-10-03'
    ,'2024-10-09'
]
train['공휴일'] = np.where(train['lifelog_date'].isin(공휴일),1,0)
test['공휴일'] = np.where(test['lifelog_date'].isin(공휴일),1,0)

# 주말 + 공휴일 묶어주기
# train['weekend'] = np.where( ((train['weekend']==0) & (train['공휴일']==1)), 1, train['weekend'])
# test['weekend'] = np.where( ((test['weekend']==0) & (test['공휴일']==1)), 1, test['weekend'])

In [145]:
# 인코딩1
train['weekday'] = train['weekday'].map(dict([(j,i) for i,j in weekday_map.items()]))
test['weekday'] = test['weekday'].map(dict([(j,i) for i,j in weekday_map.items()]))
# train = pd.get_dummies(train,columns=['weekday'],drop_first=True)
# test = pd.get_dummies(test,columns=['weekday'],drop_first=True)

# # 인코딩2
# a1_map = {'weekday': 1, 'weekend':2}
# train['week_type'] = train['week_type'].map(a1_map)
# test['week_type'] = test['week_type'].map(a1_map)

# # 인코딩3
# a1_map = {'weekday': 1, 'weekend':2}
# train['week_type_lag1'] = train['week_type_lag1'].map(a1_map)
# test['week_type_lag1'] = test['week_type_lag1'].map(a1_map)

In [146]:
# 숫자형 컬럼만 선택해서 결측값 -1로 채우기
train[train.select_dtypes(include='number').columns] = train.select_dtypes(include='number').fillna(-1)
test[test.select_dtypes(include='number').columns] = test.select_dtypes(include='number').fillna(-1)

In [147]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [148]:
lgb_A = 0.3
xgb_B = 0.3
cat_C = 0.4

In [149]:
def get_oof_predictions(X, y, lgb_params, xgb_params, n_splits=5, is_multiclass=False, num_class=None, early_stop=False):
    oof_preds_lgb = np.zeros(len(X))
    oof_preds_xgb = np.zeros(len(X))
    oof_preds_cat = np.zeros(len(X))
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    for train_idx, valid_idx in skf.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        # LightGBM
        if is_multiclass:
            lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=num_class)
        else:
            lgb_model = LGBMClassifier(**lgb_params)

        # XGBoost
        if is_multiclass:
            xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=num_class)
        else:
            xgb_model = XGBClassifier(**xgb_params)

        # CatBoost
        if is_multiclass:
            cat_model = CatBoostClassifier(**common_params_cat2, objective='MultiClass', classes_count=num_class)
        else:
            cat_model = CatBoostClassifier(**common_params_cat)

        if early_stop:
            lgb_model.fit(
                X_train, y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                callbacks=[early_stopping(stopping_rounds=100, verbose=False)]
            )
            xgb_model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=False
            )
            cat_model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=False
            )
        else:
            lgb_model.fit(X_train, y_train)
            xgb_model.fit(X_train, y_train)
            cat_model.fit(X_train, y_train)

        # Get predictions
        lgb_preds = lgb_model.predict(X_valid)
        xgb_preds = xgb_model.predict(X_valid)
        cat_preds = cat_model.predict(X_valid).ravel()  # ✅ 2차원 → 1차원

        # Store predictions
        oof_preds_lgb[valid_idx] = lgb_preds
        oof_preds_xgb[valid_idx] = xgb_preds
        oof_preds_cat[valid_idx] = cat_preds

    # Ensemble predictions (7:3 ratio)
    oof_preds = lgb_A * oof_preds_lgb + xgb_B * oof_preds_xgb + cat_C * oof_preds_cat

    if not is_multiclass:
        oof_preds = (oof_preds > 0.5).astype(int)
    else:
        oof_preds = np.round(oof_preds).astype(int)

    return oof_preds

In [150]:
def run_basemodel(train, test, valid_ids, common_params, n_splits, random_state=42, early_stop=False):

    train_df = train.copy()
    test_df = test.copy()

    submission_final = test_df[['subject_id', 'sleep_date', 'lifelog_date']].copy()
    submission_final['lifelog_date'] = pd.to_datetime(submission_final['lifelog_date']).dt.date

    # 타겟
    targets_binary = ['Q1', 'Q2', 'Q3', 'S2', 'S3']
    targets_binary_name = ['기상직후수면질','취침전신체적피로','취침전스트레스','수면효율','수면잠들기시간']
    target_multiclass = 'S1'
    all_targets = targets_binary + [target_multiclass]

    # 노이즈 수준 설정
    def add_noise(series, noise_level, seed=3):
        rng = np.random.default_rng(seed)
        return series * (1 + noise_level * rng.standard_normal(len(series)))

    noise_level = 0.015  # 필요에 따라 조정

    # 타겟인코딩
    # m = 0: 스무딩 없이 범주별 평균만 사용합니다. 관측 수가 많은 범주에는 적합하지만, 적은 경우 과적합 위험이 있습니다.
    # m = 1~10: 일반적인 기본값으로, 대부분의 상황에서 안정적인 성능을 보입니다.
    # m = 50~300: 관측 수가 매우 적은 범주가 많거나 데이터가 희소한 경우에 유용합니다.
    for tgt in all_targets:

      encoder_feats = ['subject_id','month','weekend'] # 'weekday', 'subject_id','month','weekend'

      #### 타겟인코딩1

      subject_mean = train_df.groupby(encoder_feats)[tgt].mean().rename(f'{tgt}_te')
      train_df = train_df.merge(subject_mean, on=encoder_feats, how='left')
      test_df = test_df.merge(subject_mean, on=encoder_feats, how='left')
      global_mean = train_df[tgt].mean()
      test_df[f'{tgt}_te'] = test_df[f'{tgt}_te'].fillna(global_mean)

      # 노이즈 추가
      train_df[f'{tgt}_te'] = add_noise(train_df[f'{tgt}_te'], noise_level)
      test_df[f'{tgt}_te'] = add_noise(test_df[f'{tgt}_te'], noise_level)

      #### 타겟인코딩2

      # 새로운 범주형 열 생성
      train_df['TMP'] = train_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)
      test_df['TMP'] = test_df[encoder_feats].applymap(str).apply(lambda x: ''.join(x) ,axis=1)

      # 인코더
      encoder = TargetEncoder(cols=['TMP'], smoothing=300) # 40
      encoder.fit(train_df[['TMP']], train_df[tgt])

      # 인코딩 결과를 새로운 열에 저장
      train_df[f'{tgt}_te2'] = encoder.transform(train_df[['TMP']])
      test_df[f'{tgt}_te2'] = encoder.transform(test_df[['TMP']])

      # 노이즈 추가
      train_df[f'{tgt}_te2'] = add_noise(train_df[f'{tgt}_te2'], noise_level)
      test_df[f'{tgt}_te2'] = add_noise(test_df[f'{tgt}_te2'], noise_level)

      # 불필요한 변수 제거
      train_df = train_df.drop(columns=['TMP'])
      test_df = test_df.drop(columns=['TMP'])


    # 인코딩
    PK = ['sleep_date', 'lifelog_date', 'subject_id']
    encoder = LabelEncoder()
    categorical_features = [i for i in train_df.select_dtypes(include=['object', 'category']).columns if i not in PK+['pk']]
    for col in categorical_features:
        print(col)
        train_df[col] = encoder.fit_transform(train_df[col])
        test_df[col] = encoder.fit_transform(test_df[col])


    # X
    X = train_df.drop(columns=PK + all_targets)
    test_X = test_df.drop(columns=PK + all_targets)
    print(f'# X shape: {X.shape}')
    print(f'# test_X shape: {test_X.shape}')

    print('\n STEP1: 실험 결과 확인')
    print("=============== Validation Results ==============")
    total_avg_f1s = []
    best_iteration_temp = {k: [] for k in all_targets}

    val_f1 = []
    for col in targets_binary:
        # binary
        y = train_df[col]

        valid_ids['pk'] = valid_ids['subject_id']+valid_ids['sleep_date']
        train_df['pk'] = train_df['subject_id']+train_df['sleep_date']

        X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
        X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
        y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
        y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

        # Get parameters for both models
        lgb_params = common_params[col].copy()
        lgb_params['random_state'] = random_state

        xgb_params = {
            'n_estimators': 1000,
            'learning_rate': 0.01,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': random_state
        }

        # Train LightGBM
        lgb_model = LGBMClassifier(**lgb_params)
        if early_stop:
            lgb_model.fit(
                X_train, y_train,
                eval_set=[(X_train, y_train), (X_valid, y_valid)],
                callbacks=[early_stopping(stopping_rounds=100,verbose=False)]
            )
            best_iteration_temp[col].append(lgb_model.best_iteration_)
        else:
            lgb_model.fit(X_train, y_train)
            best_iteration_temp[col].append(1000)

        # Train XGBoost
        xgb_model = XGBClassifier(**xgb_params)
        if early_stop:
            xgb_model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=False
            )
        else:
            xgb_model.fit(X_train, y_train)

        # Train Catboost
        cat_model = CatBoostClassifier(**common_params_cat, loss_function='Logloss')
        if early_stop:
            cat_model.fit(
                X_train, y_train,
                eval_set=[(X_valid, y_valid)],
                early_stopping_rounds=100,
                verbose=False
            )
        else:
            cat_model.fit(X_train, y_train)

        # Get predictions and ensemble
        lgb_pred_valid = lgb_model.predict_proba(X_valid)[:, 1]
        xgb_pred_valid = xgb_model.predict_proba(X_valid)[:, 1]
        cat_pred_valid = cat_model.predict_proba(X_valid)[:, 1]
        pred_valid = (lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + cat_C * cat_pred_valid  > 0.5).astype(int)

        f1 = f1_score(y_valid, pred_valid, average='macro')
        val_f1.append(f1)

    # multiclass
    y = train_df[target_multiclass]

    X_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
    X_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),X.columns.tolist()].reset_index(drop=True).copy()
    y_valid = train_df.loc[train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()
    y_train = train_df.loc[~train_df['pk'].isin(valid_ids['pk']),y.name].reset_index(drop=True).copy()

    # Get parameters for both models
    lgb_params = common_params['S1'].copy()
    lgb_params['random_state'] = random_state

    xgb_params = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': random_state
    }

    # Train LightGBM
    lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=3)
    if early_stop:
        lgb_model.fit(
            X_train, y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            callbacks=[early_stopping(stopping_rounds=100,verbose=False)]
        )
        best_iteration_temp[target_multiclass].append(lgb_model.best_iteration_)
    else:
        lgb_model.fit(X_train, y_train)
        best_iteration_temp[target_multiclass].append(1000)

    # Train XGBoost
    xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=3)
    if early_stop:
        xgb_model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=False
        )
    else:
        xgb_model.fit(X_train, y_train)

    # Train Catboost
    cat_model = CatBoostClassifier(**common_params_cat2, loss_function='MultiClass', classes_count=3)
    if early_stop:
        cat_model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=False
        )
    else:
        cat_model.fit(X_train, y_train)

    # Get predictions and ensemble
    lgb_pred_valid = lgb_model.predict_proba(X_valid)
    xgb_pred_valid = xgb_model.predict_proba(X_valid)
    cat_pred_valid = cat_model.predict_proba(X_valid)
    pred_valid = np.argmax(lgb_A * lgb_pred_valid + xgb_B * xgb_pred_valid + cat_C * cat_pred_valid, axis=1)

    f1 = f1_score(y_valid, pred_valid, average='macro')
    val_f1.append(f1)

    avg_f1 = np.mean(val_f1)
    total_avg_f1s.append(avg_f1)
    detail = " ".join([f"{name}({tname}):{score:.4f}" for name, tname, score in zip(targets_binary + [target_multiclass], targets_binary_name + ['S1'], val_f1)])
    print(f" 평균 F1: {avg_f1:.4f} / [상세] {detail}")

    best_iteration_dict = {k: max(best_iteration_temp[k]) for k in all_targets}

    if early_stop==True:
      print("\n[best_iteration_dict]")
      for k, v in best_iteration_dict.items():
          print(f"{k}: {v}")


    print(f"# 전체 평균 F1: {np.mean(total_avg_f1s):.4f}")
    print("================================================")

    # modoling with 100% train & no valid
    print('\n STEP2: 전체 데이터로 모델 재학습')
    print("====== modoling with 100% train & no valid =====")

    # binary
    binary_preds = {}
    binary_preds_proba = {}
    for col in targets_binary:
        # Get parameters for both models
        lgb_params = common_params[col].copy()
        lgb_params['random_state'] = random_state

        xgb_params = {
            'n_estimators': 1000,
            'learning_rate': 0.01,
            'max_depth': 6,
            'min_child_weight': 1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': random_state
        }

        y = train_df[col]

        if early_stop:
            lgb_params['n_estimators'] = best_iteration_dict[col]
            xgb_params['n_estimators'] = best_iteration_dict[col]

        # Train LightGBM
        lgb_model = LGBMClassifier(**lgb_params)
        lgb_model.fit(X, y)

        # Train XGBoost
        xgb_model = XGBClassifier(**xgb_params)
        xgb_model.fit(X, y)

        # Train CatBoost
        cat_model = CatBoostClassifier(**common_params_cat)
        cat_model.fit(X, y)

        # Get predictions and ensemble
        lgb_pred = lgb_model.predict_proba(test_X)[:, 1]
        xgb_pred = xgb_model.predict_proba(test_X)[:, 1]
        cat_pred = cat_model.predict_proba(test_X)[:, 1]
        binary_preds[col] = (lgb_A * lgb_pred + xgb_B * xgb_pred + cat_C * cat_pred > 0.5).astype(int)
        binary_preds_proba[col] = lgb_A * lgb_model.predict_proba(test_X) + xgb_B * xgb_model.predict_proba(test_X) + cat_C * cat_model.predict_proba(test_X)

        # Feature importance (using LightGBM's importance)
        fi_df = pd.DataFrame({'feature': X.columns, 'importance': lgb_model.feature_importances_})
        top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
        feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
        print(f"[{col}] {feat_str}")

    # multiclass
    y = train_df['S1']

    # Get parameters for both models
    lgb_params = common_params['S1'].copy()
    lgb_params['random_state'] = random_state

    xgb_params = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': random_state
    }

    if early_stop:
        lgb_params['n_estimators'] = best_iteration_dict['S1']
        xgb_params['n_estimators'] = best_iteration_dict['S1']

    # Train LightGBM
    lgb_model = LGBMClassifier(**lgb_params, objective='multiclass', num_class=3)
    lgb_model.fit(X, y)

    # Train XGBoost
    xgb_model = XGBClassifier(**xgb_params, objective='multi:softmax', num_class=3)
    xgb_model.fit(X, y)

    # Train CatBoost
    cat_model = CatBoostClassifier(**common_params_cat2, objective='MultiClass', classes_count=3)
    cat_model.fit(X, y)

    # Get predictions and ensemble
    lgb_pred = lgb_model.predict_proba(test_X)
    xgb_pred = xgb_model.predict_proba(test_X)
    cat_pred = cat_model.predict_proba(test_X)
    multiclass_pred = np.argmax(lgb_A * lgb_pred + xgb_B * xgb_pred + cat_C * cat_pred, axis=1)
    multiclass_pred_proba = lgb_A * lgb_pred + xgb_B * xgb_pred + cat_C * cat_pred

    # Feature importance
    fi_df = pd.DataFrame({'feature': X.columns, 'importance': lgb_model.feature_importances_})
    top10 = fi_df.sort_values(by='importance', ascending=False).head(10)
    feat_str = ", ".join([f"{row['feature']}({int(row['importance'])})" for _, row in top10.iterrows()])
    print(f"[S1] {feat_str}")

    # 예측 저장
    submission_final['S1'] = multiclass_pred
    for col in targets_binary:
      submission_final[col] = binary_preds[col]
    submission_final = submission_final[['subject_id', 'sleep_date', 'lifelog_date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']]
    fname = f"submission_{np.mean(total_avg_f1s)}.csv"
    submission_final.to_csv(fname, index=False)
    print(f"# {fname} 저장 완료")
    print(f"# submission shape:{submission_final.shape}")
    print("================================================")

    # 확률 결과 추가
    submission_proba = submission_final.copy()
    for col in targets_binary:
        for i in range(2):
            submission_proba[f'{col}_class{i}_proba'] = binary_preds_proba[col][:, i]
    for i in range(3):
        submission_proba[f'S1_class{i}_proba'] = multiclass_pred_proba[:, i]

    # 저장
    fname_proba = f"submission_with_proba_{np.mean(total_avg_f1s):.4f}.csv"
    submission_proba.to_csv(fname_proba, index=False)
    print(f"# {fname_proba} 저장 완료 (확률 포함)")

    # 모델별 예측결과 비율 비교
    a11 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
    a13 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
    a12 = train_df[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
    a21 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].sum()
    a23 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].apply(len)
    a22 = submission_final[['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']].mean()
    result = pd.concat([a11, a13, a12, a21, a23, a22], axis=1)
    result.columns = ['학습sum','학습len','학습mean','테스트sum','테스트len','테스트mean']
    print('\n STEP3: 예측결과 비교표')
    display(result)

    # === STEP4: OOF 예측 생성 (train set에 대해) ===

    # n_splits = 10
    mask = train['month'] != 6
    print(f'# k-fold: {n_splits}')
    print(f'# train: {len(y[mask])}')

    oof_f1 = []
    print('\n STEP4: OOF 예측 생성')
    oof_result = train_df[['subject_id', 'sleep_date', 'lifelog_date']].copy()
    for col in targets_binary:
        lgb_params = common_params[col].copy()
        lgb_params['random_state'] = random_state

        xgb_params = {
          'n_estimators': 1000,
          "learning_rate": 0.01,
          'reg_lambda': 1,
          'max_depth': 6,
          'n_jobs': -1,
          'subsample': 0.8,
          'colsample_bylevel': 0.8,
          'min_child_weight': 1,
          'max_bin': 200,
          'tree_method': 'hist',
          'random_state': random_state,
        }

        y = train_df[col]
        oof_preds = get_oof_predictions(X, y, lgb_params, xgb_params, n_splits=n_splits, is_multiclass=False, early_stop=early_stop)
        oof_result[col] = oof_preds
        f1 = f1_score(y[mask], oof_preds[mask], average='macro')
        oof_f1.append(f1)
        print(f"[OOF - {col}] F1 score: {f1:.4f}")

    # multiclass
    col = 'S1'
    lgb_params = common_params[col].copy()
    lgb_params['random_state'] = random_state

    xgb_params = {
      'n_estimators': 1000,
      "learning_rate": 0.01,
      'reg_lambda': 1,
      'max_depth': 6,
      'n_jobs': -1,
      'subsample': 0.8,
      'colsample_bylevel': 0.8,
      'min_child_weight': 1,
      'max_bin': 200,
      'tree_method': 'hist',
      'random_state': random_state,
    }


    y = train_df[col]
    oof_preds = get_oof_predictions(X, y, lgb_params, xgb_params, n_splits=n_splits, is_multiclass=True, num_class=3, early_stop=early_stop)
    oof_result[col] = oof_preds
    f1 = f1_score(y[mask], oof_preds[mask], average='macro')
    oof_f1.append(f1)
    print(f"[OOF - {col}] F1 score: {f1:.4f}")
    print(f"[OOF] F1 score: {np.mean(oof_f1):.4f}")

    # oof_result 저장
    fname = f"oof_result_{np.mean(total_avg_f1s)}.csv"
    oof_result.to_csv(fname, index=False)
    print(f"# {fname} 저장 완료")

    return submission_final, oof_result

### run

In [None]:
%%time

# 공통 하이퍼파라미터
common_params = {
  'n_estimators': 5000,
  "learning_rate": 0.01,
  'lambda_l1': 5,
  'lambda_l2': 1,
  'n_jobs': -1,
  'verbosity': -1
}

# 모델별 세부 하이퍼파라미터
best_param_dict = {}

# 공통 하이퍼파라미터 대체 (이상한 모델의 경우)
best_param_dict['Q3'] = common_params
best_param_dict['S1'] = common_params
best_param_dict['S2'] = common_params
best_param_dict['S3'] = common_params
best_param_dict['Q1'] = common_params
best_param_dict['Q2'] = common_params

common_params_cat = {
    'iterations': 1000,           # n_estimators에 해당
    'learning_rate': 0.01,
    'l2_leaf_reg': 1,             # reg_lambda에 해당
    'depth': 6,                   # max_depth에 해당
    'thread_count': -1,           # n_jobs에 해당
    # 'subsample': 0.8,
    'rsm': 0.8,                   # colsample_bylevel에 해당
    'min_data_in_leaf': 1,        # min_child_weight에 유사
    'border_count': 200,          # max_bin에 해당
    'task_type': 'CPU',           # 'hist'에 대응
    'random_state' : 41,
    'verbose' : False
}

common_params_cat2 = {
    'iterations': 1000,           # n_estimators에 해당
    'learning_rate': 0.01,
    'class_weights': [1.048, 0.670, 1.807],  # [0, 1, 2] 순서
    'l2_leaf_reg': 1,             # reg_lambda에 해당
    'depth': 6,                   # max_depth에 해당
    'thread_count': -1,           # n_jobs에 해당
    # 'subsample': 0.8,
    'rsm': 0.8,                   # colsample_bylevel에 해당
    'min_data_in_leaf': 1,        # min_child_weight에 유사
    'border_count': 200,          # max_bin에 해당
    'task_type': 'CPU',           # 'hist'에 대응
    'random_state' : 41,
    'verbose' : False
}

"""
# 평균 F1: 0.6451 / [상세] Q1(기상직후수면질):0.7229 Q2(취침전신체적피로):0.7588 Q3(취침전스트레스):0.6520 S2(수면효율):0.5881 S3(수면잠들기시간):0.6822 S1(S1):0.4665
# 전체 평균 F1: 0.6451
# [OOF] F1 score: 0.6611

	학습sum	학습len	학습mean	테스트sum	테스트len	테스트mean
Q1	223	450	0.4956	130	250	0.5200
Q2	253	450	0.5622	155	250	0.6200
Q3	270	450	0.6000	173	250	0.6920
S1	390	450	0.8667	208	250	0.8320
S2	293	450	0.6511	170	250	0.6800
S3	298	450	0.6622	169	250	0.6760
"""

submission_final, oof_result = run_basemodel(train, test, valid_ids, best_param_dict, n_splits=5, random_state=41, early_stop=False)

week_type_lag1
# X shape: (450, 185)
# test_X shape: (250, 185)

 STEP1: 실험 결과 확인
 평균 F1: 0.6440 / [상세] Q1(기상직후수면질):0.7229 Q2(취침전신체적피로):0.7083 Q3(취침전스트레스):0.6603 S2(수면효율):0.6179 S3(수면잠들기시간):0.6992 S1(S1):0.4554
# 전체 평균 F1: 0.6440

 STEP2: 전체 데이터로 모델 재학습
[Q1] Q1_te2(526), img1(387), wake_time_ratio(369), light_night_mean(340), wake_time_diff_lag1(275), Q1_te(249), wake_time(244), img4(219), ble_rssi_max_afterwork(204), beforebed_통화_time(198)
[Q2] Q2_te2(2229), img2(282), wake_up_early_minutes(276), Q2_te(275), wlight_evening_mean(258), rolling_wake_time_3d(256), sleep_time_diff(220), activehour_screen_time_vs_avg_pct(201), avg_rssi(197), activehour_total_screen_time(193)
[Q3] Q3_te2(2648), light_max(393), sleep_duration_diff_lag1(237), activehour_통화_time(220), sleep_duration_min(206), lat_change(160), hr_morning_max(158), activehour_NAVER_time(158), Q3_te(157), hr_evening_max(140)
[S2] S2_te2(1973), S2_te(453), beforebed_total_screen_time(268), wake_time_diff_lag1(255), light_max(236), 

Unnamed: 0,학습sum,학습len,학습mean,테스트sum,테스트len,테스트mean
Q1,223,450,0.4956,126,250,0.504
Q2,253,450,0.5622,153,250,0.612
Q3,270,450,0.6,175,250,0.7
S1,390,450,0.8667,207,250,0.828
S2,293,450,0.6511,170,250,0.68
S3,298,450,0.6622,165,250,0.66


# k-fold: 5
# train: 392

 STEP4: OOF 예측 생성
[OOF - Q1] F1 score: 0.7284
