<a href="https://colab.research.google.com/github/jacobgreen4477/Construction-Equipment-Oil-Condition-Classification-AI-Competition/blob/main/ETRI_v1_0_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

> title : 제 4회 ETRI 휴먼이해 인공지능 논문경진대회 <br>
> author : hjy <br>

In [None]:
! pip install haversine
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
from tqdm.auto import tqdm
from collections import Counter
from collections import Counter
from scipy.stats import entropy
from haversine import haversine  # 설치 필요: pip install haversine

warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# pandas 옵션
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%0.4f' % x)

In [None]:
def plot_rssi_by_subject_topN(df, TOPN):
    """
    subject_id별로 가장 자주 등장한 상위 N개 BSSID에 대해 RSSI 시계열 시각화
    - RSSI -70 이하 신호는 제외 (None 처리)
    - 선 투명도 50%
    """
    grouped = df.groupby("subject_id")

    for subject, group in grouped:
        timestamps = []
        bssid_counter = Counter()

        # 모든 BSSID 등장 횟수 카운트
        for _, row in group.iterrows():
            bssid_counter.update(row['bssid'])

        # TOP N BSSID 추출
        target_bssids = [b for b, _ in bssid_counter.most_common(TOPN)]
        time_series = {bssid: [] for bssid in target_bssids}

        # 시간별 RSSI 수집 (단, -60 이하는 제거)
        for _, row in group.iterrows():
            timestamps.append(pd.to_datetime(row['timestamp']))
            row_bssid = row['bssid']
            row_rssi = row['rssi']

            for bssid in target_bssids:
                if bssid in row_bssid:
                    idx = row_bssid.index(bssid)
                    rssi_value = row_rssi[idx]
                    if rssi_value > -60:
                        time_series[bssid].append(rssi_value)
                    else:
                        time_series[bssid].append(None)  # 약한 신호는 제외
                else:
                    time_series[bssid].append(None)  # 없는 BSSID

        # 시각화
        plt.figure(figsize=(16, 7))
        for bssid in target_bssids:
            plt.plot(timestamps, time_series[bssid], label=bssid, marker='o', alpha=0.5)

        plt.title(f"[{subject}] 시간별 RSSI 변화 (TOP {TOPN}, -60 이상만)", fontsize=14)
        plt.xlabel("시간")
        plt.ylabel("RSSI (dBm)")
        plt.xticks(rotation=45)
        plt.legend(title="BSSID", bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [None]:
def filter_strong_rssi(df, threshold=-60):
    """
    각 row의 RSSI 값 중 threshold 이하인 항목 제거

    Parameters:
    - df: DataFrame with 'bssid' and 'rssi' as lists
    - threshold: int, RSSI 값 기준 (기본: -60)

    Returns:
    - 필터링된 DataFrame (in-place 수정 아님)
    """
    filtered_df = df.copy()

    def filter_row(row):
        bssids = row['bssid']
        rssis = row['rssi']
        # RSSI > threshold 조건 만족하는 항목만 추출
        filtered = [(b, r) for b, r in zip(bssids, rssis) if r > threshold]
        if filtered:
            new_bssids, new_rssis = zip(*filtered)
            return pd.Series({'bssid': list(new_bssids), 'rssi': list(new_rssis)})
        else:
            return pd.Series({'bssid': [], 'rssi': []})

    filtered_df[['bssid', 'rssi']] = filtered_df.apply(filter_row, axis=1)
    return filtered_df

In [None]:
def daily_wifi_features_by_user(df):
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    features = []

    # subject_id + lifelog_date 기준 그룹화
    grouped = df.groupby(['subject_id', 'lifelog_date'])

    for (subject_id, date), group in grouped:
        scan_count = len(group)
        bssid_flat = sum(group['bssid'], [])  # flatten
        rssi_flat = sum(group['rssi'], [])    # flatten

        unique_bssid_count = len(set(bssid_flat))
        avg_rssi = sum(rssi_flat) / len(rssi_flat) if rssi_flat else None
        max_rssi = max(rssi_flat) if rssi_flat else None
        min_rssi = min(rssi_flat) if rssi_flat else None
        strong_rssi_ratio = sum(1 for r in rssi_flat if r > -60) / len(rssi_flat) if rssi_flat else 0
        empty_scan_count = sum(1 for b in group['bssid'] if len(b) == 0)

        # 가장 많이 탐지된 BSSID
        bssid_counter = Counter(bssid_flat)
        top_bssid, top_bssid_count = bssid_counter.most_common(1)[0] if bssid_counter else (None, 0)

        first_time = group['timestamp'].min()
        last_time = group['timestamp'].max()
        hour_span = (last_time - first_time).total_seconds() / 60  # 분 단위

        features.append({
            'subject_id': subject_id,
            'lifelog_date': date,
            'scan_count': scan_count,
            'unique_bssid_count': unique_bssid_count,
            'avg_rssi': avg_rssi,
            'max_rssi': max_rssi,
            'min_rssi': min_rssi,
            'strong_signal_ratio': strong_rssi_ratio,
            'empty_scan_count': empty_scan_count,
            'top_bssid': top_bssid,
            'top_bssid_count': top_bssid_count,
            #'time_first_seen': first_time,
            # 'time_last_seen': last_time,
            'hour_span_minutes': hour_span
        })

    return pd.DataFrame(features)

In [None]:
def generate_derived_features(df,
                             calculate_mean=True,
                             calculate_max=True,
                             calculate_entropy=True,
                             calculate_presence=False,
                             target_label=None):

    # 시간 필터링 함수
    def filter_night_hours(ts):
        hour = pd.to_datetime(ts).dt.hour
        return (hour >= 21) | (hour < 8)

    # 데이터 파싱 함수
    def parse_list(x):
        if isinstance(x, str):
            cleaned = x.strip('[]')
            items = [item.strip() for item in cleaned.split(',')]
            try:
                return list(map(float, items)) if '.' in cleaned else items
            except:
                return items
        return x

    # 1. 시간 필터링
    df = df[filter_night_hours(df['timestamp'])].copy()

    # 2. 데이터 전처리
    df = df.assign(
        labels=df['labels'].apply(parse_list),
        prob=df['prob'].apply(parse_list)
    ).explode(['labels', 'prob'])

    # 3. 타입 변환
    df['prob'] = pd.to_numeric(df['prob'], errors='coerce')
    df = df.dropna(subset=['prob'])

    # 4. 그룹화 기준
    group_keys = ['subject_id', 'lifelog_date']
    merged_df = pd.DataFrame()

    # 5. 평균 확률 계산
    if calculate_mean:
        mean_prob = df.groupby(group_keys + ['labels'])['prob'].mean().unstack().reset_index()
        merged_df = mean_prob.copy()

    # 6. 최대 확률 레이블 (조건 추가)
    if calculate_max:
        max_prob = df.loc[df.groupby(group_keys)['prob'].idxmax()][group_keys + ['labels', 'prob']]
        max_prob = max_prob.rename(columns={'labels': 'max_label', 'prob': 'max_prob'})

        # 조건 적용: max_prob < 0.1 → max_label = NaN
        max_prob['max_label'] = max_prob['max_label'].where(
            max_prob['max_prob'] >= 0.1,  # 조건
            np.nan  # 조건 미충족 시 NaN 할당
        )

        if not merged_df.empty:
            merged_df = pd.merge(merged_df, max_prob, on=group_keys, how='left')
        else:
            merged_df = max_prob.copy()

    # 7. 엔트로피 계산
    if calculate_entropy:
        entropy_df = df.groupby(group_keys + ['labels'])['prob'].mean() \
                     .groupby(group_keys).apply(entropy).reset_index(name='entropy')
        if not merged_df.empty:
            merged_df = pd.merge(merged_df, entropy_df, on=group_keys, how='left')
        else:
            merged_df = entropy_df.copy()

    # 8. 레이블 존재 여부
    if calculate_presence and target_label:
        presence_df = df.groupby(group_keys)['labels'] \
                      .apply(lambda x: (x == target_label).any().astype(int)) \
                      .reset_index(name=f'has_{target_label}')
        if not merged_df.empty:
            merged_df = pd.merge(merged_df, presence_df, on=group_keys, how='left')
        else:
            merged_df = presence_df.copy()

    # 9. 결과 정제
    if not merged_df.empty:
        merged_df = merged_df.fillna({f'has_{target_label}': 0})
        cols = [c for c in merged_df if c not in group_keys] + group_keys
        merged_df = merged_df[cols]

    return merged_df.reset_index(drop=True)

In [62]:
def calculate_daily_metrics(group):

    # 기본값 설정
    default_values = {
        'daily_avg_speed': 0,
        'daily_max_altitude': 0,
        'daily_lat_std': 0,
        'active_minutes': 0,
        'movement_ratio': 0,
        'centroid_lat': 0,
        'centroid_lon': 0,
        'altitude_change': 0,
        'total_distance(m)': 0,
        'max_speed': 0
    }

    try:


      """그룹별 모든 메트릭을 계산하는 통합 함수"""
      # 리스트 데이터 추출 (예외 처리 추가)
      all_speeds = np.concatenate(group['speed'].tolist()) if 'speed' in group else []
      all_alts = np.concatenate(group['altitude'].tolist()) if 'altitude' in group else []
      all_lats = np.concatenate(group['latitude'].tolist()) if 'latitude' in group else []
      all_lons = np.concatenate(group['longitude'].tolist()) if 'longitude' in group else []

      # 기본 통계
      avg_speed = np.mean(all_speeds) if len(all_speeds) > 0 else 0
      max_alt = np.max(all_alts) if len(all_alts) > 0 else 0
      lat_std = np.std(all_lats) if len(all_lats) > 0 else 0

      # 이동 특성
      active_mins = group.shape[0]  # 관측치 수 = 활동 분
      movement_ratio = (all_speeds > 1.0).mean() if len(all_speeds) > 0 else 0

      # 공간 특성
      centroid_lat = np.mean(all_lats) if len(all_lats) > 0 else 0
      centroid_lon = np.mean(all_lons) if len(all_lons) > 0 else 0
      alt_change = all_alts[-1] - all_alts[0] if len(all_alts) > 0 else 0

      # 이동 거리 계산 (Haversine)
      total_dist = 0.0
      if len(all_lats) > 1:
          for i in range(len(all_lats)-1):
              coord1 = (all_lats[i], all_lons[i])
              coord2 = (all_lats[i+1], all_lons[i+1])
              total_dist += haversine(coord1, coord2, unit='m')  # 미터 단위

    except Exception as e:
        print(f"Error processing group: {e}")
        return pd.Series(default_values)

    return pd.Series({
        # 기본 통계
        'daily_avg_speed': avg_speed,
        'daily_max_altitude': max_alt,
        'daily_lat_std': lat_std,

        # 이동 특성
        'active_minutes': active_mins,
        'movement_ratio': movement_ratio,

        # 공간 특성
        'centroid_lat': centroid_lat,
        'centroid_lon': centroid_lon,
        'altitude_change': alt_change,
        'total_distance(m)': total_dist,

        # 추가 메트릭
        'max_speed': np.max(all_speeds) if len(all_speeds) > 0 else 0
    })

### 데이터 읽기

In [None]:
path = '/content/drive/MyDrive/data/ch2025_data_items/'

# 1
ch2025_mACStatus = pd.read_parquet(path+'ch2025_mACStatus.parquet')
ch2025_mActivity = pd.read_parquet(path+'ch2025_mActivity.parquet')
ch2025_mAmbience = pd.read_parquet(path+'ch2025_mAmbience.parquet')
ch2025_mBle = pd.read_parquet(path+'ch2025_mBle.parquet')
ch2025_mGps = pd.read_parquet(path+'ch2025_mGps.parquet')
ch2025_mLight = pd.read_parquet(path+'ch2025_mLight.parquet')
ch2025_mScreenStatus = pd.read_parquet(path+'ch2025_mScreenStatus.parquet')
ch2025_mUsageStats = pd.read_parquet(path+'ch2025_mUsageStats.parquet')
ch2025_mWifi = pd.read_parquet(path+'ch2025_mWifi.parquet')
ch2025_wHr = pd.read_parquet(path+'ch2025_wHr.parquet')
ch2025_wLight = pd.read_parquet(path+'ch2025_wLight.parquet')
ch2025_wPedo = pd.read_parquet(path+'ch2025_wPedo.parquet')

# 2
train = pd.read_csv('/content/drive/MyDrive/data/ch2025_metrics_train.csv')
test = pd.read_csv('/content/drive/MyDrive/data/ch2025_submission_sample.csv')

### mWifi
- Wifi devices around individual subject.

In [None]:
def extract_wifi_info(row):
    wifi_data = row['m_wifi']
    bssids = [item['bssid'] for item in wifi_data]
    rssis = [item['rssi'] for item in wifi_data]
    return pd.Series({'bssid': bssids, 'rssi': rssis})

ch2025_mWifi[['bssid', 'rssi']] = ch2025_mWifi.apply(extract_wifi_info, axis=1)
ch2025_mWifi['lifelog_date'] = ch2025_mWifi['timestamp'].astype(str).str[:10]
ch2025_mWifi = ch2025_mWifi.drop(columns=['m_wifi'])
ch2025_mWifi.head(1)

Unnamed: 0,subject_id,timestamp,bssid,rssi,lifelog_date
0,id01,2024-06-26 12:03:00,"[a0:0f:37:9a:5d:8b, a0:0f:37:9a:5d:8c, a0:0f:37:9a:5d:8d, a0:0f:37:9a:5d:8e, a0:0f:37:9a:5d:8f, a0:0f:37:96:56:ef, 88:36:6c:86:75:84, a0:0f:37:96:56:ee, a0:0f:37:96:56:ed, 86:25:19:b5:b2:a5, a0:0f:37:96:56:ec, 1e:39:29:8e:fb:e9, 52:c2:e8:c7:9b:e4, a0:0f:37:96:56:eb, 12:e3:c7:09:20:34, 58:86:94:4a:08:b8, 90:9f:33:28:d0:2e, 00:26:66:bc:4e:18, f6:0a:f4:43:4b:ba, 10:e3:c7:09:20:35, 10:e3:c7:09:20:34, 1c:39:29:48:04:92, 12:e3:c7:07:9d:df, 86:25:19:c3:44:07, a0:0f:37:9a:37:2f, a0:0f:37:9a:37:2e, a0:0f:37:9a:37:2d, 0a:09:b4:74:05:ec, a0:0f:37:9a:37:2c, a0:0f:37:9a:37:2b, 0a:09:b4:74:05:eb, c0:25:2f:d8:c1:a6, 16:7f:67:bb:fa:f8, 3c:f3:92:ff:00:01, 06:09:b4:74:05:ec, 06:09:b4:74:05:eb, 12:e3:c7:0a:74:d1, 88:36:6c:a9:6f:8e, 02:e3:c7:09:20:34, 00:09:b4:74:05:eb, 00:09:b4:74:05:ec, 00:1d:93:93:cf:fe, 8e:e2:ac:a5:9d:15]","[-78, -78, -78, -78, -78, -58, -72, -58, -58, -61, -58, -71, -82, -58, -88, -82, -78, -85, -45, -63, -89, -82, -83, -84, -76, -76, -76, -72, -76, -76, -59, -82, -79, -82, -72, -59, -78, -63, -88, -60, -72, -19, -72]",2024-06-26


In [None]:
# wifi 약신호 제거
# ch2025_mWifi = filter_strong_rssi(ch2025_mWifi, threshold=-60)
# ch2025_mWifi.head(1)

In [None]:
ch2025_mWifi_daily = daily_wifi_features_by_user(ch2025_mWifi)
ch2025_mWifi_daily.head()

Unnamed: 0,subject_id,lifelog_date,scan_count,unique_bssid_count,avg_rssi,max_rssi,min_rssi,strong_signal_ratio,empty_scan_count,top_bssid,top_bssid_count,hour_span_minutes
0,id01,2024-06-26,69,393,-70.1964,-19,-91,0.2309,0,86:25:19:9f:9b:be,19,716.0
1,id01,2024-06-27,126,357,-69.0629,-26,-92,0.2701,0,04:09:a5:3a:c8:6a,54,1430.0
2,id01,2024-06-28,118,376,-69.0941,-26,-92,0.2594,0,04:09:a5:3a:c8:6a,47,1430.0
3,id01,2024-06-29,134,258,-67.7897,-24,-91,0.3063,0,04:09:a5:3a:c8:6a,117,1420.0
4,id01,2024-06-30,108,242,-68.2999,-23,-90,0.2946,0,04:09:a5:3a:c8:6a,70,1310.0


### mAmbience
- Ambient sound identification labels and their respective probabilities.

In [None]:
# - mAmbience: Ambient sound identification labels and their respective probabilities.

def extract_labels_and_probs(row):
    items = row['m_ambience']
    labels = [item[0] for item in items]
    probs = [item[1] for item in items]
    return pd.Series({'labels': labels, 'prob': probs})

ch2025_mAmbience[['labels', 'prob']]  = ch2025_mAmbience.apply(extract_labels_and_probs, axis=1)
ch2025_mAmbience['lifelog_date'] = ch2025_mAmbience['timestamp'].astype(str).str[:10]
ch2025_mAmbience = ch2025_mAmbience.drop(columns=['m_ambience'])
ch2025_mAmbience.head(1)

Unnamed: 0,subject_id,timestamp,labels,prob,lifelog_date
0,id01,2024-06-26 13:00:10,"[Music, Vehicle, Motor vehicle (road), Outside, urban or manmade, Outside, rural or natural, Car, Speech, Inside, large room or hall, Truck, Sound effect]","[0.30902618, 0.081680894, 0.04035286, 0.037144363, 0.032663062, 0.03199804, 0.029806137, 0.01684492, 0.016206821, 0.01591479]",2024-06-26


In [None]:
ch2025_mAmbience_daily = generate_derived_features(
    ch2025_mAmbience,
    calculate_mean=True,
    calculate_max=True,
    calculate_entropy=True,
    calculate_presence=True,
    target_label='Snoring'
)

In [None]:
ch2025_mAmbience_daily.head()

Unnamed: 0,A capella,"Accelerating, revving, vroom",Accordion,Acoustic guitar,Afrobeat,Air brake,Air conditioning,"Air horn, truck horn",Aircraft,Aircraft engine,Alarm,Alarm clock,Ambient music,Ambulance (siren),Animal,Applause,Arrow,Artillery fire,Babbling,"Baby cry, infant cry",Baby laughter,Background music,Bagpipes,Bang,Banjo,Bark,Basketball bounce,Bass drum,Bass guitar,Bathtub (filling or washing),Beatboxing,"Bee, wasp, etc.","Beep, bleep",Bell,Bellow,Belly laugh,Bicycle,Bicycle bell,Bird,"Bird flight, flapping wings","Bird vocalization, bird call, bird song",Biting,Bleat,Blender,Bluegrass,Blues,"Boat, Water vehicle",Boiling,Boing,Boom,Bouncing,Bow-wow,Bowed string instrument,Brass instrument,Breaking,Breathing,"Burping, eructation","Burst, pop",Bus,Busy signal,Buzz,Buzzer,Cacophony,Camera,"Canidae, dogs, wolves",Cap gun,Car,Car alarm,Car passing by,Carnatic music,Cash register,Cat,Caterwaul,"Cattle, bovinae",Caw,Cello,Chainsaw,Change ringing (campanology),Chant,Chatter,Cheering,"Chewing, mastication","Chicken, rooster",Child singing,"Child speech, kid speaking",Children playing,Children shouting,Chime,"Chink, clink",Chirp tone,"Chirp, tweet",Choir,Chop,Chopping (food),Christian music,Christmas music,"Chuckle, chortle",Church bell,Civil defense siren,Clang,Clapping,Clarinet,Classical music,Clatter,Clickety-clack,Clicking,Clip-clop,Clock,Cluck,Coin (dropping),Computer keyboard,Conversation,Coo,Cough,Country,Cowbell,Crack,Crackle,Creak,Cricket,Croak,Crow,Crowd,"Crowing, cock-a-doodle-doo","Crumpling, crinkling",Crunch,Crushing,"Crying, sobbing",Cupboard open or close,"Cutlery, silverware",Cymbal,Dance music,"Dental drill, dentist's drill",Dial tone,Didgeridoo,Ding,Ding-dong,Disco,"Dishes, pots, and pans",Distortion,Dog,"Domestic animals, pets",Door,Doorbell,Double bass,Drawer open or close,Drill,Drip,Drum,Drum and bass,Drum kit,Drum machine,Drum roll,Dubstep,Duck,Echo,Effects unit,Electric guitar,Electric piano,"Electric shaver, electric razor",Electric toothbrush,Electronic dance music,Electronic music,Electronic organ,Electronic tuner,Electronica,Emergency vehicle,Engine,Engine knocking,Engine starting,Environmental noise,Eruption,Exciting music,Explosion,Fart,Field recording,Filing (rasp),Fill (with liquid),Finger snapping,Fire,Fire alarm,"Fire engine, fire truck (siren)",Firecracker,Fireworks,"Fixed-wing aircraft, airplane",Flamenco,Flap,Flute,"Fly, housefly",Foghorn,Folk music,Fowl,Frog,Frying (food),Funk,Fusillade,Gargling,Gasp,Gears,Giggle,Glass,Glockenspiel,Goat,Gobble,Gong,Goose,Gospel music,Groan,Growling,Grunt,Guitar,"Gunshot, gunfire",Gurgling,Gush,Hair dryer,Hammer,Hammond organ,Hands,Happy music,Harmonic,Harmonica,Harp,Harpsichord,Heart murmur,"Heart sounds, heartbeat",Heavy engine (low frequency),Heavy metal,Helicopter,Hi-hat,Hiccup,Hip hop music,Hiss,Honk,Hoot,Horse,House music,Howl,"Hubbub, speech noise, speech babble",Hum,Humming,"Ice cream truck, ice cream van",Idling,Independent music,Insect,"Inside, large room or hall","Inside, public space","Inside, small room",Jackhammer,Jazz,Jet engine,Jingle (music),Jingle bell,"Jingle, tinkle",Keyboard (musical),Keys jangling,Knock,Laughter,Lawn mower,Light engine (high frequency),Liquid,"Livestock, farm animals, working animals",Lullaby,Machine gun,Mains hum,Mallet percussion,Mandolin,Mantra,Maraca,"Marimba, xylophone",Mechanical fan,Mechanisms,Medium engine (mid frequency),Meow,Microwave oven,Middle Eastern music,Moo,Mosquito,Motor vehicle (road),"Motorboat, speedboat",Motorcycle,Mouse,Music,Music for children,Music of Africa,Music of Asia,Music of Bollywood,Music of Latin America,Musical instrument,"Narration, monologue","Neigh, whinny",New-age music,Noise,Ocean,Oink,Opera,Orchestra,Organ,"Outside, rural or natural","Outside, urban or manmade",Owl,Pant,Patter,Percussion,Piano,Pig,"Pigeon, dove",Ping,Pink noise,Pizzicato,Plop,Plucked string instrument,Police car (siren),Pop music,Pour,Power tool,"Power windows, electric windows",Printer,"Propeller, airscrew",Pulleys,Pulse,Pump (liquid),Punk rock,Purr,Quack,"Race car, auto racing",Radio,Rail transport,"Railroad car, train wagon",Rain,Rain on surface,Raindrop,Rapping,"Ratchet, pawl",Rattle,Rattle (instrument),Reggae,Reversing beeps,Rhythm and blues,Rimshot,Ringtone,Roar,"Roaring cats (lions, tigers)",Rock and roll,Rock music,"Rodents, rats, mice",Roll,"Rowboat, canoe, kayak",Rub,Rumble,Run,Rustle,Rustling leaves,Sad music,"Sailboat, sailing ship",Sampler,Sanding,Sawing,Saxophone,Scary music,Scissors,Scrape,Scratch,Scratching (performance technique),Screaming,Sewing machine,Shatter,Sheep,Ship,Shout,Shuffle,Shuffling cards,Sidetone,Sigh,Silence,Sine wave,Singing,Singing bowl,Single-lens reflex camera,Sink (filling or washing),Siren,Sitar,Sizzle,Ska,Skateboard,Skidding,Slam,"Slap, smack",Sliding door,Slosh,"Smash, crash","Smoke detector, smoke alarm",Snake,Snare drum,Sneeze,Snicker,Sniff,Snoring,Snort,Sonar,Song,Soul music,Sound effect,Soundtrack music,Speech,Speech synthesizer,"Splash, splatter",Splinter,Spray,Squawk,Squeak,Squeal,Squish,Static,Steam,Steam whistle,"Steel guitar, slide guitar",Steelpan,Stir,Stomach rumble,Stream,Strum,"Subway, metro, underground",Swing music,Synthesizer,Synthetic singing,Tabla,Tambourine,Tap,Tapping (guitar technique),Tearing,Techno,Telephone,Telephone bell ringing,"Telephone dialing, DTMF",Television,Tender music,Theme music,Theremin,Throat clearing,Throbbing,"Thump, thud",Thunder,Thunderstorm,Thunk,Tick,Tick-tock,Timpani,Tire squeal,Toilet flush,Tools,Toot,Toothbrush,Traditional music,"Traffic noise, roadway noise",Train,Train horn,Train wheels squealing,Train whistle,Trance music,"Trickle, dribble",Truck,Trumpet,Tubular bells,Tuning fork,Turkey,Typewriter,Typing,Ukulele,Vacuum cleaner,Vehicle,"Vehicle horn, car horn, honking",Vibraphone,Vibration,Video game music,"Violin, fiddle",Vocal music,"Wail, moan","Walk, footsteps",Water,"Water tap, faucet",Waterfall,"Waves, surf",Wedding music,"Whack, thwack",Whale vocalization,Wheeze,Whimper,Whimper (dog),Whip,Whir,Whispering,Whistle,Whistling,White noise,Whoop,"Whoosh, swoosh, swish",Wild animals,Wind,Wind chime,"Wind instrument, woodwind instrument",Wind noise (microphone),Wood,Wood block,Writing,Yell,Yip,Yodeling,Zing,Zipper (clothing),Zither,max_label,max_prob,entropy,has_Snoring,subject_id,lifelog_date
0,,,,,,,,,,,,,,,0.1009,,0.0888,0.1467,0.0,,,,,0.0674,,,,,,,,0.2427,,,,,0.029,,0.0469,,,,,,,,,0.0449,,,,,,,,0.0399,,0.1173,,,,,,0.0357,,0.0409,0.1096,,,,,0.033,,,,,,,,,,,,,0.0032,,,,,,,,,,,,,,,,,,,,,,0.026,,,,,0.0012,0.0413,,,,,,,0.0462,,,0.012,,,,,,0.0448,,,,,,,,,,0.0204,0.0112,0.0915,0.0998,0.0666,,,0.0531,,,,,,,,,,,0.0066,,,0.2428,,,,,,,,,,,,,,0.2481,,,,,0.0336,0.0436,,,0.1467,0.2481,,,,,0.2482,,,,0.0203,0.0423,,,,,,,0.0282,,,,,,,,,,,0.0854,,0.0431,,,,0.0178,,,,,,,,,,,,0.0631,,0.0835,,,0.0406,,,,0.6191,,,,,0.1215,0.0113,0.0076,0.0369,,,,,,,,,,,,,0.1951,0.0444,,0.0649,0.6394,,,,,,0.094,0.0716,,,,,,,0.1039,0.0572,0.025,0.0594,0.0195,,,,,,,0.0014,,,0.4705,,,,,,0.0998,0.0479,0.0392,,0.0897,,,,0.0875,0.0049,,,,,,0.0115,,,,,,,,,,,,,,,,,,,,,0.0213,,,,,,,,,0.0181,0.0096,0.087,,,,0.0963,,0.0831,0.065,,,,,,,,0.1561,,,,,,,,,0.0003,,0.0236,,,0.8215,,0.06,,,,,,0.0216,,0.054,,,,0.0824,,,,0.0606,,0.0545,,,,0.0409,,,,,,0.1298,0.0,,,0.1314,,,,,,0.0979,,,,,,,,,,,,,,,,,,,,,0.0012,,,,,0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0226,,0.0155,0.0797,,,0.008,,,,,0.059,0.0315,,,,,,,,,0.0485,0.2212,,,,,0.0656,0.0084,,0.0871,0.0666,,,,0.0556,,0.0708,,,,,,,Silence,1.0,4.0694,0,id01,2024-06-26
1,,,,,,,,,,,,,,,0.1009,,0.0888,0.1467,0.0,,,,,0.0674,,,,,,,,0.2427,,,,,0.029,,0.0469,,,,,,,,,0.0449,,,,,,,,0.0399,,0.1173,,,,,,0.0357,,0.0409,0.1096,,,,,0.033,,,,,,,,,,,,,0.0032,,,,,,,,,,,,,,,,,,,,,,0.026,,,,,0.0012,0.0413,,,,,,,0.0462,,,0.012,,,,,,0.0448,,,,,,,,,,0.0204,0.0112,0.0915,0.0998,0.0666,,,0.0531,,,,,,,,,,,0.0066,,,0.2428,,,,,,,,,,,,,,0.2481,,,,,0.0336,0.0436,,,0.1467,0.2481,,,,,0.2482,,,,0.0203,0.0423,,,,,,,0.0282,,,,,,,,,,,0.0854,,0.0431,,,,0.0178,,,,,,,,,,,,0.0631,,0.0835,,,0.0406,,,,0.6191,,,,,0.1215,0.0113,0.0076,0.0369,,,,,,,,,,,,,0.1951,0.0444,,0.0649,0.6394,,,,,,0.094,0.0716,,,,,,,0.1039,0.0572,0.025,0.0594,0.0195,,,,,,,0.0014,,,0.4705,,,,,,0.0998,0.0479,0.0392,,0.0897,,,,0.0875,0.0049,,,,,,0.0115,,,,,,,,,,,,,,,,,,,,,0.0213,,,,,,,,,0.0181,0.0096,0.087,,,,0.0963,,0.0831,0.065,,,,,,,,0.1561,,,,,,,,,0.0003,,0.0236,,,0.8215,,0.06,,,,,,0.0216,,0.054,,,,0.0824,,,,0.0606,,0.0545,,,,0.0409,,,,,,0.1298,0.0,,,0.1314,,,,,,0.0979,,,,,,,,,,,,,,,,,,,,,0.0012,,,,,0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0226,,0.0155,0.0797,,,0.008,,,,,0.059,0.0315,,,,,,,,,0.0485,0.2212,,,,,0.0656,0.0084,,0.0871,0.0666,,,,0.0556,,0.0708,,,,,,,,0.0,4.0694,0,id01,2024-06-26
2,,,,,,,,,,,,,,,0.1009,,0.0888,0.1467,0.0,,,,,0.0674,,,,,,,,0.2427,,,,,0.029,,0.0469,,,,,,,,,0.0449,,,,,,,,0.0399,,0.1173,,,,,,0.0357,,0.0409,0.1096,,,,,0.033,,,,,,,,,,,,,0.0032,,,,,,,,,,,,,,,,,,,,,,0.026,,,,,0.0012,0.0413,,,,,,,0.0462,,,0.012,,,,,,0.0448,,,,,,,,,,0.0204,0.0112,0.0915,0.0998,0.0666,,,0.0531,,,,,,,,,,,0.0066,,,0.2428,,,,,,,,,,,,,,0.2481,,,,,0.0336,0.0436,,,0.1467,0.2481,,,,,0.2482,,,,0.0203,0.0423,,,,,,,0.0282,,,,,,,,,,,0.0854,,0.0431,,,,0.0178,,,,,,,,,,,,0.0631,,0.0835,,,0.0406,,,,0.6191,,,,,0.1215,0.0113,0.0076,0.0369,,,,,,,,,,,,,0.1951,0.0444,,0.0649,0.6394,,,,,,0.094,0.0716,,,,,,,0.1039,0.0572,0.025,0.0594,0.0195,,,,,,,0.0014,,,0.4705,,,,,,0.0998,0.0479,0.0392,,0.0897,,,,0.0875,0.0049,,,,,,0.0115,,,,,,,,,,,,,,,,,,,,,0.0213,,,,,,,,,0.0181,0.0096,0.087,,,,0.0963,,0.0831,0.065,,,,,,,,0.1561,,,,,,,,,0.0003,,0.0236,,,0.8215,,0.06,,,,,,0.0216,,0.054,,,,0.0824,,,,0.0606,,0.0545,,,,0.0409,,,,,,0.1298,0.0,,,0.1314,,,,,,0.0979,,,,,,,,,,,,,,,,,,,,,0.0012,,,,,0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0226,,0.0155,0.0797,,,0.008,,,,,0.059,0.0315,,,,,,,,,0.0485,0.2212,,,,,0.0656,0.0084,,0.0871,0.0666,,,,0.0556,,0.0708,,,,,,,,0.0,4.0694,0,id01,2024-06-26
3,,,,,,,,,,,,,,,0.1009,,0.0888,0.1467,0.0,,,,,0.0674,,,,,,,,0.2427,,,,,0.029,,0.0469,,,,,,,,,0.0449,,,,,,,,0.0399,,0.1173,,,,,,0.0357,,0.0409,0.1096,,,,,0.033,,,,,,,,,,,,,0.0032,,,,,,,,,,,,,,,,,,,,,,0.026,,,,,0.0012,0.0413,,,,,,,0.0462,,,0.012,,,,,,0.0448,,,,,,,,,,0.0204,0.0112,0.0915,0.0998,0.0666,,,0.0531,,,,,,,,,,,0.0066,,,0.2428,,,,,,,,,,,,,,0.2481,,,,,0.0336,0.0436,,,0.1467,0.2481,,,,,0.2482,,,,0.0203,0.0423,,,,,,,0.0282,,,,,,,,,,,0.0854,,0.0431,,,,0.0178,,,,,,,,,,,,0.0631,,0.0835,,,0.0406,,,,0.6191,,,,,0.1215,0.0113,0.0076,0.0369,,,,,,,,,,,,,0.1951,0.0444,,0.0649,0.6394,,,,,,0.094,0.0716,,,,,,,0.1039,0.0572,0.025,0.0594,0.0195,,,,,,,0.0014,,,0.4705,,,,,,0.0998,0.0479,0.0392,,0.0897,,,,0.0875,0.0049,,,,,,0.0115,,,,,,,,,,,,,,,,,,,,,0.0213,,,,,,,,,0.0181,0.0096,0.087,,,,0.0963,,0.0831,0.065,,,,,,,,0.1561,,,,,,,,,0.0003,,0.0236,,,0.8215,,0.06,,,,,,0.0216,,0.054,,,,0.0824,,,,0.0606,,0.0545,,,,0.0409,,,,,,0.1298,0.0,,,0.1314,,,,,,0.0979,,,,,,,,,,,,,,,,,,,,,0.0012,,,,,0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0226,,0.0155,0.0797,,,0.008,,,,,0.059,0.0315,,,,,,,,,0.0485,0.2212,,,,,0.0656,0.0084,,0.0871,0.0666,,,,0.0556,,0.0708,,,,,,,,0.0,4.0694,0,id01,2024-06-26
4,,,,,,,,,,,,,,,0.1009,,0.0888,0.1467,0.0,,,,,0.0674,,,,,,,,0.2427,,,,,0.029,,0.0469,,,,,,,,,0.0449,,,,,,,,0.0399,,0.1173,,,,,,0.0357,,0.0409,0.1096,,,,,0.033,,,,,,,,,,,,,0.0032,,,,,,,,,,,,,,,,,,,,,,0.026,,,,,0.0012,0.0413,,,,,,,0.0462,,,0.012,,,,,,0.0448,,,,,,,,,,0.0204,0.0112,0.0915,0.0998,0.0666,,,0.0531,,,,,,,,,,,0.0066,,,0.2428,,,,,,,,,,,,,,0.2481,,,,,0.0336,0.0436,,,0.1467,0.2481,,,,,0.2482,,,,0.0203,0.0423,,,,,,,0.0282,,,,,,,,,,,0.0854,,0.0431,,,,0.0178,,,,,,,,,,,,0.0631,,0.0835,,,0.0406,,,,0.6191,,,,,0.1215,0.0113,0.0076,0.0369,,,,,,,,,,,,,0.1951,0.0444,,0.0649,0.6394,,,,,,0.094,0.0716,,,,,,,0.1039,0.0572,0.025,0.0594,0.0195,,,,,,,0.0014,,,0.4705,,,,,,0.0998,0.0479,0.0392,,0.0897,,,,0.0875,0.0049,,,,,,0.0115,,,,,,,,,,,,,,,,,,,,,0.0213,,,,,,,,,0.0181,0.0096,0.087,,,,0.0963,,0.0831,0.065,,,,,,,,0.1561,,,,,,,,,0.0003,,0.0236,,,0.8215,,0.06,,,,,,0.0216,,0.054,,,,0.0824,,,,0.0606,,0.0545,,,,0.0409,,,,,,0.1298,0.0,,,0.1314,,,,,,0.0979,,,,,,,,,,,,,,,,,,,,,0.0012,,,,,0.02,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0226,,0.0155,0.0797,,,0.008,,,,,0.059,0.0315,,,,,,,,,0.0485,0.2212,,,,,0.0656,0.0084,,0.0871,0.0666,,,,0.0556,,0.0708,,,,,,,,0.0,4.0694,0,id01,2024-06-26


### mGps
- Multiple GPS coordinates measured within a single minute using the smartphone.

In [54]:
def extract_gps_info(row):
    m_data = row['m_gps']
    altitude = [item['altitude'] for item in m_data]
    latitude = [item['latitude'] for item in m_data]
    longitude = [item['longitude'] for item in m_data]
    speed = [item['speed'] for item in m_data]
    return pd.Series({'altitude': altitude, 'latitude': latitude, 'longitude': longitude, 'speed': speed})

ch2025_mGps[['altitude','latitude','longitude','speed']] = ch2025_mGps.apply(extract_gps_info, axis=1)
ch2025_mGps['lifelog_date'] = ch2025_mGps['timestamp'].astype(str).str[:10]
ch2025_mGps = ch2025_mGps.drop(columns=['m_gps'])
ch2025_mGps.head(1)

In [63]:
ch2025_mGps_daily = ch2025_mGps.groupby(['subject_id', 'lifelog_date']).apply(calculate_daily_metrics).reset_index()
ch2025_mGps_daily.head()

Unnamed: 0,subject_id,lifelog_date,daily_avg_speed,daily_max_altitude,daily_lat_std,active_minutes,movement_ratio,centroid_lat,centroid_lon,altitude_change,total_distance(m),max_speed
0,id01,2024-06-26,0.6605,136.9,0.0106,707.0,0.1034,0.2361,0.1302,-6.7,29113.6162,28.22
1,id01,2024-06-27,1.0705,159.3,0.0078,1439.0,0.1401,0.2359,0.113,0.0,93470.9262,36.6356
2,id01,2024-06-28,0.8132,133.8,0.0087,1418.0,0.1105,0.2346,0.1108,0.0,68632.0779,49.5476
3,id01,2024-06-29,0.5638,142.7,0.0042,1440.0,0.0865,0.2318,0.0959,0.6,47104.6264,31.0703
4,id01,2024-06-30,0.4235,133.8,0.0022,1440.0,0.0494,0.2314,0.1042,-0.3,41159.5137,33.823


### mUsageStats
- Indicates which apps were used on the smartphone and for how long.

In [65]:
def extract_mUsageStats_info(row):
    m_data = row['m_usage_stats']
    app_name = [item['app_name'] for item in m_data]
    total_time = [item['total_time'] for item in m_data]
    return pd.Series({'app_name': app_name, 'total_time': total_time})

ch2025_mUsageStats[['app_name', 'total_time']] = ch2025_mUsageStats.apply(extract_mUsageStats_info, axis=1)
ch2025_mUsageStats['lifelog_date'] = ch2025_mUsageStats['timestamp'].astype(str).str[:10]
ch2025_mUsageStats = ch2025_mUsageStats.drop(columns=['m_usage_stats'])
ch2025_mUsageStats.head(1)

Unnamed: 0,subject_id,timestamp,app_name,total_time,lifelog_date
0,id01,2024-06-26 13:00:00,"[ 캐시워크, NAVER, ✝️성경일독Q]","[69, 549, 7337]",2024-06-26


In [66]:
ch2025_mUsageStats.head()

Unnamed: 0,subject_id,timestamp,app_name,total_time,lifelog_date
0,id01,2024-06-26 13:00:00,"[ 캐시워크, NAVER, ✝️성경일독Q]","[69, 549, 7337]",2024-06-26
1,id01,2024-06-26 13:10:00,"[통화, 토스, 전화, 카카오톡, NAVER, ✝️성경일독Q, One UI 홈]","[26419, 119896, 59284, 6744, 67042, 1504, 209417]",2024-06-26
2,id01,2024-06-26 13:20:00,"[메시지, One UI 홈]","[388651, 211334]",2024-06-26
3,id01,2024-06-26 13:30:00,"[메시지, ✝️성경일독Q]","[211633, 805]",2024-06-26
4,id01,2024-06-26 13:50:00,"[카카오톡, 캐시워크, ✝️성경일독Q, One UI 홈]","[35446, 105, 42402, 40225]",2024-06-26


### 학습 & 테스트 데이터

In [None]:
train2 = train.merge(ch2025_mWifi_daily,on=['subject_id','lifelog_date'],how='left')
train2.head()

Unnamed: 0,subject_id,sleep_date,lifelog_date,Q1,Q2,Q3,S1,S2,S3,scan_count,unique_bssid_count,avg_rssi,max_rssi,min_rssi,strong_signal_ratio,empty_scan_count,top_bssid,top_bssid_count,hour_span_minutes
0,id01,2024-06-27,2024-06-26,0,0,0,0,0,1,69.0,393.0,-70.1964,-19.0,-91.0,0.2309,0.0,86:25:19:9f:9b:be,19.0,716.0
1,id01,2024-06-28,2024-06-27,0,0,0,0,1,1,126.0,357.0,-69.0629,-26.0,-92.0,0.2701,0.0,04:09:a5:3a:c8:6a,54.0,1430.0
2,id01,2024-06-29,2024-06-28,1,0,0,1,1,1,118.0,376.0,-69.0941,-26.0,-92.0,0.2594,0.0,04:09:a5:3a:c8:6a,47.0,1430.0
3,id01,2024-06-30,2024-06-29,1,0,1,2,0,0,134.0,258.0,-67.7897,-24.0,-91.0,0.3063,0.0,04:09:a5:3a:c8:6a,117.0,1420.0
4,id01,2024-07-01,2024-06-30,0,1,1,1,1,1,108.0,242.0,-68.2999,-23.0,-90.0,0.2946,0.0,04:09:a5:3a:c8:6a,70.0,1310.0


In [None]:
test2 = test.merge(ch2025_mWifi_daily,on=['subject_id','lifelog_date'],how='left')
test2.head()

Unnamed: 0,subject_id,sleep_date,lifelog_date,Q1,Q2,Q3,S1,S2,S3,scan_count,unique_bssid_count,avg_rssi,max_rssi,min_rssi,strong_signal_ratio,empty_scan_count,top_bssid,top_bssid_count,hour_span_minutes
0,id01,2024-07-31,2024-07-30,0,0,0,0,0,0,115.0,910.0,-73.8566,-29.0,-93.0,0.1451,0.0,86:25:19:9f:9b:be,42.0,1400.0
1,id01,2024-08-01,2024-07-31,0,0,0,0,0,0,135.0,492.0,-69.2624,-33.0,-93.0,0.2579,0.0,04:09:a5:3a:c8:6a,48.0,1430.0
2,id01,2024-08-02,2024-08-01,0,0,0,0,0,0,124.0,397.0,-72.1787,-33.0,-92.0,0.2053,0.0,86:25:19:9f:9b:be,49.0,1428.0
3,id01,2024-08-03,2024-08-02,0,0,0,0,0,0,132.0,366.0,-69.7404,-27.0,-93.0,0.2188,0.0,04:09:a5:3a:c8:6a,53.0,1430.0
4,id01,2024-08-04,2024-08-03,0,0,0,0,0,0,107.0,312.0,-69.6403,-21.0,-93.0,0.2214,0.0,04:09:a5:3a:c8:6a,77.0,1420.0
