In [None]:
from pathlib import Path
import json, numpy as np, pandas as pd

ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name=="notebook" else Path.cwd()
RAW  = ROOT / "data" / "raw"
PROC = ROOT / "data" / "processed" / "etrilifelog"; PROC.mkdir(parents=True, exist_ok=True)
ART  = ROOT / "artifacts"; ART.mkdir(parents=True, exist_ok=True)

CSV_PATH = RAW / "ch2025_metrics_train.csv"
ITEMS_DIR = RAW / "ch2025_data_items"

In [4]:
df = pd.read_csv(CSV_PATH)
df.head(), df.shape, df.columns.tolist()

(  subject_id  sleep_date lifelog_date  Q1  Q2  Q3  S1  S2  S3
 0       id01  2024-06-27   2024-06-26   0   0   0   0   0   1
 1       id01  2024-06-28   2024-06-27   0   0   0   0   1   1
 2       id01  2024-06-29   2024-06-28   1   0   0   1   1   1
 3       id01  2024-06-30   2024-06-29   1   0   1   2   0   0
 4       id01  2024-07-01   2024-06-30   0   1   1   1   1   1,
 (450, 9),
 ['subject_id',
  'sleep_date',
  'lifelog_date',
  'Q1',
  'Q2',
  'Q3',
  'S1',
  'S2',
  'S3'])

In [None]:
df["sleep_date"] = pd.to_datetime(df["sleep_date"])
df["lifelog_date"] = pd.to_datetime(df["lifelog_date"])
for c in ["Q1","Q2","Q3","S2","S3"]:
    df[c] = df[c].astype("int8")
df["S1"] = df["S1"].astype("int8")
df["subject_id"] = df["subject_id"].astype("category")

dup = df.duplicated(["subject_id","lifelog_date"]).sum()
print("duplicates:", dup)
if dup:
    df = df.drop_duplicates(["subject_id","lifelog_date"]).copy()

df["dow"] = df["lifelog_date"].dt.dayofweek.astype("int8")   # 0=Mon
df["is_weekend"] = df["dow"].isin([5,6]).astype("int8")

df[["subject_id","lifelog_date","sleep_date","dow","is_weekend","Q1","Q2","Q3","S1","S2","S3"]].head()

duplicates: 0


Unnamed: 0,subject_id,lifelog_date,sleep_date,dow,is_weekend,Q1,Q2,Q3,S1,S2,S3
0,id01,2024-06-26,2024-06-27,2,0,0,0,0,0,0,1
1,id01,2024-06-27,2024-06-28,3,0,0,0,0,0,1,1
2,id01,2024-06-28,2024-06-29,4,0,1,0,0,1,1,1
3,id01,2024-06-29,2024-06-30,5,1,1,0,1,2,0,0
4,id01,2024-06-30,2024-07-01,6,1,0,1,1,1,1,1


In [None]:
catalog = {
    "dataset": "ETRI Lifelog 2024",
    "rows": int(len(df)),
    "subjects": int(df["subject_id"].nunique()),
    "date_range": {
        "min": df["lifelog_date"].min().strftime("%Y-%m-%d"),
        "max": df["lifelog_date"].max().strftime("%Y-%m-%d"),
    },
    "targets": {t: df[t].value_counts(dropna=False).to_dict() for t in ["Q1","Q2","Q3","S1","S2","S3"]}
}
(ART / "dataset_catalog.json").write_text(json.dumps(catalog, ensure_ascii=False, indent=2))
catalog

{'dataset': 'ETRI Lifelog 2024',
 'rows': 450,
 'subjects': 10,
 'date_range': {'min': '2024-06-03', 'max': '2024-11-14'},
 'targets': {'Q1': {0: 227, 1: 223},
  'Q2': {1: 253, 0: 197},
  'Q3': {1: 270, 0: 180},
  'S1': {1: 224, 0: 143, 2: 83},
  'S2': {1: 293, 0: 157},
  'S3': {1: 298, 0: 152}}}

In [None]:
cols = ["subject_id","lifelog_date","sleep_date","dow","is_weekend","Q1","Q2","Q3","S1","S2","S3"]
ds = df[cols].sort_values(["subject_id","lifelog_date"]).reset_index(drop=True)

ds.to_parquet(PROC / "metrics_only.parquet", compression="snappy", index=False)
ds.to_csv(PROC / "metrics_only.csv", index=False)

ds.head()

Unnamed: 0,subject_id,lifelog_date,sleep_date,dow,is_weekend,Q1,Q2,Q3,S1,S2,S3
0,id01,2024-06-26,2024-06-27,2,0,0,0,0,0,0,1
1,id01,2024-06-27,2024-06-28,3,0,0,0,0,0,1,1
2,id01,2024-06-28,2024-06-29,4,0,1,0,0,1,1,1
3,id01,2024-06-29,2024-06-30,5,1,1,0,1,2,0,0
4,id01,2024-06-30,2024-07-01,6,1,0,1,1,1,1,1


In [None]:
B_SIZE, SEED = 512, 42
rng = np.random.default_rng(SEED)

key = ds["subject_id"].astype(str) + "|" + ds["is_weekend"].astype(str)
idxs = []
for k, g in ds.groupby(key):
    take = max(1, int(round(B_SIZE * len(g) / len(ds))))
    take = min(take, len(g))
    sel = rng.choice(g.index.values, size=take, replace=False)
    idxs.extend(sel.tolist())
if len(idxs) > B_SIZE:
    idxs = list(rng.choice(np.array(idxs), size=B_SIZE, replace=False))
b_idx = sorted(map(int, idxs))

base_rates = {t: float(ds.loc[b_idx, t].mean()) for t in ["Q1","Q2","Q3","S1","S2","S3"]}

(ART / "background_idx.json").write_text(json.dumps({"seed": SEED, "size": len(b_idx), "index": b_idx}, ensure_ascii=False, indent=2))
(ART / "base_rates.json").write_text(json.dumps(base_rates, ensure_ascii=False, indent=2))

len(b_idx), base_rates

(450,
 {'Q1': 0.4955555555555556,
  'Q2': 0.5622222222222222,
  'Q3': 0.6,
  'S1': 0.8666666666666667,
  'S2': 0.6511111111111111,
  'S3': 0.6622222222222223})

In [None]:
import pyarrow as pa, pyarrow.parquet as pq

report = {}
for p in sorted(ITEMS_DIR.glob("*.parquet")):
    table = pq.read_table(p, columns=None)
    schema = {name: str(field.type) for name, field in zip(table.schema.names, table.schema)}
    cand = [c for c in table.schema.names if c.lower() in ["timestamp","time","dt","datetime","event_time","ts"]]
    report[p.name] = {
        "path": str(p),
        "n_rows": int(table.num_rows),
        "n_cols": int(table.num_columns),
        "columns": schema,
        "time_col_guess": cand[0] if cand else None
    }

(ART / "items_schema.json").write_text(json.dumps(report, ensure_ascii=False, indent=2))
list(report.keys())[:5], report[next(iter(report))]["columns"]

(['ch2025_mACStatus.parquet',
  'ch2025_mActivity.parquet',
  'ch2025_mAmbience.parquet',
  'ch2025_mBle.parquet',
  'ch2025_mGps.parquet'],
 {'subject_id': 'string', 'timestamp': 'timestamp[ns]', 'm_charging': 'int64'})