 드라이브 마운트
 * local로 가져와서 압축풀기

In [None]:
import os, shutil, zipfile
from pathlib import Path

# (Colab이면) 드라이브 마운트
IN_COLAB = "COLAB_GPU" in os.environ or "COLAB_TPU_ADDR" in os.environ
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive")

# ✅ 너 드라이브 원본 zip 위치
DRIVE_DIR = Path("/content/drive/MyDrive/3.개방데이터")

# ✅ 새로 만들 로컬 작업 폴더 (기존꺼 삭제하고 새로)
LOCAL_DIR = Path("/content/data_71802_local")

# ✅ 옵션: 로컬에 zip 복사본을 남길지
KEEP_LOCAL_ZIPS = False

# 0) 기존 로컬 폴더 삭제(완전 초기화)
if LOCAL_DIR.exists():
    shutil.rmtree(LOCAL_DIR)
LOCAL_DIR.mkdir(parents=True, exist_ok=True)

# 1) zip 찾기
zips = sorted(DRIVE_DIR.rglob("*.zip"))
print("[INFO] zip found:", len(zips))
assert len(zips) > 0, "DRIVE_DIR에서 zip을 못 찾았어. DRIVE_DIR 경로 확인해줘."

# 2) 압축해제 출력 폴더
EXTRACT_ROOT = LOCAL_DIR / "extracted"
EXTRACT_ROOT.mkdir(parents=True, exist_ok=True)

# 3) zip별로 압축해제 (필요하면 로컬에 잠깐 복사 → 해제 → zip 삭제)
for i, zp in enumerate(zips, 1):
    print(f"[{i}/{len(zips)}] unzip:", zp)

    # zip을 로컬에 복사할지 여부
    if KEEP_LOCAL_ZIPS:
        local_zip = LOCAL_DIR / "zips" / zp.name
        local_zip.parent.mkdir(parents=True, exist_ok=True)
        if not local_zip.exists():
            shutil.copy2(zp, local_zip)
        zip_to_open = local_zip
    else:
        # 드라이브에서 바로 풀기(느릴 수 있음). 느리면 True로 바꿔서 로컬 복사 후 풀어.
        zip_to_open = zp

    out_dir = EXTRACT_ROOT / zp.stem
    out_dir.mkdir(parents=True, exist_ok=True)

    # 이미 풀렸는지 간단 체크(폴더에 뭐라도 있으면 스킵)
    if any(out_dir.rglob("*")):
        print("  -> already extracted, skip")
        continue

    try:
        with zipfile.ZipFile(zip_to_open, "r") as zf:
            zf.extractall(out_dir)
        print("  -> done")
    except Exception as e:
        print("  -> [WARN] failed:", e)

# 4) 로컬 zip 복사본 지우기 (KEEP_LOCAL_ZIPS=False이면 애초에 안 생김)
if not KEEP_LOCAL_ZIPS:
    # 로컬에 zip을 따로 복사하지 않았으니 할 일 없음
    pass

# ✅ 최종 BASE_DIR: 압축해제된 데이터 루트
BASE_DIR = EXTRACT_ROOT
print("[INFO] BASE_DIR:", BASE_DIR)

# (검증) csv/bin/json 개수 확인
exts = {"csv":0,"bin":0,"json":0,"zip":0}
for p in BASE_DIR.rglob("*"):
    if p.is_file():
        s = p.suffix.lower().lstrip(".")
        if s in exts: exts[s] += 1
print("[INFO] extracted counts:", exts)


Mounted at /content/drive
[INFO] zip found: 683
[1/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Other/Other.zip
  -> done
[2/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Training/01.원천데이터/TS_agv_01_agv01_0901_0812.zip
  -> done
[3/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Training/01.원천데이터/TS_agv_01_agv01_0902_1253.zip
  -> done
[4/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Training/01.원천데이터/TS_agv_01_agv01_0902_2013.zip
  -> done
[5/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Training/01.원천데이터/TS_agv_01_agv01_0903_1018.zip
  -> done
[6/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Training/01.원천데이터/TS_agv_01_agv01_0903_1407.zip
  -> done
[7/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Training/01.원천데이터/TS_agv_01_agv01_1027_0724.zip
  -> done
[8/683] unzip: /content/drive/MyDrive/3.개방데이터/1.데이터/Training/01.원천데이터/TS_agv_01_agv01_1027_1405.zip
  -> done
[9/683] unzip: /content/drive/M

#2) complete_keys 없으면 자동 생성 → episodes/samples까지 한 번에

In [None]:
from pathlib import Path
import re
from collections import defaultdict

# ✅ BASE_DIR 확인(압축 풀린 루트)
BASE_DIR = Path("/content/data_71802_local/extracted")

# --- records + complete_keys 자동 생성 ---
stem_pat = re.compile(r"^(?P<prefix>[a-zA-Z]+)(?P<id>\d+)_(?P<mmdd>\d{4})_(?P<hms>\d{6})$")

def build_records_by_stem(base_dir: Path):
    records = defaultdict(dict)
    for f in base_dir.rglob("*"):
        if not f.is_file():
            continue
        ext = f.suffix.lower().lstrip(".")
        if ext not in ("csv","bin","json"):
            continue
        m = stem_pat.match(f.stem)
        if not m:
            continue
        g = m.groupdict()
        key = (g["prefix"].lower(), int(g["id"]), g["mmdd"], g["hms"])
        records[key][ext] = str(f)

    records = dict(records)
    complete_keys = [k for k,v in records.items() if all(x in v for x in ("csv","bin","json"))]
    print("records:", len(records))
    print("complete_keys:", len(complete_keys))
    return records, complete_keys

# complete_keys가 없으면 생성
if "complete_keys" not in globals() or "records" not in globals():
    records, complete_keys = build_records_by_stem(BASE_DIR)

# --- episodes / samples 생성 ---
PAST = 30
HORIZON = 30

def build_episodes(keys):
    epi_map = defaultdict(list)
    for (prefix, did, mmdd, hms) in keys:
        epi_map[(prefix, did, mmdd)].append((prefix, did, mmdd, hms))
    episodes=[]
    for epi_id, ks in epi_map.items():
        episodes.append((epi_id, sorted(ks, key=lambda x: x[3])))
    return episodes

episodes = build_episodes(complete_keys)
print("episodes:", len(episodes))

def build_samples(episodes, past=PAST, horizon=HORIZON):
    samples=[]
    for epi_idx, (_, ks) in enumerate(episodes):
        L = len(ks)
        for t in range(past-1, L-horizon):
            samples.append((epi_idx, t))
    return samples

samples = build_samples(episodes)
print("samples:", len(samples))

# (검증) 에피소드 길이 분포 조금 보기
lens = [len(ks) for _, ks in episodes]
print("episode length min/mean/max:", min(lens), sum(lens)/len(lens), max(lens))


records: 124263
complete_keys: 111870
episodes: 185
samples: 100955
episode length min/mean/max: 300 604.7027027027027 1504


#3) 센서/라벨 로더 + valid 필터링(라벨 None 제거)

In [None]:
import numpy as np, pandas as pd, csv, json
from pathlib import Path

SENSOR8 = ["PM1.0","PM2.5","PM10","NTC","CT1","CT2","CT3","CT4"]

def load_sensor_csv(path: str) -> np.ndarray:
    raw = Path(path).read_text(encoding="utf-8", errors="replace")
    lines = [ln.strip().lstrip("\ufeff") for ln in raw.splitlines() if ln.strip()]

    # (A) 메타 로그형
    if any(ln.startswith("sensor_data,") for ln in lines[:80]):
        vals = {k: None for k in SENSOR8}
        for ln in lines:
            parts = [p.strip().strip('"') for p in ln.split(",")]
            if len(parts) >= 4 and parts[0] == "sensor_data":
                name, field = parts[1], parts[2]
                if name in vals and field == "value":
                    try: vals[name] = float(parts[3])
                    except: pass
        if any(vals[k] is None for k in SENSOR8):
            raise ValueError(f"Missing sensor values in {path}")
        return np.array([vals[k] for k in SENSOR8], dtype=np.float32)

    # (B) TS 테이블형
    try:
        dialect = csv.Sniffer().sniff(raw[:2048], delimiters=[",",";","\t","|"])
        sep = dialect.delimiter
    except:
        sep = ","
    df = pd.read_csv(path, sep=sep, engine="python", on_bad_lines="skip")
    df.columns = [str(c).strip() for c in df.columns]
    if set(SENSOR8).issubset(set(df.columns)):
        return df.iloc[-1][SENSOR8].astype(np.float32).to_numpy()

    num_df = df.select_dtypes(include=["number"])
    if num_df.shape[1] >= 8:
        return num_df.iloc[-1, :8].astype(np.float32).to_numpy()
    raise ValueError(f"Cannot parse sensor csv: {path}")

LABEL_KEYS = ["label","Label","state","State","status","Status","target","Target","class","Class","y","Y"]

def _find_first_int_like(obj):
    if isinstance(obj, dict):
        for k,v in obj.items():
            if isinstance(k, str) and any(x in k.lower() for x in ["label","state","status","class","target"]):
                out = _find_first_int_like(v)
                if out is not None: return out
            out = _find_first_int_like(v)
            if out is not None: return out
    elif isinstance(obj, list):
        for it in obj:
            out = _find_first_int_like(it)
            if out is not None: return out
    else:
        try:
            val = int(float(obj))
            if 0 <= val <= 3: return val
        except:
            return None
    return None

def load_label_json(path: str):
    try:
        d = json.loads(Path(path).read_text(encoding="utf-8", errors="replace"))
    except:
        return None
    if isinstance(d, dict):
        for k in LABEL_KEYS:
            if k in d:
                v = _find_first_int_like(d[k])
                if v is not None: return v
    return _find_first_int_like(d)

# --- valid filtering ---
samples_valid=[]
for epi_idx, t in samples:
    _, ks = episodes[epi_idx]
    yk = ks[t+HORIZON]
    y = load_label_json(records[yk]["json"])
    if y is not None:
        samples_valid.append((epi_idx,t))

print("samples_valid:", len(samples_valid))


samples_valid: 100955


In [None]:
import random
from collections import defaultdict

SEED = 42
random.seed(SEED)

def split_by_episode(samples_valid, ratios=(0.8,0.1,0.1), seed=SEED):
    random.seed(seed)
    epi_to = defaultdict(list)
    for e,t in samples_valid:
        epi_to[e].append((e,t))
    epi_ids = list(epi_to.keys())
    random.shuffle(epi_ids)
    n=len(epi_ids)
    n_tr=int(n*ratios[0]); n_va=int(n*ratios[1])
    tr=set(epi_ids[:n_tr]); va=set(epi_ids[n_tr:n_tr+n_va]); te=set(epi_ids[n_tr+n_va:])
    train=[s for e in tr for s in epi_to[e]]
    val  =[s for e in va for s in epi_to[e]]
    test =[s for e in te for s in epi_to[e]]
    return train,val,test,(len(tr),len(va),len(te))

train_s, val_s, test_s, epi_counts = split_by_episode(samples_valid)
print("episode split:", epi_counts)
print("sample split :", len(train_s), len(val_s), len(test_s))


episode split: (148, 18, 19)
sample split : 80153 9469 11333


#5) 열화상 캐시(속도) — 먼저 10,000개만

In [None]:
import numpy as np
from pathlib import Path

CACHE_DIR = Path("/content/cache_thermal_celsius")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

H, W = 120, 160
NPIX = H*W

def _score_kelvin_like(x):
    finite = np.isfinite(x)
    if finite.mean() < 0.98: return -1e9
    xf = x[finite]
    mu, sd = float(xf.mean()), float(xf.std())
    score = 0.0
    if 150 <= mu <= 450: score += 100
    score -= abs(mu - 300) * 0.2
    if sd > 250: score -= (sd - 250) * 2
    return score

def decode_thermal_bin(path: str) -> np.ndarray:
    b = Path(path).read_bytes()
    need = NPIX * 2
    tail = b[-need:]
    cands=[]
    specs=[(np.uint16,False),(np.uint16,True),(np.int16,False),(np.int16,True)]
    scales=[1.0,0.1,0.01]
    for dt, swap in specs:
        arr = np.frombuffer(tail, dtype=dt)
        if swap: arr = arr.byteswap()
        x = arr.reshape(H,W).astype(np.float32)
        for sc in scales:
            cands.append(x*sc)
    best_x, best_s=None, -1e9
    for x in cands:
        s=_score_kelvin_like(x)
        if s>best_s: best_s, best_x=s, x
    return best_x

def thermal_preprocess(bin_path: str, invalid_k_le=1.0, clip_c=(-50.0,200.0)):
    img = decode_thermal_bin(bin_path).astype(np.float32)
    img = np.where(img <= invalid_k_le, np.nan, img)
    img = img - 273.15
    lo,hi = clip_c
    img = np.where((img<lo)|(img>hi), np.nan, img)
    m = np.nanmedian(img)
    return np.nan_to_num(img, nan=float(m))

def cache_one(bin_path: str) -> bool:
    out = CACHE_DIR / (Path(bin_path).stem + ".npy")
    if out.exists(): return True
    try:
        np.save(out, thermal_preprocess(bin_path).astype(np.float32))
        return True
    except:
        return False

def load_thermal_cached(bin_path: str) -> np.ndarray:
    npy = CACHE_DIR / (Path(bin_path).stem + ".npy")
    if npy.exists(): return np.load(npy).astype(np.float32)
    return thermal_preprocess(bin_path).astype(np.float32)

def bins_from_samples(samples_list):
    s=set()
    for epi_idx,t in samples_list:
        _, ks = episodes[epi_idx]
        for dt in (29,14,0):
            s.add(records[ks[t-dt]]["bin"])
    return list(s)

need_bins = list(set(bins_from_samples(train_s) + bins_from_samples(val_s)))
print("need_bins:", len(need_bins))

ok=0
for p in need_bins[:10000]:
    ok += int(cache_one(p))
print("cached:", ok, "/ 10000")


need_bins: 94436
cached: 10000 / 10000


#6) sensor mean/std(train only)

In [None]:
def compute_sensor_stats(train_s, max_rows=20000):
    xs=[]
    for i,(epi_idx,t) in enumerate(train_s):
        if i>=max_rows: break
        _, ks = episodes[epi_idx]
        xs.append(load_sensor_csv(records[ks[t]]["csv"]))
    X = np.stack(xs, axis=0).astype(np.float32)
    return X.mean(axis=0), X.std(axis=0)+1e-6

def zscore_clip_sensor(sw, mu, sd, clip=3.0):
    z = (sw - mu)/sd
    return np.clip(z, -clip, clip).astype(np.float32)

sensor_mean, sensor_std = compute_sensor_stats(train_s)
print("sensor_mean:", sensor_mean)
print("sensor_std :", sensor_std)


sensor_mean: [11.67935  16.089    27.73555  32.233047  6.949652 25.268726 14.244119
  6.012115]
sensor_std : [ 8.065869  9.157165 17.152046  8.07791  18.231245 34.726257 23.67867
 13.208   ]


#8) Dataset/DataLoader + 속도검증

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import time

IMG_SIZE = 224
IMG_MEAN = 43.715
IMG_STD  = 4.673

def resize_img(img_120x160: np.ndarray, out=IMG_SIZE) -> np.ndarray:
    x = torch.tensor(img_120x160[None,None,:,:], dtype=torch.float32)  # (1,1,120,160)
    x = torch.nn.functional.interpolate(x, size=(out,out), mode="bilinear", align_corners=False)
    return x[0,0].numpy()

class ForecastSingleDataset(Dataset):
    def __init__(self, samples_list):
        self.samples = samples_list

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        epi_idx, t = self.samples[idx]
        _, ks = episodes[epi_idx]

        # sensor window (30,8)
        win_keys = ks[t-(PAST-1):t+1]
        sw = np.stack([load_sensor_csv(records[k]["csv"]) for k in win_keys], axis=0).astype(np.float32)
        sw = zscore_clip_sensor(sw, sensor_mean, sensor_std, clip=3.0)

        # image stack (3,224,224): t-29, t-14, t
        im1 = resize_img(load_thermal_cached(records[ks[t-29]]["bin"]))
        im2 = resize_img(load_thermal_cached(records[ks[t-14]]["bin"]))
        im3 = resize_img(load_thermal_cached(records[ks[t]]["bin"]))
        img = np.stack([im1, im2, im3], axis=0).astype(np.float32)

        # 논문 z-score (Celsius)
        img = (img - IMG_MEAN) / (IMG_STD + 1e-8)

        # target (t+30)
        y = float(load_label_json(records[ks[t+HORIZON]]["json"]))  # samples_valid로 None 제거됨

        return {
            "sensor_window": torch.tensor(sw, dtype=torch.float32),
            "ir_images": torch.tensor(img, dtype=torch.float32),
            "target": torch.tensor(y, dtype=torch.float32),
        }

ds_train = ForecastSingleDataset(train_s)
ds_val   = ForecastSingleDataset(val_s)
ds_test  = ForecastSingleDataset(test_s)

dl_train = DataLoader(ds_train, batch_size=32, shuffle=True,  num_workers=2, pin_memory=True)
dl_val   = DataLoader(ds_val,   batch_size=32, shuffle=False, num_workers=2, pin_memory=True)
dl_test  = DataLoader(ds_test,  batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

# ✅ 샘플 검증
b = ds_train[0]
print("sample check:", b["sensor_window"].shape, b["ir_images"].shape, b["target"].item())

# ✅ 로딩 속도 검증
t0 = time.time()
for i, batch in enumerate(dl_train):
    if i == 5:
        break
print("5 batches load time(sec):", time.time() - t0)


sample check: torch.Size([30, 8]) torch.Size([3, 224, 224]) 1.0
5 batches load time(sec): 2.4176673889160156


#9) MMT 모델 + 학습 루프(안전장치 포함: steps 로그)
# ❌ 아래 코드 학습 잘못된거임!!! 맨 마지막 페이지 봐야함!!!

In [None]:
import torch.nn as nn
import torch.optim as optim
import numpy as np

class MMT(nn.Module):
    def __init__(self, d_model=256, nhead=8, layers=4, patch=16):
        super().__init__()
        self.patch = patch
        self.img_tokens = (IMG_SIZE // patch) * (IMG_SIZE // patch)  # 196

        self.img_proj = nn.Linear(patch*patch*3, d_model)
        self.sen_proj = nn.Linear(8, d_model)

        self.cls = nn.Parameter(torch.zeros(1,1,d_model))
        self.pos = nn.Parameter(torch.zeros(1,1+self.img_tokens+PAST, d_model))

        enc = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.enc = nn.TransformerEncoder(enc, num_layers=layers)

        self.head = nn.Linear(d_model, 1)
        nn.init.trunc_normal_(self.cls, std=0.02)
        nn.init.trunc_normal_(self.pos, std=0.02)

    def img_to_tokens(self, x):
        B,C,H,W = x.shape
        p = self.patch
        patches = x.unfold(2,p,p).unfold(3,p,p)          # (B,3,14,14,16,16)
        patches = patches.permute(0,2,3,1,4,5).contiguous()
        patches = patches.view(B, self.img_tokens, C*p*p) # (B,196,3*256)
        return self.img_proj(patches)                     # (B,196,d)

    def forward(self, sensor_window, ir_images):
        B = sensor_window.size(0)
        tok_img = self.img_to_tokens(ir_images)           # (B,196,d)
        tok_sen = self.sen_proj(sensor_window)            # (B,30,d)

        cls = self.cls.expand(B,-1,-1)                    # (B,1,d)
        x = torch.cat([cls, tok_img, tok_sen], dim=1)     # (B,227,d)
        x = x + self.pos[:, :x.size(1), :]
        h = self.enc(x)
        return self.head(h[:,0]).squeeze(1)               # (B,)

def evaluate(model, loader, device):
    model.eval()
    loss_fn = nn.MSELoss()
    losses=[]
    with torch.no_grad():
        for batch in loader:
            sen = batch["sensor_window"].to(device)
            img = batch["ir_images"].to(device)
            y   = batch["target"].to(device)
            yhat = model(sen, img)
            losses.append(loss_fn(yhat, y).item())
    return float(np.mean(losses)) if losses else 1e9

device = "cuda" if torch.cuda.is_available() else "cpu"
print("device:", device)

model = MMT().to(device)
opt = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
loss_fn = nn.MSELoss()

best = 1e9
EPOCHS = 5

for epoch in range(1, EPOCHS+1):
    model.train()
    run=0.0; n=0

    for step, batch in enumerate(dl_train, 1):
        sen = batch["sensor_window"].to(device)
        img = batch["ir_images"].to(device)
        y   = batch["target"].to(device)

        opt.zero_grad(set_to_none=True)
        yhat = model(sen, img)
        loss = loss_fn(yhat, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        run += loss.item() * sen.size(0)
        n += sen.size(0)

        # ✅ 진행 확인용(너가 불안해했던 부분 해결)
        if step % 200 == 0:
            print(f"  step {step} | batch_loss {loss.item():.4f}")

    tr_loss = run/max(n,1)
    va_loss = evaluate(model, dl_val, device)
    print(f"Epoch {epoch} | train {tr_loss:.4f} | val {va_loss:.4f}")

    if va_loss < best:
        best = va_loss
        torch.save(model.state_dict(), "/content/mmt_best.pt")
        print("  saved best.")


device: cuda
  step 200 | batch_loss 0.0007
  step 400 | batch_loss 0.0004
  step 600 | batch_loss 0.0001
  step 800 | batch_loss 0.0001
  step 1000 | batch_loss 0.0004
  step 1200 | batch_loss 0.0001
  step 1400 | batch_loss 0.0003
  step 1600 | batch_loss 0.0001
  step 1800 | batch_loss 0.0001
  step 2000 | batch_loss 0.0000
  step 2200 | batch_loss 0.0001
  step 2400 | batch_loss 0.0002
Epoch 1 | train 0.0151 | val 0.0007
  saved best.
  step 200 | batch_loss 0.0000
  step 400 | batch_loss 0.0000
  step 600 | batch_loss 0.0000
  step 800 | batch_loss 0.0000
  step 1000 | batch_loss 0.0000
  step 1200 | batch_loss 0.0001
  step 1400 | batch_loss 0.0001
  step 1600 | batch_loss 0.0001
  step 1800 | batch_loss 0.0000
  step 2000 | batch_loss 0.0000
  step 2200 | batch_loss 0.0000
  step 2400 | batch_loss 0.0001
Epoch 2 | train 0.0001 | val 0.0012
  step 200 | batch_loss 0.0000
  step 400 | batch_loss 0.0002
  step 600 | batch_loss 0.0000
  step 800 | batch_loss 0.0001
  step 1000 | bat

#10) 테스트 평가 + (선택) 라벨 반올림 정확도

In [None]:
# best 모델 로드
model.load_state_dict(torch.load("/content/mmt_best.pt", map_location=device))

test_mse = evaluate(model, dl_test, device)
print("TEST MSE:", test_mse)

# 라벨 0~3으로 rounding해서 accuracy도 같이(참고용)
model.eval()
ys=[]; yh=[]
with torch.no_grad():
    for batch in dl_test:
        sen = batch["sensor_window"].to(device)
        img = batch["ir_images"].to(device)
        y   = batch["target"].cpu().numpy()
        pred = model(sen,img).cpu().numpy()
        ys.append(y); yh.append(pred)

ys = np.concatenate(ys)
yh = np.concatenate(yh)

yh_round = np.clip(np.rint(yh), 0, 3)
acc = (yh_round == ys).mean()
mae = np.mean(np.abs(yh - ys))
print("TEST rounded accuracy:", acc)
print("TEST MAE:", mae)


TEST MSE: 7.574634672008114e-05
TEST rounded accuracy: 1.0
TEST MAE: 0.008703235


---

#지금 결과는 “너무 좋아서 의심해야 하는” 전형적인 패턴
* rounded accuracy = 1.0, train loss=0.0000에 가까움, MAE=0.012면 거의 완벽 예측인데, 현실 데이터(열화 예지/상태 예측)에서는 보통 이렇게 안 나와.

* 즉, 모델이 천재라서라기보다 누수(leakage) 또는 라벨이 사실상 고정/

In [None]:
ok=0
for p in need_bins:
    ok += int(cache_one(p))
print("cached total:", ok, "/", len(need_bins))


cached total: 94436 / 94436


#검증

# 가장 흔한 원인 3가지


(A) 라벨이 에피소드 내에서 거의 변하지 않는다

에피소드마다 상태(0~3)가 거의 고정이면, 미래(t+30)도 지금(t)과 같아서 그냥 “현재 상태”만 알아도 맞춤.

✅ 체크: 에피소드 내 라벨 변화율(전이 횟수) 보기



(B) 우리가 만든 “에피소드” 정의가 너무 커서, split이 사실상 같은 흐름을 공유한다

episode split 자체는 했지만, episode를 (prefix,id,mmdd)로 묶었잖아.
만약 같은 운행이 날짜/폴더가 달라져도 같은 시퀀스가 분산되어 있거나, 혹은 파일명 규칙이 섞여 “다른 에피소드”로 잘못 쪼개졌다면 누수가 생길 수 있어.

✅ 체크: train/val/test에 동일한 img-id나 동일 stem이 들어가는지 확인



(C) 라벨 JSON을 잘못 파싱해서 “항상 0 또는 1” 같은 값만 뽑는 경우

load_label_json()이 중첩 구조에서 엉뚱한 숫자(예: 장비번호, 누적일수)를 라벨로 착각해 거의 일정한 값만 반환할 수 있어.

✅ 체크: 라벨 분포(0/1/2/3)와 JSON 원문에서 어떤 경로로 뽑혔는지 샘플링 확인

#검증 1) 라벨 분포(Train/Val/Test 각각)

In [None]:
import numpy as np

def label_dist(samples_list, n=20000):
    ys=[]
    for i,(epi_idx,t) in enumerate(samples_list[:n]):
        _, ks = episodes[epi_idx]
        y = load_label_json(records[ks[t+HORIZON]]["json"])
        ys.append(y)
    ys = np.array(ys)
    uniq, cnt = np.unique(ys, return_counts=True)
    return dict(zip(uniq.tolist(), cnt.tolist()))

print("train label dist:", label_dist(train_s))
print("val   label dist:", label_dist(val_s))
print("test  label dist:", label_dist(test_s))


train label dist: {1: 20000}
val   label dist: {1: 9469}
test  label dist: {1: 11333}


#검증 2) “현재(t)” 라벨과 “미래(t+30)” 라벨이 얼마나 같은지

In [None]:
def same_rate(samples_list, n=20000):
    same=0; total=0
    for (epi_idx,t) in samples_list[:n]:
        _, ks = episodes[epi_idx]
        y_now = load_label_json(records[ks[t]]["json"])
        y_fut = load_label_json(records[ks[t+HORIZON]]["json"])
        if y_now is None or y_fut is None:
            continue
        same += int(y_now == y_fut)
        total += 1
    return same/total if total else None

print("train y(t)==y(t+30):", same_rate(train_s))
print("val   y(t)==y(t+30):", same_rate(val_s))
print("test  y(t)==y(t+30):", same_rate(test_s))


train y(t)==y(t+30): 1.0
val   y(t)==y(t+30): 1.0
test  y(t)==y(t+30): 1.0


#검증 3) 에피소드 내부 라벨 전이(변화) 횟수

In [None]:
def episode_transitions(epi_idx, max_len=1200):
    _, ks = episodes[epi_idx]
    L = min(len(ks), max_len)
    ys=[]
    for i in range(L):
        y = load_label_json(records[ks[i]]["json"])
        ys.append(y)
    # None 제거
    ys = [y for y in ys if y is not None]
    trans = sum(int(ys[i]!=ys[i-1]) for i in range(1,len(ys)))
    return len(ys), trans

# 몇 개만 보기
for epi_idx in [0, 1, 2, 3, 4]:
    L, tr = episode_transitions(epi_idx)
    print(f"episode {epi_idx}: len={L}, transitions={tr}")


episode 0: len=361, transitions=0
episode 1: len=903, transitions=0
episode 2: len=1200, transitions=0
episode 3: len=1200, transitions=0
episode 4: len=1200, transitions=0


In [None]:
print("y true min/max:", ys.min(), ys.max())
print("y pred min/max:", yh.min(), yh.max())
print("pred sample:", yh[:10])


y true min/max: 1.0 1.0
y pred min/max: 1.0085742 1.0087517
pred sample: [1.0087075 1.0086893 1.0086706 1.0087031 1.0087147 1.0086861 1.0087025
 1.0086948 1.0086625 1.0086873]


#결론

 파이프라인에서 “라벨(y)”은

* train/val/test 전부 1만 존재하고

* y(t) == y(t+30) 항상 참

* 에피소드 내부 전이(변화) 0

즉 모델이 잘한 게 아니라, 우리가 가져온 y가 ‘항상 1’로 고정돼 있어서 정확도 1.0이 나온 거야.

이건 2가지 중 하나야:

1. 라벨 JSON에서 우리가 뽑는 값이 “진짜 라벨”이 아니라, JSON 안에 있는 어떤 숫자(예: 장비ID/운영일/카운트 등)를 잘못 집어서 항상 1이 나오는 경우

2. AIHub 라벨 파일 자체가 “정상(1)”만 포함된 subset인데, 그럼 데이터셋 목표와 안 맞음(가능성 낮음).
→ 네 데이터 규모/구성상 1)일 확률이 거의 확실.

# 진짜 라벨 경로”를 찾아서 파서를 고정해야 함

지금 필요한 건 라벨 JSON 구조를 실제로 확인해서,

어떤 키/경로에 0~3이 있는지 찾아내고

그 경로로만 라벨을 읽도록 load_label_json()을 바꾸는 것.

#1) 라벨 JSON 20개를 “키 구조”까지 자동 분석해서 후보를 뽑는 코드

In [None]:
import json
from pathlib import Path
from collections import defaultdict, Counter
import random

# complete_keys에서 json 경로 샘플링
json_paths = [records[k]["json"] for k in complete_keys]
random.seed(42)
sample_paths = random.sample(json_paths, 30)

def collect_paths(obj, prefix=""):
    out = []
    if isinstance(obj, dict):
        for k,v in obj.items():
            out += collect_paths(v, prefix + f".{k}")
    elif isinstance(obj, list):
        for i,v in enumerate(obj[:20]):  # 너무 길면 앞 20개만
            out += collect_paths(v, prefix + f"[{i}]")
    else:
        out.append((prefix, obj))
    return out

path_counter = Counter()
value_examples = defaultdict(list)

for p in sample_paths:
    try:
        d = json.loads(Path(p).read_text(encoding="utf-8", errors="replace"))
    except:
        continue

    for path, val in collect_paths(d, prefix="$"):
        # 0~3 후보(정수/실수/문자열 모두)
        try:
            vv = int(float(val))
            if 0 <= vv <= 3:
                path_counter[path] += 1
                if len(value_examples[path]) < 5:
                    value_examples[path].append(vv)
        except:
            pass

print("=== candidate paths that yield values in [0..3] ===")
for path, cnt in path_counter.most_common(20):
    print(f"{cnt:>2}x  {path}  examples={value_examples[path]}")


=== candidate paths that yield values in [0..3] ===
30x  $.meta_info[0].duration_time  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].PM10[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].PM2.5[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].PM1.0[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].NTC[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].CT1[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].CT2[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].CT3[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.sensor_data[0].CT4[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.annotations[0].tagging[0].state  examples=[0, 0, 3, 0, 0]
30x  $.external_data[0].ex_temperature[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.external_data[0].ex_humidity[0].trend  examples=[1, 1, 1, 1, 1]
30x  $.external_data[0].ex_illuminance[0].trend  examples=[1, 1, 1, 1, 1]
28x  $.sensor_data[0].CT1[0].value  examples=[2, 2, 2, 1, 2]
23x  $.sensor_data[0].CT3[0].value  examples=[0, 0

#2) 지금 당장 “항상 1”이 뽑히는 이유를 확인하는 코드 (현재 파서 디버그)

In [None]:
def debug_label(path: str):
    d = json.loads(Path(path).read_text(encoding="utf-8", errors="replace"))
    y = load_label_json(path)
    return y, d

for p in sample_paths[:5]:
    y, d = debug_label(p)
    print("json:", p)
    print("parsed y:", y)
    # 상위 키만 출력
    if isinstance(d, dict):
        print("top keys:", list(d.keys())[:30])
    print("-"*60)


json: /content/data_71802_local/extracted/VL_oht_18_oht18_0826_2155/oht18_0826_215736.json
parsed y: 1
top keys: ['meta_info', 'sensor_data', 'ir_data', 'annotations', 'external_data']
------------------------------------------------------------
json: /content/data_71802_local/extracted/TL_oht_15_oht15_0826_2126/oht15_0826_212628.json
parsed y: 1
top keys: ['meta_info', 'sensor_data', 'ir_data', 'annotations', 'external_data']
------------------------------------------------------------
json: /content/data_71802_local/extracted/TL_oht_02_oht02_0920_1020/oht02_0920_102525.json
parsed y: 1
top keys: ['meta_info', 'sensor_data', 'ir_data', 'annotations', 'external_data']
------------------------------------------------------------
json: /content/data_71802_local/extracted/TL_agv_14_agv14_0903_0650/agv14_0903_065301.json
parsed y: 1
top keys: ['meta_info', 'sensor_data', 'ir_data', 'annotations', 'external_data']
------------------------------------------------------------
json: /content/d

✅ 진짜 라벨:
 `$.annotations[0].tagging[0].state` (예시가 0, 3 등으로 변함)

#1) 라벨 파서 교체 (정답: annotations→tagging→state)

In [None]:
import json
from pathlib import Path

def load_label_json_strict(path: str):
    """
    정답 라벨: $.annotations[0].tagging[0].state
    실패하면 None 반환 (필터링에서 제거)
    """
    try:
        d = json.loads(Path(path).read_text(encoding="utf-8", errors="replace"))
    except:
        return None

    try:
        v = d["annotations"][0]["tagging"][0]["state"]
        v = int(float(v))
        if 0 <= v <= 3:
            return v
        return None
    except:
        return None


#2) 라벨 분포/전이 재검증 (이제 정상이어야 함)

In [None]:
import numpy as np
import random

def label_dist_strict(samples_list, n=20000):
    ys=[]
    for (epi_idx,t) in samples_list[:n]:
        _, ks = episodes[epi_idx]
        y = load_label_json_strict(records[ks[t+HORIZON]]["json"])
        if y is not None:
            ys.append(y)
    ys = np.array(ys)
    uniq, cnt = np.unique(ys, return_counts=True)
    return dict(zip(uniq.tolist(), cnt.tolist())), len(ys)

# samples_valid 다시 만들기
samples_valid=[]
for epi_idx, t in samples:
    _, ks = episodes[epi_idx]
    y = load_label_json_strict(records[ks[t+HORIZON]]["json"])
    if y is not None:
        samples_valid.append((epi_idx,t))

print("samples_valid (strict):", len(samples_valid))

# 분포 보기(전체 samples_valid 기준이 아니라, split 전이라 임시로 일부만)
dist, m = label_dist_strict(samples_valid, n=min(20000, len(samples_valid)))
print("label dist (strict, sample):", dist, "n_used=", m)

# y(t)==y(t+30) 비율 다시 보기
def same_rate_strict(samples_list, n=20000):
    same=0; total=0
    for (epi_idx,t) in samples_list[:n]:
        _, ks = episodes[epi_idx]
        y_now = load_label_json_strict(records[ks[t]]["json"])
        y_fut = load_label_json_strict(records[ks[t+HORIZON]]["json"])
        if y_now is None or y_fut is None:
            continue
        same += int(y_now == y_fut)
        total += 1
    return same/total if total else None

print("y(t)==y(t+30) strict:", same_rate_strict(samples_valid, n=min(20000, len(samples_valid))))

# 에피소드 전이 다시 확인
def episode_transitions_strict(epi_idx, max_len=1200):
    _, ks = episodes[epi_idx]
    L = min(len(ks), max_len)
    ys=[]
    for i in range(L):
        y = load_label_json_strict(records[ks[i]]["json"])
        if y is not None:
            ys.append(y)
    trans = sum(int(ys[i]!=ys[i-1]) for i in range(1,len(ys)))
    return len(ys), trans, (ys[:20] if ys else [])

for epi_idx in [0,1,2,3,4]:
    L,tr,head = episode_transitions_strict(epi_idx)
    print(f"episode {epi_idx}: len={L}, transitions={tr}, head={head}")


samples_valid (strict): 100955
label dist (strict, sample): {0: 7980, 1: 5108, 2: 5082, 3: 1830} n_used= 20000
y(t)==y(t+30) strict: 0.6685
episode 0: len=361, transitions=3, head=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
episode 1: len=903, transitions=11, head=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
episode 2: len=1200, transitions=15, head=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
episode 3: len=1200, transitions=15, head=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
episode 4: len=1200, transitions=15, head=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


#3) split 다시 만들기 (strict samples_valid 기준)

In [None]:
import random
from collections import defaultdict

SEED = 42
random.seed(SEED)

def split_by_episode(samples_valid, ratios=(0.8,0.1,0.1), seed=SEED):
    random.seed(seed)
    epi_to = defaultdict(list)
    for e,t in samples_valid:
        epi_to[e].append((e,t))
    epi_ids = list(epi_to.keys())
    random.shuffle(epi_ids)
    n=len(epi_ids)
    n_tr=int(n*ratios[0]); n_va=int(n*ratios[1])
    tr=set(epi_ids[:n_tr]); va=set(epi_ids[n_tr:n_tr+n_va]); te=set(epi_ids[n_tr+n_va:])
    train=[s for e in tr for s in epi_to[e]]
    val  =[s for e in va for s in epi_to[e]]
    test =[s for e in te for s in epi_to[e]]
    return train,val,test,(len(tr),len(va),len(te))

train_s, val_s, test_s, epi_counts = split_by_episode(samples_valid)
print("episode split:", epi_counts)
print("sample split :", len(train_s), len(val_s), len(test_s))

print("train label dist:", label_dist_strict(train_s)[0])
print("val   label dist:", label_dist_strict(val_s)[0])
print("test  label dist:", label_dist_strict(test_s)[0])


episode split: (148, 18, 19)
sample split : 80153 9469 11333
train label dist: {0: 7966, 1: 5118, 2: 5116, 3: 1800}
val   label dist: {0: 4715, 1: 1994, 2: 2010, 3: 750}
test  label dist: {0: 4502, 1: 2879, 2: 2902, 3: 1050}


In [None]:
y = float(load_label_json_strict(records[ks[t+HORIZON]]["json"]))


이제야 “예측 문제”가 제대로 정의된 상태야.

* 라벨 분포가 0~3로 잘 나뉨(샘플 기준 0이 많고 3이 적은 불균형 존재)

* y(t)==y(t+30)이 0.6685 → “그냥 현재값 복사” 베이스라인 정확도는 약 66.9%

* 에피소드 전이도 생김(3~15회) → 미래예측 의미 있음

---

이제 해야 할 건 딱 2가지 수정이야:

1. Dataset의 target을 load_label_json_strict로 교체

2. 평가는 회귀(MSE/MAE) + 분류(rounded accuracy + confusion)를 같이 보되, 불균형 때문에 macro F1도 같이 보는 게 좋아.

#A) split 다시 만들고(train/val/test)

In [None]:
train_s, val_s, test_s, epi_counts = split_by_episode(samples_valid)
print("episode split:", epi_counts)
print("sample split :", len(train_s), len(val_s), len(test_s))


episode split: (148, 18, 19)
sample split : 80153 9469 11333


#B) Dataset 수정: target만 strict로

In [None]:
# 기존:
# y = float(load_label_json(records[ks[t+HORIZON]]["json"]))

# 수정:
y = float(load_label_json_strict(records[ks[t+HORIZON]]["json"]))


In [None]:
b = ds_train[0]
print("target example:", b["target"].item())


target example: 1.0


#C) (권장) 불균형 보정: weighted loss로 학습 안정화

In [None]:
import numpy as np
from collections import Counter

def get_train_label_counts(train_s, max_n=50000):
    ys=[]
    for (epi_idx,t) in train_s[:max_n]:
        _, ks = episodes[epi_idx]
        y = load_label_json_strict(records[ks[t+HORIZON]]["json"])
        ys.append(int(y))
    c = Counter(ys)
    return c

cnt = get_train_label_counts(train_s)
print("train label counts:", cnt)

# inverse freq weights (0~3)
total = sum(cnt.values())
w = np.array([total/(cnt.get(i,1)) for i in range(4)], dtype=np.float32)
w = w / w.mean()  # scale normalize
print("class weights:", w)
class_w = torch.tensor(w)


train label counts: Counter({0: 20537, 2: 12536, 1: 12457, 3: 4470})
class weights: [0.4503855 0.7425196 0.7378404 2.0692544]


In [None]:
import numpy as np
from collections import Counter

def get_train_label_counts(train_s, max_n=50000):
    ys=[]
    for (epi_idx,t) in train_s[:max_n]:
        _, ks = episodes[epi_idx]
        y = load_label_json_strict(records[ks[t+HORIZON]]["json"])
        ys.append(int(y))
    c = Counter(ys)
    return c

cnt = get_train_label_counts(train_s)
print("train label counts:", cnt)

# inverse freq weights (0~3)
total = sum(cnt.values())
w = np.array([total/(cnt.get(i,1)) for i in range(4)], dtype=np.float32)
w = w / w.mean()  # scale normalize
print("class weights:", w)
class_w = torch.tensor(w)


train label counts: Counter({0: 20537, 2: 12536, 1: 12457, 3: 4470})
class weights: [0.4503855 0.7425196 0.7378404 2.0692544]


In [None]:
import torch
import torch.nn as nn

class WeightedMSE(nn.Module):
    def __init__(self, class_w: torch.Tensor):
        super().__init__()
        self.register_buffer("w", class_w.float())

    def forward(self, pred, target):
        # target: float(0~3) -> class index
        idx = torch.clamp(target.round().long(), 0, 3)
        ww = self.w[idx]
        return torch.mean(ww * (pred - target)**2)

loss_fn = WeightedMSE(class_w.to(device))


#D) 평가: MSE/MAE + rounded accuracy + confusion + macro F1

In [None]:
import numpy as np

def eval_metrics(model, loader, device):
    model.eval()
    ys=[]; yh=[]
    with torch.no_grad():
        for batch in loader:
            sen = batch["sensor_window"].to(device)
            img = batch["ir_images"].to(device)
            y   = batch["target"].cpu().numpy()
            pred = model(sen,img).cpu().numpy()
            ys.append(y); yh.append(pred)
    y = np.concatenate(ys)
    p = np.concatenate(yh)

    mse = np.mean((p - y)**2)
    mae = np.mean(np.abs(p - y))

    pr = np.clip(np.rint(p), 0, 3).astype(int)
    yt = y.astype(int)

    acc = (pr == yt).mean()

    # confusion + macro f1
    cm = np.zeros((4,4), dtype=int)
    for a,b in zip(yt, pr):
        cm[a,b] += 1

    f1s=[]
    for cls in range(4):
        tp = cm[cls,cls]
        fp = cm[:,cls].sum() - tp
        fn = cm[cls,:].sum() - tp
        prec = tp/(tp+fp+1e-9)
        rec  = tp/(tp+fn+1e-9)
        f1 = 2*prec*rec/(prec+rec+1e-9)
        f1s.append(f1)
    macro_f1 = float(np.mean(f1s))

    return {"mse": float(mse), "mae": float(mae), "acc_round": float(acc), "macro_f1": macro_f1, "cm": cm}

# 학습 후:
print("VAL :", eval_metrics(model, dl_val, device))
print("TEST:", eval_metrics(model, dl_test, device))


VAL : {'mse': 7.575827476102859e-05, 'mae': 0.008703900501132011, 'acc_round': 1.0, 'macro_f1': 0.24999999987497357, 'cm': array([[   0,    0,    0,    0],
       [   0, 9469,    0,    0],
       [   0,    0,    0,    0],
       [   0,    0,    0,    0]])}
TEST: {'mse': 7.574661867693067e-05, 'mae': 0.008703234605491161, 'acc_round': 1.0, 'macro_f1': 0.24999999987497795, 'cm': array([[    0,     0,     0,     0],
       [    0, 11333,     0,     0],
       [    0,     0,     0,     0],
       [    0,     0,     0,     0]])}


---

#E) 가장 중요한 베이스라인 2개(꼭 비교해야 함)

In [None]:
def baseline_hold(samples_list, n=20000):
    ys=[]; pr=[]
    for (epi_idx,t) in samples_list[:n]:
        _, ks = episodes[epi_idx]
        y_now = load_label_json_strict(records[ks[t]]["json"])
        y_fut = load_label_json_strict(records[ks[t+HORIZON]]["json"])
        if y_now is None or y_fut is None:
            continue
        ys.append(y_fut)
        pr.append(y_now)
    ys=np.array(ys); pr=np.array(pr)ㅌ
    return (ys==pr).mean()

print("baseline hold acc (test):", baseline_hold(test_s, n=20000))


baseline hold acc (test): 0.6744021882996559


In [None]:
def baseline_majority(samples_list, majority=0, n=20000):
    ys=[]
    for (epi_idx,t) in samples_list[:n]:
        _, ks = episodes[epi_idx]
        y = load_label_json_strict(records[ks[t+HORIZON]]["json"])
        if y is not None:
            ys.append(y)
    ys=np.array(ys)
    return (ys==majority).mean()

print("baseline majority-0 acc (test):", baseline_majority(test_s, 0, n=20000))


baseline majority-0 acc (test): 0.3972469778522898


#hold baseline 테스트 정확도(정식 계산)

In [None]:
print("baseline hold acc (test):", baseline_hold(test_s, n=50000))


baseline hold acc (test): 0.6744021882996559


In [None]:
# ds_train 만든 뒤
tmp = [ds_train[i]["target"].item() for i in range(20)]
print("targets sample:", tmp)
print("unique:", sorted(set(int(x) for x in tmp)))


targets sample: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
unique: [1]


결과

✅ 너 Dataset이 아직도 “옛 load_label_json(항상 1)”을 쓰고 있거나,
혹은 strict 함수로 바꿨더라도 t+HORIZON이 아니라 다른 경로/다른 json을 읽고 있어서 결과가 다시 1로 고정된 거야.

지금 네 검증(라벨 분포, 전이)은 strict 파서로 정상이었잖아.
그런데 Dataset 샘플이 전부 1이면, Dataset 내부 y만 잘못 연결된 것이 확실해.

#✅ 1) Dataset 강제 교체 (strict 라벨 + 디버그 출력 포함)

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset

class ForecastSingleDatasetStrict(Dataset):
    def __init__(self, samples_list, debug=False):
        self.samples = samples_list
        self.debug = debug

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        epi_idx, t = self.samples[idx]
        _, ks = episodes[epi_idx]

        # --- sensor window ---
        win_keys = ks[t-(PAST-1):t+1]
        sw = np.stack([load_sensor_csv(records[k]["csv"]) for k in win_keys], axis=0).astype(np.float32)
        sw = zscore_clip_sensor(sw, sensor_mean, sensor_std, clip=3.0)

        # --- image stack ---
        im1 = resize_img(load_thermal_cached(records[ks[t-29]]["bin"]))
        im2 = resize_img(load_thermal_cached(records[ks[t-14]]["bin"]))
        im3 = resize_img(load_thermal_cached(records[ks[t]]["bin"]))
        img = np.stack([im1, im2, im3], axis=0).astype(np.float32)
        img = (img - IMG_MEAN) / (IMG_STD + 1e-8)

        # --- STRICT target: t+HORIZON state ---
        json_path = records[ks[t+HORIZON]]["json"]
        y = load_label_json_strict(json_path)  # <-- 핵심

        # debug 모드면 실제 json 경로와 y 출력 (처음 몇 개만)
        if self.debug and idx < 5:
            print("[DEBUG] idx", idx, "t", t, "json:", json_path, "y:", y)

        # 방어: 혹시 None이면 (원래 samples_valid로 제거되긴 했지만)
        if y is None:
            y = 0

        return {
            "sensor_window": torch.tensor(sw, dtype=torch.float32),
            "ir_images": torch.tensor(img, dtype=torch.float32),
            "target": torch.tensor(float(y), dtype=torch.float32),  # 회귀용(기존 유지)
        }


#✅ 2) ds_train을 이걸로 다시 만들고, debug로 라벨 확인

In [None]:
ds_train = ForecastSingleDatasetStrict(train_s, debug=True)

tmp = [ds_train[i]["target"].item() for i in range(50)]
print("targets sample:", tmp[:20])
print("unique:", sorted(set(int(x) for x in tmp)))


[DEBUG] idx 0 t 29 json: /content/data_71802_local/extracted/TL_agv_06_agv06_0902_1356/agv06_0902_135727.json y: 0
[DEBUG] idx 1 t 30 json: /content/data_71802_local/extracted/TL_agv_06_agv06_0902_1356/agv06_0902_135728.json y: 0
[DEBUG] idx 2 t 31 json: /content/data_71802_local/extracted/TL_agv_06_agv06_0902_1356/agv06_0902_135729.json y: 0
[DEBUG] idx 3 t 32 json: /content/data_71802_local/extracted/TL_agv_06_agv06_0902_1356/agv06_0902_135730.json y: 0
[DEBUG] idx 4 t 33 json: /content/data_71802_local/extracted/TL_agv_06_agv06_0902_1356/agv06_0902_135731.json y: 0
targets sample: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
unique: [0]


#✅ 3) 추가로 “Dataset이 쓰는 strict y”와 “직접 계산한 y”가 같은지 검증

In [None]:
# dataset이 읽는 y
y_ds = int(ds_train[0]["target"].item())

# 같은 인덱스를 직접 계산
epi_idx, t = train_s[0]
_, ks = episodes[epi_idx]
y_direct = load_label_json_strict(records[ks[t+HORIZON]]["json"])

print("y_ds:", y_ds)
print("y_direct:", y_direct)


[DEBUG] idx 0 t 29 json: /content/data_71802_local/extracted/TL_agv_06_agv06_0902_1356/agv06_0902_135727.json y: 0
y_ds: 0
y_direct: 0


지금 debug를 보면:

t=29인데 라벨을 읽는 json이 ...135727.json

즉 “현재 t 시점의 json”을 읽고 있어.

그런데 우리가 원하는 건 t+30 (HORIZON) 시점 라벨이야.

원래 코드에서 json_path = records[ks[t+HORIZON]]["json"]로 되어 있는데, 출력은 그게 아닌 것처럼 보임.

왜냐면, 네 파일명 규칙상 1356xx는 초 단위로 증가하는데,
t=29면 t+30은 59초 뒤라서 1357xx가 아니라 1356xx에서 +59초가 되어야 해서 숫자가 달라져야 해.
그런데 지금은 t=29 → 135727, t=30 → 135728… 완전 현재 인덱스랑 1:1로 움직임이야.
즉 HORIZON이 0으로 잡혀있거나, Dataset 안에서 HORIZON이 덮였거나, 혹은 ks 리스트 자체가 이미 “t+30만큼 밀린 리스트”로 들어온 상태야.

결론: 지금은 HORIZON 적용이 깨져서, “미래 라벨”을 못 보고 “현재 라벨”만 보고 있어.
그래서 지금 샘플 50개가 전부 0인 것도 자연스러움(해당 에피소드 초반이 0 상태로 고정돼 있으니까).

---

#1) HORIZON이 진짜 30인지 먼저 확정 (가장 빠른 진단)

In [None]:
print("HORIZON =", HORIZON)

epi_idx, t = train_s[0]
_, ks = episodes[epi_idx]

print("t =", t)
print("stem(t)      =", Path(records[ks[t]]["json"]).stem)
print("stem(t+30)   =", Path(records[ks[t+30]]["json"]).stem)
print("stem(t+H)    =", Path(records[ks[t+HORIZON]]["json"]).stem)


HORIZON = 30
t = 29
stem(t)      = agv06_0902_135657
stem(t+30)   = agv06_0902_135727
stem(t+H)    = agv06_0902_135727


#2) Dataset이 “진짜로 t+HORIZON을 쓰는지” 강제 검증

In [None]:
class ForecastSingleDatasetStrict(Dataset):
    def __init__(self, samples_list, debug=False):
        self.samples = samples_list
        self.debug = debug

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        epi_idx, t = self.samples[idx]
        _, ks = episodes[epi_idx]

        # STRICT target
        jp_now = records[ks[t]]["json"]
        jp_fut = records[ks[t+HORIZON]]["json"]

        y_now = load_label_json_strict(jp_now)
        y_fut = load_label_json_strict(jp_fut)

        if self.debug and idx < 3:
            print("[DEBUG] idx", idx, "t", t, "H", HORIZON)
            print("   now:", Path(jp_now).stem, "y_now:", y_now)
            print("   fut:", Path(jp_fut).stem, "y_fut:", y_fut)

        # --- 나머지는 생략(속도 위해), 일단 타깃만 검증 ---
        return {"target": torch.tensor(float(y_fut), dtype=torch.float32)}


In [None]:
ds_dbg = ForecastSingleDatasetStrict(train_s, debug=True)
for i in range(3):
    _ = ds_dbg[i]


[DEBUG] idx 0 t 29 H 30
   now: agv06_0902_135657 y_now: 0
   fut: agv06_0902_135727 y_fut: 0
[DEBUG] idx 1 t 30 H 30
   now: agv06_0902_135658 y_now: 0
   fut: agv06_0902_135728 y_fut: 0
[DEBUG] idx 2 t 31 H 30
   now: agv06_0902_135659 y_now: 0
   fut: agv06_0902_135729 y_fut: 0


결론: 정상 / 지금 출력은 “HORIZON이 제대로 적용되고 있고, t+30 시점 라벨을 정확히 읽고 있다”는 뜻이야.

* t=29의 현재 stem: 135657

* t+30의 stem: 135727 (정확히 30초 뒤)

* Dataset debug도 now/fut가 다르게 찍힘 ✅

그런데 targets sample이 전부 0으로 나온 이유는 딱 하나:

네가 뽑은 ds_train[0..49]는 같은 에피소드 초반 구간이고, 그 구간의 t+30 라벨이 아직 전부 0이라서 그래.

아까 너가 “전체 분포”는 0~3이 섞여 있다고 확인했잖아.
즉, Dataset 자체는 정상이고, 단지 “앞부분만 찍어보면 0만 보인다”는 거야.

---

#1) Dataset 타깃 랜덤 샘플로 유니크 확인(필수)

In [None]:
import random, numpy as np

def sample_targets(ds, k=2000, seed=42):
    random.seed(seed)
    idxs = random.sample(range(len(ds)), k)
    ys = [int(ds[i]["target"].item()) for i in idxs]
    uniq, cnt = np.unique(ys, return_counts=True)
    return dict(zip(uniq.tolist(), cnt.tolist()))

ds_train = ForecastSingleDatasetStrict(train_s, debug=False)
print("random target dist (train):", sample_targets(ds_train, k=5000))


random target dist (train): {0: 2182, 1: 1217, 2: 1197, 3: 404}


#2) 모델이 hold baseline(0.6744)을 이기는지

In [None]:
class ForecastSingleDatasetStrictCls(Dataset):
    def __init__(self, samples_list):
        self.samples = samples_list

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        epi_idx, t = self.samples[idx]
        _, ks = episodes[epi_idx]

        # sensor window
        win_keys = ks[t-(PAST-1):t+1]
        sw = np.stack([load_sensor_csv(records[k]["csv"]) for k in win_keys], axis=0).astype(np.float32)
        sw = zscore_clip_sensor(sw, sensor_mean, sensor_std, clip=3.0)

        # image stack
        im1 = resize_img(load_thermal_cached(records[ks[t-29]]["bin"]))
        im2 = resize_img(load_thermal_cached(records[ks[t-14]]["bin"]))
        im3 = resize_img(load_thermal_cached(records[ks[t]]["bin"]))
        img = np.stack([im1, im2, im3], axis=0).astype(np.float32)
        img = (img - IMG_MEAN) / (IMG_STD + 1e-8)

        # ✅ strict label at t+30
        y = int(load_label_json_strict(records[ks[t+HORIZON]]["json"]))
        return {
            "sensor_window": torch.tensor(sw, dtype=torch.float32),
            "ir_images": torch.tensor(img, dtype=torch.float32),
            "target": torch.tensor(y, dtype=torch.long),
        }


In [None]:
ds_train = ForecastSingleDatasetStrictCls(train_s)
ds_val   = ForecastSingleDatasetStrictCls(val_s)
ds_test  = ForecastSingleDatasetStrictCls(test_s)

dl_train = DataLoader(ds_train, batch_size=32, shuffle=True,  num_workers=2, pin_memory=True)
dl_val   = DataLoader(ds_val,   batch_size=32, shuffle=False, num_workers=2, pin_memory=True)
dl_test  = DataLoader(ds_test,  batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

# quick check
tmp = [ds_train[i]["target"].item() for i in range(50)]
print("first50 unique:", sorted(set(tmp)))
print("random dist:", sample_targets(ds_train, k=5000))


first50 unique: [0]
random dist: {0: 2182, 1: 1217, 2: 1197, 3: 404}


#3) MMT 분류 모델 + class weight CE loss

In [None]:
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter

class MMT_Cls(nn.Module):
    def __init__(self, d_model=256, nhead=8, layers=4, patch=16, num_classes=4):
        super().__init__()
        self.patch = patch
        self.img_tokens = (IMG_SIZE // patch) * (IMG_SIZE // patch)

        self.img_proj = nn.Linear(patch*patch*3, d_model)
        self.sen_proj = nn.Linear(8, d_model)

        self.cls = nn.Parameter(torch.zeros(1,1,d_model))
        self.pos = nn.Parameter(torch.zeros(1,1+self.img_tokens+PAST, d_model))

        enc = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.enc = nn.TransformerEncoder(enc, num_layers=layers)
        self.head = nn.Linear(d_model, num_classes)

        nn.init.trunc_normal_(self.cls, std=0.02)
        nn.init.trunc_normal_(self.pos, std=0.02)

    def img_to_tokens(self, x):
        B,C,H,W = x.shape
        p = self.patch
        patches = x.unfold(2,p,p).unfold(3,p,p)
        patches = patches.permute(0,2,3,1,4,5).contiguous()
        patches = patches.view(B, self.img_tokens, C*p*p)
        return self.img_proj(patches)

    def forward(self, sensor_window, ir_images):
        B = sensor_window.size(0)
        tok_img = self.img_to_tokens(ir_images)
        tok_sen = self.sen_proj(sensor_window)
        cls = self.cls.expand(B,-1,-1)
        x = torch.cat([cls, tok_img, tok_sen], dim=1)
        x = x + self.pos[:, :x.size(1), :]
        h = self.enc(x)
        return self.head(h[:,0])  # (B,4)

device = "cuda" if torch.cuda.is_available() else "cpu"

# class weights from train labels (샘플링)
ys=[]
for i in range(min(50000, len(ds_train))):
    ys.append(int(ds_train[i]["target"].item()))
cnt = Counter(ys)
total = sum(cnt.values())
w = np.array([total/(cnt.get(i,1)) for i in range(4)], dtype=np.float32)
w = w / w.mean()
class_w = torch.tensor(w, device=device)
print("train label counts(sample):", cnt)
print("class weights:", w)

model = MMT_Cls().to(device)
criterion = nn.CrossEntropyLoss(weight=class_w)
opt = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)


train label counts(sample): Counter({0: 20537, 2: 12536, 1: 12457, 3: 4470})
class weights: [0.4503855 0.7425196 0.7378404 2.0692544]


#4) 평가 함수(accuracy + macroF1 + confusion)

In [None]:
def eval_cls(model, loader, device):
    model.eval()
    ys=[]; pr=[]
    with torch.no_grad():
        for batch in loader:
            sen = batch["sensor_window"].to(device)
            img = batch["ir_images"].to(device)
            y   = batch["target"].cpu().numpy()
            logits = model(sen,img).cpu().numpy()
            pred = logits.argmax(axis=1)
            ys.append(y); pr.append(pred)

    y = np.concatenate(ys); p = np.concatenate(pr)
    acc = (y==p).mean()

    cm = np.zeros((4,4), dtype=int)
    for a,b in zip(y,p): cm[a,b]+=1

    f1s=[]
    for c in range(4):
        tp=cm[c,c]
        fp=cm[:,c].sum()-tp
        fn=cm[c,:].sum()-tp
        prec=tp/(tp+fp+1e-9)
        rec =tp/(tp+fn+1e-9)
        f1=2*prec*rec/(prec+rec+1e-9)
        f1s.append(f1)

    return float(acc), float(np.mean(f1s)), cm


#5) 학습(5 epoch) + hold baseline(0.6744) 넘는지 확인

In [None]:
best = -1
EPOCHS = 5

for epoch in range(1, EPOCHS+1):
    model.train()
    run=0.0; n=0
    for step, batch in enumerate(dl_train, 1):
        sen = batch["sensor_window"].to(device)
        img = batch["ir_images"].to(device)
        y   = batch["target"].to(device)

        opt.zero_grad(set_to_none=True)
        logits = model(sen,img)
        loss = criterion(logits, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()

        run += loss.item()*sen.size(0)
        n += sen.size(0)

        if step % 300 == 0:
            print(f"  step {step} | loss {loss.item():.4f}")

    tr_loss = run/max(n,1)
    val_acc, val_f1, _ = eval_cls(model, dl_val, device)
    print(f"Epoch {epoch} | train loss {tr_loss:.4f} | val acc {val_acc:.4f} | val macroF1 {val_f1:.4f}")

    if val_acc > best:
        best = val_acc
        torch.save(model.state_dict(), "/content/mmt_cls_best.pt")
        print("  saved best.")


  step 300 | loss 0.6095
  step 600 | loss 0.4940
  step 900 | loss 0.5936
  step 1200 | loss 0.4337
  step 1500 | loss 0.4261
  step 1800 | loss 0.3146
  step 2100 | loss 0.5006
  step 2400 | loss 0.6567
Epoch 1 | train loss 0.6978 | val acc 0.8287 | val macroF1 0.7909
  saved best.
  step 300 | loss 0.4357
  step 600 | loss 0.4063
  step 900 | loss 0.2326
  step 1200 | loss 0.4056
  step 1500 | loss 0.4677
  step 1800 | loss 0.4023
  step 2100 | loss 0.4347
  step 2400 | loss 0.5074
Epoch 2 | train loss 0.4781 | val acc 0.7798 | val macroF1 0.7752
  step 300 | loss 0.3743
  step 600 | loss 0.4119
  step 900 | loss 0.2725
  step 1200 | loss 0.4307
  step 1500 | loss 0.3747
  step 1800 | loss 0.2696
  step 2100 | loss 0.3951
  step 2400 | loss 0.2141
Epoch 3 | train loss 0.4273 | val acc 0.7576 | val macroF1 0.7726
  step 300 | loss 0.4003
  step 600 | loss 0.3338
  step 900 | loss 0.2821
  step 1200 | loss 0.6968
  step 1500 | loss 0.4942
  step 1800 | loss 0.6574
  step 2100 | loss 0

In [None]:
model.load_state_dict(torch.load("/content/mmt_cls_best.pt", map_location=device))
test_acc, test_f1, test_cm = eval_cls(model, dl_test, device)
print("TEST acc:", test_acc, " (hold baseline=0.6744)")
print("TEST macroF1:", test_f1)
print("TEST confusion:\n", test_cm)


TEST acc: 0.8103767757875232  (hold baseline=0.6744)
TEST macroF1: 0.8036109486000743
TEST confusion:
 [[3552  787    0  163]
 [ 440 2105  334    0]
 [  63   13 2629  197]
 [  85    0   67  898]]


결론:
재현 파이프라인이 제대로 돌아갔고, 모델이 의미 있게 미래를 예측하고 있다는 강한 증거야.

* TEST acc 0.8104 vs hold 0.6744 → +0.1360p 개선 (엄청 큼)

* macro F1 0.8036 → 불균형(특히 3번 클래스 적음)에서도 균형 있게 맞춘 편

* confusion도 “대각선 우세”라서 전반적으로 잘 맞음

#결론 = 회귀에서 분류로 바꿈!!!
>  왜 분류로 바꿨냐(너 상황에서 현실적인 이유)


1) 회귀(MSE)에서 라벨이 잘못되면 티가 덜 나고 “0에 수렴” 착시가 생김

* 너가 겪은 것처럼 y가 1로 고정되면 MSE는 바로 0으로 떨어져서 “학습이 잘 됐나?”처럼 보임.

* 즉, 회귀는 라벨 버그를 더 늦게 발견할 수 있어.

2) 분류는 “라벨 분포/전이/혼동”이 바로 드러남

* CE로 돌리면 label dist가 이상하면 바로 티가 나고,

* confusion matrix로 “진짜로 0~3을 구분하는지”가 한 번에 보임.

* 그래서 라벨 파싱(state 경로 고정)이 제대로 됐는지 확인하는 용도로 분류가 빠름.

3) 너희는 “hold baseline(0.674)”을 넘는지 빠르게 확인해야 했음

* 분류 지표(acc/macroF1)가 “기준선”을 잡기에 쉬워서 우선 확인한 거야.

* 즉, 분류는 재현 목적이 아니라 디버깅/검증용 브릿지로 쓴 거고,
라벨이 정상이라는 게 확인됐으니 이제 회귀로 돌아가면 돼.

---

#분류 -> 회귀로 다시 바꾸기

1) 모델 head

* 분류: Linear(d_model, 4) (logits)

* 회귀: Linear(d_model, 1) (scalar)



2) loss

* 분류: CrossEntropyLoss

* 회귀: MSELoss 또는 SmoothL1Loss(Huber) (노이즈 있으면 Huber 추천)



3) target 타입

* 분류: y.long()

* 회귀: y.float()



4) 평가

* RMSE/MAE를 기본으로 찍고

* 필요하면 round/clip해서 “참고용” accuracy/macroF1도 같이 찍기