#드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


#1) 폴더 구조 점검 + zip 파일 스캔(자동)

아래에서 DATA_ROOT만 네 드라이브 구조에 맞게 바꿔줘.

In [None]:
import os
from collections import Counter
from tqdm.auto import tqdm

DATA_ROOT = "/content/drive/MyDrive/3.개방데이터/1.데이터"  # ✅ 여기만 맞춰줘

TRAIN_ROOT = os.path.join(DATA_ROOT, "Training")
VAL_ROOT   = os.path.join(DATA_ROOT, "Validation")

def tree_preview(root, max_depth=2, max_items=30):
    root = os.path.abspath(root)
    print(f"\n📁 TREE PREVIEW: {root}")
    if not os.path.exists(root):
        print("❌ NOT EXISTS")
        return
    shown = 0
    for cur, dirs, files in os.walk(root):
        rel = os.path.relpath(cur, root)
        depth = 0 if rel == "." else rel.count(os.sep) + 1
        if depth > max_depth:
            continue
        indent = "  " * depth
        print(f"{indent}- {os.path.basename(cur)}/")
        # 폴더/파일 일부만
        for d in sorted(dirs)[:5]:
            pass
        for f in sorted(files)[:5]:
            print(f"{indent}    • {f}")
            shown += 1
            if shown >= max_items:
                print(f"{indent}    ... (truncated)")
                return

tree_preview(TRAIN_ROOT, max_depth=3)
tree_preview(VAL_ROOT,   max_depth=3)



📁 TREE PREVIEW: /content/drive/MyDrive/3.개방데이터/1.데이터/Training
- Training/
  - 02.라벨링데이터/
      • TL_agv_01_agv01_0901_0812.zip
      • TL_agv_01_agv01_0901_0812.zip.part0
      • TL_agv_01_agv01_0902_1253.zip
      • TL_agv_01_agv01_0902_1253.zip.part0
      • TL_agv_01_agv01_0902_2013.zip
  - 01.원천데이터/
      • TS_agv_01_agv01_0901_0812.zip
      • TS_agv_01_agv01_0901_0812.zip.part0
      • TS_agv_01_agv01_0902_1253.zip
      • TS_agv_01_agv01_0902_1253.zip.part0
      • TS_agv_01_agv01_0902_2013.zip

📁 TREE PREVIEW: /content/drive/MyDrive/3.개방데이터/1.데이터/Validation
- Validation/
  - 01.원천데이터/
      • VS_agv_17_agv17_0902_1039.zip
      • VS_agv_17_agv17_0902_1039.zip.part0
      • VS_agv_17_agv17_0902_1908.zip
      • VS_agv_17_agv17_0902_1908.zip.part0
      • VS_agv_17_agv17_0903_0732.zip
  - 02.라벨링데이터/
      • VL_agv_17_agv17_0902_1039.zip
      • VL_agv_17_agv17_0902_1039.zip.part0
      • VL_agv_17_agv17_0902_1908.zip
      • VL_agv_17_agv17_0902_190

#2) Training/Validation 내부 zip 파일을 “prefix별(TS/TL/VS/VL)”로 자동 탐색

In [None]:
import re

ZIP_PAT = re.compile(r"^(TS|TL|VS|VL)_(agv|oht)_(\d+)_((?:agv|oht)\d+)_(\d{4})_(\d{4})\.zip$", re.I)

def scan_zips(root):
    zips = []
    for cur, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith(".zip"):
                zips.append(os.path.join(cur, f))
    return sorted(zips)

def group_by_prefix(zip_paths):
    groups = {"TS":[], "TL":[], "VS":[], "VL":[], "OTHER":[]}
    for p in zip_paths:
        base = os.path.basename(p)
        m = ZIP_PAT.match(base)
        if m:
            groups[m.group(1).upper()].append(p)
        else:
            groups["OTHER"].append(p)
    return groups

train_zips = scan_zips(TRAIN_ROOT)
val_zips   = scan_zips(VAL_ROOT)

train_g = group_by_prefix(train_zips)
val_g   = group_by_prefix(val_zips)

print("\n✅ TRAIN zip counts:", {k: len(v) for k,v in train_g.items()})
print("✅ VAL   zip counts:", {k: len(v) for k,v in val_g.items()})

print("\n[TRAIN sample]")
for k in ["TS","TL"]:
    if train_g[k]:
        print(k, os.path.basename(train_g[k][0]))
print("\n[VAL sample]")
for k in ["VS","VL"]:
    if val_g[k]:
        print(k, os.path.basename(val_g[k][0]))



✅ TRAIN zip counts: {'TS': 303, 'TL': 303, 'VS': 0, 'VL': 0, 'OTHER': 0}
✅ VAL   zip counts: {'TS': 0, 'TL': 0, 'VS': 38, 'VL': 38, 'OTHER': 0}

[TRAIN sample]
TS TS_agv_01_agv01_0901_0812.zip
TL TL_agv_01_agv01_0901_0812.zip

[VAL sample]
VS VS_agv_17_agv17_0902_1039.zip
VL VL_agv_17_agv17_0902_1039.zip


#3) (속도↑) zip을 로컬(/content)로 “필요한 것만” 복사

Colab 로컬 디스크 용량이 제한이라서, 일단 TS/TL만 또는 zip 일부만 복사하는 게 안전해.
아래는 “prefix 지정 + 최대 몇 개”로 복사하는 방식.

In [None]:
import shutil
from pathlib import Path

LOCAL_ZIP_DIR = Path("/content/local_zips")
LOCAL_ZIP_DIR.mkdir(parents=True, exist_ok=True)

def copy_zips_to_local(zip_list, dst_dir, max_n=None):
    if max_n is not None:
        zip_list = zip_list[:max_n]
    copied = []
    for p in tqdm(zip_list, desc=f"Copy to {dst_dir}"):
        dst = dst_dir / os.path.basename(p)
        if dst.exists() and dst.stat().st_size == os.path.getsize(p):
            copied.append(str(dst))
            continue
        shutil.copy2(p, dst)   # ✅ Drive → local
        copied.append(str(dst))
    return copied

# ✅ 예시: Training TS/TL 각각 20개만 로컬로
local_TS = copy_zips_to_local(train_g["TS"], LOCAL_ZIP_DIR, max_n=20)
local_TL = copy_zips_to_local(train_g["TL"], LOCAL_ZIP_DIR, max_n=20)

print("local_TS:", len(local_TS), "local_TL:", len(local_TL))
print("sample local:", local_TS[0] if local_TS else None)


Copy to /content/local_zips:   0%|          | 0/20 [00:00<?, ?it/s]

Copy to /content/local_zips:   0%|          | 0/20 [00:00<?, ?it/s]

local_TS: 20 local_TL: 20
sample local: /content/local_zips/TS_agv_01_agv01_0901_0812.zip


#4) 로컬 zip 압축해제(진행률 + 빠르게)

zip 하나당 내부 파일 수가 많아서, zip 단위 진행률 + 파일 단위 진행률 둘 다 넣었어.

In [None]:
import zipfile

EXTRACT_DIR = Path("/content/extracted")
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

def unzip_one(zip_path, out_dir):
    zip_path = Path(zip_path)
    out_sub = out_dir / zip_path.stem
    out_sub.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(zip_path, "r") as z:
        members = z.namelist()
        for m in tqdm(members, desc=f"Extract {zip_path.name}", leave=False):
            z.extract(m, out_sub)
    return str(out_sub)

def unzip_many(zip_paths, out_dir, max_n=None):
    if max_n is not None:
        zip_paths = zip_paths[:max_n]
    extracted = []
    for zp in tqdm(zip_paths, desc="Unzip zips"):
        extracted.append(unzip_one(zp, out_dir))
    return extracted

# ✅ 예시: 로컬 TS/TL 각각 5개씩만 풀기
ex_TS = unzip_many(local_TS, EXTRACT_DIR, max_n=5)
ex_TL = unzip_many(local_TL, EXTRACT_DIR, max_n=5)

print("extracted TS sample:", ex_TS[0] if ex_TS else None)
print("extracted TL sample:", ex_TL[0] if ex_TL else None)


Unzip zips:   0%|          | 0/5 [00:00<?, ?it/s]

Extract TS_agv_01_agv01_0901_0812.zip:   0%|          | 0/722 [00:00<?, ?it/s]

Extract TS_agv_01_agv01_0902_1253.zip:   0%|          | 0/722 [00:00<?, ?it/s]

Extract TS_agv_01_agv01_0902_2013.zip:   0%|          | 0/722 [00:00<?, ?it/s]

Extract TS_agv_01_agv01_0903_1018.zip:   0%|          | 0/722 [00:00<?, ?it/s]

Extract TS_agv_01_agv01_0903_1407.zip:   0%|          | 0/722 [00:00<?, ?it/s]

Unzip zips:   0%|          | 0/5 [00:00<?, ?it/s]

Extract TL_agv_01_agv01_0901_0812.zip:   0%|          | 0/361 [00:00<?, ?it/s]

Extract TL_agv_01_agv01_0902_1253.zip:   0%|          | 0/361 [00:00<?, ?it/s]

Extract TL_agv_01_agv01_0902_2013.zip:   0%|          | 0/361 [00:00<?, ?it/s]

Extract TL_agv_01_agv01_0903_1018.zip:   0%|          | 0/361 [00:00<?, ?it/s]

Extract TL_agv_01_agv01_0903_1407.zip:   0%|          | 0/361 [00:00<?, ?it/s]

extracted TS sample: /content/extracted/TS_agv_01_agv01_0901_0812
extracted TL sample: /content/extracted/TL_agv_01_agv01_0901_0812


In [None]:
def count_exts(folder):
    c = Counter()
    for cur, _, files in os.walk(folder):
        for f in files:
            ext = os.path.splitext(f)[1].lower()
            c[ext] += 1
    return c

print("\n[Check extracted TS]")
if ex_TS:
    print(ex_TS[0], count_exts(ex_TS[0]).most_common(10))

print("\n[Check extracted TL]")
if ex_TL:
    print(ex_TL[0], count_exts(ex_TL[0]).most_common(10))



[Check extracted TS]
/content/extracted/TS_agv_01_agv01_0901_0812 [('.csv', 361), ('.bin', 361)]

[Check extracted TL]
/content/extracted/TL_agv_01_agv01_0901_0812 [('.json', 361)]


In [None]:
 import os
from pathlib import Path

ts_dir = Path("/content/extracted/TS_agv_01_agv01_0901_0812")
tl_dir = Path("/content/extracted/TL_agv_01_agv01_0901_0812")

def stems_in_dir(d, ext):
    return set([p.stem for p in d.glob(f"*{ext}")])

csv_stems  = stems_in_dir(ts_dir, ".csv")
bin_stems  = stems_in_dir(ts_dir, ".bin")
json_stems = stems_in_dir(tl_dir, ".json")

common = csv_stems & bin_stems & json_stems
print("csv:", len(csv_stems), "bin:", len(bin_stems), "json:", len(json_stems))
print("common( csv∩bin∩json ):", len(common))

print("\nmissing bin for csv:", len(csv_stems - bin_stems))
print("missing json for csv:", len(csv_stems - json_stems))
print("missing csv for json:", len(json_stems - csv_stems))

# 샘플 5개만 보기
print("\ncommon sample:", list(sorted(common))[:5])


csv: 361 bin: 361 json: 361
common( csv∩bin∩json ): 361

missing bin for csv: 0
missing json for csv: 0
missing csv for json: 0

common sample: ['agv01_0901_081240', 'agv01_0901_081241', 'agv01_0901_081242', 'agv01_0901_081243', 'agv01_0901_081244']


#❌ 3), 4)버리기❌ -> 다시 전체 파일 압축해제

In [None]:
import shutil, os
from pathlib import Path

LOCAL_ZIP_DIR = Path("/content/local_zips")
EXTRACT_DIR   = Path("/content/extracted")

for p in [LOCAL_ZIP_DIR, EXTRACT_DIR]:
    if p.exists():
        shutil.rmtree(p)
    p.mkdir(parents=True, exist_ok=True)

print("✅ reset:", str(LOCAL_ZIP_DIR), str(EXTRACT_DIR))


✅ reset: /content/local_zips /content/extracted


In [None]:
import re
from tqdm.auto import tqdm

DATA_ROOT = "/content/drive/MyDrive/3.개방데이터/1.데이터"  # ✅ 너 경로 맞게
TRAIN_ROOT = os.path.join(DATA_ROOT, "Training")
VAL_ROOT   = os.path.join(DATA_ROOT, "Validation")

ZIP_PAT = re.compile(r"^(TS|TL|VS|VL)_(agv|oht)_(\d+)_((?:agv|oht)\d+)_(\d{4})_(\d{4})\.zip$", re.I)

def scan_zips(root):
    out = []
    for cur, _, files in os.walk(root):
        for f in files:
            if f.lower().endswith(".zip"):
                out.append(os.path.join(cur, f))
    return sorted(out)

def parse_info(path):
    base = os.path.basename(path)
    m = ZIP_PAT.match(base)
    if not m:
        return None
    prefix, eq_type, eq_id, eq_str, mmdd, hhmm = m.groups()
    return {
        "path": path,
        "prefix": prefix.upper(),
        "eq_type": eq_type.lower(),
        "eq_id": int(eq_id),
        "zip_key": f"{eq_str}_{mmdd}_{hhmm}"
    }

train_all = [parse_info(p) for p in scan_zips(TRAIN_ROOT)]
val_all   = [parse_info(p) for p in scan_zips(VAL_ROOT)]

train_all = [x for x in train_all if x is not None]
val_all   = [x for x in val_all if x is not None]

def group_infos(infos):
    g = {"TS":[], "TL":[], "VS":[], "VL":[]}
    for x in infos:
        if x["prefix"] in g:
            g[x["prefix"]].append(x)
    return g

train_g = group_infos(train_all)
val_g   = group_infos(val_all)

print("✅ TRAIN:", {k: len(v) for k,v in train_g.items()})
print("✅ VAL  :", {k: len(v) for k,v in val_g.items()})
print("sample TS:", os.path.basename(train_g["TS"][0]["path"]) if train_g["TS"] else None)
print("sample TL:", os.path.basename(train_g["TL"][0]["path"]) if train_g["TL"] else None)


✅ TRAIN: {'TS': 303, 'TL': 303, 'VS': 0, 'VL': 0}
✅ VAL  : {'TS': 0, 'TL': 0, 'VS': 38, 'VL': 38}
sample TS: TS_agv_01_agv01_0901_0812.zip
sample TL: TL_agv_01_agv01_0901_0812.zip


In [None]:
import random
random.seed(42)

MODE = "balanced"   # ✅ "all" 또는 "balanced"

N_TRAIN_PER_ID = 10   # train id(1~14) 당 TS/TL zip 몇 개 가져올지
N_VAL_PER_ID   = 10   # val   id(15~16) 당 VS/VL zip 몇 개 가져올지
N_TEST_PER_ID  = 10   # test  id(17~18) 당 VS/VL zip 몇 개 가져올지  (Validation 폴더에서 뽑힘)

def pick_by_id(infos, id_list, n_per_id):
    picked = []
    for i in id_list:
        cand = [x for x in infos if x["eq_id"] == i]
        if not cand:
            continue
        random.shuffle(cand)
        picked.extend(cand[:min(n_per_id, len(cand))])
    return picked

if MODE == "all":
    sel_TS = train_g["TS"]
    sel_TL = train_g["TL"]
    sel_VS = val_g["VS"]
    sel_VL = val_g["VL"]
else:
    # train: 1~14는 TS/TL에서
    train_ids = list(range(1,15))
    sel_TS = pick_by_id(train_g["TS"], train_ids, N_TRAIN_PER_ID)
    sel_TL = pick_by_id(train_g["TL"], train_ids, N_TRAIN_PER_ID)

    # val: 15~16은 VS/VL에서
    val_ids = [15,16]
    sel_VS_val = pick_by_id(val_g["VS"], val_ids, N_VAL_PER_ID)
    sel_VL_val = pick_by_id(val_g["VL"], val_ids, N_VAL_PER_ID)

    # test: 17~18도 VS/VL에서
    test_ids = [17,18]
    sel_VS_test = pick_by_id(val_g["VS"], test_ids, N_TEST_PER_ID)
    sel_VL_test = pick_by_id(val_g["VL"], test_ids, N_TEST_PER_ID)

    sel_VS = sel_VS_val + sel_VS_test
    sel_VL = sel_VL_val + sel_VL_test

print("✅ selected counts:",
      "TS", len(sel_TS), "TL", len(sel_TL), "VS", len(sel_VS), "VL", len(sel_VL))

# zip_key 기준으로 TS-TL / VS-VL 짝 맞는지(필수)
def keys(infos): return set([x["zip_key"] for x in infos])
print("train zip_key intersect:", len(keys(sel_TS) & keys(sel_TL)), "/", min(len(sel_TS), len(sel_TL)))
print("val/test zip_key intersect:", len(keys(sel_VS) & keys(sel_VL)), "/", min(len(sel_VS), len(sel_VL)))


✅ selected counts: TS 140 TL 140 VS 20 VL 20
train zip_key intersect: 79 / 140
val/test zip_key intersect: 9 / 20


In [None]:
import shutil
from tqdm.auto import tqdm

def keep_pairs(A, B):
    kb = {x["zip_key"]: x for x in B}
    outA, outB = [], []
    for a in A:
        b = kb.get(a["zip_key"])
        if b is not None:
            outA.append(a); outB.append(b)
    return outA, outB

sel_TS, sel_TL = keep_pairs(sel_TS, sel_TL)
sel_VS, sel_VL = keep_pairs(sel_VS, sel_VL)

print("✅ paired counts:",
      "TS", len(sel_TS), "TL", len(sel_TL), "VS", len(sel_VS), "VL", len(sel_VL))

def copy_infos(infos, dst_dir):
    copied = []
    for x in tqdm(infos, desc=f"Copy to {dst_dir}"):
        src = x["path"]
        dst = dst_dir / os.path.basename(src)
        if dst.exists() and dst.stat().st_size == os.path.getsize(src):
            copied.append(str(dst))
            continue
        shutil.copy2(src, dst)
        copied.append(str(dst))
    return copied

local_TS = copy_infos(sel_TS, LOCAL_ZIP_DIR)
local_TL = copy_infos(sel_TL, LOCAL_ZIP_DIR)
local_VS = copy_infos(sel_VS, LOCAL_ZIP_DIR)
local_VL = copy_infos(sel_VL, LOCAL_ZIP_DIR)

print("✅ local zip files:", len(list(LOCAL_ZIP_DIR.glob("*.zip"))))
print("sample:", next(iter(LOCAL_ZIP_DIR.glob("*.zip"))).name if list(LOCAL_ZIP_DIR.glob("*.zip")) else None)


✅ paired counts: TS 79 TL 79 VS 9 VL 9


Copy to /content/local_zips:   0%|          | 0/79 [00:00<?, ?it/s]

Copy to /content/local_zips:   0%|          | 0/79 [00:00<?, ?it/s]

Copy to /content/local_zips:   0%|          | 0/9 [00:00<?, ?it/s]

Copy to /content/local_zips:   0%|          | 0/9 [00:00<?, ?it/s]

✅ local zip files: 176
sample: TS_oht_12_oht12_0828_0420.zip


In [None]:
from collections import Counter

def count_exts(folder):
    c = Counter()
    for cur, _, files in os.walk(folder):
        for f in files:
            c[os.path.splitext(f)[1].lower()] += 1
    return c

# 샘플 1개씩만 확인
sample_TS = next((p for p in EXTRACT_DIR.iterdir() if p.name.startswith("TS_")), None)
sample_TL = next((p for p in EXTRACT_DIR.iterdir() if p.name.startswith("TL_")), None)
sample_VS = next((p for p in EXTRACT_DIR.iterdir() if p.name.startswith("VS_")), None)
sample_VL = next((p for p in EXTRACT_DIR.iterdir() if p.name.startswith("VL_")), None)

print("[Check extracted TS]", sample_TS, count_exts(sample_TS) if sample_TS else None)
print("[Check extracted TL]", sample_TL, count_exts(sample_TL) if sample_TL else None)
print("[Check extracted VS]", sample_VS, count_exts(sample_VS) if sample_VS else None)
print("[Check extracted VL]", sample_VL, count_exts(sample_VL) if sample_VL else None)


[Check extracted TS] None None
[Check extracted TL] None None
[Check extracted VS] None None
[Check extracted VL] None None


In [None]:
from pathlib import Path
import re, os
from collections import Counter

LOCAL_ZIP_DIR = Path("/content/local_zips")
EXTRACT_DIR   = Path("/content/extracted")

print("LOCAL_ZIP_DIR exists:", LOCAL_ZIP_DIR.exists())
print("EXTRACT_DIR exists:", EXTRACT_DIR.exists())

zips = sorted(LOCAL_ZIP_DIR.glob("*.zip"))
print("local zip count:", len(zips))
print("zip sample:", [p.name for p in zips[:10]])

cnt = Counter([p.name.split("_")[0] for p in zips])  # TS/TL/VS/VL
print("prefix counts:", cnt)


LOCAL_ZIP_DIR exists: True
EXTRACT_DIR exists: True
local zip count: 176
zip sample: ['TL_agv_01_agv01_1027_0724.zip', 'TL_agv_01_agv01_1027_1405.zip', 'TL_agv_02_agv02_0902_1306.zip', 'TL_agv_03_agv03_0902_1320.zip', 'TL_agv_03_agv03_0902_2043.zip', 'TL_agv_03_agv03_1027_0745.zip', 'TL_agv_04_agv04_0901_0918.zip', 'TL_agv_05_agv05_0901_0924.zip', 'TL_agv_05_agv05_0902_1345.zip', 'TL_agv_05_agv05_0903_1059.zip']
prefix counts: Counter({'TL': 79, 'TS': 79, 'VL': 9, 'VS': 9})


In [None]:
ex_dirs = sorted([p for p in EXTRACT_DIR.iterdir() if p.is_dir()]) if EXTRACT_DIR.exists() else []
print("extracted dir count:", len(ex_dirs))
print("extracted dir sample:", [p.name for p in ex_dirs[:20]])


extracted dir count: 0
extracted dir sample: []


In [None]:
# extracted에 폴더가 있는데 TS_ 로 시작하지 않는 경우를 잡아냄
if EXTRACT_DIR.exists():
    dirs = [p for p in EXTRACT_DIR.iterdir() if p.is_dir()]
    bad = [p.name for p in dirs if not (p.name.startswith("TS_") or p.name.startswith("TL_") or p.name.startswith("VS_") or p.name.startswith("VL_"))]
    print("non-standard extracted dirs (sample):", bad[:20])


non-standard extracted dirs (sample): []


In [None]:
import zipfile
from tqdm.auto import tqdm
from pathlib import Path

LOCAL_ZIP_DIR = Path("/content/local_zips")
EXTRACT_DIR   = Path("/content/extracted")
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

zip_paths = sorted([str(p) for p in LOCAL_ZIP_DIR.glob("*.zip")])
print("to unzip:", len(zip_paths))
print("zip sample:", zip_paths[0] if zip_paths else None)

failed = []

for zp in tqdm(zip_paths, desc="Unzipping"):
    zp = Path(zp)
    out_sub = EXTRACT_DIR / zp.stem
    out_sub.mkdir(parents=True, exist_ok=True)
    try:
        with zipfile.ZipFile(zp, "r") as z:
            z.extractall(out_sub)
    except Exception as e:
        failed.append((zp.name, str(e)))

print("✅ unzip done. failed:", len(failed))
if failed:
    print("failed sample:", failed[:5])

# prefix별 extracted 폴더 개수 체크
dirs = [p for p in EXTRACT_DIR.iterdir() if p.is_dir()]
print("extracted total dirs:", len(dirs))
print("TS dirs:", sum(d.name.startswith("TS_") for d in dirs))
print("TL dirs:", sum(d.name.startswith("TL_") for d in dirs))
print("VS dirs:", sum(d.name.startswith("VS_") for d in dirs))
print("VL dirs:", sum(d.name.startswith("VL_") for d in dirs))
print("dir sample:", [d.name for d in dirs[:10]])


to unzip: 176
zip sample: /content/local_zips/TL_agv_01_agv01_1027_0724.zip


Unzipping:   0%|          | 0/176 [00:00<?, ?it/s]

✅ unzip done. failed: 0
extracted total dirs: 176
TS dirs: 79
TL dirs: 79
VS dirs: 9
VL dirs: 9
dir sample: ['TS_agv_05_agv05_0903_1059', 'TS_oht_02_oht02_0901_1218', 'TL_oht_05_oht05_0827_0129', 'TS_agv_13_agv13_1026_2235', 'TS_oht_10_oht10_0920_1415', 'TL_agv_10_agv10_1027_1008', 'TS_oht_05_oht05_0914_0632', 'TL_agv_05_agv05_0902_1345', 'TS_oht_14_oht14_0827_0424', 'TS_agv_01_agv01_1027_1405']


완벽하게 정상화됐어.

✅ 이제 /content/extracted 아래에 TS 79 / TL 79 / VS 9 / VL 9 폴더가 제대로 생겼고, 압축해제 실패 0이니까 다음 단계로 바로 넘어가면 돼.

#1) 먼저 “회귀 타깃 y가 json에 진짜 있는지” 확인 (필수)

아래 코드는 TL/VL 폴더에서 json 하나를 까서 키 구조를 출력해.
여기서 y가 어디 있는지 확정해야 회귀 df를 만들 수 있어.

In [None]:
import os, json, random
from pathlib import Path

EXTRACT_DIR = Path("/content/extracted")

# TL 또는 VL 중 하나에서 json 1개 샘플
label_dirs = [p for p in EXTRACT_DIR.iterdir() if p.is_dir() and (p.name.startswith("TL_") or p.name.startswith("VL_"))]
print("label dirs:", len(label_dirs))

sample_dir = random.choice(label_dirs)
json_files = sorted(sample_dir.glob("*.json"))
print("sample label dir:", sample_dir.name, "json count:", len(json_files))

sample_json = random.choice(json_files)
obj = json.loads(sample_json.read_text(encoding="utf-8"))

print("\njson top keys:", list(obj.keys())[:30])
# annotations 구조도 같이 보기
ann = obj.get("annotations", None)
print("annotations type:", type(ann), "preview:", ann if isinstance(ann,(dict,list)) else ann)


label dirs: 88
sample label dir: TL_oht_07_oht07_0920_1030 json count: 301

json top keys: ['meta_info', 'sensor_data', 'ir_data', 'annotations', 'external_data']
annotations type: <class 'list'> preview: [{'tagging': [{'annotation_type': 'tagging', 'state': '1'}]}]


In [None]:
import json
from pathlib import Path
import random
import re

EXTRACT_DIR = Path("/content/extracted")
label_dirs = [p for p in EXTRACT_DIR.iterdir() if p.is_dir() and (p.name.startswith("TL_") or p.name.startswith("VL_"))]
sample_dir = random.choice(label_dirs)
json_files = sorted(sample_dir.glob("*.json"))
sample_json = random.choice(json_files)

obj = json.loads(sample_json.read_text(encoding="utf-8"))

KEY_HINT = re.compile(r"(depth|height|target|value|label|score|reg|y)", re.I)

hits = []

def walk(x, path=""):
    if isinstance(x, dict):
        for k, v in x.items():
            p2 = f"{path}.{k}" if path else k
            if KEY_HINT.search(k):
                hits.append(("KEY_HINT", p2, type(v).__name__, str(v)[:120]))
            walk(v, p2)
    elif isinstance(x, list):
        for i, v in enumerate(x):
            walk(v, f"{path}[{i}]")
    else:
        if isinstance(x, (int, float)) and not isinstance(x, bool):
            hits.append(("NUM", path, type(x).__name__, str(x)))

walk(obj)

print("sample:", sample_dir.name, sample_json.name)
print("top keys:", list(obj.keys()))
print("\n---- hits ----")
for h in hits[:80]:
    print(h)
print("\nTotal hits:", len(hits))


sample: TL_oht_12_oht12_0901_1353 oht12_0901_135922.json
top keys: ['meta_info', 'sensor_data', 'ir_data', 'annotations', 'external_data']

---- hits ----
('KEY_HINT', 'meta_info[0].sensor_types', 'str', 'NTC, PM10, PM2.5, PM1.0, CT1, CT2, CT3, CT4')
('KEY_HINT', 'meta_info[0].cumulative_operating_day', 'str', '18')
('KEY_HINT', 'meta_info[0].equipment_history', 'str', '13')
('KEY_HINT', 'sensor_data[0].PM10[0].value', 'float', '20.0')
('NUM', 'sensor_data[0].PM10[0].value', 'float', '20.0')
('KEY_HINT', 'sensor_data[0].PM2.5[0].value', 'float', '12.0')
('NUM', 'sensor_data[0].PM2.5[0].value', 'float', '12.0')
('KEY_HINT', 'sensor_data[0].PM1.0[0].value', 'float', '8.0')
('NUM', 'sensor_data[0].PM1.0[0].value', 'float', '8.0')
('KEY_HINT', 'sensor_data[0].NTC[0].value', 'float', '27.8')
('NUM', 'sensor_data[0].NTC[0].value', 'float', '27.8')
('KEY_HINT', 'sensor_data[0].CT1[0].value', 'float', '2.71')
('NUM', 'sensor_data[0].CT1[0].value', 'float', '2.71')
('KEY_HINT', 'sensor_data[0].

#2) ML 회귀로 바로 되는 “정답 라벨 추출” (핵심 코드)

아래 함수만 제대로 쓰면, CatBoost/RandomForest/LightGBM 전부 회귀로 학습 가능해

In [None]:
def extract_y_from_label_json(js):
    """
    TL_*.json 파싱 결과(dict)에서 회귀 타깃 y(0.0~3.0)를 뽑는다.
    dataset 샘플 기준: js["annotations"][0]["tagging"][0]["state"] == "1" 같은 형태
    """
    ann = js.get("annotations", None)
    if not ann or not isinstance(ann, list):
        return None

    # annotations[0]에 tagging 리스트가 들어있는 케이스를 우선 처리
    tagging = ann[0].get("tagging", None) if isinstance(ann[0], dict) else None
    if not tagging or not isinstance(tagging, list):
        return None

    state = tagging[0].get("state", None) if isinstance(tagging[0], dict) else None
    if state is None:
        return None

    # "1" 같은 문자열 → 회귀 타깃 float
    try:
        return float(state)
    except:
        return None


In [None]:
!pip -q install catboost tqdm

import os, re, json, glob
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.1/97.1 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
EXTRACT_DIR = "/content/extracted"  # 너가 압축 푼 위치

def list_dirs(prefix):
    return sorted(glob.glob(os.path.join(EXTRACT_DIR, f"{prefix}_*")))

TS_dirs = list_dirs("TS")
TL_dirs = list_dirs("TL")
VS_dirs = list_dirs("VS")
VL_dirs = list_dirs("VL")

print("TS:", len(TS_dirs), "TL:", len(TL_dirs), "VS:", len(VS_dirs), "VL:", len(VL_dirs))
print("sample TS:", os.path.basename(TS_dirs[0]) if TS_dirs else None)
print("sample TL:", os.path.basename(TL_dirs[0]) if TL_dirs else None)


TS: 79 TL: 79 VS: 9 VL: 9
sample TS: TS_agv_01_agv01_1027_0724
sample TL: TL_agv_01_agv01_1027_0724


In [None]:
PAT = re.compile(r"^(TS|TL|VS|VL)_(agv|oht)_(\d+)_((?:agv|oht)\d+)_(\d{4})_(\d{4})$")

def parse_dirname(dirname):
    """
    return dict:
      prefix, equipment_type, equipment_num(int), equipment_id(str), mmdd, hhmm, zip_key
    """
    base = os.path.basename(dirname)
    m = PAT.match(base)
    if not m:
        return None
    prefix, eq_type, eq_num, eq_id, mmdd, hhmm = m.groups()
    eq_num = int(eq_num)
    zip_key = f"{eq_id}_{mmdd}_{hhmm}"   # 예: agv05_0903_1059
    return {
        "prefix": prefix,
        "equipment_type": eq_type,
        "equipment_num": eq_num,
        "equipment_id": eq_id,
        "mmdd": mmdd,
        "hhmm": hhmm,
        "zip_key": zip_key,
        "dir": dirname
    }

TS_meta = [parse_dirname(d) for d in TS_dirs]
TL_meta = [parse_dirname(d) for d in TL_dirs]
TS_meta = [x for x in TS_meta if x]
TL_meta = [x for x in TL_meta if x]

ts_by_zip = {x["zip_key"]: x["dir"] for x in TS_meta}
tl_by_zip = {x["zip_key"]: x["dir"] for x in TL_meta}

common_zip = sorted(set(ts_by_zip.keys()) & set(tl_by_zip.keys()))
print("paired zip_key:", len(common_zip))
print("sample zip_key:", common_zip[:5])


paired zip_key: 79
sample zip_key: ['agv01_1027_0724', 'agv01_1027_1405', 'agv02_0902_1306', 'agv03_0902_1320', 'agv03_0902_2043']


#3) 라벨(y) 추출 (회귀 타깃)

네가 확인한 구조 기준: annotations[0]["tagging"][0]["state"]

In [None]:
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def extract_y_from_json(js):
    ann = js.get("annotations")
    if not ann or not isinstance(ann, list):
        return None
    if not isinstance(ann[0], dict):
        return None
    tagging = ann[0].get("tagging")
    if not tagging or not isinstance(tagging, list) or not isinstance(tagging[0], dict):
        return None
    state = tagging[0].get("state")
    if state is None:
        return None
    try:
        return float(state)  # "1" -> 1.0
    except:
        return None


#4) “샘플 단위(stem)”로 TS(csv)와 TL(json) 매칭해서 df_all 만들기

In [None]:
#csv → 센서 feature 만들기 (빠른 버전: 첫 행 사용)
def read_first_row_csv(csv_path):
    # 매우 빠름(첫 row만). csv 구조가 커도 부담 적음.
    df = pd.read_csv(csv_path)
    row = df.iloc[0].to_dict()
    return row
#zip_key 1개에서 레코드들 만들기
def build_records_from_one_zip(zip_key, ts_dir, tl_dir):
    # 파일 목록
    csvs = {os.path.splitext(os.path.basename(p))[0]: p for p in glob.glob(os.path.join(ts_dir, "*.csv"))}
    jsons = {os.path.splitext(os.path.basename(p))[0]: p for p in glob.glob(os.path.join(tl_dir, "*.json"))}

    common = sorted(set(csvs.keys()) & set(jsons.keys()))
    records = []

    if len(common) == 0:
        return records

    # equipment 정보는 zip_key에서 복원 가능하지만, dir명에서 뽑는 게 더 안전
    ts_info = parse_dirname(os.path.basename(ts_dir))
    for stem in common:
        feat = read_first_row_csv(csvs[stem])
        js = load_json(jsons[stem])
        y = extract_y_from_json(js)
        if y is None:
            continue

        rec = {
            "zip_key": zip_key,
            "sample_key": stem,
            "equipment_id": ts_info["equipment_id"],
            "equipment_type": ts_info["equipment_type"],
            "equipment_num": ts_info["equipment_num"],
            "y": y
        }
        # 센서 피처 붙이기 (숫자만 남기기)
        for k, v in feat.items():
            rec[k] = v
        records.append(rec)

    return records
  #전체 df_all 생성
records = []
for zk in tqdm(common_zip, desc="build df_all from paired TS/TL"):
    ts_dir = ts_by_zip[zk]
    tl_dir = tl_by_zip[zk]
    records.extend(build_records_from_one_zip(zk, ts_dir, tl_dir))

df_all = pd.DataFrame(records)
print("df_all shape:", df_all.shape)
display(df_all.head())
print("y unique:", sorted(df_all["y"].unique())[:20], " ...")
print(df_all["y"].value_counts().sort_index())


build df_all from paired TS/TL:   0%|          | 0/79 [00:00<?, ?it/s]

df_all shape: (25935, 14)


Unnamed: 0,zip_key,sample_key,equipment_id,equipment_type,equipment_num,y,NTC,PM1.0,PM2.5,PM10,CT1,CT2,CT3,CT4
0,agv01_1027_0724,agv01_1027_072426,agv01,agv,1,0.0,21.0,17.0,23.0,40.0,1.8,74.88,49.96,19.91
1,agv01_1027_0724,agv01_1027_072427,agv01,agv,1,0.0,21.4,19.0,23.0,41.0,1.79,75.0,49.98,19.95
2,agv01_1027_0724,agv01_1027_072428,agv01,agv,1,0.0,21.44,18.0,22.0,39.0,1.98,75.0,49.99,19.99
3,agv01_1027_0724,agv01_1027_072429,agv01,agv,1,0.0,21.6,18.0,22.0,39.0,1.86,74.92,50.08,19.91
4,agv01_1027_0724,agv01_1027_072430,agv01,agv,1,0.0,21.54,18.0,22.0,40.0,1.78,75.13,49.97,19.97


y unique: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0)]  ...
y
0.0    13548
1.0     5233
2.0     5264
3.0     1890
Name: count, dtype: int64


#5) 전처리 : 숫자 컬럼만, 결측 처리

In [None]:
ID_COLS = ["zip_key","sample_key","equipment_id","equipment_type","equipment_num","y"]

# object -> numeric 가능한 건 변환, 안되면 NaN
X = df_all.drop(columns=["y"])
for c in X.columns:
    if c in ["zip_key","sample_key","equipment_id","equipment_type"]:
        continue
    X[c] = pd.to_numeric(X[c], errors="coerce")

# equipment_type 원핫 (RF는 문자열 못씀)
X = pd.get_dummies(X, columns=["equipment_type"], drop_first=False)

y = df_all["y"].astype(float).values

# 결측 채우기
X = X.fillna(X.median(numeric_only=True))

print("X shape:", X.shape, "y shape:", y.shape)


X shape: (25935, 14) y shape: (25935,)


#6) 장비 ID 기준 split (1 ~ 14/ 15 ~ 16 / 17 ~ 18)

너는 “test=17~18”도 쓰고 싶지만,

지금 로컬 zip은 TS/TL 79쌍 + VS/VL 9쌍이라서
17~18이 로컬에 없으면 test가 0이 될 수 있어.


일단 split 코드는 아래처럼 두고, test가 비면 “VAL만” 우선 돌려도 됨.

In [None]:
eqnum = df_all["equipment_num"].values

train_mask = (eqnum >= 1) & (eqnum <= 14)
val_mask   = (eqnum >= 15) & (eqnum <= 16)
test_mask  = (eqnum >= 17) & (eqnum <= 18)

def split_xy(X, y, mask):
    return X.loc[mask].copy(), y[mask].copy()

X_train, y_train = split_xy(X, y, train_mask)
X_val,   y_val   = split_xy(X, y, val_mask)
X_test,  y_test  = split_xy(X, y, test_mask)

print("split rows:", len(X_train), len(X_val), len(X_test))


split rows: 25935 0 0


In [None]:
def eval_reg(y_true, y_pred, name=""):
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2   = r2_score(y_true, y_pred)
    out = {"model": name, "MAE": mae, "RMSE": rmse, "R2": r2}
    return out


In [None]:
cat = CatBoostRegressor(
    loss_function="RMSE",
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    verbose=200,
    early_stopping_rounds=200
)

cat.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

pred_val_cat  = cat.predict(X_val)
res_val_cat   = eval_reg(y_val, pred_val_cat, "CatBoost(sensor)")

results = [res_val_cat]

if len(X_test) > 0:
    pred_test_cat = cat.predict(X_test)
    results.append(eval_reg(y_test, pred_test_cat, "CatBoost(sensor) [TEST]"))

pd.DataFrame(results)


CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="agv01_1027_0724": Cannot convert 'agv01_1027_0724' to float

In [None]:
print("equipment_num unique:", sorted(df_all["equipment_num"].unique().tolist()))
print(df_all["equipment_num"].value_counts().sort_index())

equipment_num unique: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
equipment_num
1     1685
2     2227
3     2046
4     1866
5     2347
6     1263
7     1985
8     1324
9     1985
10    2227
11    1985
12    1745
13    1625
14    1625
Name: count, dtype: int64


* 지금 출력 결과를 보면 로컬에 1 ~ 14 장비만 들어와 있어서 val/test(15 ~ 18)가 0인 게 “정상”이었고, 원래 논문 split(1 ~ 14/15 ~ 16/17 ~ 18)을 그대로 쓰려면 로컬에 15~18 장비 zip을 반드시 추가로 가져와야 해.

* 아래 코드는 “새 코랩에서 그대로” 돌릴 수 있게

1) 원본 Drive에서 15~18 장비의 TS/TL/VS/VL zip만 찾아서 로컬(/content/local_zips)로 복사 →
2) 압축해제(/content/extracted) →
3) 빠른 검증까지 한 번에 정리한 버전이야.
(진행률 tqdm 포함)

In [None]:
from pathlib import Path

# ✅ 너 Drive 원본 zip들이 있는 루트 폴더 (여기만 맞춰줘)
DRIVE_ROOT = Path("/content/drive/MyDrive/3.개방데이터/1.데이터")

# 보통 이런 구조였지:
# DRIVE_ROOT/Training/01.원천데이터/TS_*.zip
# DRIVE_ROOT/Training/02.라벨링데이터/TL_*.zip
# DRIVE_ROOT/Validation/01.원천데이터/VS_*.zip
# DRIVE_ROOT/Validation/02.라벨링데이터/VL_*.zip
TRAIN_TS_DIR = DRIVE_ROOT/"Training"/"01.원천데이터"
TRAIN_TL_DIR = DRIVE_ROOT/"Training"/"02.라벨링데이터"
VAL_VS_DIR   = DRIVE_ROOT/"Validation"/"01.원천데이터"
VAL_VL_DIR   = DRIVE_ROOT/"Validation"/"02.라벨링데이터"

LOCAL_ZIP_DIR = Path("/content/local_zips")
EXTRACT_DIR   = Path("/content/extracted")


#1) (핵심) 15~18 장비 zip만 “페어링(같은 장비/시간 TS↔TL)”해서 로컬로 복사

페어링을 먼저 하는 이유: TS만 가져오거나 TL만 가져오면 df_all 구성할 때 다시 누락이 생김.

“장비 편향” 줄이려면 15~18에서 가능한 전부 가져오는 게 가장 깔끔함.

In [None]:
import re, shutil
from tqdm.auto import tqdm
from collections import Counter

LOCAL_ZIP_DIR.mkdir(parents=True, exist_ok=True)
EXTRACT_DIR.mkdir(parents=True, exist_ok=True)

# 파일명 예:
# TS_agv_16_agv16_0920_1756.zip
# TL_oht_15_oht15_0901_0812.zip
ZIP_RE = re.compile(r"^(TS|TL|VS|VL)_(agv|oht)_(\d+)_((?:agv|oht)\d+)_(\d{4})_(\d{4})\.zip$")

def parse_zip(p: Path):
    m = ZIP_RE.match(p.name)
    if not m:
        return None
    prefix, eq_type, eq_num, eq_id, mmdd, hhmm = m.groups()
    eq_num = int(eq_num)
    zip_key = f"{eq_id}_{mmdd}_{hhmm}"   # agv16_0920_1756 같은 키
    return dict(prefix=prefix, eq_type=eq_type, eq_num=eq_num, eq_id=eq_id, mmdd=mmdd, hhmm=hhmm, zip_key=zip_key, path=p)

def list_zips(dir_path: Path, prefix: str):
    # dir 내부 zip 전부 스캔
    z = []
    for p in dir_path.glob(f"{prefix}_*.zip"):
        info = parse_zip(p)
        if info:
            z.append(info)
    return z

TS_all = list_zips(TRAIN_TS_DIR, "TS")
TL_all = list_zips(TRAIN_TL_DIR, "TL")
VS_all = list_zips(VAL_VS_DIR, "VS")
VL_all = list_zips(VAL_VL_DIR, "VL")

print("found:", len(TS_all), len(TL_all), len(VS_all), len(VL_all))

TARGET_EQ = {15,16,17,18}

def filter_by_eq(zlist):
    return [d for d in zlist if d["eq_num"] in TARGET_EQ]

TS_1518 = filter_by_eq(TS_all)
TL_1518 = filter_by_eq(TL_all)
VS_1518 = filter_by_eq(VS_all)
VL_1518 = filter_by_eq(VL_all)

print("15~18 raw:", len(TS_1518), len(TL_1518), len(VS_1518), len(VL_1518))

# ✅ 페어링: TS/TL은 zip_key 기준으로 교집합만
TS_map = {d["zip_key"]: d for d in TS_1518}
TL_map = {d["zip_key"]: d for d in TL_1518}
paired_keys_train = sorted(set(TS_map) & set(TL_map))

# ✅ (선택) Validation도 VS/VL 페어링(있으면 같이 복사)
VS_map = {d["zip_key"]: d for d in VS_1518}
VL_map = {d["zip_key"]: d for d in VL_1518}
paired_keys_val = sorted(set(VS_map) & set(VL_map))

print("paired train keys:", len(paired_keys_train), "paired val keys:", len(paired_keys_val))
print("train eq count:", Counter([TS_map[k]["eq_num"] for k in paired_keys_train]))

def copy_infos(infos, dst_dir: Path):
    for d in tqdm(infos, desc=f"Copy to {dst_dir}", leave=False):
        src = d["path"]
        dst = dst_dir/src.name
        if not dst.exists():
            shutil.copy2(src, dst)

# 실제 복사할 목록 만들기
to_copy = []
to_copy += [TS_map[k] for k in paired_keys_train]
to_copy += [TL_map[k] for k in paired_keys_train]
to_copy += [VS_map[k] for k in paired_keys_val]
to_copy += [VL_map[k] for k in paired_keys_val]

print("to_copy total:", len(to_copy))
copy_infos(to_copy, LOCAL_ZIP_DIR)

print("✅ local zip count now:", len(list(LOCAL_ZIP_DIR.glob("*.zip"))))
print("sample:", next(iter(LOCAL_ZIP_DIR.glob("*.zip"))))


found: 303 303 38 38
15~18 raw: 38 38 38 38
paired train keys: 38 paired val keys: 38
train eq count: Counter({15: 19, 16: 19})
to_copy total: 152


Copy to /content/local_zips:   0%|          | 0/152 [00:00<?, ?it/s]

✅ local zip count now: 310
sample: /content/local_zips/VS_oht_18_oht18_0902_0245.zip


#2) 로컬 zip(추가된 15~18 포함)을 압축해제

이미 같은 폴더명이 있으면 스킵하도록(속도) 처리했어.

In [None]:
import zipfile

def unzip_all(local_zip_dir: Path, extract_dir: Path):
    zips = sorted(local_zip_dir.glob("*.zip"))
    failed = []
    for zp in tqdm(zips, desc="Unzipping"):
        out_dir = extract_dir / zp.stem
        if out_dir.exists() and any(out_dir.iterdir()):
            continue
        out_dir.mkdir(parents=True, exist_ok=True)
        try:
            with zipfile.ZipFile(zp, "r") as z:
                z.extractall(out_dir)
        except Exception as e:
            failed.append((zp.name, str(e)))
    return failed

failed = unzip_all(LOCAL_ZIP_DIR, EXTRACT_DIR)
print("✅ unzip done. failed:", len(failed))
if failed[:3]:
    print("failed sample:", failed[:3])

# 빠른 검증: 폴더 카운트
dirs = [p for p in EXTRACT_DIR.iterdir() if p.is_dir()]
print("extracted total dirs:", len(dirs))
print("sample dirs:", [d.name for d in dirs[:10]])


Unzipping:   0%|          | 0/310 [00:00<?, ?it/s]

✅ unzip done. failed: 0
extracted total dirs: 310
sample dirs: ['TS_agv_05_agv05_0903_1059', 'TS_oht_02_oht02_0901_1218', 'VL_agv_18_agv18_0902_1929', 'TL_oht_05_oht05_0827_0129', 'TL_oht_16_oht16_0830_0658', 'TL_agv_15_agv15_0902_0957', 'VS_oht_17_oht17_0827_0215', 'TS_oht_16_oht16_0830_0658', 'VS_oht_18_oht18_0827_0454', 'TS_oht_15_oht15_0829_0605']


#3) “진짜로 15~18이 들어왔는지” 최종 확인(가장 중요)

압축해제 폴더명에서 장비번호를 뽑아 분포를 찍는다.

In [None]:
DIR_RE = re.compile(r"^(TS|TL|VS|VL)_(agv|oht)_(\d+)_")

eq_nums = []
for d in EXTRACT_DIR.iterdir():
    if not d.is_dir():
        continue
    m = DIR_RE.match(d.name)
    if m:
        eq_nums.append(int(m.group(3)))

eq_nums = sorted(set(eq_nums))
print("✅ extracted equipment_num unique:", eq_nums)


✅ extracted equipment_num unique: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]


#회귀 파이프라인 전체 코드
#df_all >> slpit >> 모델 3개

In [None]:
import os, re, json
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

EXTRACT_DIR = Path("/content/extracted")

DIR_RE = re.compile(r"^(TS|TL|VS|VL)_(agv|oht)_(\d+)_((?:agv|oht)\d+)_(\d{4})_(\d{4})$")

def parse_extracted_dirname(dname: str):
    m = DIR_RE.match(dname)
    if not m:
        return None
    prefix, eq_type, eq_num, eq_id, mmdd, hhmm = m.groups()
    eq_num = int(eq_num)
    zip_key = f"{eq_id}_{mmdd}_{hhmm}"
    return dict(prefix=prefix, eq_type=eq_type, eq_num=eq_num, eq_id=eq_id, mmdd=mmdd, hhmm=hhmm, zip_key=zip_key)

def list_files_by_stem(folder: Path, exts=(".csv", ".json", ".bin")):
    out = {}
    for p in folder.iterdir():
        if p.is_file() and p.suffix.lower() in exts:
            out[p.stem] = p
    return out

# ---- sensor csv: 첫 row에서 숫자 feature만 뽑기 ----
def read_first_row_features(csv_path: Path):
    df = pd.read_csv(csv_path)
    if len(df) == 0:
        return None
    row = df.iloc[0].to_dict()
    feat = {}
    for k, v in row.items():
        try:
            feat[k] = float(v)
        except:
            pass
    return feat if feat else None

# ---- thermal bin decode (너가 이미 성공한 포맷: 240x320, header=128, uint16) ----
def decode_thermal_bin_oht_agv(raw: bytes, H=240, W=320, header=128, dtype=np.uint16):
    n = len(raw)
    expect = header + H*W*np.dtype(dtype).itemsize
    if n != expect:
        return None, {"reason":"SIZE_MISMATCH", "n":n, "expect":expect}
    buf = raw[header:]
    arr = np.frombuffer(buf, dtype=dtype).reshape(H, W)
    # 0~1 normalize float32
    img = (arr.astype(np.float32) / np.iinfo(dtype).max)
    meta = {"reason":"OK", "n":n, "header":header, "shape":(H,W), "dtype":str(dtype)}
    return img, meta

def read_thermal(bin_path: Path):
    raw = bin_path.read_bytes()
    return decode_thermal_bin_oht_agv(raw)

# ---- 회귀 y: ir_data.temp_max.value_TGmx (없으면 None) ----
def extract_y_reg_from_json(json_path: Path):
    j = json.loads(json_path.read_text(encoding="utf-8"))
    try:
        v = j["ir_data"][0]["temp_max"][0]["value_TGmx"]
        return float(v)
    except:
        return None


In [None]:
def build_df_all_from_extracted(extract_dir: Path, use_prefixes=("TS","TL","VS","VL")):
    # prefix별 폴더 모으기
    folders = [p for p in extract_dir.iterdir() if p.is_dir()]
    by_prefix = {}
    for fd in folders:
        info = parse_extracted_dirname(fd.name)
        if not info or info["prefix"] not in use_prefixes:
            continue
        by_prefix.setdefault(info["prefix"], []).append((fd, info))

    # TS↔TL, VS↔VL zip_key로 페어링
    def map_by_zipkey(items):
        return {info["zip_key"]: (fd, info) for fd, info in items}

    TS_map = map_by_zipkey(by_prefix.get("TS", []))
    TL_map = map_by_zipkey(by_prefix.get("TL", []))
    VS_map = map_by_zipkey(by_prefix.get("VS", []))
    VL_map = map_by_zipkey(by_prefix.get("VL", []))

    pairs = []
    for k in sorted(set(TS_map)&set(TL_map)):
        pairs.append(("train", k, TS_map[k], TL_map[k]))
    for k in sorted(set(VS_map)&set(VL_map)):
        pairs.append(("valtest", k, VS_map[k], VL_map[k]))

    records = []
    for split_group, zip_key, (ts_fd, ts_info), (tl_fd, tl_info) in tqdm(pairs, desc="build df_all from paired TS/TL & VS/VL"):
        # 파일 목록
        csvs = list_files_by_stem(ts_fd, exts=(".csv",))
        bins = list_files_by_stem(ts_fd, exts=(".bin",))
        jsons = list_files_by_stem(tl_fd, exts=(".json",))

        common = sorted(set(csvs) & set(jsons))  # csv stem과 json stem이 같은 것만
        if not common:
            continue

        for stem in common:
            feat = read_first_row_features(csvs[stem])
            if feat is None:
                continue

            y = extract_y_reg_from_json(jsons[stem])
            if y is None:
                continue

            rec = {
                "zip_key": zip_key,
                "sample_key": stem,
                "equipment_id": ts_info["eq_id"],
                "equipment_type": ts_info["eq_type"],
                "equipment_num": ts_info["eq_num"],
                "y": y,
            }
            # sensor feature
            for kf, vf in feat.items():
                rec[kf] = vf

            # thermal (bin이 있을 때만)
            if stem in bins:
                img, meta = read_thermal(bins[stem])
                rec["thermal"] = img
                rec["thermal_meta"] = meta
            else:
                rec["thermal"] = None
                rec["thermal_meta"] = {"reason":"BIN_NOT_FOUND"}

            records.append(rec)

    df_all = pd.DataFrame(records)
    return df_all

df_all = build_df_all_from_extracted(EXTRACT_DIR)
print("df_all shape:", df_all.shape)
display(df_all.head(3))
print("y stats:", df_all["y"].describe())
print("thermal available rate:", df_all["thermal"].apply(lambda x: isinstance(x, np.ndarray)).mean())


build df_all from paired TS/TL & VS/VL:   0%|          | 0/155 [00:00<?, ?it/s]

df_all shape: (50723, 16)


Unnamed: 0,zip_key,sample_key,equipment_id,equipment_type,equipment_num,y,NTC,PM1.0,PM2.5,PM10,CT1,CT2,CT3,CT4,thermal,thermal_meta
0,agv01_1027_0724,agv01_1027_072426,agv01,agv,1,41.87,21.0,17.0,23.0,40.0,1.8,74.88,49.96,19.91,"[[0.53142595, 0.645716, 0.27785152, 0.25102618...","{'reason': 'OK', 'n': 153728, 'header': 128, '..."
1,agv01_1027_0724,agv01_1027_072427,agv01,agv,1,41.9,21.4,19.0,23.0,41.0,1.79,75.0,49.98,19.95,"[[0.8533455, 0.65333027, 0.32832837, 0.2510261...","{'reason': 'OK', 'n': 153728, 'header': 128, '..."
2,agv01_1027_0724,agv01_1027_072428,agv01,agv,1,41.81,21.44,18.0,22.0,39.0,1.98,75.0,49.99,19.99,"[[0.46666667, 0.46666667, 0.34166476, 0.251026...","{'reason': 'OK', 'n': 153728, 'header': 128, '..."


y stats: count    50723.000000
mean        61.466383
std         19.245335
min         11.460000
25%         46.800000
50%         54.380000
75%         73.830000
max        172.060000
Name: y, dtype: float64
thermal available rate: 1.0


In [None]:
def split_by_equipment(df):
    eq = df["equipment_num"].values
    tr = (eq>=1) & (eq<=14)
    va = (eq>=15) & (eq<=16)
    te = (eq>=17) & (eq<=18)
    return df[tr].copy(), df[va].copy(), df[te].copy()

df_train, df_val, df_test = split_by_equipment(df_all)
print("split rows:", len(df_train), len(df_val), len(df_test))
print("train eq unique:", sorted(df_train["equipment_num"].unique()))
print("val eq unique:", sorted(df_val["equipment_num"].unique()))
print("test eq unique:", sorted(df_test["equipment_num"].unique()))


split rows: 25935 12394 12394
train eq unique: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14)]
val eq unique: [np.int64(15), np.int64(16)]
test eq unique: [np.int64(17), np.int64(18)]


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

ID_COLS = ["zip_key","sample_key","equipment_id","equipment_type","equipment_num","thermal","thermal_meta","y"]

def make_sensor_Xy(df):
    X = df.drop(columns=[c for c in ID_COLS if c in df.columns], errors="ignore")
    # equipment_type은 one-hot
    if "equipment_type" in df.columns:
        X = pd.get_dummies(pd.concat([df[["equipment_type"]], X], axis=1), columns=["equipment_type"], drop_first=False)
    # 전부 numeric 강제
    for c in X.columns:
        X[c] = pd.to_numeric(X[c], errors="coerce")
    X = X.fillna(X.median(numeric_only=True))
    y = df["y"].astype(float).values
    return X, y

def eval_reg(y_true, y_pred, name=""):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    return {"model":name, "MAE":mae, "RMSE":rmse, "R2":r2}


#CatBoost 오류해결

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def eval_reg(y_true, y_pred, name=""):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)

    mae  = mean_absolute_error(y_true, y_pred)
    mse  = mean_squared_error(y_true, y_pred)          # squared 파라미터 없이
    rmse = np.sqrt(mse)
    r2   = r2_score(y_true, y_pred)

    res = {"MAE": mae, "RMSE": rmse, "R2": r2}
    print(f"=== {name} ===")
    print(f"MAE : {mae:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"R2  : {r2:.6f}")
    return res


In [None]:
from catboost import CatBoostRegressor, Pool

# 예시: 이런 컬럼들은 범주형으로 취급
cat_cols = ["zip_key","sample_key","equipment_id","equipment_type"]
y = df_all["y"].values

X = df_all.drop(columns=["y"]).copy()

# 결측 처리(숫자만)
num_cols = [c for c in X.columns if c not in cat_cols]
X[num_cols] = X[num_cols].apply(pd.to_numeric, errors="coerce")
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# split 마스크는 너가 만든 train_mask/val_mask/test_mask 그대로 사용
X_train, y_train = X.loc[train_mask], y[train_mask]
X_val,   y_val   = X.loc[val_mask],   y[val_mask]
X_test,  y_test  = X.loc[test_mask],  y[test_mask]

train_pool = Pool(X_train, y_train, cat_features=cat_cols)
val_pool   = Pool(X_val,   y_val,   cat_features=cat_cols)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_cols)

cat = CatBoostRegressor(
    loss_function="RMSE",
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    verbose=200,
)

cat.fit(train_pool, eval_set=val_pool, use_best_model=True)

pred_val  = cat.predict(val_pool)
pred_test = cat.predict(test_pool)

res_cat_val  = eval_reg(y_val,  pred_val,  "CatBoost / VAL")
res_cat_test = eval_reg(y_test, pred_test, "CatBoost / TEST")


IndexError: Boolean index has wrong length: 25935 instead of 50723

In [None]:
!pip -q install catboost

from catboost import CatBoostRegressor

X_train, y_train = make_sensor_Xy(df_train)
X_val, y_val = make_sensor_Xy(df_val)
X_test, y_test = make_sensor_Xy(df_test)

cat = CatBoostRegressor(
    loss_function="RMSE",
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    verbose=200,
)

cat.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True)

pred_val_cat  = cat.predict(X_val)
pred_test_cat = cat.predict(X_test)

res_cat_val  = eval_reg(y_val, pred_val_cat,  "CatBoost / VAL")
res_cat_test = eval_reg(y_test, pred_test_cat,"CatBoost / TEST")
res_cat_val, res_cat_test


0:	learn: 17.4058792	test: 20.5628271	best: 20.5628271 (0)	total: 50.8ms	remaining: 1m 41s
200:	learn: 4.4148484	test: 11.4608332	best: 11.4260685 (109)	total: 719ms	remaining: 6.43s
400:	learn: 3.8039743	test: 11.4682430	best: 11.4260685 (109)	total: 1.4s	remaining: 5.58s
600:	learn: 3.4696875	test: 11.4845552	best: 11.4260685 (109)	total: 2.08s	remaining: 4.83s
800:	learn: 3.2425634	test: 11.4967544	best: 11.4260685 (109)	total: 2.8s	remaining: 4.19s
1000:	learn: 3.0739259	test: 11.5179028	best: 11.4260685 (109)	total: 3.51s	remaining: 3.5s
1200:	learn: 2.9432444	test: 11.5453939	best: 11.4260685 (109)	total: 4.33s	remaining: 2.88s
1400:	learn: 2.8316730	test: 11.5557542	best: 11.4260685 (109)	total: 5.36s	remaining: 2.29s
1600:	learn: 2.7368110	test: 11.5673900	best: 11.4260685 (109)	total: 6.18s	remaining: 1.54s
1800:	learn: 2.6495968	test: 11.5891295	best: 11.4260685 (109)	total: 7.02s	remaining: 775ms
1999:	learn: 2.5702754	test: 11.5963004	best: 11.4260685 (109)	total: 7.77s	rem

TypeError: got an unexpected keyword argument 'squared'

In [None]:
import numpy as np

# df_all이 현재 50723행짜리여야 함
eqnum = df_all["equipment_num"].astype(int).values

train_mask = (eqnum >= 1) & (eqnum <= 14)
val_mask   = (eqnum >= 15) & (eqnum <= 16)
test_mask  = (eqnum >= 17) & (eqnum <= 18)

print("mask lens:", len(train_mask), len(val_mask), len(test_mask))
print("split rows:", train_mask.sum(), val_mask.sum(), test_mask.sum())
print("train eq unique:", np.unique(eqnum[train_mask])[:10], "...")
print("val eq unique:", np.unique(eqnum[val_mask]))
print("test eq unique:", np.unique(eqnum[test_mask]))


mask lens: 50723 50723 50723
split rows: 25935 12394 12394
train eq unique: [ 1  2  3  4  5  6  7  8  9 10] ...
val eq unique: [15 16]
test eq unique: [17 18]


In [None]:
import pandas as pd

cat_cols = ["zip_key","sample_key","equipment_id","equipment_type"]
y = df_all["y"].astype(float).values

X = df_all.drop(columns=["y"]).copy()

# 숫자 변환 + 결측 처리 (cat_cols 제외)
num_cols = [c for c in X.columns if c not in cat_cols]
X[num_cols] = X[num_cols].apply(pd.to_numeric, errors="coerce")
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# ✅ 이제 길이 맞는 마스크로 split
X_train, y_train = X.loc[train_mask].copy(), y[train_mask].copy()
X_val,   y_val   = X.loc[val_mask].copy(),   y[val_mask].copy()
X_test,  y_test  = X.loc[test_mask].copy(),  y[test_mask].copy()

print("X shapes:", X_train.shape, X_val.shape, X_test.shape)
print("y shapes:", y_train.shape, y_val.shape, y_test.shape)


X shapes: (25935, 15) (12394, 15) (12394, 15)
y shapes: (25935,) (12394,) (12394,)


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def eval_reg(y_true, y_pred, name=""):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred) ** 0.5   # ✅ squared 파라미터 없이 호환
    r2   = r2_score(y_true, y_pred)
    out = {"model": name, "MAE": mae, "RMSE": rmse, "R2": r2}
    print(f"[{name}] MAE={mae:.4f} | RMSE={rmse:.4f} | R2={r2:.4f}")
    return out

results = []


In [None]:
from catboost import CatBoostRegressor

cat_features = [X.columns.get_loc(c) for c in ["zip_key","sample_key","equipment_id","equipment_type"]]

cat = CatBoostRegressor(
    loss_function="RMSE",
    iterations=2000,
    learning_rate=0.05,
    depth=8,
    random_seed=42,
    verbose=200,
    early_stopping_rounds=200
)

cat.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)

pred_val_cat  = cat.predict(X_val)
pred_test_cat = cat.predict(X_test)

results.append({**eval_reg(y_val,  pred_val_cat,  "CatBoost"), "split":"VAL"})
results.append({**eval_reg(y_test, pred_test_cat, "CatBoost"), "split":"TEST"})


0:	learn: 17.3867109	test: 20.5913318	best: 20.5913318 (0)	total: 29.5ms	remaining: 59s
200:	learn: 2.4176120	test: 12.4693286	best: 12.4672844 (197)	total: 5.25s	remaining: 47s
400:	learn: 1.8424336	test: 12.5725439	best: 12.4346334 (221)	total: 10.9s	remaining: 43.3s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 12.43463336
bestIteration = 221

Shrink model to first 222 iterations.
[CatBoost] MAE=7.8503 | RMSE=12.4346 | R2=0.6556
[CatBoost] MAE=6.3277 | RMSE=8.6878 | R2=0.7983


In [None]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd

# ✅ RF에 넣으면 안 되는 문자열 ID 컬럼들 (CatBoost에서는 cat_features로 쓰지만, RF는 못 씀)
DROP_ID_COLS = ["zip_key", "sample_key", "equipment_id"]  # 필요시 더 추가 가능

def make_rf_matrix(X_df):
    X_num = X_df.drop(columns=[c for c in DROP_ID_COLS if c in X_df.columns]).copy()
    # equipment_type만 원핫
    if "equipment_type" in X_num.columns:
        X_num = pd.get_dummies(X_num, columns=["equipment_type"], drop_first=False)
    # 혹시 남아있는 object 컬럼이 있으면 에러 내서 바로 잡기
    obj_cols = X_num.select_dtypes(include=["object"]).columns.tolist()
    if obj_cols:
        raise ValueError(f"RF input still has object cols: {obj_cols}")
    return X_num

Xo_train = make_rf_matrix(X_train)
Xo_val   = make_rf_matrix(X_val).reindex(columns=Xo_train.columns, fill_value=0)
Xo_test  = make_rf_matrix(X_test).reindex(columns=Xo_train.columns, fill_value=0)

rf = RandomForestRegressor(
    n_estimators=800,
    random_state=42,
    n_jobs=-1,
)

rf.fit(Xo_train, y_train)

pred_val_rf  = rf.predict(Xo_val)
pred_test_rf = rf.predict(Xo_test)

results.append({**eval_reg(y_val,  pred_val_rf,  "RandomForest"), "split":"VAL"})
results.append({**eval_reg(y_test, pred_test_rf, "RandomForest"), "split":"TEST"})


[RandomForest] MAE=7.0426 | RMSE=12.2733 | R2=0.6644
[RandomForest] MAE=5.5610 | RMSE=7.7539 | R2=0.8393


In [None]:
# =========================
# 1) Imports
# =========================
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from catboost import CatBoostRegressor

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# -------------------------
# RMSE/R2 계산 (sklearn 버전 이슈 회피)
# -------------------------
def eval_reg(y_true, y_pred, name=""):
    y_true = np.asarray(y_true).reshape(-1)
    y_pred = np.asarray(y_pred).reshape(-1)
    mae = float(np.mean(np.abs(y_true - y_pred)))
    rmse = float(np.sqrt(np.mean((y_true - y_pred) ** 2)))
    ss_res = float(np.sum((y_true - y_pred) ** 2))
    ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2))
    r2 = float(1.0 - ss_res / ss_tot) if ss_tot > 0 else np.nan
    print(f"[{name}] MAE={mae:.4f} | RMSE={rmse:.4f} | R2={r2:.4f}")
    return {"mae": mae, "rmse": rmse, "r2": r2}


# =========================
# 2) 센서 feature만 뽑기 (문자열 제거)
# =========================
ID_COLS = ["zip_key","sample_key","equipment_id","equipment_type","equipment_num","y","thermal","thermal_meta"]
drop_cols = [c for c in ID_COLS if c in df_all.columns]

# 센서 피처 후보
sensor_df = df_all.drop(columns=drop_cols, errors="ignore").copy()

# 혹시 object 섞이면 numeric으로 강제
for c in sensor_df.columns:
    if sensor_df[c].dtype == "object":
        sensor_df[c] = pd.to_numeric(sensor_df[c], errors="coerce")

sensor_df = sensor_df.fillna(sensor_df.median(numeric_only=True))

# equipment_type 원핫이 필요하면 여기서 (이미 X 만들 때 했으면 생략)
if "equipment_type" in df_all.columns:
    # 만약 sensor_df에 equipment_type이 남아있으면(위 drop에서 뺀 경우)
    pass

X_sensor = sensor_df
y = df_all["y"].astype(float).values

# split
X_train_s = X_sensor.loc[train_mask].copy()
X_val_s   = X_sensor.loc[val_mask].copy()
X_test_s  = X_sensor.loc[test_mask].copy()
y_train_  = y[train_mask]
y_val_    = y[val_mask]
y_test_   = y[test_mask]

print("sensor shapes:", X_train_s.shape, X_val_s.shape, X_test_s.shape)


# =========================
# 3) Thermal Dataset
# =========================
def has_thermal_series(s: pd.Series) -> np.ndarray:
    # thermal이 np.ndarray인 row만 True
    return s.apply(lambda x: isinstance(x, np.ndarray)).values

class ThermalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, mask: np.ndarray):
        sub = df.loc[mask].copy()
        m = has_thermal_series(sub["thermal"])
        sub = sub.loc[m].reset_index(drop=True)

        self.y = sub["y"].astype(np.float32).values
        self.x = sub["thermal"].values  # object array of np.ndarray

        # 첫 샘플로 shape 체크
        if len(self.x) == 0:
            self.H, self.W = None, None
        else:
            self.H, self.W = self.x[0].shape

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        img = self.x[idx].astype(np.float32)  # (H,W) 0~1
        img = torch.from_numpy(img).unsqueeze(0)  # (1,H,W)
        y = torch.tensor(self.y[idx])
        return img, y

tr_th = ThermalDataset(df_all, train_mask)
va_th = ThermalDataset(df_all, val_mask)
te_th = ThermalDataset(df_all, test_mask)

print("thermal sizes:", len(tr_th), len(va_th), len(te_th), "shape:", tr_th.H, tr_th.W)

assert len(tr_th) > 0 and len(va_th) > 0 and len(te_th) > 0, \
    "thermal이 비어있습니다. df_all['thermal']에 실제 ndarray가 들어가 있는지 확인하세요."


# =========================
# 4) Thermal CNN (회귀 + 임베딩 추출)
# =========================
class ThermalCNN(nn.Module):
    def __init__(self, embed_dim=64):
        super().__init__()
        self.backbone = nn.Sequential(
            nn.Conv2d(1, 16, 3, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(16, 32, 3, stride=2, padding=1), nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=2, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool2d((1,1))
        )
        self.embed = nn.Linear(64, embed_dim)
        self.head  = nn.Linear(embed_dim, 1)

    def forward(self, x):
        h = self.backbone(x).flatten(1)         # (B,64)
        e = torch.relu(self.embed(h))           # (B,embed_dim)
        y = self.head(e).squeeze(1)             # (B,)
        return y, e

device = "cuda" if torch.cuda.is_available() else "cpu"
model_th = ThermalCNN(embed_dim=64).to(device)

train_loader = DataLoader(tr_th, batch_size=256, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(va_th, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(te_th, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)

opt = torch.optim.Adam(model_th.parameters(), lr=1e-3)
loss_fn = nn.L1Loss()  # MAE 기준 (논문 스타일이면 MSE로 바꿔도 됨)

def run_epoch(loader, train=True):
    model_th.train(train)
    total_loss, n = 0.0, 0
    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)
        pred, _ = model_th(xb)
        loss = loss_fn(pred, yb)
        if train:
            opt.zero_grad()
            loss.backward()
            opt.step()
        total_loss += float(loss.item()) * len(xb)
        n += len(xb)
    return total_loss / max(n, 1)

best_val = 1e18
best_state = None
patience, wait = 5, 0

for epoch in range(1, 51):
    tr_loss = run_epoch(train_loader, train=True)
    va_loss = run_epoch(val_loader, train=False)
    print(f"[ThermalCNN] epoch {epoch:02d} train_mae={tr_loss:.4f} val_mae={va_loss:.4f}")

    if va_loss < best_val - 1e-4:
        best_val = va_loss
        best_state = {k: v.detach().cpu().clone() for k, v in model_th.state_dict().items()}
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("Early stop.")
            break

model_th.load_state_dict(best_state)
model_th.eval()

@torch.no_grad()
def extract_cnn_preds_and_embeds(loader):
    preds, embeds = [], []
    for xb, _ in loader:
        xb = xb.to(device, non_blocking=True)
        yhat, e = model_th(xb)
        preds.append(yhat.detach().cpu().numpy())
        embeds.append(e.detach().cpu().numpy())
    return np.concatenate(preds), np.concatenate(embeds)

pred_val_cnn, emb_val = extract_cnn_preds_and_embeds(val_loader)
pred_test_cnn, emb_test = extract_cnn_preds_and_embeds(test_loader)

# train 쪽도 필요(퓨전 학습용)
pred_train_cnn, emb_train = extract_cnn_preds_and_embeds(train_loader)

print("cnn pred/emb shapes:", pred_train_cnn.shape, emb_train.shape)


# =========================
# 5) CatBoost(sensor) 회귀
# =========================
cat = CatBoostRegressor(
    loss_function="RMSE",
    iterations=2000,
    depth=8,
    learning_rate=0.05,
    random_seed=42,
    early_stopping_rounds=200,
    verbose=200
)

cat.fit(X_train_s, y_train_, eval_set=(X_val_s, y_val_), use_best_model=True)

pred_val_cat  = cat.predict(X_val_s)
pred_test_cat = cat.predict(X_test_s)
pred_train_cat = cat.predict(X_train_s)

_ = eval_reg(y_val_,  pred_val_cat,  "CatBoost / VAL")
_ = eval_reg(y_test_, pred_test_cat, "CatBoost / TEST")


# =========================
# 6) Late Fusion(ML 결합) : Ridge 회귀
#   입력 = [cat_pred, cnn_pred, cnn_embed(64)]
# =========================
def stack_fusion_features(cat_pred, cnn_pred, cnn_embed):
    cat_pred = np.asarray(cat_pred).reshape(-1, 1)
    cnn_pred = np.asarray(cnn_pred).reshape(-1, 1)
    return np.hstack([cat_pred, cnn_pred, cnn_embed])

Z_train = stack_fusion_features(pred_train_cat, pred_train_cnn, emb_train)
Z_val   = stack_fusion_features(pred_val_cat,   pred_val_cnn,   emb_val)
Z_test  = stack_fusion_features(pred_test_cat,  pred_test_cnn,  emb_test)

fusion = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0, random_state=42))
])
fusion.fit(Z_train, y_train_[:len(Z_train)])  # ThermalDataset에서 thermal있는 샘플만 썼으니 길이 맞춰야 함

# ⚠️ 위 y_train_ 길이 mismatch 가능성 대비:
# ThermalDataset은 "thermal 있는 row만 필터링" 했기 때문에
# y_train_도 동일 row만 써야 정확함.
# 아래에서 정확히 맞추는 방식(권장)을 추가로 제공할게.

pred_val_late  = fusion.predict(Z_val)
pred_test_late = fusion.predict(Z_test)

_ = eval_reg(y_val_[:len(pred_val_late)],  pred_val_late,  "LateFusion / VAL")
_ = eval_reg(y_test_[:len(pred_test_late)], pred_test_late, "LateFusion / TEST")


sensor shapes: (25935, 8) (12394, 8) (12394, 8)
thermal sizes: 25935 12394 12394 shape: 240 320
[ThermalCNN] epoch 01 train_mae=28.1119 val_mae=16.0555
[ThermalCNN] epoch 02 train_mae=14.0295 val_mae=15.9870
[ThermalCNN] epoch 03 train_mae=13.9745 val_mae=15.8764
[ThermalCNN] epoch 04 train_mae=13.9400 val_mae=15.8216
[ThermalCNN] epoch 05 train_mae=13.8945 val_mae=15.6660
[ThermalCNN] epoch 06 train_mae=13.8536 val_mae=15.5852
[ThermalCNN] epoch 07 train_mae=13.7592 val_mae=15.5348
[ThermalCNN] epoch 08 train_mae=13.7183 val_mae=15.4254
[ThermalCNN] epoch 09 train_mae=13.6665 val_mae=15.3849
[ThermalCNN] epoch 10 train_mae=13.6608 val_mae=15.3528
[ThermalCNN] epoch 11 train_mae=13.6014 val_mae=15.2791
[ThermalCNN] epoch 12 train_mae=13.5523 val_mae=15.1611
[ThermalCNN] epoch 13 train_mae=13.4899 val_mae=15.1456
[ThermalCNN] epoch 14 train_mae=13.3941 val_mae=14.9506
[ThermalCNN] epoch 15 train_mae=13.3055 val_mae=14.8754
[ThermalCNN] epoch 16 train_mae=13.2588 val_mae=14.7371
[Thermal

In [None]:
# thermal 있는 row만 남기는 mask(각 split별)
train_has_th = train_mask & has_thermal_series(df_all["thermal"])
val_has_th   = val_mask   & has_thermal_series(df_all["thermal"])
test_has_th  = test_mask  & has_thermal_series(df_all["thermal"])

X_train_s2 = X_sensor.loc[train_has_th].copy()
y_train2   = y[train_has_th]
X_val_s2   = X_sensor.loc[val_has_th].copy()
y_val2     = y[val_has_th]
X_test_s2  = X_sensor.loc[test_has_th].copy()
y_test2    = y[test_has_th]

print("aligned(sensor) shapes:", X_train_s2.shape, X_val_s2.shape, X_test_s2.shape)

# 이후 CatBoost는 X_train_s2/y_train2 로 학습
# pred_train_cat = cat.predict(X_train_s2) 같은 식으로 전부 s2 버전 사용


aligned(sensor) shapes: (25935, 8) (12394, 8) (12394, 8)


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_s, y_train_)
pred_val_rf  = rf.predict(X_val_s)
pred_test_rf = rf.predict(X_test_s)

_ = eval_reg(y_val_,  pred_val_rf,  "RandomForest / VAL")
_ = eval_reg(y_test_, pred_test_rf, "RandomForest / TEST")


[RandomForest / VAL] MAE=6.8496 | RMSE=12.3854 | R2=0.6583
[RandomForest / TEST] MAE=5.1710 | RMSE=7.5325 | R2=0.8483


In [None]:
results = []

def add_result(model_name, split, y_true, y_pred):
    m = eval_reg(y_true, y_pred, f"{model_name} / {split}")
    results.append({"model": model_name, "split": split, **m})

# CatBoost
add_result("CatBoost", "VAL",  y_val_,  pred_val_cat)
add_result("CatBoost", "TEST", y_test_, pred_test_cat)

# RandomForest
add_result("RandomForest", "VAL",  y_val_,  pred_val_rf)
add_result("RandomForest", "TEST", y_test_, pred_test_rf)

# LateFusion
add_result("LateFusion", "VAL",  y_val_[:len(pred_val_late)],  pred_val_late)
add_result("LateFusion", "TEST", y_test_[:len(pred_test_late)], pred_test_late)

df_res = pd.DataFrame(results).sort_values(["split","model"]).reset_index(drop=True)
display(df_res)

metric_cols = ["mae","rmse","r2"]
summary = df_res.groupby("split")[metric_cols].agg(["mean","std"]).round(6)
display(summary)

# 보기 좋게 mean±std
pretty = {}
for m in metric_cols:
    pretty[m] = summary[(m,"mean")].map(lambda x: f"{x:.4f}") + " ± " + summary[(m,"std")].map(lambda x: f"{x:.4f}")
pretty_df = pd.DataFrame(pretty)
display(pretty_df)


[CatBoost / VAL] MAE=6.6104 | RMSE=11.3124 | R2=0.7149
[CatBoost / TEST] MAE=5.1646 | RMSE=7.2132 | R2=0.8609
[RandomForest / VAL] MAE=6.8496 | RMSE=12.3854 | R2=0.6583
[RandomForest / TEST] MAE=5.1710 | RMSE=7.5325 | R2=0.8483
[LateFusion / VAL] MAE=6.6099 | RMSE=11.3084 | R2=0.7151
[LateFusion / TEST] MAE=5.2120 | RMSE=7.2296 | R2=0.8603


Unnamed: 0,model,split,mae,rmse,r2
0,CatBoost,TEST,5.164583,7.213199,0.860927
1,LateFusion,TEST,5.211997,7.229622,0.860293
2,RandomForest,TEST,5.170967,7.532484,0.848343
3,CatBoost,VAL,6.610371,11.312427,0.714923
4,LateFusion,VAL,6.609858,11.308386,0.715126
5,RandomForest,VAL,6.849641,12.385448,0.658277


Unnamed: 0_level_0,mae,mae,rmse,rmse,r2,r2
Unnamed: 0_level_1,mean,std,mean,std,mean,std
split,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
TEST,5.182516,0.02573,7.325102,0.179786,0.856521,0.00709
VAL,6.689957,0.138291,11.668753,0.620679,0.696108,0.032764


Unnamed: 0_level_0,mae,rmse,r2
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TEST,5.1825 ± 0.0257,7.3251 ± 0.1798,0.8565 ± 0.0071
VAL,6.6900 ± 0.1383,11.6688 ± 0.6207,0.6961 ± 0.0328
