# 알약 판별 EDA

In [1]:
# 노트북 상위 폴더 경로 추가
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import json
import os
import glob
from collections import Counter
import pandas as pd


# 유틸리티 함수 임포트
from src.utils import get_device, init_logger

In [3]:
logger = init_logger(name="healtheat_vision")  # 여기서만 프로젝트 이름을 지정
device = get_device()

logger.info(f"Using device: {device}")

Apple MPS GPU detected.
2025-12-15 18:01:51 - INFO - Using device: mps


In [4]:
from pathlib import Path
import json

PROJECT_ROOT = Path.cwd().resolve().parents[0] 
src_root = PROJECT_ROOT / "data" / "aihub_downloads" / "raw_extracted_02_except"
dst_root = PROJECT_ROOT / "data" / "aihub_downloads" / "raw_extracted_02_except_json_edited"

print("SRC exists:", src_root.exists(), src_root)
print("DST exists:", dst_root.exists(), dst_root)

SRC exists: True /Users/youuchul/Documents/github/03_projects/01_HealthEat Pill Detection Model/healtheat_vision/data/aihub_downloads/raw_extracted_02_except
DST exists: True /Users/youuchul/Documents/github/03_projects/01_HealthEat Pill Detection Model/healtheat_vision/data/aihub_downloads/raw_extracted_02_except_json_edited


In [5]:
from pathlib import Path
import json
import pprint

# 0) 테스트용 JSON 1개 잡기
json_paths = sorted(src_root.rglob("*.json"))
print("aihub_downloads 내 JSON 수:", len(json_paths))

test_path = json_paths[0]  # 일단 첫 파일
print("테스트 파일:", test_path)

data = json.loads(test_path.read_text())

print("\n[BEFORE] categories / annotations / images[0].dl_*")
pprint.pp(data["categories"])
pprint.pp(data["annotations"][:1])
pprint.pp({k: data["images"][0].get(k) for k in ["id", "dl_idx", "dl_name"]})

# 1) 변환(메모리에서만)
img0 = data["images"][0]
dl_idx = img0.get("dl_idx")
dl_name = img0.get("dl_name")

# dl_idx가 문자열이면 정수로 바꾸기 (category_id, categories.id는 int가 안전)
dl_idx_int = int(dl_idx) if dl_idx is not None else None

# categories를 dl 기준으로 단일 재구성
data["categories"] = [{
    "supercategory": "pill",
    "id": dl_idx_int,
    "name": dl_name
}]

# annotations의 category_id를 dl_idx로 교체
for ann in data.get("annotations", []):
    ann["category_id"] = dl_idx_int

print("\n[AFTER] categories / annotations / images[0].dl_*")
pprint.pp(data["categories"])
pprint.pp(data["annotations"][:1])
pprint.pp({k: data["images"][0].get(k) for k in ["id", "dl_idx", "dl_name"]})

aihub_downloads 내 JSON 수: 40415
테스트 파일: /Users/youuchul/Documents/github/03_projects/01_HealthEat Pill Detection Model/healtheat_vision/data/aihub_downloads/raw_extracted_02_except/K-000250-000573-002483-006192_json/K-000250/K-000250-000573-002483-006192_0_2_0_2_70_000_200.json

[BEFORE] categories / annotations / images[0].dl_*
[{'supercategory': 'pill', 'id': 1, 'name': 'Drug'}]
[{'area': 75888,
  'iscrowd': 0,
  'bbox': [553, 184, 272, 279],
  'category_id': 1,
  'ignore': 0,
  'segmentation': [],
  'id': 1,
  'image_id': 1}]
{'id': 1, 'dl_idx': '249', 'dl_name': '마그밀정(수산화마그네슘)'}

[AFTER] categories / annotations / images[0].dl_*
[{'supercategory': 'pill', 'id': 249, 'name': '마그밀정(수산화마그네슘)'}]
[{'area': 75888,
  'iscrowd': 0,
  'bbox': [553, 184, 272, 279],
  'category_id': 249,
  'ignore': 0,
  'segmentation': [],
  'id': 1,
  'image_id': 1}]
{'id': 1, 'dl_idx': '249', 'dl_name': '마그밀정(수산화마그네슘)'}


In [6]:
from pathlib import Path
import json

json_paths = list(src_root.rglob("*.json"))

for src_path in json_paths:
    if "json_edited" in src_path.parts:
        continue

    try:
        _ = json.loads(src_path.read_text())
    except json.JSONDecodeError as e:
        print("❌ JSON 깨짐 파일:", src_path)
        print("에러:", e)

        # 에러 위치 근처 200자 미리보기
        txt = src_path.read_text()
        start = max(e.pos - 200, 0)
        end = min(e.pos + 200, len(txt))
        print("\n--- 에러 위치 근처 ---")
        print(txt[start:end])
        print("\n---------------------")
        break

❌ JSON 깨짐 파일: /Users/youuchul/Documents/github/03_projects/01_HealthEat Pill Detection Model/healtheat_vision/data/aihub_downloads/raw_extracted_02_except/K-012247-016551-033009-044199_json/K-016551-오류/K-012247-016551-033009-044199_0_2_0_2_90_000_200.json
에러: Expecting value: line 59 column 12 (char 1571)

--- 에러 위치 근처 ---
front_img": "",
			"mark_code_back_img": "",
			"mark_code_front": "",
			"mark_code_back": "",
			"change_date": "20190124",
			"id": 1
		}
	],
	"type": "instances",
	"annotations": [
		{
			"area": ,
			"iscrowd": 0,
			"bbox": [],
			"category_id": 1,
			"ignore": 0,
			"segmentation": [],
			"id": 1,
			"image_id": 1
		}
	],
	"categories": [
		{
			"supercategory": "pill",
			"id": 1,
			"name

---------------------


In [7]:
bad_encoding = []
bad_json = []

for src_path in json_paths:
    if "json_edited" in src_path.parts:
        continue

    try:
        text = src_path.read_text(encoding="utf-8", errors="replace")
    except Exception as e:
        bad_encoding.append((str(src_path), str(e)))
        continue

    try:
        _ = json.loads(text)
    except json.JSONDecodeError as e:
        bad_json.append((str(src_path), str(e)))
        continue

print("인코딩 문제 파일 수:", len(bad_encoding))
print("JSON 문법 깨짐 파일 수:", len(bad_json))

인코딩 문제 파일 수: 0
JSON 문법 깨짐 파일 수: 2


In [8]:
edited_paths = sorted(dst_root.rglob("*.json"))
print("edited JSON 수:", len(edited_paths))

sample = edited_paths[0]
d = json.loads(sample.read_text(encoding="utf-8"))

print(d["categories"])
print(d["annotations"][0]["category_id"], d["images"][0]["dl_idx"], d["images"][0]["dl_name"])

edited JSON 수: 40414
[{'supercategory': 'pill', 'id': 1, 'name': 'Drug'}]
1 249 마그밀정(수산화마그네슘)
