In [3]:
!pip install -q paddleocr ultralytics opencv-python-headless tqdm

[0m

### 데이터 준비


In [4]:
import os
import random
import shutil
import subprocess, textwrap
import zipfile
import glob, json, cv2, numpy as np
from pathlib import Path
from tqdm import tqdm
from paddleocr import PaddleOCR
from ultralytics import YOLO
from collections import defaultdict

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [5]:
!pwd

/content


In [17]:
# %cd /content

/content


In [7]:
!unzip -q /content/medicine.zip

In [8]:
src_images = Path("/content/원천데이터/TS1/result/medicine/images")
src_labels = Path("/content/라벨링데이터/TL1/result/medicine/annotations")

# 대상 기본 경로
base_dir = Path("/content/data")
dst_images_base = base_dir / "01.원천데이터/OCR"
dst_labels_base = base_dir / "02.라벨링데이터/OCR"

train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1

# 이미지와 라벨 파일 목록
image_files = sorted([f for f in src_images.iterdir() if f.suffix.lower() in [".jpg", ".jpeg", ".png"]])
random.seed(2025)
random.shuffle(image_files)

n_total = len(image_files)
n_train = int(n_total * train_ratio)
n_val = int(n_total * val_ratio)

splits = {
    "train": image_files[:n_train],
    "val": image_files[n_train:n_train + n_val],
    "test": image_files[n_train + n_val:]
}

# 파일 복사
def copy_files(files, img_dst_dir, lbl_dst_dir):
    os.makedirs(img_dst_dir, exist_ok=True)
    os.makedirs(lbl_dst_dir, exist_ok=True)
    for img_path in files:
        # 이미지
        shutil.copy(img_path, img_dst_dir / img_path.name)
        # 라벨
        lbl_name = img_path.stem + ".json"
        lbl_path = src_labels / lbl_name
        if lbl_path.exists():
            shutil.copy(lbl_path, lbl_dst_dir / lbl_name)

for split, files in splits.items():
    copy_files(
        files,
        dst_images_base / split,
        dst_labels_base / split
    )

{
    "총 이미지 수": n_total,
    "train": len(splits["train"]),
    "val": len(splits["val"]),
    "test": len(splits["test"])
}

{'총 이미지 수': 485, 'train': 388, 'val': 48, 'test': 49}

## 라벨 변환

In [15]:
# ---------------- 기본 경로 ----------------
DATA_ROOT = Path("/content/data")
YOLO_ROOT = Path("/content/yolo_obb")
for split in ["train", "val", "test"]:
    (YOLO_ROOT / "images" / split).mkdir(parents=True, exist_ok=True)
    (YOLO_ROOT / "labels" / split).mkdir(parents=True, exist_ok=True)

IMG_EXTS = {".jpg", ".jpeg"}

In [20]:
# ---------------- 유틸 함수 ----------------
def order_quad(quad):
    """
    quad: (4,2) ndarray (x,y) 4점, 순서 뒤섞여 있어도 됨
    반환: 시계방향 정렬 [lt, rt, rb, lb]
    """
    q = np.asarray(quad, dtype=np.float32)
    if q.shape != (4, 2):
        raise ValueError("quad must be shape (4,2)")
    s = q.sum(axis=1)              # x+y
    d = q[:, 0] - q[:, 1]          # x-y
    lt = q[np.argmin(s)]
    rb = q[np.argmax(s)]
    rt = q[np.argmax(d)]
    lb = q[np.argmin(d)]
    return np.stack([lt, rt, rb, lb], axis=0)

def write_yolo_obb_txt(txt_path, quads_norm, cls_id=0):
    """
    YOLO OBB: cls x1 y1 x2 y2 x3 y3 x4 y4  (총 9개 값)
    """
    lines = []
    for q in quads_norm:
        q = np.asarray(q, dtype=np.float32).reshape(-1)
        if q.size != 8:
            continue  # 방어코드
        line = f"{cls_id} " + " ".join(f"{v:.6f}" for v in q.tolist())  # 좌표 8개만 직렬화
        lines.append(line)

    Path(txt_path).parent.mkdir(parents=True, exist_ok=True)
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

def load_image_size(img_path):
    # imdecode로 경로/한글 호환
    img = cv2.imdecode(np.fromfile(str(img_path), dtype=np.uint8), cv2.IMREAD_COLOR)
    if img is None:
        return None
    return img.shape[1], img.shape[0]  # (W,H)

def clamp01_xyxy(qn):
    qn[:, 0] = np.clip(qn[:, 0], 0.0, 1.0)
    qn[:, 1] = np.clip(qn[:, 1], 0.0, 1.0)
    return qn

def valid_quad(q, min_pix=3):
    # 픽셀 기준 최소 크기 필터
    w = q[:, 0].max() - q[:, 0].min()
    h = q[:, 1].max() - q[:, 1].min()
    return (w >= min_pix) and (h >= min_pix)

def to_pixel_coords(q, W, H):
    """
    q: (4,2). 좌표가 [0,1] 정규화일 수도, 픽셀일 수도 있음 → 픽셀 스케일로 통일
    """
    q = np.asarray(q, dtype=np.float32)
    if (q[:, 0].max() <= 1.5) and (q[:, 1].max() <= 1.5):
        q = q.copy()
        q[:, 0] *= float(W)
        q[:, 1] *= float(H)
    return q

In [21]:
# ---------------- 라벨 파서(스키마 맞게 키만 조정) ----------------
def read_quads_from_label(label_path):
    """
    AI Hub 의약품 스키마 전용:
    meta['annotations'][0]['polygons'][i]['points'] => [{x:.., y:..} * 4]
    반환: [np.ndarray shape (4,2), ...]
    """
    import json
    import numpy as np

    with open(label_path, "r", encoding="utf-8") as f:
        meta = json.load(f)

    quads = []
    anns = meta.get("annotations", [])
    if not anns:
        return quads

    # 대부분 annotations는 길이 1개이며 그 안의 polygons 배열을 사용
    polygons = anns[0].get("polygons", []) if isinstance(anns[0], dict) else []
    for poly in polygons:
        pts = poly.get("points")
        if not pts or len(pts) < 4:
            continue

        # [{x:..,y:..}] → (N,2)
        try:
            p = np.array([[float(pt["x"]), float(pt["y"])] for pt in pts], dtype=np.float32)
        except Exception:
            # 혹시 숫자 리스트형식도 들어오면 보정
            p = np.array(pts, dtype=np.float32)
            if p.ndim == 1:
                p = p.reshape(-1, 2)

        # 4점 초과면 외접사각형으로 근사
        if p.shape[0] > 4:
            rect = cv2.minAreaRect(p.astype(np.float32))
            p = cv2.boxPoints(rect).astype(np.float32)

        if p.shape == (4, 2):
            quads.append(p)

    return quads

In [22]:
# ---------------- 메인 변환 루프 ----------------
def convert_split(split):
    src_img_dir = DATA_ROOT / f"01.원천데이터/OCR/{split}"
    src_lab_dir = DATA_ROOT / f"02.라벨링데이터/OCR/{split}"
    out_img_dir = YOLO_ROOT / f"images/{split}"
    out_lab_dir = YOLO_ROOT / f"labels/{split}"

    img_paths = sorted([p for p in src_img_dir.rglob("*")
                        if p.is_file() and p.suffix.lower() in IMG_EXTS])
    lab_map = {p.stem: p for p in src_lab_dir.rglob("*.json")}

    print(f"[{split}] imgs:{len(img_paths)} labs(map):{len(lab_map)} dir:{src_img_dir}")

    kept, skipped = 0, 0
    for img_path in tqdm(img_paths, desc=f"[{split}] label→OBB"):
        stem = img_path.stem
        lab_path = lab_map.get(stem)
        if not lab_path:
            skipped += 1
            continue

        wh = load_image_size(img_path)
        if wh is None:
            skipped += 1
            continue
        W, H = wh

        quads_raw = read_quads_from_label(lab_path)
        quads_final = []
        for q in quads_raw:
            # (0) 스케일 보정(정규화 → 픽셀)
            q = to_pixel_coords(q, W, H)
            # (1) 유효성 검사 (픽셀)
            if not valid_quad(q, min_pix=3):
                continue
            # (2) 정렬
            q = order_quad(q)
            # (3) 경계 클리핑
            q[:, 0] = np.clip(q[:, 0], 0, W - 1)
            q[:, 1] = np.clip(q[:, 1], 0, H - 1)
            # (4) 정규화
            qn = q.astype(np.float32).copy()
            qn[:, 0] /= float(W)
            qn[:, 1] /= float(H)
            # (5) 클램프
            qn = clamp01_xyxy(qn)
            # (6) 너무 작은 박스 제거
            if (qn[:, 0].max() - qn[:, 0].min()) < 1e-4 or (qn[:, 1].max() - qn[:, 1].min()) < 1e-4:
                continue
            quads_final.append(qn)

        if not quads_final:
            skipped += 1
            continue

        # 이미지 복사 (바이트 그대로; exif/icc 유지 필요시 shutil.copy2 사용)
        out_img_path = out_img_dir / f"{stem}{img_path.suffix}"
        out_img_path.parent.mkdir(parents=True, exist_ok=True)
        with open(img_path, "rb") as r, open(out_img_path, "wb") as w:
            w.write(r.read())

        # YOLO OBB 라벨 저장
        out_txt = out_lab_dir / f"{stem}.txt"
        write_yolo_obb_txt(out_txt, quads_final, cls_id=0)
        kept += 1

    print(f"[{split}] kept: {kept}, skipped: {skipped}")

# ---------------- 실행 ----------------
for split in ["train", "val", "test"]:
    convert_split(split)
print("라벨 변환 완료한다")


[train] imgs:388 labs(map):388 dir:/content/data/01.원천데이터/OCR/train


[train] label→OBB: 100%|██████████| 388/388 [00:18<00:00, 20.81it/s]


[train] kept: 388, skipped: 0
[val] imgs:48 labs(map):48 dir:/content/data/01.원천데이터/OCR/val


[val] label→OBB: 100%|██████████| 48/48 [00:02<00:00, 19.81it/s]


[val] kept: 48, skipped: 0
[test] imgs:49 labs(map):49 dir:/content/data/01.원천데이터/OCR/test


[test] label→OBB: 100%|██████████| 49/49 [00:02<00:00, 22.64it/s]

[test] kept: 49, skipped: 0
라벨 변환 완료한다





## YOLOv8-OBB 탐지전용

In [34]:
YOLO_ROOT = Path("/content/yolo_obb")
(YOLO_ROOT/"data.yaml").write_text(
f"""path: {YOLO_ROOT}
train: images/train
val: images/val
test: images/test
names: [text]
task: obb
""", encoding="utf-8")

print("data.yaml ->", YOLO_ROOT/"data.yaml")

data.yaml -> /content/yolo_obb/data.yaml


In [35]:
# 학습
root = Path("/content/yolo_obb")
for s in ["train","val","test"]:
    ni = len(list((root/f"images/{s}").glob("*.*")))
    nl = len(list((root/f"labels/{s}").glob("*.txt")))
    print(f"{s}: images={ni}, labels={nl}")

train: images=388, labels=388
val: images=48, labels=48
test: images=49, labels=49


In [36]:
model = YOLO("yolov8n-obb.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-obb.pt to 'yolov8n-obb.pt': 100%|██████████| 6.26M/6.26M [00:00<00:00, 25.9MB/s]


In [None]:
model.train(
    data="/content/yolo_obb/data.yaml",
    epochs=30,
    imgsz=896,    # 768~1024 사이 조절
    batch=64,
    cache=True,
    workers=2,
    device="cpu"
)

Ultralytics 8.3.177 🚀 Python-3.11.13 torch-2.6.0+cu124 CPU (Intel Xeon 2.20GHz)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=64, bgr=0.0, box=7.5, cache=True, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/content/yolo_obb/data.yaml, degrees=0.0, deterministic=True, device=cpu, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=30, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=896, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n-obb.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train2, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective=0.0, plots=True, pose=12.0

[34m[1mtrain: [0mScanning /content/yolo_obb/labels/train... 388 images, 0 backgrounds, 0 corrupt: 100%|██████████| 388/388 [00:14<00:00, 26.82it/s]

[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00002.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00003.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00017.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00054.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00055.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00062.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00063.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00064.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00066.jpg: corrupt JPEG restored and saved
[34m[1mtrain: [0m/content/yolo_obb/images/train/medicine_00067.jpg: co






[34m[1mtrain: [0mCaching images (0.5GB RAM): 100%|██████████| 388/388 [00:33<00:00, 11.47it/s]

[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, method='weighted_average', num_output_channels=3), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))





[34m[1mval: [0mFast image access ✅ (ping: 0.1±0.1 ms, read: 91.3±56.7 MB/s, size: 751.6 KB)


[34m[1mval: [0mScanning /content/yolo_obb/labels/val... 48 images, 0 backgrounds, 0 corrupt: 100%|██████████| 48/48 [00:01<00:00, 25.50it/s]

[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00077.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00392.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00474.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00502.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00519.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00524.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00535.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00546.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00554.jpg: corrupt JPEG restored and saved
[34m[1mval: [0m/content/yolo_obb/images/val/medicine_00597.jpg: corrupt JPEG restored and saved
[34m[1mv






[34m[1mval: [0mCaching images (0.1GB RAM): 100%|██████████| 48/48 [00:03<00:00, 15.60it/s]

Plotting labels to runs/obb/train2/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 63 weight(decay=0.0), 73 weight(decay=0.0005), 72 bias(decay=0.0)
Image sizes 896 train, 896 val
Using 0 dataloader workers
Logging results to [1mruns/obb/train2[0m
Starting training for 30 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


  0%|          | 0/7 [00:00<?, ?it/s]

## CROP

In [None]:
runs = sorted(Path("/content/runs/obb/train").glob("*/weights/best.pt"))
BEST = str(runs[-1]) if runs else "/content/best-obb.pt"  # 필요시 직접 경로 기입
print("WEIGHTS:", BEST)

model = YOLO(BEST)
YOLO_IMG_DIR = Path("/content/yolo_obb/images")  # {train,val,test}
CROP_DIR = Path("/content/crops")                 # 결과 저장 루트
CROP_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# (4점 정렬 + 투시보정 + 옵션 강화)
def order_quad(pts):
    pts = np.asarray(pts, np.float32).reshape(4,2)
    s = pts.sum(1); d = np.diff(pts,1).ravel()
    tl = pts[np.argmin(s)]; br = pts[np.argmax(s)]
    tr = pts[np.argmin(d)]; bl = pts[np.argmax(d)]
    return np.stack([tl,tr,br,bl]).astype(np.float32)

def warp_quad(img_bgr, quad, out_h=48):
    q = order_quad(quad)
    w1 = np.linalg.norm(q[0]-q[1]); w2 = np.linalg.norm(q[2]-q[3])
    W  = max(int(max(w1, w2)), 8)
    dst = np.array([[0,0],[W-1,0],[W-1,out_h-1],[0,out_h-1]], np.float32)
    M = cv2.getPerspectiveTransform(q, dst)
    return cv2.warpPerspective(img_bgr, M, (W, out_h))

def enhance_local(crop_bgr, clahe_clip=2.0, unsharp_gain=0.2, gauss_sigma=0.8):
    lab = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2LAB)
    l,a,b = cv2.split(lab)
    l  = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8,8)).apply(l)
    out = cv2.cvtColor(cv2.merge([l,a,b]), cv2.COLOR_LAB2BGR)
    if unsharp_gain>0:
        blur = cv2.GaussianBlur(out,(0,0),gauss_sigma)
        out  = cv2.addWeighted(out, 1.0+unsharp_gain, blur, -unsharp_gain, 0)
    return out

In [None]:
# 크롭 생성 함수
def create_crops_for_split(split="val", conf=0.25, iou=0.5, imgsz=1024, out_h=48, use_enhance=False):
    src_dir = YOLO_IMG_DIR / split
    out_dir = CROP_DIR / split
    out_dir.mkdir(parents=True, exist_ok=True)
    manifest_path = CROP_DIR / f"manifest_{split}.jsonl"

    imgs = sorted([p for p in src_dir.glob("*.*") if p.suffix.lower() in {".jpg",".jpeg",".png"}])
    print(f"[{split}] images:", len(imgs))
    n_crops = 0
    with open(manifest_path, "w", encoding="utf-8") as mf:
        for img_path in tqdm(imgs, desc=f"crop {split}"):
            img = cv2.imread(str(img_path))
            if img is None:
                continue
            res = model(str(img_path), imgsz=imgsz, conf=conf, iou=iou, verbose=False)[0]

            # OBB의 4점 좌표 꺼내기
            if hasattr(res, "obb") and res.obb is not None:
                quads = res.obb.xyxyxyxy.cpu().numpy()        # (N,4,2)
                clsids = res.obb.cls.cpu().numpy().astype(int)
                confs  = res.obb.conf.cpu().numpy()
            else:
                quads, clsids, confs = [], [], []

            for k, quad in enumerate(quads):
                crop = warp_quad(img, quad, out_h=out_h)
                if use_enhance:
                    crop = enhance_local(crop)

                crop_name = f"{img_path.stem}_obb{k:03d}.jpg"
                crop_path = out_dir / crop_name
                cv2.imwrite(str(crop_path), crop)

                rec = {
                    "src_img": str(img_path),
                    "crop": str(crop_path),
                    "quad": quad.tolist(),
                    "cls_id": int(clsids[k]) if len(clsids)>k else 0,
                    "conf": float(confs[k]) if len(confs)>k else 0.0,
                    "out_h": out_h,
                }
                mf.write(json.dumps(rec, ensure_ascii=False) + "\n")
                n_crops += 1
    print(f"[{split}] saved crops:", n_crops, "| manifest:", manifest_path)
    return n_crops, manifest_path


In [None]:
_ = create_crops_for_split("val",  conf=0.25, iou=0.5, imgsz=1024, out_h=48, use_enhance=False)
_ = create_crops_for_split("test", conf=0.25, iou=0.5, imgsz=1024, out_h=48, use_enhance=False)
_ = create_crops_for_split("train", conf=0.25, iou=0.5, imgsz=1024, out_h=48)

In [None]:
split = "val"
m = Path(f"/content/crops/manifest_{split}.jsonl")
print(m)
print("\n".join(m.read_text(encoding="utf-8").splitlines()[:5]))

## 인식

In [None]:
ocr = PaddleOCR(use_gpu=False, det=False, rec=True, lang='korean')  # 'korean'에 영문/숫자 포함
print("PaddleOCR(rec-only) ready.")

In [None]:
def enhance_local(crop_bgr, clahe_clip=2.0, unsharp_gain=0.2, gauss_sigma=0.8):
    lab = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2LAB)
    l,a,b = cv2.split(lab)
    l  = cv2.createCLAHE(clipLimit=clahe_clip, tileGridSize=(8,8)).apply(l)
    out = cv2.cvtColor(cv2.merge([l,a,b]), cv2.COLOR_LAB2BGR)
    if unsharp_gain>0:
        blur = cv2.GaussianBlur(out,(0,0),gauss_sigma)
        out  = cv2.addWeighted(out, 1.0+unsharp_gain, blur, -unsharp_gain, 0)
    return out