<a href="https://colab.research.google.com/github/kimdeoksoo-71/Preprocessing/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Opening

In [None]:
# @title (1) Import Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# @title (2) install

# pdf2image 설치\\
!pip install pdf2image opencv-python pillow numpy

# pdf2image 설치\\
!apt-get install -y poppler-utils
!pip install pdf2image


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 2 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 0s (609 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 117540 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing

# 강대 X 38 문항별

In [None]:
# @title 강대X 38문항 문항별 : 문제 (600dpi, Grayscale 고정)
# === 설정: 600dpi 고해상도 / Grayscale / PDF 가시 영역 기준 크롭 ===

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/02_Input/Problem"
OUT_DIR   = "/content/drive/MyDrive/PBMAI/03_Output/crops"
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1BIZV74AUr_A2KCsJJCOWEQ7KMIIXIReOSrnRCkGpKbs/edit?"
WORKSHEET_NAME = "문38X"

# ✅ 해상도 600dpi로 상향 조정
DPI                = 600
RETRY_TIMES        = 2
OVERWRITE          = False
SKIP_DONE          = True
SKIP_OUT_OF_RANGE_PAGES = True

# ✅ 컬러 옵션: Grayscale 강제 설정
GRAYSCALE_RENDER = True
FORCE_BW        = False  # 1비트 흑백이 아닌 부드러운 회색조를 위해 False 유지

# ----[2] 인증 및 Google Sheets 연결 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive.readonly"])
gc = gspread.authorize(creds)

def safe_call(fn, *args, **kwargs):
    for i in range(10):
        try: return fn(*args, **kwargs)
        except Exception: time.sleep(min(2 * (2**i), 30)); continue
    raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그 초기화 및 데이터 로드 ----
values_all = safe_call(ws.get_all_values)
last_row = len(values_all)
if last_row >= 2: safe_call(ws.batch_clear, [f"M2:R{last_row}"])

header = values_all[0][:]
col_idx = {name: header.index(name)+1 for name in header}
df = pd.DataFrame(values_all[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[4] 이미지 처리 핵심 유틸리티 ----

def render_page_to_image(page, dpi=600, grayscale=True):
    """
    PDF 페이지를 지정된 DPI와 컬러모드로 렌더링.
    PDF 자체에서 Crop된 상태(CropBox)를 기준으로 가져옴.
    """
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)

    # 렌더링 단계에서 Grayscale 적용
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)

    # PIL 이미지 변환 (L = 8-bit Grayscale)
    mode = "L" if grayscale else "RGB"
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def apply_inner_trim(img, row_item):
    # 시트의 inner_top/bottom/left/right 값만큼 추가로 깎아냄
    w_val = lambda k: max(0.0, min(1.0, float(str(row_item.get(k, 0)).strip() or 0)))
    m = {k: w_val(f"inner_{k}") for k in ["top", "bottom", "left", "right"]}
    if sum(m.values()) == 0: return img
    W, H = img.size
    return img.crop((int(m["left"]*W), int(m["top"]*H), W-int(m["right"]*W), H-int(m["bottom"]*H)))

# ----[5] 실행 루프 ----
KST = timezone(timedelta(hours=9))
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))

for _, row in tqdm(df.iterrows(), total=len(df), desc="문항 추출 중"):
    r = row.to_dict(); rownum = int(r["_rownum"])
    if SKIP_DONE and r.get("status","").lower()=="done": continue

    # 페이지 번호 파싱
    p_str = str(r.get("pages","")).strip()
    pages_list = [int(p) for p in re.split(r'[,-]', p_str) if p.isdigit()] # 간이 파싱
    if not pages_list: continue

    outputs = []
    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        doc = fitz.open(pdf_path)

        for p in pages_list:
            if p > len(doc): continue

            # 1. 600dpi Grayscale로 페이지 전체 렌더링 (사전 트림 반영됨)
            img = render_page_to_image(doc[p-1], DPI, GRAYSCALE_RENDER)
            W, H = img.size

            # 2. 시트의 x, y, w, h 좌표대로 Crop
            # (좌표는 0~1 사이의 비율값(frac) 기준)
            try:
                x, y = float(r['x']), float(r['y'])
                w, h = float(r['w']), float(r['h'])
                crop = img.crop((int(x*W), int(y*H), int((x+w)*W), int((y+h)*H)))

                # 3. 추가 내부 트림 적용
                crop = apply_inner_trim(crop, r)

                # 4. 저장
                out_name = f"{pdf_stem}_p{p:03d}_{r['box_id']}.png"
                out_path = os.path.join(OUT_DIR, out_name)
                crop.save(out_path, "PNG", optimize=True)
                outputs.append(out_name)
            except: continue
        doc.close()

    # 시트 상태 업데이트
    st = "done" if outputs else "error"
    safe_call(ws.update_cell, rownum, col_idx["status"], st)
    safe_call(ws.update_cell, rownum, col_idx["processed_at"], datetime.now(KST).isoformat())

print(f"✅ 처리가 완료되었습니다. 결과물 확인: {OUT_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


문항 추출 중: 100%|██████████| 38/38 [00:00<00:00, 5368.26it/s]

✅ 처리가 완료되었습니다. 결과물 확인: /content/drive/MyDrive/PBMAI/03_Output/crops





In [None]:
# @title 강대X 38문항 2단통 : 해설 (600dpi, Grayscale 고정)
# === Colab: 600dpi 고해상도 / Grayscale / PDF 가시 영역 기준 크롭 ===

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/02_Input/Solution"
OUT_DIR   = "/content/drive/MyDrive/PBMAI/03_Output/crops"
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1BIZV74AUr_A2KCsJJCOWEQ7KMIIXIReOSrnRCkGpKbs/edit?"
WORKSHEET_NAME = "해2단40"

# ✅ 600dpi 설정
DPI                = 600
RETRY_TIMES        = 2
OVERWRITE          = False
SKIP_DONE          = True
SKIP_OUT_OF_RANGE_PAGES = True
OUTPUTS_VERBOSE    = False

# ✅ 이미지 옵션: Grayscale 고정
GRAYSCALE_RENDER = True
FORCE_BW        = False
BW_DITHER       = "NONE"
BW_THRESHOLD    = None

PAGE_TRIM_MODE = "none"

# ----[2] 인증 및 Google Sheets 연결 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive.readonly"]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid: creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출 ----
def safe_call(fn, *args, tries=10, base=2.0, jitter=1.0, max_wait=90.0, **kwargs):
    for i in range(tries):
        try: return fn(*args, **kwargs)
        except Exception as e:
            if i < tries - 1:
                time.sleep(min(base*(2**i) + random.random()*jitter, max_wait))
                continue
            raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그 및 데이터 로드 ----
values_all = safe_call(ws.get_all_values)
if not values_all: raise ValueError("시트가 비어 있습니다.")
last_row = len(values_all)
if last_row >= 2: safe_call(ws.batch_clear, [f"M2:R{last_row}"])

header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right",
          "filename","status","attempts","last_error","outputs","processed_at"]
for col in NEEDED:
    if col not in header: header.append(col)
safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

df = pd.DataFrame(values_all[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[4] 이미지 처리 유틸리티 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=600, grayscale=True):
    """600dpi Grayscale로 렌더링 (사전 트림 상태 반영)"""
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    # 확실한 Grayscale 변환을 위해 "L" 모드 사용
    mode = "L" if pix.n == 1 else "RGB"
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            try:
                a,b = tok.split("-",1); a,b = int(a), int(b)
                out.update(range(min(a,b), max(a,b)+1))
            except: continue
        elif tok.isdigit(): out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x = as_float(row_item.get("x", 0)); y = as_float(row_item.get("y", 0))
    w = as_float(row_item.get("w", 0)); h = as_float(row_item.get("h", 0))
    if unit == "px":
        x1, y1 = int(round(x)), int(round(y))
        x2, y2 = int(round(x + w)), int(round(y + h))
    else:
        x1, y1 = int(round(x * W)), int(round(y * H))
        x2, y2 = int(round((x + w) * W)), int(round((y + h) * H))
    return (max(0, x1), max(0, y1), min(W, x2), min(H, y2))

def apply_inner_trim(img, row_item):
    W, H = img.size
    l = int(as_float(row_item.get("inner_left", 0)) * W)
    r = int(as_float(row_item.get("inner_right", 0)) * W)
    t = int(as_float(row_item.get("inner_top", 0)) * H)
    b = int(as_float(row_item.get("inner_bottom", 0)) * H)
    if l+r+t+b == 0: return img
    return img.crop((l, t, W-r, H-b))

def save_with_tmp_and_verify(crop, out_path):
    os.makedirs(TMP_DIR, exist_ok=True)
    tmp = os.path.join(TMP_DIR, os.path.basename(out_path))
    # 그레이스케일 최적화 저장
    crop.save(tmp, "PNG", optimize=True, compress_level=9)
    shutil.move(tmp, out_path)

def trimmed_page_image(doc, page_idx):
    page = doc[page_idx-1]
    return render_page_to_image(page, DPI, grayscale=GRAYSCALE_RENDER)

# ----[5] 실행 루프 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _, row in tqdm(df.iterrows(), total=len(df), desc="해설 추출 중"):
    r = row.to_dict(); rownum = int(r["_rownum"])
    if SKIP_DONE and (r.get("status","").lower()=="done"): continue

    pages_list = parse_pages_field(r.get("pages",""))
    if not pages_list: continue

    multi_page = len(pages_list) > 1
    outputs = []; errors = []; expected_total = 0

    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        try: doc = fitz.open(pdf_path)
        except Exception as e: errors.append(f"open_fail:{e}"); continue

        target_pages = [p for p in pages_list if 1 <= p <= len(doc)]
        expected_total += len(target_pages)

        for p in target_pages:
            try:
                img = trimmed_page_image(doc, p)
                W, H = img.size
                x1, y1, x2, y2 = rect_from_row(r, W, H)
                crop = img.crop((x1, y1, x2, y2))
                crop = apply_inner_trim(crop, r)

                out_name = f"{pdf_stem}_p{p:03d}_{r['box_id']}.png"
                out_path = os.path.join(OUT_DIR, out_name)
                save_with_tmp_and_verify(crop, out_path)
                outputs.append(out_name)
            except Exception as e: errors.append(f"page{p}_err:{e}")
        doc.close()

    # 시트 업데이트
    st = "done" if outputs else "error"
    patch = {"status": st, "last_error": "; ".join(errors)[:500], "processed_at": KST_NOW()}
    row_vals = [patch.get(h, r.get(h,"")) for h in header]
    safe_call(ws.update, f"A{rownum}:{rowcol_to_a1(rownum, len(header))}", [row_vals])

print("✅ 모든 해설 파일 처리가 완료되었습니다.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  return fn(*args, **kwargs)


Found 4 PDFs


  return fn(*args, **kwargs)
Processing rows: 100%|██████████| 40/40 [00:50<00:00,  1.25s/it]

✅ All rows processed across all PDFs (no pre-trim).





# 써킷 X 12 문항별

In [7]:
# @title 써킷X 12문항 문항별 : 문제 (600dpi, Grayscale 고정)
# === Colab: 써킷X_문제_12문항 (600dpi + Grayscale) ===
# 원본 PDF의 가시 영역(CropBox) 반영 및 고해상도 변환

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/02_Input/Problem"
OUT_DIR   = "/content/drive/MyDrive/PBMAI/03_Output/crops"
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1BIZV74AUr_A2KCsJJCOWEQ7KMIIXIReOSrnRCkGpKbs/edit?"
WORKSHEET_NAME = "문12써킷"

# ✅ 600dpi 설정
DPI                = 600
RETRY_TIMES        = 2
OVERWRITE          = True
SKIP_DONE          = False
SKIP_OUT_OF_RANGE_PAGES = True

# ✅ 이미지 옵션: Grayscale 고정
GRAYSCALE_RENDER = True
FORCE_BW        = False

# ----[2] 인증 및 Google Sheets 연결 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive.readonly"]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid: creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출 ----
def safe_call(fn, *args, tries=10, base=2.0, jitter=1.0, max_wait=90.0, **kwargs):
    for i in range(tries):
        try: return fn(*args, **kwargs)
        except Exception:
            time.sleep(min(base*(2**i) + random.random()*jitter, max_wait))
            continue
    raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그 컬럼 초기화 및 데이터 로드 ----
values_all = safe_call(ws.get_all_values)
if not values_all: raise ValueError("시트가 비어 있습니다.")
last_row = len(values_all)
if last_row >= 2: safe_call(ws.batch_clear, [f"M2:R{last_row}"])

header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right",
          "filename","status","attempts","last_error","outputs","processed_at"]
for col in NEEDED:
    if col not in header: header.append(col)
safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

df = pd.DataFrame(values_all[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[4] 이미지 처리 유틸리티 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=600, grayscale=True):
    """PDF 가시 영역을 고해상도 Grayscale로 렌더링"""
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    # 8-bit Grayscale "L" 모드로 PIL 이미지 생성
    return Image.frombytes("L" if grayscale else "RGB", [pix.width, pix.height], pix.samples)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            try:
                a, b = tok.split("-", 1); a, b = int(a), int(b)
                out.update(range(min(a,b), max(a,b)+1))
            except: continue
        elif tok.isdigit(): out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x = as_float(row_item.get("x", 0)); y = as_float(row_item.get("y", 0))
    w = as_float(row_item.get("w", 0)); h = as_float(row_item.get("h", 0))
    if unit == "px":
        x1, y1 = int(round(x)), int(round(y))
        x2, y2 = int(round(x + w)), int(round(y + h))
    else:
        x1, y1 = int(round(x * W)), int(round(y * H))
        x2, y2 = int(round((x + w) * W)), int(round((y + h) * H))
    return (max(0, x1), max(0, y1), min(W, x2), min(H, y2))

def apply_inner_trim(img, row_item):
    W, H = img.size
    l = int(as_float(row_item.get("inner_left", 0)) * W)
    r = int(as_float(row_item.get("inner_right", 0)) * W)
    t = int(as_float(row_item.get("inner_top", 0)) * H)
    b = int(as_float(row_item.get("inner_bottom", 0)) * H)
    if l+r+t+b == 0: return img
    return img.crop((l, t, W-r, H-b))

def save_with_tmp_and_verify(crop, out_path):
    os.makedirs(TMP_DIR, exist_ok=True)
    tmp = os.path.join(TMP_DIR, os.path.basename(out_path))
    crop.save(tmp, "PNG", optimize=True, compress_level=9)
    shutil.move(tmp, out_path)

# ----[5] 실행 루프 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _, row in tqdm(df.iterrows(), total=len(df), desc="문항 추출 중"):
    r = row.to_dict(); rownum = int(r["_rownum"])
    if SKIP_DONE and (r.get("status","").lower()=="done"): continue

    pages_list = parse_pages_field(r.get("pages",""))
    if not pages_list: continue

    # ✅ L열(filename) 값 가져오기
    custom_filename = str(r.get("filename", "")).strip()

    outputs, errors = [], []
    expected_total = 0

    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        try:
            doc = fitz.open(pdf_path)
            target_pages = [p for p in pages_list if 1 <= p <= len(doc)]
            expected_total += len(target_pages)

            for p in target_pages:
                try:
                    img = render_page_to_image(doc[p-1], DPI, GRAYSCALE_RENDER)
                    W, H = img.size
                    x1, y1, x2, y2 = rect_from_row(r, W, H)
                    crop = img.crop((x1, y1, x2, y2))
                    crop = apply_inner_trim(crop, r)

                    # ✅ 파일명 생성: 원본파일명 + L열값
                    if custom_filename:
                        out_name = f"{pdf_stem}_{custom_filename}.png"
                    else:
                        out_name = f"{pdf_stem}.png"

                    out_path = os.path.join(OUT_DIR, out_name)
                    save_with_tmp_and_verify(crop, out_path)
                    outputs.append(out_name)
                except Exception as e: errors.append(f"p{p}_err:{e}")
            doc.close()
        except Exception as e: errors.append(f"pdf_err:{e}")

    # 결과 업데이트
    st = "done" if outputs else "error"
    patch = {"status": st, "last_error": "; ".join(errors)[:500], "processed_at": KST_NOW()}
    row_vals = [patch.get(h, r.get(h,"")) for h in header]
    safe_call(ws.update, f"A{rownum}:{rowcol_to_a1(rownum, len(header))}", [row_vals])

print("✅ 써킷X 문항 처리가 완료되었습니다.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  try: return fn(*args, **kwargs)
  try: return fn(*args, **kwargs)
문항 추출 중: 100%|██████████| 12/12 [01:15<00:00,  6.29s/it]

✅ 써킷X 문항 처리가 완료되었습니다.





In [11]:
# @title 써킷X 12문항 2단통 : 해설 (600dpi, Grayscale 고정)
# === Colab: 써킷X_해설_2단통 (고해상도 + 그레이스케일) ===

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/02_Input/Solution"
OUT_DIR   = "/content/drive/MyDrive/PBMAI/03_Output/crops"
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1BIZV74AUr_A2KCsJJCOWEQ7KMIIXIReOSrnRCkGpKbs/edit?"
WORKSHEET_NAME = "해2단40"

# ✅ 600dpi 상향 조정
DPI                = 600
RETRY_TIMES        = 2
OVERWRITE          = True
SKIP_DONE          = False
SKIP_OUT_OF_RANGE_PAGES = True
OUTPUTS_VERBOSE    = False

# ✅ 이미지 옵션: Grayscale 고정
GRAYSCALE_RENDER = True
FORCE_BW        = False

# ✅ 프리-트림 비활성화 (원본 PDF 가시 영역 사용)
PAGE_TRIM_MODE = "none"

# ----[2] 인증 및 Google Sheets 연결 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive.readonly"]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid: creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출 ----
def safe_call(fn, *args, tries=10, base=2.0, jitter=1.0, max_wait=90.0, **kwargs):
    for i in range(tries):
        try: return fn(*args, **kwargs)
        except Exception:
            time.sleep(min(base*(2**i) + random.random()*jitter, max_wait))
            continue
    raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그 및 데이터 로드 ----
values_all = safe_call(ws.get_all_values)
if not values_all: raise ValueError("시트가 비어 있습니다.")
last_row = len(values_all)
if last_row >= 2: safe_call(ws.batch_clear, [f"M2:R{last_row}"])

header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right",
          "filename","status","attempts","last_error","outputs","processed_at"]
for col in NEEDED:
    if col not in header: header.append(col)
safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

df = pd.DataFrame(values_all[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[4] 이미지 처리 유틸리티 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=600, grayscale=True):
    """PDF 페이지를 600dpi Grayscale로 렌더링"""
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    # PIL "L" 모드 (8-bit grayscale) 강제
    return Image.frombytes("L" if grayscale else "RGB", [pix.width, pix.height], pix.samples)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            try:
                a, b = tok.split("-", 1); a, b = int(a), int(b)
                out.update(range(min(a,b), max(a,b)+1))
            except: continue
        elif tok.isdigit(): out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x, y = as_float(row_item.get("x", 0)), as_float(row_item.get("y", 0))
    w, h = as_float(row_item.get("w", 0)), as_float(row_item.get("h", 0))
    if unit == "px":
        x1, y1 = int(round(x)), int(round(y))
        x2, y2 = int(round(x+w)), int(round(y+h))
    else:
        x1, y1 = int(round(x*W)), int(round(y*H))
        x2, y2 = int(round((x+w)*W)), int(round((y+h)*H))
    return (max(0, x1), max(0, y1), min(W, x2), min(H, y2))

def apply_inner_trim(img, row_item):
    W, H = img.size
    l = int(as_float(row_item.get("inner_left", 0)) * W)
    r = int(as_float(row_item.get("inner_right", 0)) * W)
    t = int(as_float(row_item.get("inner_top", 0)) * H)
    b = int(as_float(row_item.get("inner_bottom", 0)) * H)
    if l+r+t+b == 0: return img
    return img.crop((l, t, W-r, H-b))

def save_with_tmp_and_verify(crop, out_path):
    os.makedirs(TMP_DIR, exist_ok=True)
    tmp = os.path.join(TMP_DIR, os.path.basename(out_path))
    crop.save(tmp, "PNG", optimize=True, compress_level=9)
    shutil.move(tmp, out_path)

# ----[5] 실행 루프 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _, row in tqdm(df.iterrows(), total=len(df), desc="해설 추출 중"):
    r = row.to_dict(); rownum = int(r["_rownum"])
    if SKIP_DONE and (r.get("status","").lower()=="done"): continue

    pages_list = parse_pages_field(r.get("pages",""))
    if not pages_list: continue

    # ✅ L열(filename) 값 가져오기
    custom_filename = str(r.get("filename", "")).strip()

    outputs, errors = [], []
    expected_total = 0

    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        try:
            doc = fitz.open(pdf_path)
            target_pages = [p for p in pages_list if 1 <= p <= len(doc)]
            expected_total += len(target_pages)

            for p in target_pages:
                try:
                    img = render_page_to_image(doc[p-1], DPI, GRAYSCALE_RENDER)
                    W, H = img.size
                    x1, y1, x2, y2 = rect_from_row(r, W, H)
                    crop = img.crop((x1, y1, x2, y2))
                    crop = apply_inner_trim(crop, r)

                    # ✅ 파일명 생성: 원본파일명 + L열값
                    if custom_filename:
                        out_name = f"{pdf_stem}_{custom_filename}.png"
                    else:
                        out_name = f"{pdf_stem}.png"

                    out_path = os.path.join(OUT_DIR, out_name)
                    save_with_tmp_and_verify(crop, out_path)
                    outputs.append(out_name)
                except Exception as e: errors.append(f"p{p}_err:{e}")
            doc.close()
        except Exception as e: errors.append(f"pdf_err:{e}")

    # 결과 업데이트
    st = "done" if outputs else "error"
    patch = {"status": st, "last_error": "; ".join(errors)[:500], "processed_at": KST_NOW()}
    row_vals = [patch.get(h, r.get(h,"")) for h in header]
    safe_call(ws.update, f"A{rownum}:{rowcol_to_a1(rownum, len(header))}", [row_vals])

print("✅ 써킷X 해설 처리가 완료되었습니다.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  try: return fn(*args, **kwargs)
  try: return fn(*args, **kwargs)
해설 추출 중:  20%|██        | 8/40 [00:24<01:38,  3.09s/it]


KeyboardInterrupt: 

# 강대 K

In [None]:
# @title  해설지 2단 100개 (사전 트림)
# === Colab: 2단 해설지 PDF 처리 (프리-트림 제거 버전) ===
#            + 워크시트 규격으로만 크롭
#            + 전체 PDF 일괄 처리 / 로그(M~R) 초기화 / gspread 백오프

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/02_Input/Solution"            # 여백이 이미 제거된 PDF
OUT_DIR   = "/content/drive/MyDrive/PBMAI/03_Output/crops"     # 결과 저장 폴더
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1BIZV74AUr_A2KCsJJCOWEQ7KMIIXIReOSrnRCkGpKbs/edit?"
WORKSHEET_NAME = "2단통100"

DPI                = 300
RETRY_TIMES        = 2
OVERWRITE          = False
SKIP_DONE          = True
SKIP_OUT_OF_RANGE_PAGES = True
OUTPUTS_VERBOSE    = False

# ---- 이미지 옵션 ----
GRAYSCALE_RENDER = True
FORCE_BW        = False
BW_DITHER       = "NONE"
BW_THRESHOLD    = None

# ✅ 프리-트림 제거: PAGE_TRIM_MODE 고정 없음
PAGE_TRIM_MODE = "none"

# ----[2] 인증 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive.readonly",
]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid:
    creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출 ----
BACKOFF_TRIES  = 10
BACKOFF_BASE   = 2.0
BACKOFF_JITTER = 1.0
BACKOFF_MAX    = 90.0

def safe_call(fn, *args, tries=BACKOFF_TRIES, base=BACKOFF_BASE,
              jitter=BACKOFF_JITTER, max_wait=BACKOFF_MAX, **kwargs):
    for i in range(tries):
        try:
            return fn(*args, **kwargs)
        except APIError as e:
            code = getattr(getattr(e, "response", None), "status_code", None)
            msg = (str(getattr(e, "response", "")) + " " + str(e)).lower()
            transient = (code in (429, 500, 503)) or ("quota" in msg) or ("rate" in msg)
            if transient and i < tries - 1:
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise
        except Exception as e:
            if i < min(3, tries-1):
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

# 워크시트 핸들
sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그 컬럼(M~R) 비우기 ----
values_all = safe_call(ws.get_all_values)
if not values_all:
    raise ValueError("시트가 비어 있습니다.")
last_row = len(values_all)
if last_row >= 2:
    safe_call(ws.batch_clear, [f"M2:R{last_row}"])

# ----[4] 헤더 보정 ----
header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right",
          "filename","status","attempts","last_error","outputs","processed_at"]
for col in NEEDED:
    if col not in header: header.append(col)
safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

# ----[5] 데이터 로드 ----
records = safe_call(ws.get_all_values)
if len(records) < 2:
    print("데이터 행이 없습니다."); records.append([])
df = pd.DataFrame(records[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[6] 유틸 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=300, grayscale=GRAYSCALE_RENDER):
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    mode = {1: "L", 3: "RGB", 4: "RGBA"}.get(pix.n, "RGB")
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            a,b = tok.split("-",1); a,b = int(a), int(b)
            out.update(range(min(a,b), max(a,b)+1))
        else:
            out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x = as_float(row_item.get("x", 0)); y = as_float(row_item.get("y", 0))
    w = as_float(row_item.get("w", 0)); h = as_float(row_item.get("h", 0))
    if unit == "px":
        x1, y1 = int(round(x)), int(round(y))
        x2, y2 = int(round(x + w)), int(round(y + h))
    else:
        x1, y1 = int(round(x * W)), int(round(y * H))
        x2, y2 = int(round((x + w) * W)), int(round((y + h) * H))
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(W, x2), min(H, y2)
    if x2 <= x1 or y2 <= y1:
        raise ValueError(f"잘못된 박스: ({x1},{y1})~({x2},{y2})")
    return (x1, y1, x2, y2)

def apply_inner_trim(img, row_item):
    margins = dict(
        top=max(0.0,min(1.0,as_float(row_item.get("inner_top",0)))),
        bottom=max(0.0,min(1.0,as_float(row_item.get("inner_bottom",0)))),
        left=max(0.0,min(1.0,as_float(row_item.get("inner_left",0)))),
        right=max(0.0,min(1.0,as_float(row_item.get("inner_right",0))))
    )
    if sum(margins.values()) == 0: return img
    W,H = img.size
    l=int(round(margins["left"]*W)); r=int(round(margins["right"]*W))
    t=int(round(margins["top"]*H)); b=int(round(margins["bottom"]*H))
    return img.crop((l,t,W-r,H-b))

def maybe_binarize(img):
    if not FORCE_BW: return img
    g = img.convert("L")
    if BW_THRESHOLD is not None:
        g = ImageOps.autocontrast(g)
        return g.point(lambda p:255 if p>=BW_THRESHOLD else 0, mode="1")
    dither = Image.FLOYDSTEINBERG if BW_DITHER.upper()=="FS" else Image.NONE
    return g.convert("1", dither=dither)

def build_output_filename(base_name,pdf_stem,page_idx,box_id,multi_page=False):
    def _ensure_png(name): return name if name.lower().endswith(".png") else f"{name}.png"
    if base_name:
        name=_ensure_png(base_name.strip())
        if multi_page:
            stem,ext=os.path.splitext(name)
            name=f"{stem}_p{page_idx:03d}{ext}"
        return f"{pdf_stem}_{name}"
    return f"{pdf_stem}_page{page_idx:03d}_box{int(as_float(box_id,0)) or 0}.png"

def verify_image(path):
    if not os.path.exists(path): return False,"file_not_found"
    if os.path.getsize(path)==0: return False,"file_size_zero"
    try:
        with Image.open(path) as im: im.verify()
        return True,""
    except Exception as e:
        return False,f"PIL_verify_error:{e}"

def save_with_tmp_and_verify(crop,out_path):
    os.makedirs(TMP_DIR,exist_ok=True)
    tmp=os.path.join(TMP_DIR,os.path.basename(out_path))
    crop=maybe_binarize(crop)
    crop.save(tmp,"PNG",optimize=True,compress_level=9)
    ok,why=verify_image(tmp)
    if not ok: raise IOError(f"save_verify_failed:{why}")
    shutil.move(tmp,out_path)

# ✅ 프리-트림 완전 제거: 페이지 그대로 렌더링
def trimmed_page_image(doc,page_idx):
    page = doc[page_idx-1]
    return render_page_to_image(page,DPI,grayscale=GRAYSCALE_RENDER)

# ----[7] input 폴더의 PDF 수집 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))
if not pdf_paths:
    raise FileNotFoundError(f"INPUT_DIR에 PDF가 없습니다: {INPUT_DIR}")
print(f"Found {len(pdf_paths)} PDFs")

# ----[8] 실행 ----
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _,row in tqdm(df.iterrows(),total=len(df),desc="Processing rows"):
    r=row.to_dict(); rownum=int(r["_rownum"])
    attempts_val=int(as_float(r.get("attempts",0)))+1
    safe_call(ws.update_cell,rownum,col_idx["attempts"],str(attempts_val))
    if SKIP_DONE and (r.get("status","").lower()=="done"): continue

    pages_list=parse_pages_field(r.get("pages",""))
    if not pages_list:
        if "page" in header and str(r.get("page","")).strip():
            pages_list=[int(as_float(r.get("page"),0))]
        else:
            safe_call(ws.update_cell,rownum,col_idx["status"],"error")
            safe_call(ws.update_cell,rownum,col_idx["last_error"],"no_pages_specified")
            continue

    multi_page=len(pages_list)>1
    box_id=r.get("box_id",""); base_name=(r.get("filename","") or "").strip()
    outputs=[]; errors=[]; expected_total=0

    for pdf_path in pdf_paths:
        pdf_stem=os.path.splitext(os.path.basename(pdf_path))[0]
        try: doc=fitz.open(pdf_path)
        except Exception as e:
            errors.append(f"open_fail:{pdf_stem}:{e}"); continue
        n_pages=len(doc)
        target_pages=[p for p in pages_list if 1<=p<=n_pages] if SKIP_OUT_OF_RANGE_PAGES else pages_list
        expected_total+=len(target_pages)

        for attempt in range(RETRY_TIMES):
            try:
                for p in target_pages:
                    outname=build_output_filename(base_name,pdf_stem,p,box_id,multi_page)
                    out_path=os.path.join(OUT_DIR,outname)
                    if os.path.exists(out_path) and not OVERWRITE:
                        ok,_=verify_image(out_path)
                        if ok: outputs.append(out_path); continue
                    img=trimmed_page_image(doc,p)
                    W,H=img.size
                    x1,y1,x2,y2=rect_from_row(r,W,H)
                    crop=img.crop((x1,y1,x2,y2))
                    crop=apply_inner_trim(crop,r)
                    save_with_tmp_and_verify(crop,out_path)
                    outputs.append(out_path)
                break
            except Exception as e:
                if attempt<RETRY_TIMES-1: time.sleep(0.1); continue
                errors.append(f"{pdf_stem}:{e}")
        doc.close()

    ok_cnt=len(outputs)
    st="done" if (expected_total>0 and ok_cnt==expected_total) else ("partial" if ok_cnt>0 else "error")
    outputs_cell="" if ok_cnt==0 else f"{ok_cnt}/{expected_total} files (e.g., {os.path.basename(outputs[0])})"
    patch={"status":st,"last_error":"" if st=="done" else "; ".join(errors)[:500],
           "outputs":outputs_cell,"processed_at":KST_NOW(),"attempts":str(attempts_val)}
    row_vals=[patch.get(h,r.get(h,"")) for h in header]
    a1=f"{rowcol_to_a1(rownum,1)}:{rowcol_to_a1(rownum,len(header))}"
    safe_call(ws.update,a1,[row_vals])
    time.sleep(0.1)

print("✅ All rows processed across all PDFs (no pre-trim).")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  return fn(*args, **kwargs)


Found 1 PDFs


  return fn(*args, **kwargs)
Processing rows: 100%|██████████| 100/100 [01:25<00:00,  1.17it/s]

✅ All rows processed across all PDFs (no pre-trim).





In [None]:
# @title 해설지 3단 (600dpi, Grayscale 고정)
# === Colab: 3단 해설지 PDF 처리 (고해상도 + 그레이스케일) ===

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/02_Input/Solution"
OUT_DIR   = "/content/drive/MyDrive/PBMAI/03_Output/crops"
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1BIZV74AUr_A2KCsJJCOWEQ7KMIIXIReOSrnRCkGpKbs/edit?"
WORKSHEET_NAME = "해3단"

# ✅ 600dpi 설정
DPI                = 600
RETRY_TIMES        = 2
OVERWRITE          = False
SKIP_DONE          = True
SKIP_OUT_OF_RANGE_PAGES = True
OUTPUTS_VERBOSE    = False

# ✅ 이미지 옵션: Grayscale 강제
GRAYSCALE_RENDER = True
FORCE_BW        = False
BW_DITHER       = "NONE"
BW_THRESHOLD    = None

PAGE_TRIM_MODE = "none"

# ----[2] 인증 및 Google Sheets 연결 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive.readonly"]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid: creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출 ----
def safe_call(fn, *args, tries=10, base=2.0, jitter=1.0, max_wait=90.0, **kwargs):
    for i in range(tries):
        try: return fn(*args, **kwargs)
        except Exception:
            time.sleep(min(base*(2**i) + random.random()*jitter, max_wait))
            continue
    raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그 및 데이터 로드 ----
values_all = safe_call(ws.get_all_values)
if not values_all: raise ValueError("시트가 비어 있습니다.")
last_row = len(values_all)
if last_row >= 2: safe_call(ws.batch_clear, [f"M2:R{last_row}"])

header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right",
          "filename","status","attempts","last_error","outputs","processed_at"]
for col in NEEDED:
    if col not in header: header.append(col)
safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

df = pd.DataFrame(values_all[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[4] 이미지 처리 유틸리티 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=600, grayscale=True):
    """600dpi Grayscale로 렌더링"""
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    # PIL "L" 모드 (8-bit grayscale) 사용
    return Image.frombytes("L" if grayscale else "RGB", [pix.width, pix.height], pix.samples)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            try:
                a, b = tok.split("-", 1); a, b = int(a), int(b)
                out.update(range(min(a,b), max(a,b)+1))
            except: continue
        elif tok.isdigit(): out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x, y = as_float(row_item.get("x", 0)), as_float(row_item.get("y", 0))
    w, h = as_float(row_item.get("w", 0)), as_float(row_item.get("h", 0))
    if unit == "px":
        x1, y1 = int(round(x)), int(round(y))
        x2, y2 = int(round(x+w)), int(round(y+h))
    else:
        x1, y1 = int(round(x*W)), int(round(y*H))
        x2, y2 = int(round((x+w)*W)), int(round((y+h)*H))
    return (max(0, x1), max(0, y1), min(W, x2), min(H, y2))

def apply_inner_trim(img, row_item):
    W, H = img.size
    l = int(as_float(row_item.get("inner_left", 0)) * W)
    r = int(as_float(row_item.get("inner_right", 0)) * W)
    t = int(as_float(row_item.get("inner_top", 0)) * H)
    b = int(as_float(row_item.get("inner_bottom", 0)) * H)
    if l+r+t+b == 0: return img
    return img.crop((l, t, W-r, H-b))

def save_with_tmp_and_verify(crop, out_path):
    os.makedirs(TMP_DIR, exist_ok=True)
    tmp = os.path.join(TMP_DIR, os.path.basename(out_path))
    crop.save(tmp, "PNG", optimize=True, compress_level=9)
    shutil.move(tmp, out_path)

# ----[5] 실행 루프 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _, row in tqdm(df.iterrows(), total=len(df), desc="3단 해설 추출 중"):
    r = row.to_dict(); rownum = int(r["_rownum"])
    if SKIP_DONE and (r.get("status","").lower()=="done"): continue

    pages_list = parse_pages_field(r.get("pages",""))
    if not pages_list: continue

    outputs, errors = [], []
    expected_total = 0

    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        try:
            doc = fitz.open(pdf_path)
            target_pages = [p for p in pages_list if 1 <= p <= len(doc)]
            expected_total += len(target_pages)

            for p in target_pages:
                try:
                    # 원본 PDF의 가시 영역 기준 렌더링
                    img = render_page_to_image(doc[p-1], DPI, GRAYSCALE_RENDER)
                    W, H = img.size
                    x1, y1, x2, y2 = rect_from_row(r, W, H)
                    crop = img.crop((x1, y1, x2, y2))
                    crop = apply_inner_trim(crop, r)

                    out_name = f"{pdf_stem}_p{p:03d}_{r['box_id']}.png"
                    out_path = os.path.join(OUT_DIR, out_name)
                    save_with_tmp_and_verify(crop, out_path)
                    outputs.append(out_name)
                except Exception as e: errors.append(f"p{p}_err:{e}")
            doc.close()
        except Exception as e: errors.append(f"pdf_err:{e}")

    # 결과 업데이트
    st = "done" if outputs else "error"
    patch = {"status": st, "last_error": "; ".join(errors)[:500], "processed_at": KST_NOW()}
    row_vals = [patch.get(h, r.get(h,"")) for h in header]
    safe_call(ws.update, f"A{rownum}:{rowcol_to_a1(rownum, len(header))}", [row_vals])

print("✅ 3단 해설 처리가 완료되었습니다.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  try: return fn(*args, **kwargs)
  try: return fn(*args, **kwargs)
3단 해설 추출 중: 100%|██████████| 36/36 [00:49<00:00,  1.38s/it]

✅ 3단 해설 처리가 완료되었습니다.





In [None]:
# @title 해설지 3단
# === Colab: 3단 PDF 대응 / 페이지별 상하좌우 여백 트림(가변) / Sheet('강대X_해설_3단') 참조 크롭 ===
#            + 전체 PDF 일괄 처리 / 로그(M~R) 초기화 / gspread 보수적 백오프
#            + 그레이스케일 렌더링(기본) + 옵션 1비트 흑백 ===

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/input"            # 이 폴더의 모든 *.pdf 처리
OUT_DIR   = "/content/drive/MyDrive/PBMAI/output/crops"     # 결과 저장 폴더
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1WyScwpGxWpKnpzrOjzyTC5qq8U3NeU3MzBa_I23ggz0/edit?"
WORKSHEET_NAME = "강대X_해설_3단"   # ← 요청대로 시트명 반영

DPI                = 300
RETRY_TIMES        = 2          # 이미지 저장 재시도(행 내부)
OVERWRITE          = False      # 기존 파일 유지
SKIP_DONE          = True       # status=='done' 행 스킵
SKIP_OUT_OF_RANGE_PAGES = True  # 페이지 범위 밖은 무시(오류 처리 X)
OUTPUTS_VERBOSE    = False      # outputs에 경로 대신 요약

# ---- 이미지 옵션 ----
GRAYSCALE_RENDER = True   # 페이지 렌더를 회색조로
FORCE_BW        = False   # True면 크롭 직후 1비트 흑백으로 변환
BW_DITHER       = "NONE"  # "NONE" | "FS" (Floyd-Steinberg)
BW_THRESHOLD    = None    # 0~255 임계값. None이면 dither 방식 사용

# 페이지 프리-트림(여백 제거)
# 요청: 좌/우 6%, 아래 8%, 위는 1쪽만 13%, 2쪽~끝 10%
PAGE_TRIM_MODE = "perpage_fraction"  # "none" | "fraction"(고정) | "auto" | "perpage_fraction"
AUTO_TRIM_TOL = 245
AUTO_TRIM_PAD = 0
AUTO_TRIM_MAX_SHAVE = 0.20

# ----[2] 인증 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive.readonly",
]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid:
    creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출(지수 백오프+지터) 보수 설정 ----
BACKOFF_TRIES  = 10
BACKOFF_BASE   = 2.0
BACKOFF_JITTER = 1.0
BACKOFF_MAX    = 90.0

def safe_call(fn, *args, tries=BACKOFF_TRIES, base=BACKOFF_BASE,
              jitter=BACKOFF_JITTER, max_wait=BACKOFF_MAX, **kwargs):
    for i in range(tries):
        try:
            return fn(*args, **kwargs)
        except APIError as e:
            code = getattr(getattr(e, "response", None), "status_code", None)
            msg = (str(getattr(e, "response", "")) + " " + str(e)).lower()
            transient = (code in (429, 500, 503)) or ("quota" in msg) or ("rate" in msg)
            if transient and i < tries - 1:
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise
        except Exception as e:
            if i < min(3, tries-1):
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

# 워크시트 핸들(전부 safe_call)
sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 시작 시: 로그 컬럼(M~R) 비우기 ----
values_all = safe_call(ws.get_all_values)  # [[header...], [row2...], ...]
if not values_all:
    raise ValueError("시트가 비어 있습니다. 먼저 헤더를 만들어 주세요.")
last_row = len(values_all)
if last_row >= 2:
    safe_call(ws.batch_clear, [f"M2:R{last_row}"])

# ----[4] 헤더 보정(없으면 추가) ----
header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right","filename",
          "status","attempts","last_error","outputs","processed_at"]
base_header = header[:]
for col in NEEDED:
    if col not in header: header.append(col)
if header != base_header:
    safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

# ----[5] 데이터 로드(다시 읽어 최신화) ----
records = safe_call(ws.get_all_values)
if len(records) < 2:
    print("데이터 행이 없습니다."); records.append([])
df = pd.DataFrame(records[1:], columns=header)
df["_rownum"] = df.index + 2   # 실제 시트 행 번호

# ----[6] 유틸 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=300, grayscale=GRAYSCALE_RENDER):
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB   # ← 회색조 렌더
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    mode = {1: "L", 3: "RGB", 4: "RGBA"}.get(pix.n, "RGB")
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def trim_by_fraction(img, margins):
    W, H = img.size
    l = int(round(float(margins.get("left", 0)) * W))
    r = int(round(float(margins.get("right", 0)) * W))
    t = int(round(float(margins.get("top", 0)) * H))
    b = int(round(float(margins.get("bottom", 0)) * H))
    x1, y1, x2, y2 = l, t, W - r, H - b
    x1 = max(0, min(W, x1)); x2 = max(0, min(W, x2))
    y1 = max(0, min(H, y1)); y2 = max(0, min(H, y2))
    return img.crop((x1, y1, x2, y2)) if (x2 > x1 and y2 > y1) else img

def auto_trim_whitespace(img, tol=245, pad=0, max_shave_pct=0.2):
    g = img.convert("L")
    thr = g.point(lambda p: 0 if p > tol else 255, mode="1")
    bbox = thr.getbbox()
    if not bbox: return img
    x1, y1, x2, y2 = bbox
    W, H = img.size
    max_shave_x = int(W * max_shave_pct)
    max_shave_y = int(H * max_shave_pct)
    x1 = max(0, min(x1, max_shave_x)); y1 = max(0, min(y1, max_shave_y))
    x2 = min(W, max(x2, W - max_shave_x)); y2 = min(H, max(y2, H - max_shave_y))
    x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
    x2 = min(W, x2 + pad); y2 = min(H, y2 + pad)
    return img.crop((x1, y1, x2, y2)) if (x2 > x1 and y2 > y1) else img

def page_margins_for(page_idx:int, n_pages:int):
    """
    요청 규칙:
      - 좌/우: 6% (모든 페이지)
      - 하단: 8% (모든 페이지)
      - 상단: 첫 페이지 13%, 그 외 10%
    page_idx: 1-based
    """
    left = right = 0.06
    bottom = 0.08
    top = 0.13 if page_idx == 1 else 0.10
    return dict(top=top, bottom=bottom, left=left, right=right)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            a,b = tok.split("-",1); a,b = int(a), int(b)
            lo,hi = min(a,b), max(a,b); out.update(range(lo,hi+1))
        else:
            out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x = as_float(row_item.get("x", 0)); y = as_float(row_item.get("y", 0))
    w = as_float(row_item.get("w", 0)); h = as_float(row_item.get("h", 0))
    if unit == "px":
        x1 = int(round(x)); y1 = int(round(y))
        x2 = int(round(x + w)); y2 = int(round(y + h))
    else:
        x1 = int(round(x * W)); y1 = int(round(y * H))
        x2 = int(round((x + w) * W)); y2 = int(round((y + h) * H))
    x1, y1 = max(0, x1), max(0, y1); x2, y2 = min(W, x2), min(H, y2)
    if x2 <= x1 or y2 <= y1:
        raise ValueError(f"잘못된 박스 사각형: ({x1},{y1})~({x2},{y2})")
    return (x1, y1, x2, y2)

def apply_inner_trim(img, row_item):
    margins = dict(
        top   = max(0.0, min(1.0, as_float(row_item.get("inner_top", 0)))),
        bottom= max(0.0, min(1.0, as_float(row_item.get("inner_bottom", 0)))),
        left  = max(0.0, min(1.0, as_float(row_item.get("inner_left", 0)))),
        right = max(0.0, min(1.0, as_float(row_item.get("inner_right", 0)))),
    )
    if sum(margins.values()) == 0: return img
    return trim_by_fraction(img, margins)

def maybe_binarize(img):
    if not FORCE_BW:
        return img
    g = img.convert("L")
    if BW_THRESHOLD is not None:
        g = ImageOps.autocontrast(g)
        return g.point(lambda p: 255 if p >= BW_THRESHOLD else 0, mode="1")
    dither = Image.FLOYDSTEINBERG if BW_DITHER.upper()=="FS" else Image.NONE
    return g.convert("1", dither=dither)

def build_output_filename(base_name, pdf_stem, page_idx, box_id, multi_page=False):
    def _ensure_png(name):
        return name if name.lower().endswith(".png") else f"{name}.png"
    if base_name:
        name = _ensure_png(base_name.strip())
        if multi_page:
            stem, ext = os.path.splitext(name)
            name = f"{stem}_p{page_idx:03d}{ext}"
        return f"{pdf_stem}_{name}"
    return f"{pdf_stem}_page{page_idx:03d}_box{int(as_float(box_id,0)) or 0}.png"

def verify_image(path):
    if not os.path.exists(path): return False, "file_not_found"
    if os.path.getsize(path) == 0: return False, "file_size_zero"
    try:
        with Image.open(path) as im: im.verify()
        return True, ""
    except Exception as e:
        return False, f"PIL_verify_error: {e}"

def save_with_tmp_and_verify(crop, out_path):
    os.makedirs(TMP_DIR, exist_ok=True)
    tmp_path = os.path.join(TMP_DIR, os.path.basename(out_path))
    # 저장 직전 1비트 변환(옵션)
    crop = maybe_binarize(crop)
    crop.save(tmp_path, "PNG", optimize=True, compress_level=9)  # 용량 최적화
    ok, why = verify_image(tmp_path)
    if not ok: raise IOError(f"save_verify_failed:{why}")
    shutil.move(tmp_path, out_path)

def trimmed_page_image(doc, page_idx):
    """
    page_idx: 1-based
    PAGE_TRIM_MODE에 따라 페이지 렌더 후 여백 트림 적용
    """
    page = doc[page_idx-1]
    img = render_page_to_image(page, DPI, grayscale=GRAYSCALE_RENDER)

    if PAGE_TRIM_MODE == "perpage_fraction":
        margins = page_margins_for(page_idx, len(doc))
        img = trim_by_fraction(img, margins)
    elif PAGE_TRIM_MODE == "fraction":
        # 고정값 사용할 경우를 대비해서 남김 (미사용)
        img = trim_by_fraction(img, dict(top=0.10, bottom=0.10, left=0.10, right=0.10))
    elif PAGE_TRIM_MODE == "auto":
        img = auto_trim_whitespace(img, tol=AUTO_TRIM_TOL, pad=AUTO_TRIM_PAD, max_shave_pct=AUTO_TRIM_MAX_SHAVE)
    # "none"이면 원본 유지
    return img

# ----[7] input 폴더의 PDF 수집 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR, "*.pdf")))
if not pdf_paths:
    raise FileNotFoundError(f"INPUT_DIR에 PDF가 없습니다: {INPUT_DIR}")
print(f"Found {len(pdf_paths)} PDFs")

# ----[8] 실행(행 단위 처리) ----
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    r = row.to_dict()
    rownum = int(r["_rownum"])

    # attempts 증가
    attempts_val = int(as_float(r.get("attempts", 0), 0)) + 1
    safe_call(ws.update_cell, rownum, col_idx["attempts"], str(attempts_val))

    if SKIP_DONE and (r.get("status","").lower() == "done"):
        continue

    pages_list = parse_pages_field(r.get("pages",""))
    if not pages_list:
        if "page" in header and str(r.get("page","")).strip():
            pages_list = [int(as_float(r.get("page"), 0))]
        else:
            safe_call(ws.update_cell, rownum, col_idx["status"], "error")
            safe_call(ws.update_cell, rownum, col_idx["last_error"], "no_pages_specified")
            continue

    multi_page = len(pages_list) > 1
    box_id = r.get("box_id","")
    base_name = (r.get("filename","") or "").strip()

    outputs = []
    errors  = []
    expected_total = 0

    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        try:
            doc = fitz.open(pdf_path)
        except Exception as e:
            errors.append(f"open_fail:{pdf_stem}:{e}")
            continue

        n_pages = len(doc)

        # 실제 처리 대상 페이지
        if SKIP_OUT_OF_RANGE_PAGES:
            target_pages = [p for p in pages_list if 1 <= p <= n_pages]
            if not target_pages:
                doc.close()
                continue
        else:
            target_pages = pages_list
        expected_total += len(target_pages)

        for attempt in range(RETRY_TIMES):
            try:
                for p in target_pages:
                    if not SKIP_OUT_OF_RANGE_PAGES and (p < 1 or p > n_pages):
                        raise IndexError(f"page_out_of_range:{pdf_stem}:{p}/{n_pages}")

                    outname = build_output_filename(base_name, pdf_stem, p, box_id, multi_page=multi_page)
                    out_path = os.path.join(OUT_DIR, outname)

                    if os.path.exists(out_path) and not OVERWRITE:
                        ok, _ = verify_image(out_path)
                        if ok:
                            outputs.append(out_path)
                            continue

                    # 페이지 렌더 + 요청 규칙대로 여백 트림
                    img = trimmed_page_image(doc, p)
                    W, H = img.size

                    # 시트에 적힌 박스 기준으로 크롭
                    x1,y1,x2,y2 = rect_from_row(r, W, H)
                    crop = img.crop((x1,y1,x2,y2))
                    crop = apply_inner_trim(crop, r)

                    save_with_tmp_and_verify(crop, out_path)
                    outputs.append(out_path)
                break  # 이 PDF 성공
            except Exception as e:
                if attempt < RETRY_TIMES - 1:
                    time.sleep(0.1); continue
                errors.append(f"{pdf_stem}:{e}")
        doc.close()

    # ---- 행 상태 배치 업데이트 ----
    ok_cnt = len(outputs)
    st = "done" if (expected_total > 0 and ok_cnt == expected_total) \
         else ("partial" if ok_cnt > 0 else "error")

    if OUTPUTS_VERBOSE:
        outputs_cell = " | ".join(outputs)
    else:
        if ok_cnt == 0:
            outputs_cell = ""
        else:
            sample = os.path.basename(outputs[0])
            outputs_cell = f"{ok_cnt}/{expected_total} files (e.g., {sample})"

    patch = {
        "status": st,
        "last_error": "" if st=="done" else "; ".join(errors)[:500],
        "outputs": outputs_cell,
        "processed_at": KST_NOW(),
        "attempts": str(attempts_val),
    }
    row_vals = [patch.get(h, r.get(h, "")) for h in header]
    a1 = f"{rowcol_to_a1(rownum, 1)}:{rowcol_to_a1(rownum, len(header))}"
    safe_call(ws.update, a1, [row_vals])

    time.sleep(0.1)  # rate limit 완화

print("All rows processed across all PDFs.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 1 PDFs


  return fn(*args, **kwargs)
Processing rows: 100%|██████████| 24/24 [00:20<00:00,  1.16it/s]

All rows processed across all PDFs.





# 평가원

In [None]:
# @title 평가원 46문항
# 평가원_문제_46문항 (2025년 평가원 문제 크롭 규격, 총 46문항)

# === Colab: 전체 PDF 일괄 처리 / 로그(M~R) 초기화 / gspread 전구간 보수적 백오프
#            + 그레이스케일 렌더링(기본) + 옵션 1비트 흑백 ===
!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/input"            # 이 폴더의 모든 *.pdf 처리
OUT_DIR   = "/content/drive/MyDrive/PBMAI/output/crops"     # 결과 저장 폴더
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1WyScwpGxWpKnpzrOjzyTC5qq8U3NeU3MzBa_I23ggz0/edit?"
WORKSHEET_NAME = "평가원_문제_46"

DPI                = 300
RETRY_TIMES        = 2          # 이미지 저장 재시도(행 내부)
OVERWRITE          = False      # 기존 파일 유지
SKIP_DONE          = True       # status=='done' 행 스킵
SKIP_OUT_OF_RANGE_PAGES = True  # 페이지 범위 밖은 무시(오류 처리 X)
OUTPUTS_VERBOSE    = False      # outputs에 경로 대신 요약

# ---- 이미지 옵션 ----
GRAYSCALE_RENDER = True   # 페이지 렌더를 회색조로
FORCE_BW        = False   # True면 크롭 직후 1비트 흑백으로 변환
BW_DITHER       = "NONE"  # "NONE" | "FS" (Floyd-Steinberg)
BW_THRESHOLD    = None    # 0~255 임계값. None이면 dither 방식 사용

# 페이지 프리-트림(여백 제거)
PAGE_TRIM_MODE = "fraction"  # "none" | "fraction" | "auto"
PAGE_TRIM = dict(top=0.13, bottom=0.10, left=0.10, right=0.10)
AUTO_TRIM_TOL = 245
AUTO_TRIM_PAD = 0
AUTO_TRIM_MAX_SHAVE = 0.20

# ----[2] 인증 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive.readonly",
]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid:
    creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출(지수 백오프+지터) 보수 설정 ----
BACKOFF_TRIES  = 10
BACKOFF_BASE   = 2.0
BACKOFF_JITTER = 1.0
BACKOFF_MAX    = 90.0

def safe_call(fn, *args, tries=BACKOFF_TRIES, base=BACKOFF_BASE,
              jitter=BACKOFF_JITTER, max_wait=BACKOFF_MAX, **kwargs):
    for i in range(tries):
        try:
            return fn(*args, **kwargs)
        except APIError as e:
            code = getattr(getattr(e, "response", None), "status_code", None)
            msg = (str(getattr(e, "response", "")) + " " + str(e)).lower()
            transient = (code in (429, 500, 503)) or ("quota" in msg) or ("rate" in msg)
            if transient and i < tries - 1:
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise
        except Exception as e:
            if i < min(3, tries-1):
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

# 워크시트 핸들(전부 safe_call)
sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 시작 시: 로그 컬럼(M~R) 비우기 ----
values_all = safe_call(ws.get_all_values)  # [[header...], [row2...], ...]
if not values_all:
    raise ValueError("시트가 비어 있습니다. 먼저 헤더를 만들어 주세요.")
last_row = len(values_all)
if last_row >= 2:
    safe_call(ws.batch_clear, [f"M2:R{last_row}"])

# ----[4] 헤더 보정(없으면 추가) ----
header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right","filename",
          "status","attempts","last_error","outputs","processed_at"]
base_header = header[:]
for col in NEEDED:
    if col not in header: header.append(col)
if header != base_header:
    safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

# ----[5] 데이터 로드(다시 읽어 최신화) ----
records = safe_call(ws.get_all_values)
if len(records) < 2:
    print("데이터 행이 없습니다."); records.append([])
df = pd.DataFrame(records[1:], columns=header)
df["_rownum"] = df.index + 2   # 실제 시트 행 번호

# ----[6] 유틸 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=300, grayscale=GRAYSCALE_RENDER):
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB   # ← 회색조 렌더
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    mode = {1: "L", 3: "RGB", 4: "RGBA"}.get(pix.n, "RGB")
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def trim_by_fraction(img, margins):
    W, H = img.size
    l = int(round(float(margins.get("left", 0)) * W))
    r = int(round(float(margins.get("right", 0)) * W))
    t = int(round(float(margins.get("top", 0)) * H))
    b = int(round(float(margins.get("bottom", 0)) * H))
    x1, y1, x2, y2 = l, t, W - r, H - b
    x1 = max(0, min(W, x1)); x2 = max(0, min(W, x2))
    y1 = max(0, min(H, y1)); y2 = max(0, min(H, y2))
    return img.crop((x1, y1, x2, y2)) if (x2 > x1 and y2 > y1) else img

def auto_trim_whitespace(img, tol=245, pad=0, max_shave_pct=0.2):
    g = img.convert("L")
    thr = g.point(lambda p: 0 if p > tol else 255, mode="1")
    bbox = thr.getbbox()
    if not bbox: return img
    x1, y1, x2, y2 = bbox
    W, H = img.size
    max_shave_x = int(W * max_shave_pct)
    max_shave_y = int(H * max_shave_pct)
    x1 = max(0, min(x1, max_shave_x)); y1 = max(0, min(y1, max_shave_y))
    x2 = min(W, max(x2, W - max_shave_x)); y2 = min(H, max(y2, H - max_shave_y))
    x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
    x2 = min(W, x2 + pad); y2 = min(H, y2 + pad)
    return img.crop((x1, y1, x2, y2)) if (x2 > x1 and y2 > y1) else img

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            a,b = tok.split("-",1); a,b = int(a), int(b)
            lo,hi = min(a,b), max(a,b); out.update(range(lo,hi+1))
        else:
            out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x = as_float(row_item.get("x", 0)); y = as_float(row_item.get("y", 0))
    w = as_float(row_item.get("w", 0)); h = as_float(row_item.get("h", 0))
    if unit == "px":
        x1 = int(round(x)); y1 = int(round(y))
        x2 = int(round(x + w)); y2 = int(round(y + h))
    else:
        x1 = int(round(x * W)); y1 = int(round(y * H))
        x2 = int(round((x + w) * W)); y2 = int(round((y + h) * H))
    x1, y1 = max(0, x1), max(0, y1); x2, y2 = min(W, x2), min(H, y2)
    if x2 <= x1 or y2 <= y1:
        raise ValueError(f"잘못된 박스 사각형: ({x1},{y1})~({x2},{y2})")
    return (x1, y1, x2, y2)

def apply_inner_trim(img, row_item):
    margins = dict(
        top   = max(0.0, min(1.0, as_float(row_item.get("inner_top", 0)))),
        bottom= max(0.0, min(1.0, as_float(row_item.get("inner_bottom", 0)))),
        left  = max(0.0, min(1.0, as_float(row_item.get("inner_left", 0)))),
        right = max(0.0, min(1.0, as_float(row_item.get("inner_right", 0)))),
    )
    if sum(margins.values()) == 0: return img
    return trim_by_fraction(img, margins)

def maybe_binarize(img):
    if not FORCE_BW:
        return img
    g = img.convert("L")
    if BW_THRESHOLD is not None:
        g = ImageOps.autocontrast(g)
        return g.point(lambda p: 255 if p >= BW_THRESHOLD else 0, mode="1")
    dither = Image.FLOYDSTEINBERG if BW_DITHER.upper()=="FS" else Image.NONE
    return g.convert("1", dither=dither)

def build_output_filename(base_name, pdf_stem, page_idx, box_id, multi_page=False):
    def _ensure_png(name):
        return name if name.lower().endswith(".png") else f"{name}.png"
    if base_name:
        name = _ensure_png(base_name.strip())
        if multi_page:
            stem, ext = os.path.splitext(name)
            name = f"{stem}_p{page_idx:03d}{ext}"
        return f"{pdf_stem}_{name}"
    return f"{pdf_stem}_page{page_idx:03d}_box{int(as_float(box_id,0)) or 0}.png"

def verify_image(path):
    if not os.path.exists(path): return False, "file_not_found"
    if os.path.getsize(path) == 0: return False, "file_size_zero"
    try:
        with Image.open(path) as im: im.verify()
        return True, ""
    except Exception as e:
        return False, f"PIL_verify_error: {e}"

def save_with_tmp_and_verify(crop, out_path):
    os.makedirs(TMP_DIR, exist_ok=True)
    tmp_path = os.path.join(TMP_DIR, os.path.basename(out_path))
    # 저장 직전 1비트 변환(옵션)
    crop = maybe_binarize(crop)
    crop.save(tmp_path, "PNG", optimize=True, compress_level=9)  # 용량 최적화
    ok, why = verify_image(tmp_path)
    if not ok: raise IOError(f"save_verify_failed:{why}")
    shutil.move(tmp_path, out_path)

def trimmed_page_image(doc, page_idx):
    # 페이지 렌더만 수행 — 프리트리밍 없음
    page = doc[page_idx - 1]
    img = render_page_to_image(page, DPI, grayscale=GRAYSCALE_RENDER)
    return img   # ← 바로 반환


# ----[7] input 폴더의 PDF 수집 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR, "*.pdf")))
if not pdf_paths:
    raise FileNotFoundError(f"INPUT_DIR에 PDF가 없습니다: {INPUT_DIR}")
print(f"Found {len(pdf_paths)} PDFs")

# ----[8] 실행(행 단위 처리) ----
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    r = row.to_dict()
    rownum = int(r["_rownum"])

    # attempts 증가
    attempts_val = int(as_float(r.get("attempts", 0), 0)) + 1
    safe_call(ws.update_cell, rownum, col_idx["attempts"], str(attempts_val))

    if SKIP_DONE and (r.get("status","").lower() == "done"):
        continue

    pages_list = parse_pages_field(r.get("pages",""))
    if not pages_list:
        if "page" in header and str(r.get("page","")).strip():
            pages_list = [int(as_float(r.get("page"), 0))]
        else:
            safe_call(ws.update_cell, rownum, col_idx["status"], "error")
            safe_call(ws.update_cell, rownum, col_idx["last_error"], "no_pages_specified")
            continue

    multi_page = len(pages_list) > 1
    box_id = r.get("box_id","")
    base_name = (r.get("filename","") or "").strip()

    outputs = []
    errors  = []
    expected_total = 0

    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        try:
            doc = fitz.open(pdf_path)
        except Exception as e:
            errors.append(f"open_fail:{pdf_stem}:{e}")
            continue

        n_pages = len(doc)

        # 실제 처리 대상 페이지
        if SKIP_OUT_OF_RANGE_PAGES:
            target_pages = [p for p in pages_list if 1 <= p <= n_pages]
            if not target_pages:
                doc.close()
                continue
        else:
            target_pages = pages_list
        expected_total += len(target_pages)

        for attempt in range(RETRY_TIMES):
            try:
                for p in target_pages:
                    if not SKIP_OUT_OF_RANGE_PAGES and (p < 1 or p > n_pages):
                        raise IndexError(f"page_out_of_range:{pdf_stem}:{p}/{n_pages}")

                    outname = build_output_filename(base_name, pdf_stem, p, box_id, multi_page=multi_page)
                    out_path = os.path.join(OUT_DIR, outname)

                    if os.path.exists(out_path) and not OVERWRITE:
                        ok, _ = verify_image(out_path)
                        if ok:
                            outputs.append(out_path)
                            continue

                    img = trimmed_page_image(doc, p)
                    W, H = img.size
                    x1,y1,x2,y2 = rect_from_row(r, W, H)
                    crop = img.crop((x1,y1,x2,y2))
                    crop = apply_inner_trim(crop, r)

                    save_with_tmp_and_verify(crop, out_path)
                    outputs.append(out_path)
                break  # 이 PDF 성공
            except Exception as e:
                if attempt < RETRY_TIMES - 1:
                    time.sleep(0.1); continue
                errors.append(f"{pdf_stem}:{e}")
        doc.close()

    # ---- 행 상태 배치 업데이트 ----
    ok_cnt = len(outputs)
    st = "done" if (expected_total > 0 and ok_cnt == expected_total) \
         else ("partial" if ok_cnt > 0 else "error")

    if OUTPUTS_VERBOSE:
        outputs_cell = " | ".join(outputs)
    else:
        if ok_cnt == 0:
            outputs_cell = ""
        else:
            sample = os.path.basename(outputs[0])
            outputs_cell = f"{ok_cnt}/{expected_total} files (e.g., {sample})"

    patch = {
        "status": st,
        "last_error": "" if st=="done" else "; ".join(errors)[:500],
        "outputs": outputs_cell,
        "processed_at": KST_NOW(),
        "attempts": str(attempts_val),
    }
    row_vals = [patch.get(h, r.get(h, "")) for h in header]
    a1 = f"{rowcol_to_a1(rownum, 1)}:{rowcol_to_a1(rownum, len(header))}"
    safe_call(ws.update, a1, [row_vals])

    time.sleep(0.1)  # rate limit 완화

print("All rows processed across all PDFs.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 1 PDFs


  return fn(*args, **kwargs)
Processing rows: 100%|██████████| 46/46 [00:39<00:00,  1.16it/s]

All rows processed across all PDFs.





In [None]:
# @title 평가원 가나형
# 평가원_문제_가나형 (2016~2020년 평가원 문제 크롭 규격, 총 30문항)

# === Colab: 전체 PDF 일괄 처리 / 로그(M~R) 초기화 / gspread 전구간 보수적 백오프
#            + 그레이스케일 렌더링(기본) + 옵션 1비트 흑백 ===
!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/input"            # 이 폴더의 모든 *.pdf 처리
OUT_DIR   = "/content/drive/MyDrive/PBMAI/output/crops"     # 결과 저장 폴더
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1WyScwpGxWpKnpzrOjzyTC5qq8U3NeU3MzBa_I23ggz0/edit?"
WORKSHEET_NAME = "평가원_문제_25"

DPI                = 300
RETRY_TIMES        = 2          # 이미지 저장 재시도(행 내부)
OVERWRITE          = False      # 기존 파일 유지
SKIP_DONE          = True       # status=='done' 행 스킵
SKIP_OUT_OF_RANGE_PAGES = True  # 페이지 범위 밖은 무시(오류 처리 X)
OUTPUTS_VERBOSE    = False      # outputs에 경로 대신 요약

# ---- 이미지 옵션 ----
GRAYSCALE_RENDER = True   # 페이지 렌더를 회색조로
FORCE_BW        = False   # True면 크롭 직후 1비트 흑백으로 변환
BW_DITHER       = "NONE"  # "NONE" | "FS" (Floyd-Steinberg)
BW_THRESHOLD    = None    # 0~255 임계값. None이면 dither 방식 사용

# 페이지 프리-트림(여백 제거)
PAGE_TRIM_MODE = "fraction"  # "none" | "fraction" | "auto"
PAGE_TRIM = dict(top=0.13, bottom=0.10, left=0.10, right=0.10)
AUTO_TRIM_TOL = 245
AUTO_TRIM_PAD = 0
AUTO_TRIM_MAX_SHAVE = 0.20

# ----[2] 인증 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive.readonly",
]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid:
    creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출(지수 백오프+지터) 보수 설정 ----
BACKOFF_TRIES  = 10
BACKOFF_BASE   = 2.0
BACKOFF_JITTER = 1.0
BACKOFF_MAX    = 90.0

def safe_call(fn, *args, tries=BACKOFF_TRIES, base=BACKOFF_BASE,
              jitter=BACKOFF_JITTER, max_wait=BACKOFF_MAX, **kwargs):
    for i in range(tries):
        try:
            return fn(*args, **kwargs)
        except APIError as e:
            code = getattr(getattr(e, "response", None), "status_code", None)
            msg = (str(getattr(e, "response", "")) + " " + str(e)).lower()
            transient = (code in (429, 500, 503)) or ("quota" in msg) or ("rate" in msg)
            if transient and i < tries - 1:
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise
        except Exception as e:
            if i < min(3, tries-1):
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

# 워크시트 핸들(전부 safe_call)
sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 시작 시: 로그 컬럼(M~R) 비우기 ----
values_all = safe_call(ws.get_all_values)  # [[header...], [row2...], ...]
if not values_all:
    raise ValueError("시트가 비어 있습니다. 먼저 헤더를 만들어 주세요.")
last_row = len(values_all)
if last_row >= 2:
    safe_call(ws.batch_clear, [f"M2:R{last_row}"])

# ----[4] 헤더 보정(없으면 추가) ----
header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right","filename",
          "status","attempts","last_error","outputs","processed_at"]
base_header = header[:]
for col in NEEDED:
    if col not in header: header.append(col)
if header != base_header:
    safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

# ----[5] 데이터 로드(다시 읽어 최신화) ----
records = safe_call(ws.get_all_values)
if len(records) < 2:
    print("데이터 행이 없습니다."); records.append([])
df = pd.DataFrame(records[1:], columns=header)
df["_rownum"] = df.index + 2   # 실제 시트 행 번호

# ----[6] 유틸 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=300, grayscale=GRAYSCALE_RENDER):
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB   # ← 회색조 렌더
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    mode = {1: "L", 3: "RGB", 4: "RGBA"}.get(pix.n, "RGB")
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def trim_by_fraction(img, margins):
    W, H = img.size
    l = int(round(float(margins.get("left", 0)) * W))
    r = int(round(float(margins.get("right", 0)) * W))
    t = int(round(float(margins.get("top", 0)) * H))
    b = int(round(float(margins.get("bottom", 0)) * H))
    x1, y1, x2, y2 = l, t, W - r, H - b
    x1 = max(0, min(W, x1)); x2 = max(0, min(W, x2))
    y1 = max(0, min(H, y1)); y2 = max(0, min(H, y2))
    return img.crop((x1, y1, x2, y2)) if (x2 > x1 and y2 > y1) else img

def auto_trim_whitespace(img, tol=245, pad=0, max_shave_pct=0.2):
    g = img.convert("L")
    thr = g.point(lambda p: 0 if p > tol else 255, mode="1")
    bbox = thr.getbbox()
    if not bbox: return img
    x1, y1, x2, y2 = bbox
    W, H = img.size
    max_shave_x = int(W * max_shave_pct)
    max_shave_y = int(H * max_shave_pct)
    x1 = max(0, min(x1, max_shave_x)); y1 = max(0, min(y1, max_shave_y))
    x2 = min(W, max(x2, W - max_shave_x)); y2 = min(H, max(y2, H - max_shave_y))
    x1 = max(0, x1 - pad); y1 = max(0, y1 - pad)
    x2 = min(W, x2 + pad); y2 = min(H, y2 + pad)
    return img.crop((x1, y1, x2, y2)) if (x2 > x1 and y2 > y1) else img

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            a,b = tok.split("-",1); a,b = int(a), int(b)
            lo,hi = min(a,b), max(a,b); out.update(range(lo,hi+1))
        else:
            out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x = as_float(row_item.get("x", 0)); y = as_float(row_item.get("y", 0))
    w = as_float(row_item.get("w", 0)); h = as_float(row_item.get("h", 0))
    if unit == "px":
        x1 = int(round(x)); y1 = int(round(y))
        x2 = int(round(x + w)); y2 = int(round(y + h))
    else:
        x1 = int(round(x * W)); y1 = int(round(y * H))
        x2 = int(round((x + w) * W)); y2 = int(round((y + h) * H))
    x1, y1 = max(0, x1), max(0, y1); x2, y2 = min(W, x2), min(H, y2)
    if x2 <= x1 or y2 <= y1:
        raise ValueError(f"잘못된 박스 사각형: ({x1},{y1})~({x2},{y2})")
    return (x1, y1, x2, y2)

def apply_inner_trim(img, row_item):
    margins = dict(
        top   = max(0.0, min(1.0, as_float(row_item.get("inner_top", 0)))),
        bottom= max(0.0, min(1.0, as_float(row_item.get("inner_bottom", 0)))),
        left  = max(0.0, min(1.0, as_float(row_item.get("inner_left", 0)))),
        right = max(0.0, min(1.0, as_float(row_item.get("inner_right", 0)))),
    )
    if sum(margins.values()) == 0: return img
    return trim_by_fraction(img, margins)

def maybe_binarize(img):
    if not FORCE_BW:
        return img
    g = img.convert("L")
    if BW_THRESHOLD is not None:
        g = ImageOps.autocontrast(g)
        return g.point(lambda p: 255 if p >= BW_THRESHOLD else 0, mode="1")
    dither = Image.FLOYDSTEINBERG if BW_DITHER.upper()=="FS" else Image.NONE
    return g.convert("1", dither=dither)

def build_output_filename(base_name, pdf_stem, page_idx, box_id, multi_page=False):
    def _ensure_png(name):
        return name if name.lower().endswith(".png") else f"{name}.png"
    if base_name:
        name = _ensure_png(base_name.strip())
        if multi_page:
            stem, ext = os.path.splitext(name)
            name = f"{stem}_p{page_idx:03d}{ext}"
        return f"{pdf_stem}_{name}"
    return f"{pdf_stem}_page{page_idx:03d}_box{int(as_float(box_id,0)) or 0}.png"

def verify_image(path):
    if not os.path.exists(path): return False, "file_not_found"
    if os.path.getsize(path) == 0: return False, "file_size_zero"
    try:
        with Image.open(path) as im: im.verify()
        return True, ""
    except Exception as e:
        return False, f"PIL_verify_error: {e}"

def save_with_tmp_and_verify(crop, out_path):
    os.makedirs(TMP_DIR, exist_ok=True)
    tmp_path = os.path.join(TMP_DIR, os.path.basename(out_path))
    # 저장 직전 1비트 변환(옵션)
    crop = maybe_binarize(crop)
    crop.save(tmp_path, "PNG", optimize=True, compress_level=9)  # 용량 최적화
    ok, why = verify_image(tmp_path)
    if not ok: raise IOError(f"save_verify_failed:{why}")
    shutil.move(tmp_path, out_path)

def trimmed_page_image(doc, page_idx):
    page = doc[page_idx-1]
    img = render_page_to_image(page, DPI, grayscale=GRAYSCALE_RENDER)
    if PAGE_TRIM_MODE == "fraction":
        img = trim_by_fraction(img, PAGE_TRIM)
    elif PAGE_TRIM_MODE == "auto":
        img = auto_trim_whitespace(img, tol=AUTO_TRIM_TOL, pad=AUTO_TRIM_PAD, max_shave_pct=AUTO_TRIM_MAX_SHAVE)
    return img

# ----[7] input 폴더의 PDF 수집 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR, "*.pdf")))
if not pdf_paths:
    raise FileNotFoundError(f"INPUT_DIR에 PDF가 없습니다: {INPUT_DIR}")
print(f"Found {len(pdf_paths)} PDFs")

# ----[8] 실행(행 단위 처리) ----
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    r = row.to_dict()
    rownum = int(r["_rownum"])

    # attempts 증가
    attempts_val = int(as_float(r.get("attempts", 0), 0)) + 1
    safe_call(ws.update_cell, rownum, col_idx["attempts"], str(attempts_val))

    if SKIP_DONE and (r.get("status","").lower() == "done"):
        continue

    pages_list = parse_pages_field(r.get("pages",""))
    if not pages_list:
        if "page" in header and str(r.get("page","")).strip():
            pages_list = [int(as_float(r.get("page"), 0))]
        else:
            safe_call(ws.update_cell, rownum, col_idx["status"], "error")
            safe_call(ws.update_cell, rownum, col_idx["last_error"], "no_pages_specified")
            continue

    multi_page = len(pages_list) > 1
    box_id = r.get("box_id","")
    base_name = (r.get("filename","") or "").strip()

    outputs = []
    errors  = []
    expected_total = 0

    for pdf_path in pdf_paths:
        pdf_stem = os.path.splitext(os.path.basename(pdf_path))[0]
        try:
            doc = fitz.open(pdf_path)
        except Exception as e:
            errors.append(f"open_fail:{pdf_stem}:{e}")
            continue

        n_pages = len(doc)

        # 실제 처리 대상 페이지
        if SKIP_OUT_OF_RANGE_PAGES:
            target_pages = [p for p in pages_list if 1 <= p <= n_pages]
            if not target_pages:
                doc.close()
                continue
        else:
            target_pages = pages_list
        expected_total += len(target_pages)

        for attempt in range(RETRY_TIMES):
            try:
                for p in target_pages:
                    if not SKIP_OUT_OF_RANGE_PAGES and (p < 1 or p > n_pages):
                        raise IndexError(f"page_out_of_range:{pdf_stem}:{p}/{n_pages}")

                    outname = build_output_filename(base_name, pdf_stem, p, box_id, multi_page=multi_page)
                    out_path = os.path.join(OUT_DIR, outname)

                    if os.path.exists(out_path) and not OVERWRITE:
                        ok, _ = verify_image(out_path)
                        if ok:
                            outputs.append(out_path)
                            continue

                    img = trimmed_page_image(doc, p)
                    W, H = img.size
                    x1,y1,x2,y2 = rect_from_row(r, W, H)
                    crop = img.crop((x1,y1,x2,y2))
                    crop = apply_inner_trim(crop, r)

                    save_with_tmp_and_verify(crop, out_path)
                    outputs.append(out_path)
                break  # 이 PDF 성공
            except Exception as e:
                if attempt < RETRY_TIMES - 1:
                    time.sleep(0.1); continue
                errors.append(f"{pdf_stem}:{e}")
        doc.close()

    # ---- 행 상태 배치 업데이트 ----
    ok_cnt = len(outputs)
    st = "done" if (expected_total > 0 and ok_cnt == expected_total) \
         else ("partial" if ok_cnt > 0 else "error")

    if OUTPUTS_VERBOSE:
        outputs_cell = " | ".join(outputs)
    else:
        if ok_cnt == 0:
            outputs_cell = ""
        else:
            sample = os.path.basename(outputs[0])
            outputs_cell = f"{ok_cnt}/{expected_total} files (e.g., {sample})"

    patch = {
        "status": st,
        "last_error": "" if st=="done" else "; ".join(errors)[:500],
        "outputs": outputs_cell,
        "processed_at": KST_NOW(),
        "attempts": str(attempts_val),
    }
    row_vals = [patch.get(h, r.get(h, "")) for h in header]
    a1 = f"{rowcol_to_a1(rownum, 1)}:{rowcol_to_a1(rownum, len(header))}"
    safe_call(ws.update, a1, [row_vals])

    time.sleep(0.1)  # rate limit 완화

print("All rows processed across all PDFs.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 1 PDFs


  return fn(*args, **kwargs)
Processing rows: 100%|██████████| 30/30 [00:22<00:00,  1.32it/s]

All rows processed across all PDFs.





# 2단통 문항


In [None]:
# @title 강대X 38문항 2단통 : 문제 (사전 트림)
# === Colab: PDF 여백 이미 제거된 입력용 / 2단 시험지 좌우 분리 / 워크시트 규격 크롭 ===
#            + 전체 PDF 일괄 처리 / 로그(M~R) 초기화 / gspread 백오프
#            + 그레이스케일 렌더링(기본) + 옵션 1비트 흑백 ===

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/input"            # 여백 제거된 *.pdf
OUT_DIR   = "/content/drive/MyDrive/PBMAI/output/crops"     # 결과 저장 폴더
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1WyScwpGxWpKnpzrOjzyTC5qq8U3NeU3MzBa_I23ggz0/edit?"
WORKSHEET_NAME = "강대X_문제_2단통"

DPI                = 300
RETRY_TIMES        = 2
OVERWRITE          = False
SKIP_DONE          = True
SKIP_OUT_OF_RANGE_PAGES = True
OUTPUTS_VERBOSE    = False

# ---- 이미지 옵션 ----
GRAYSCALE_RENDER = True
FORCE_BW        = False
BW_DITHER       = "NONE"
BW_THRESHOLD    = None

# ✅ 프리-트림 관련 설정 제거됨
PAGE_TRIM_MODE = "none"

# ----[2] 인증 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive.readonly",
]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid:
    creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출(지수 백오프+지터) ----
BACKOFF_TRIES  = 10
BACKOFF_BASE   = 2.0
BACKOFF_JITTER = 1.0
BACKOFF_MAX    = 90.0

def safe_call(fn, *args, tries=BACKOFF_TRIES, base=BACKOFF_BASE,
              jitter=BACKOFF_JITTER, max_wait=BACKOFF_MAX, **kwargs):
    for i in range(tries):
        try:
            return fn(*args, **kwargs)
        except APIError as e:
            code = getattr(getattr(e, "response", None), "status_code", None)
            msg = (str(getattr(e, "response", "")) + " " + str(e)).lower()
            transient = (code in (429, 500, 503)) or ("quota" in msg) or ("rate" in msg)
            if transient and i < tries - 1:
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise
        except Exception as e:
            if i < min(3, tries-1):
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

# 워크시트 핸들
sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그 컬럼 초기화 ----
values_all = safe_call(ws.get_all_values)
if not values_all:
    raise ValueError("시트가 비어 있습니다.")
last_row = len(values_all)
if last_row >= 2:
    safe_call(ws.batch_clear, [f"M2:R{last_row}"])

# ----[4] 헤더 보정 ----
header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right",
          "filename","status","attempts","last_error","outputs","processed_at"]
for col in NEEDED:
    if col not in header: header.append(col)
safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

# ----[5] 데이터 로드 ----
records = safe_call(ws.get_all_values)
if len(records) < 2:
    print("데이터 행이 없습니다."); records.append([])
df = pd.DataFrame(records[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[6] 유틸 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=300, grayscale=GRAYSCALE_RENDER):
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    mode = {1: "L", 3: "RGB", 4: "RGBA"}.get(pix.n, "RGB")
    return Image.frombytes(mode, [pix.width, pix.height], pix.samples)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out = set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            a,b = tok.split("-",1); a,b = int(a), int(b)
            out.update(range(min(a,b), max(a,b)+1))
        else:
            out.add(int(tok))
    return sorted(out)

def as_float(x, default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item, W, H):
    unit = (row_item.get("unit") or "frac").strip().lower()
    x = as_float(row_item.get("x", 0)); y = as_float(row_item.get("y", 0))
    w = as_float(row_item.get("w", 0)); h = as_float(row_item.get("h", 0))
    if unit == "px":
        x1, y1 = int(round(x)), int(round(y))
        x2, y2 = int(round(x + w)), int(round(y + h))
    else:
        x1, y1 = int(round(x * W)), int(round(y * H))
        x2, y2 = int(round((x + w) * W)), int(round((y + h) * H))
    x1, y1 = max(0, x1), max(0, y1)
    x2, y2 = min(W, x2), min(H, y2)
    if x2 <= x1 or y2 <= y1:
        raise ValueError(f"잘못된 박스: ({x1},{y1})~({x2},{y2})")
    return (x1, y1, x2, y2)

def apply_inner_trim(img, row_item):
    margins = dict(
        top=max(0.0,min(1.0,as_float(row_item.get("inner_top",0)))),
        bottom=max(0.0,min(1.0,as_float(row_item.get("inner_bottom",0)))),
        left=max(0.0,min(1.0,as_float(row_item.get("inner_left",0)))),
        right=max(0.0,min(1.0,as_float(row_item.get("inner_right",0))))
    )
    if sum(margins.values()) == 0: return img
    W,H = img.size
    l=int(round(margins["left"]*W)); r=int(round(margins["right"]*W))
    t=int(round(margins["top"]*H)); b=int(round(margins["bottom"]*H))
    return img.crop((l,t,W-r,H-b))

def maybe_binarize(img):
    if not FORCE_BW: return img
    g = img.convert("L")
    if BW_THRESHOLD is not None:
        g = ImageOps.autocontrast(g)
        return g.point(lambda p:255 if p>=BW_THRESHOLD else 0, mode="1")
    dither = Image.FLOYDSTEINBERG if BW_DITHER.upper()=="FS" else Image.NONE
    return g.convert("1", dither=dither)

def build_output_filename(base_name,pdf_stem,page_idx,box_id,multi_page=False):
    def _ensure_png(name): return name if name.lower().endswith(".png") else f"{name}.png"
    if base_name:
        name=_ensure_png(base_name.strip())
        if multi_page:
            stem,ext=os.path.splitext(name)
            name=f"{stem}_p{page_idx:03d}{ext}"
        return f"{pdf_stem}_{name}"
    return f"{pdf_stem}_page{page_idx:03d}_box{int(as_float(box_id,0)) or 0}.png"

def verify_image(path):
    if not os.path.exists(path): return False,"file_not_found"
    if os.path.getsize(path)==0: return False,"file_size_zero"
    try:
        with Image.open(path) as im: im.verify()
        return True,""
    except Exception as e:
        return False,f"PIL_verify_error:{e}"

def save_with_tmp_and_verify(crop,out_path):
    os.makedirs(TMP_DIR,exist_ok=True)
    tmp=os.path.join(TMP_DIR,os.path.basename(out_path))
    crop=maybe_binarize(crop)
    crop.save(tmp,"PNG",optimize=True,compress_level=9)
    ok,why=verify_image(tmp)
    if not ok: raise IOError(f"save_verify_failed:{why}")
    shutil.move(tmp,out_path)

# ✅ 프리-트림 완전 제거: 페이지 그대로 렌더링
def trimmed_page_image(doc,page_idx):
    page = doc[page_idx-1]
    return render_page_to_image(page,DPI,grayscale=GRAYSCALE_RENDER)

# ----[7] input 폴더의 PDF 수집 ----
pdf_paths = sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))
if not pdf_paths:
    raise FileNotFoundError(f"INPUT_DIR에 PDF가 없습니다: {INPUT_DIR}")
print(f"Found {len(pdf_paths)} PDFs")

# ----[8] 실행 ----
KST_NOW = lambda: datetime.now(KST).isoformat(timespec="seconds")

for _,row in tqdm(df.iterrows(),total=len(df),desc="Processing rows"):
    r=row.to_dict(); rownum=int(r["_rownum"])
    attempts_val=int(as_float(r.get("attempts",0)))+1
    safe_call(ws.update_cell,rownum,col_idx["attempts"],str(attempts_val))
    if SKIP_DONE and (r.get("status","").lower()=="done"): continue

    pages_list=parse_pages_field(r.get("pages",""))
    if not pages_list:
        if "page" in header and str(r.get("page","")).strip():
            pages_list=[int(as_float(r.get("page"),0))]
        else:
            safe_call(ws.update_cell,rownum,col_idx["status"],"error")
            safe_call(ws.update_cell,rownum,col_idx["last_error"],"no_pages_specified")
            continue

    multi_page=len(pages_list)>1
    box_id=r.get("box_id",""); base_name=(r.get("filename","") or "").strip()
    outputs=[]; errors=[]; expected_total=0

    for pdf_path in pdf_paths:
        pdf_stem=os.path.splitext(os.path.basename(pdf_path))[0]
        try: doc=fitz.open(pdf_path)
        except Exception as e:
            errors.append(f"open_fail:{pdf_stem}:{e}"); continue
        n_pages=len(doc)
        target_pages=[p for p in pages_list if 1<=p<=n_pages] if SKIP_OUT_OF_RANGE_PAGES else pages_list
        expected_total+=len(target_pages)

        for attempt in range(RETRY_TIMES):
            try:
                for p in target_pages:
                    outname=build_output_filename(base_name,pdf_stem,p,box_id,multi_page)
                    out_path=os.path.join(OUT_DIR,outname)
                    if os.path.exists(out_path) and not OVERWRITE:
                        ok,_=verify_image(out_path)
                        if ok: outputs.append(out_path); continue
                    img=trimmed_page_image(doc,p)
                    W,H=img.size
                    x1,y1,x2,y2=rect_from_row(r,W,H)
                    crop=img.crop((x1,y1,x2,y2))
                    crop=apply_inner_trim(crop,r)
                    save_with_tmp_and_verify(crop,out_path)
                    outputs.append(out_path)
                break
            except Exception as e:
                if attempt<RETRY_TIMES-1: time.sleep(0.1); continue
                errors.append(f"{pdf_stem}:{e}")
        doc.close()

    ok_cnt=len(outputs)
    st="done" if (expected_total>0 and ok_cnt==expected_total) else ("partial" if ok_cnt>0 else "error")
    outputs_cell="" if ok_cnt==0 else f"{ok_cnt}/{expected_total} files (e.g., {os.path.basename(outputs[0])})"
    patch={"status":st,"last_error":"" if st=="done" else "; ".join(errors)[:500],
           "outputs":outputs_cell,"processed_at":KST_NOW(),"attempts":str(attempts_val)}
    row_vals=[patch.get(h,r.get(h,"")) for h in header]
    a1=f"{rowcol_to_a1(rownum,1)}:{rowcol_to_a1(rownum,len(header))}"
    safe_call(ws.update,a1,[row_vals])
    time.sleep(0.1)

print("✅ All rows processed across all PDFs (no pre-trim).")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  return fn(*args, **kwargs)


Found 1 PDFs


  return fn(*args, **kwargs)
Processing rows: 100%|██████████| 32/32 [00:37<00:00,  1.18s/it]

✅ All rows processed across all PDFs (no pre-trim).





In [None]:
# @title 써킷X 12문항 2단통 : 문제 (사전 트림)
# === Colab: 써킷X_문제_12문항 (프리-트림 제거 버전) ===
# 원본 PDF는 이미 여백 제거됨 → 워크시트 크롭 규격만 적용

!pip -q install pymupdf pillow tqdm gspread pandas google-auth

import os, re, io, glob, shutil, time, random
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image, ImageOps
from tqdm import tqdm
from datetime import datetime, timezone, timedelta

# ----[0] Google Drive 마운트 ----
from google.colab import drive
drive.mount('/content/drive')

# ----[1] 경로/설정 ----
INPUT_DIR = "/content/drive/MyDrive/PBMAI/input"
OUT_DIR   = "/content/drive/MyDrive/PBMAI/output/crops"
os.makedirs(OUT_DIR, exist_ok=True)

SPREADSHEET_URL_OR_KEY = "https://docs.google.com/spreadsheets/d/1WyScwpGxWpKnpzrOjzyTC5qq8U3NeU3MzBa_I23ggz0/edit?"
WORKSHEET_NAME = "써킷X_2단통"

DPI                = 300
RETRY_TIMES        = 2
OVERWRITE          = False
SKIP_DONE          = True
SKIP_OUT_OF_RANGE_PAGES = True
OUTPUTS_VERBOSE    = False

# ---- 이미지 옵션 ----
GRAYSCALE_RENDER = True
FORCE_BW        = False
BW_DITHER       = "NONE"
BW_THRESHOLD    = None

# ✅ 프리-트림 비활성화
PAGE_TRIM_MODE = "none"

# ----[2] 인증 ----
from google.colab import auth
auth.authenticate_user()

import gspread
import google.auth
from google.auth.transport.requests import Request
from gspread.exceptions import APIError
from gspread.utils import rowcol_to_a1

SCOPES = [
    "https://www.googleapis.com/auth/spreadsheets",
    "https://www.googleapis.com/auth/drive.readonly",
]
creds, _ = google.auth.default(scopes=SCOPES)
if not creds.valid:
    creds.refresh(Request())
gc = gspread.authorize(creds)

# ----[2-1] gspread 안전 호출 ----
BACKOFF_TRIES  = 10
BACKOFF_BASE   = 2.0
BACKOFF_JITTER = 1.0
BACKOFF_MAX    = 90.0

def safe_call(fn, *args, tries=BACKOFF_TRIES, base=BACKOFF_BASE,
              jitter=BACKOFF_JITTER, max_wait=BACKOFF_MAX, **kwargs):
    for i in range(tries):
        try:
            return fn(*args, **kwargs)
        except APIError as e:
            code = getattr(getattr(e, "response", None), "status_code", None)
            msg = (str(getattr(e, "response", "")) + " " + str(e)).lower()
            transient = (code in (429, 500, 503)) or ("quota" in msg) or ("rate" in msg)
            if transient and i < tries - 1:
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise
        except Exception as e:
            if i < min(3, tries-1):
                delay = min(base*(2**i) + random.random()*jitter, max_wait)
                time.sleep(delay); continue
            raise

def _extract_key(url_or_key:str):
    m = re.search(r"/spreadsheets/d/([a-zA-Z0-9-_]+)", url_or_key)
    return m.group(1) if m else url_or_key

# 워크시트 핸들
sh = safe_call(gc.open_by_key, _extract_key(SPREADSHEET_URL_OR_KEY))
ws = safe_call(sh.worksheet, WORKSHEET_NAME)

# ----[3] 로그(M~R) 비우기 ----
values_all = safe_call(ws.get_all_values)
if not values_all:
    raise ValueError("시트가 비어 있습니다.")
last_row = len(values_all)
if last_row >= 2:
    safe_call(ws.batch_clear, [f"M2:R{last_row}"])

# ----[4] 헤더 보정 ----
header = values_all[0][:]
NEEDED = ["pages","box_id","x","y","w","h","unit","inner_top","inner_bottom","inner_left","inner_right",
          "filename","status","attempts","last_error","outputs","processed_at"]
for col in NEEDED:
    if col not in header: header.append(col)
safe_call(ws.update, 'A1', [header])
col_idx = {name: header.index(name)+1 for name in header}

# ----[5] 데이터 로드 ----
records = safe_call(ws.get_all_values)
if len(records) < 2:
    print("데이터 행이 없습니다."); records.append([])
df = pd.DataFrame(records[1:], columns=header)
df["_rownum"] = df.index + 2

# ----[6] 유틸 ----
KST = timezone(timedelta(hours=9))
TMP_DIR = "/content/tmp_crops"; os.makedirs(TMP_DIR, exist_ok=True)

def render_page_to_image(page, dpi=300, grayscale=GRAYSCALE_RENDER):
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    cs = fitz.csGRAY if grayscale else fitz.csRGB
    pix = page.get_pixmap(matrix=mat, alpha=False, colorspace=cs)
    mode = {1:"L",3:"RGB",4:"RGBA"}.get(pix.n,"RGB")
    return Image.frombytes(mode,[pix.width,pix.height],pix.samples)

def parse_pages_field(pages_field):
    s = str(pages_field).strip()
    if not s: return []
    out=set()
    for tok in re.split(r"\s*,\s*", s):
        if not tok: continue
        if "-" in tok:
            a,b=tok.split("-",1); a,b=int(a),int(b)
            out.update(range(min(a,b),max(a,b)+1))
        else: out.add(int(tok))
    return sorted(out)

def as_float(x,default=0.0):
    try: return float(str(x).strip())
    except: return default

def rect_from_row(row_item,W,H):
    unit=(row_item.get("unit") or "frac").strip().lower()
    x=as_float(row_item.get("x",0)); y=as_float(row_item.get("y",0))
    w=as_float(row_item.get("w",0)); h=as_float(row_item.get("h",0))
    if unit=="px":
        x1=int(round(x)); y1=int(round(y))
        x2=int(round(x+w)); y2=int(round(y+h))
    else:
        x1=int(round(x*W)); y1=int(round(y*H))
        x2=int(round((x+w)*W)); y2=int(round((y+h)*H))
    x1,y1=max(0,x1),max(0,y1)
    x2,y2=min(W,x2),min(H,y2)
    if x2<=x1 or y2<=y1:
        raise ValueError(f"잘못된 박스: ({x1},{y1})~({x2},{y2})")
    return (x1,y1,x2,y2)

def apply_inner_trim(img,row_item):
    margins=dict(
        top=max(0.0,min(1.0,as_float(row_item.get("inner_top",0)))),
        bottom=max(0.0,min(1.0,as_float(row_item.get("inner_bottom",0)))),
        left=max(0.0,min(1.0,as_float(row_item.get("inner_left",0)))),
        right=max(0.0,min(1.0,as_float(row_item.get("inner_right",0))))
    )
    if sum(margins.values())==0: return img
    W,H=img.size
    l=int(round(margins["left"]*W)); r=int(round(margins["right"]*W))
    t=int(round(margins["top"]*H)); b=int(round(margins["bottom"]*H))
    return img.crop((l,t,W-r,H-b))

def maybe_binarize(img):
    if not FORCE_BW: return img
    g=img.convert("L")
    if BW_THRESHOLD is not None:
        g=ImageOps.autocontrast(g)
        return g.point(lambda p:255 if p>=BW_THRESHOLD else 0,mode="1")
    dither=Image.FLOYDSTEINBERG if BW_DITHER.upper()=="FS" else Image.NONE
    return g.convert("1",dither=dither)

def build_output_filename(base_name,pdf_stem,page_idx,box_id,multi_page=False):
    def _ensure_png(name): return name if name.lower().endswith(".png") else f"{name}.png"
    if base_name:
        name=_ensure_png(base_name.strip())
        if multi_page:
            stem,ext=os.path.splitext(name)
            name=f"{stem}_p{page_idx:03d}{ext}"
        return f"{pdf_stem}_{name}"
    return f"{pdf_stem}_page{page_idx:03d}_box{int(as_float(box_id,0)) or 0}.png"

def verify_image(path):
    if not os.path.exists(path): return False,"file_not_found"
    if os.path.getsize(path)==0: return False,"file_size_zero"
    try:
        with Image.open(path) as im: im.verify()
        return True,""
    except Exception as e:
        return False,f"PIL_verify_error:{e}"

def save_with_tmp_and_verify(crop,out_path):
    os.makedirs(TMP_DIR,exist_ok=True)
    tmp=os.path.join(TMP_DIR,os.path.basename(out_path))
    crop=maybe_binarize(crop)
    crop.save(tmp,"PNG",optimize=True,compress_level=9)
    ok,why=verify_image(tmp)
    if not ok: raise IOError(f"save_verify_failed:{why}")
    shutil.move(tmp,out_path)

# ✅ 프리-트림 완전 제거: 페이지 그대로 렌더링
def trimmed_page_image(doc,page_idx):
    page=doc[page_idx-1]
    return render_page_to_image(page,DPI,grayscale=GRAYSCALE_RENDER)

# ----[7] input 폴더의 PDF 수집 ----
pdf_paths=sorted(glob.glob(os.path.join(INPUT_DIR,"*.pdf")))
if not pdf_paths:
    raise FileNotFoundError(f"INPUT_DIR에 PDF가 없습니다: {INPUT_DIR}")
print(f"Found {len(pdf_paths)} PDFs")

# ----[8] 실행 ----
KST_NOW=lambda: datetime.now(KST).isoformat(timespec="seconds")

for _,row in tqdm(df.iterrows(),total=len(df),desc="Processing rows"):
    r=row.to_dict()
    rownum=int(r["_rownum"])
    attempts_val=int(as_float(r.get("attempts",0)))+1
    safe_call(ws.update_cell,rownum,col_idx["attempts"],str(attempts_val))
    if SKIP_DONE and (r.get("status","").lower()=="done"): continue

    pages_list=parse_pages_field(r.get("pages",""))
    if not pages_list:
        if "page" in header and str(r.get("page","")).strip():
            pages_list=[int(as_float(r.get("page"),0))]
        else:
            safe_call(ws.update_cell,rownum,col_idx["status"],"error")
            safe_call(ws.update_cell,rownum,col_idx["last_error"],"no_pages_specified")
            continue

    multi_page=len(pages_list)>1
    box_id=r.get("box_id","")
    base_name=(r.get("filename","") or "").strip()
    outputs,errors=[],[]
    expected_total=0

    for pdf_path in pdf_paths:
        pdf_stem=os.path.splitext(os.path.basename(pdf_path))[0]
        try: doc=fitz.open(pdf_path)
        except Exception as e:
            errors.append(f"open_fail:{pdf_stem}:{e}")
            continue

        n_pages=len(doc)
        target_pages=[p for p in pages_list if 1<=p<=n_pages] if SKIP_OUT_OF_RANGE_PAGES else pages_list
        expected_total+=len(target_pages)

        for attempt in range(RETRY_TIMES):
            try:
                for p in target_pages:
                    outname=build_output_filename(base_name,pdf_stem,p,box_id,multi_page)
                    out_path=os.path.join(OUT_DIR,outname)
                    if os.path.exists(out_path) and not OVERWRITE:
                        ok,_=verify_image(out_path)
                        if ok: outputs.append(out_path); continue

                    img=trimmed_page_image(doc,p)
                    W,H=img.size
                    x1,y1,x2,y2=rect_from_row(r,W,H)
                    crop=img.crop((x1,y1,x2,y2))
                    crop=apply_inner_trim(crop,r)
                    save_with_tmp_and_verify(crop,out_path)
                    outputs.append(out_path)
                break
            except Exception as e:
                if attempt<RETRY_TIMES-1: time.sleep(0.1); continue
                errors.append(f"{pdf_stem}:{e}")
        doc.close()

    ok_cnt=len(outputs)
    st="done" if (expected_total>0 and ok_cnt==expected_total) else ("partial" if ok_cnt>0 else "error")
    outputs_cell="" if ok_cnt==0 else f"{ok_cnt}/{expected_total} files (e.g., {os.path.basename(outputs[0])})"
    patch={"status":st,"last_error":"" if st=="done" else "; ".join(errors)[:500],
           "outputs":outputs_cell,"processed_at":KST_NOW(),"attempts":str(attempts_val)}
    row_vals=[patch.get(h,r.get(h,"")) for h in header]
    a1=f"{rowcol_to_a1(rownum,1)}:{rowcol_to_a1(rownum,len(header))}"
    safe_call(ws.update,a1,[row_vals])
    time.sleep(0.1)

print("✅ All rows processed (no pre-trim).")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  return fn(*args, **kwargs)


Found 1 PDFs


  return fn(*args, **kwargs)
Processing rows: 100%|██████████| 12/12 [00:08<00:00,  1.34it/s]

✅ All rows processed (no pre-trim).





# PDF, 이미지 파일 처리

In [None]:
# @title PNG 묶어서 PDF로 저장
!pip -q install pillow tqdm

import os, glob, unicodedata
from collections import defaultdict
from PIL import Image
from tqdm import tqdm

# 1) 드라이브 강제 재마운트
try:
    from google.colab import drive
    try:
        drive.flush_and_unmount()
    except Exception:
        pass
    drive.mount('/content/drive', force_remount=True)
except Exception as e:
    print("Drive mount error:", e)

# 2) 폴더 자동탐색: 기본 경로 우선, 없으면 재귀 탐색
BASES = [
    "/content/drive/MyDrive",
    "/content/drive/My Drive",
]
IN_DIR = None
for base in BASES:
    cand = os.path.join(base, "PBMAI", "output", "crops")
    if os.path.exists(cand):
        IN_DIR = cand
        break

def find_crops_dir(start_dir):
    for path in glob.glob(os.path.join(start_dir, "**", "crops"), recursive=True):
        parts = path.replace("\\", "/").split("/")
        if len(parts) >= 3 and parts[-3:] == ["output", "crops"]:
            return path
    return None

if IN_DIR is None:
    for base in BASES:
        if os.path.exists(base):
            guess = find_crops_dir(os.path.join(base, "PBMAI"))
            if guess:
                IN_DIR = guess
                break

if IN_DIR is None:
    raise FileNotFoundError("PBMAI/output/crops 폴더를 찾지 못했습니다.")

OUT_DIR = IN_DIR.replace("/output/crops", "/crop_combine")
os.makedirs(OUT_DIR, exist_ok=True)

print("[INFO] IN_DIR  =", IN_DIR)
print("[INFO] OUT_DIR =", OUT_DIR)

# 3) PNG(및 이미지) 스캔 & 그룹핑 (맨 끝이 '_NN' 두자리)
EXTS = ["png","jpg","jpeg","tif","tiff","bmp","webp"]
PREFER_ORDER = ["png","jpg","jpeg","tif","tiff","bmp","webp"]

def normalize(s):
    return unicodedata.normalize("NFC", s)

def scan_images(indir):
    files = []
    for ext in EXTS:
        files += glob.glob(os.path.join(indir, f"*.{ext}"))
        files += glob.glob(os.path.join(indir, f"*.{ext.upper()}"))
    return sorted(set(files))

def split_base_seq(path):
    name = normalize(os.path.basename(path))
    root, _ = os.path.splitext(name)
    base, sep, tail = root.rpartition('_')
    if sep != '_':
        return None
    tail = tail.strip()
    if len(tail) == 2 and tail.isdigit():
        return base.strip(), int(tail)
    return None

def load_image_as_rgb(p):
    im = Image.open(p)
    # PDF 저장용으로 투명/팔레트 처리
    if im.mode in ("RGBA","LA","P"):
        im = im.convert("RGB")
    elif im.mode not in ("RGB","L"):
        im = im.convert("RGB")
    # L(그레이)도 PDF로는 잘 들어가지만, 일관성 위해 RGB로 통일하고 싶으면 아래 한 줄 켜도 됨
    # if im.mode == "L": im = im.convert("RGB")
    return im

all_files = scan_images(IN_DIR)
print(f"[INFO] 스캔 이미지 수: {len(all_files)}")
if not all_files:
    raise FileNotFoundError("crops 폴더에 이미지가 없습니다.")

groups = defaultdict(dict)  # base -> {seq: path}
skipped = []
for p in all_files:
    parsed = split_base_seq(p)
    if not parsed:
        skipped.append(os.path.basename(p))
        continue
    base, seq = parsed

    # 동일 seq에 여러 확장자가 있으면 우선순위(PNG 우선)로 선택
    cur = groups[base].get(seq)
    if cur is None:
        groups[base][seq] = p
    else:
        cur_ext = os.path.splitext(cur)[1][1:].lower()
        new_ext = os.path.splitext(p)[1][1:].lower()
        if PREFER_ORDER.index(new_ext) < PREFER_ORDER.index(cur_ext):
            groups[base][seq] = p

print(f"[INFO] 매칭 베이스: {len(groups)} / 총 페이지: {sum(len(v) for v in groups.values())}")
if skipped:
    print("[참고] 규칙 불일치 파일 예시:", skipped[:10])

if not groups:
    raise FileNotFoundError("'<베이스>_NN.ext' 형식이 하나도 매칭되지 않았습니다. (_01, _02 … 확인)")

# 4) base별로 멀티페이지 PDF 저장
errors = []
for base, seqmap in tqdm(groups.items(), desc="Combining PNG -> PDF"):
    try:
        seqs = sorted(seqmap.keys())
        pages = [seqmap[s] for s in seqs]

        first = load_image_as_rgb(pages[0])
        rest  = [load_image_as_rgb(p) for p in pages[1:]]

        out_pdf = os.path.join(OUT_DIR, f"{normalize(base)}_crop.pdf")
        first.save(out_pdf, "PDF", save_all=True, append_images=rest, resolution=300.0)

        print(f"✔ {base}: {len(pages)} pages -> {out_pdf}")
    except Exception as e:
        errors.append((base, str(e)))
        print(f"✘ {base} -> {e}")

print("\n완료!" if not errors else f"\n실패 그룹: {errors}")


Mounted at /content/drive
[INFO] IN_DIR  = /content/drive/MyDrive/PBMAI/output/crops
[INFO] OUT_DIR = /content/drive/MyDrive/PBMAI/crop_combine
[INFO] 스캔 이미지 수: 32
[INFO] 매칭 베이스: 1 / 총 페이지: 32


Combining PNG -> PDF: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it]

✔ MOBI25FS012고_문제: 32 pages -> /content/drive/MyDrive/PBMAI/crop_combine/MOBI25FS012고_문제_crop.pdf

완료!





In [None]:
# @title 가로 600px로 이미지 일괄 리사이즈
# === 구글드라이브의 모든 이미지파일 가로 600px로 일괄 리사이즈 (비율 유지, 덮어쓰기) ===
# 실행환경: Google Colab
# ------------------------------------------------------------

!pip install pillow tqdm

import os
from PIL import Image
from tqdm import tqdm
from google.colab import drive

# [1] 구글드라이브 마운트
drive.mount('/content/drive')

# [2] 리사이즈할 폴더 경로 설정
#   ⚠️ Windows식 경로 "\" 대신 "/"로 바꿔야 함
folder_path = '/content/drive/MyDrive/20 문항 관리/INSERTIMG'

# [3] 지원하는 이미지 확장자 목록
IMG_EXT = ('.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.webp')

# [4] 모든 이미지 파일 탐색 및 리사이즈
count = 0
for filename in tqdm(os.listdir(folder_path), desc='Resizing'):
    if filename.lower().endswith(IMG_EXT):
        file_path = os.path.join(folder_path, filename)
        try:
            with Image.open(file_path) as img:
                w, h = img.size
                if w > 600:  # 이미 작은 이미지는 건너뜀
                    new_h = int(h * (600 / w))
                    resized = img.resize((600, new_h), Image.LANCZOS)
                    resized.save(file_path)  # 덮어쓰기
                    count += 1
        except Exception as e:
            print(f'⚠️ 오류: {filename} → {e}')

print(f'✅ 총 {count}개의 이미지를 600px로 리사이즈 완료했습니다.')


Mounted at /content/drive


Resizing: 100%|██████████| 38/38 [00:14<00:00,  2.63it/s]

✅ 총 38개의 이미지를 600px로 리사이즈 완료했습니다.





In [None]:
# @title CRUX PDF 파일 홀짝구별해 PNG로 분할 (홀 본문항, 짝 기출체크)

# 1. 필수 라이브러리 설치 (이미 설치했다면 생략 가능)
!apt-get install -y poppler-utils
!pip install pdf2image

from google.colab import drive
import os
from pathlib import Path
from pdf2image import convert_from_path

# 구글 드라이브 마운트
drive.mount('/content/drive')

# --- 경로 설정 ---
input_dir = '/content/drive/MyDrive/PBMAI/02_Input/Problem'
output_dir = '/content/drive/MyDrive/PBMAI/03_Output/crops'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

def process_pdfs_advanced():
    pdf_files = [f for f in os.listdir(input_dir) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print("입력 폴더에 PDF 파일이 없습니다.")
        return

    for pdf_file in pdf_files:
        file_path = os.path.join(input_dir, pdf_file)
        file_name_stem = Path(pdf_file).stem

        print(f"처리 중: {pdf_file} (600dpi, Grayscale, Crop 적용)...")

        # 핵심 변경 사항:
        # 1. dpi=600: 해상도 상향
        # 2. grayscale=True: 흑백(회색조) 변환
        # 3. use_cropbox=True: PDF에 설정된 크롭 영역을 기준으로 변환
        images = convert_from_path(
            file_path,
            dpi=600,
            grayscale=True,
            use_cropbox=True
        )

        for i, image in enumerate(images):
            page_num = i + 1
            seq_num = (i // 2) + 1
            suffix = "A" if page_num % 2 != 0 else "B"

            new_filename = f"{file_name_stem}_{seq_num:03d}_{suffix}.png"
            save_path = os.path.join(output_dir, new_filename)

            # 저장
            image.save(save_path, 'PNG')

        print(f"완료: {pdf_file} (총 {len(images)} 페이지)")

# 실행
process_pdfs_advanced()
print("--- 모든 작업이 완료되었습니다 ---")

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.12).
0 upgraded, 0 newly installed, 0 to remove and 2 not upgraded.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
처리 중: 26CRUX미적분2권조판5_문제.pdf (600dpi, Grayscale, Crop 적용)...
완료: 26CRUX미적분2권조판5_문제.pdf (총 90 페이지)
--- 모든 작업이 완료되었습니다 ---
