In [3]:
import os
import re
import warnings
import pandas as pd
import fitz  # PyMuPDF
import pdfplumber
import pickle   # chunks 캐시 저장
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
# 🔧 설정
upload_dir = "upload_docs"
index_path = "faiss_index"
chunk_cache_path = os.path.join(index_path, "chunks.pkl")
faiss_file = os.path.join(index_path, "index.faiss")
pkl_file = os.path.join(index_path, "index.pkl")

embedding_model = OllamaEmbeddings(model="bge-m3")

warnings.filterwarnings("ignore", message="CropBox missing from /Page, defaulting to MediaBox")


In [None]:
✅ 목표: fitz로 텍스트 및 표 추출

In [None]:
# ✅ 문서 → 청크 변환
def process_pdfs_to_chunks():
    all_docs = []

    for file_name in os.listdir(upload_dir):
        if not file_name.endswith(".pdf"):
            continue

        file_path = os.path.join(upload_dir, file_name)
        print(f"📄 처리 중: {file_name}")

        # with pdfplumber.open(file_path) as pdf:
        pdf = fitz.open(file_path)
        for page_number in range(len(pdf)):
            page = pdf.load_page(page_number)
            text = page.get_text()
            tables = page.find_tables()
            table_summaries = []
            
            for table in tables:
                raw_table = table.extract()
                if raw_table and len(raw_table) > 1:
                    df = pd.DataFrame(raw_table[1:], columns=raw_table[0])
                    table_summaries.append(f"\n\n===TABLE===\n\n{df.to_markdown()}")
        
            ''' for table in tables:
                df = pd.DataFrame(table.extract()[1:], columns=table.extract()[0])
                table_summaries.append(f"\n\n===TABLE===\n\n{df.to_markdown()}") '''

            combined_text = text + "\n" + "\n".join(table_summaries)

            doc = Document(
                page_content=combined_text,
                metadata={
                    "university": file_name.replace("2025 ", "").replace(".pdf", ""),  # 대학명
                    "source": file_name,
                    "page": page_number + 1
                }
            )

            all_docs.append(doc)

    print("페이지별 청크 생성 중...")
    #splitter = SemanticChunker(embedding_model)
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = splitter.split_documents(all_docs)

    print("캐시 저장 중...")
    with open(chunk_cache_path, "wb") as f:
        pickle.dump(chunks, f)

    print("✅ 캐시 저장 완료")
    return chunks


✅ 목표: fitz로 텍스트 추출, pdfplumber로 표 추출

In [8]:
chunk_pages = {
    #"2025 강원대.pdf": (3, 18),
    "2025 강원대.txt": (1, None),
    "2025 건국대.pdf": (3, None),
    "2025 경북대.pdf": (2, 8),
    "2025 경희대.pdf": (11, 20),
    "2025 고려대.pdf": (3, 17),
    "2025 동아대.pdf": (3, 16),
    "2025 부산대.pdf": (3, 17),
    "2025 서강대.pdf": (3, None),
    "2025 서울대.pdf": (5, 12),
    "2025 서울시립대.pdf": (4, 17),
    "2025 성균관대.pdf": (5, 16),
    "2025 아주대.pdf": (2, None),
    "2025 연세대.pdf": (3, 14),
    "2025 영남대.pdf": (5, 15),
    "2025 원광대.pdf": (2, 15),
    "2025 이화여대.pdf": (2, 10),
    "2025 인하대.pdf": (2, 14),
    "2025 전남대.pdf": (3, 15),
    "2025 전북대.pdf": (3, 12),
    "2025 제주대.pdf": (3, 16),
    "2025 중앙대.pdf": (2, 14),
    "2025 충남대.pdf": (3, 19),
    "2025 충북대.pdf": (3, 22),
    "2025 한국외대.pdf": (3, None),
    "2025 한양대.pdf": (3, 22)
}


In [5]:
import json
import yaml

# 불러오기
with open("university.yaml", "r", encoding="utf-8") as f:
    chunk_pages = yaml.safe_load(f)
    
# 저장
with open("university.json", "w", encoding="utf-8") as f:
    json.dump(chunk_pages, f, ensure_ascii=False, indent=2)


FileNotFoundError: [Errno 2] No such file or directory: 'university.yaml'

In [7]:
def reduce_spaces(text: str) -> str:
    return re.sub(r' {2,}', '', text)

def reduce_spaces_all(text: str) -> str:
    return re.sub(r' +', '', text)


In [None]:
def process_pdfs_to_chunks():
    all_docs = []

    for file_name in os.listdir(upload_dir):

        file_path = os.path.join(upload_dir, file_name)
        ext = os.path.splitext(file_name)[1].lower()
        print(f"📄 처리 중: {file_name}")

        if ext == ".pdf":
            # PDF 파일 처리        
            # PDF 객체를 연다
            pdf_fitz = fitz.open(file_path)
            # 페이지를 Skip하기 위한 정의
            total_pages = len(pdf_fitz)
            start_page, end_page = chunk_pages.get(file_name, (1, None))  # 기본값: 1페이지부터 끝까지
            start_idx = max(start_page - 1, 0)
            end_idx = min(end_page if end_page is not None else total_pages, total_pages)

            with pdfplumber.open(file_path) as pdf_plumber:
                for page_number in range(start_idx, end_idx):
                    text = ""
                    summaries = []

                    # === 텍스트 추출 (fitz) ===
                    try:
                        text = pdf_fitz.load_page(page_number).get_text()
                        text = reduce_spaces(text)
                    except Exception as e:
                        print(f"⚠️ 텍스트 추출 오류 (Page {page_number+1}): {e}")
                        text = ""

                    # === 표 추출 (pdfplumber) ===
                    try:
                        tables = pdf_plumber.pages[page_number].extract_tables()
                        for table in tables:
                            if not table or len(table) < 2:
                                continue
                            df = pd.DataFrame(table[1:], columns=table[0])
                            md_table = df.to_markdown()
                            md_table = reduce_spaces_all(md_table)
                            summaries.append(f"\n\n===TABLE===\n\n{md_table}")
                    except Exception as e:
                        print(f"⚠️ 표 추출 오류 (Page {page_number+1}): {e}")
                        
                    # === 문서 객체로 결합 ===
                    combined_text = "".join(summaries) + "\n" + "\n" + text
                    doc = Document(
                        page_content=combined_text,
                        metadata={
                            "university": file_name.replace("2025 ", "").rsplit(".", 1)[0],
                            "source": file_name,
                            "page": page_number + 1
                        }
                    )
                    all_docs.append(doc)
                    
        elif ext == ".txt":
            # TXT 파일 처리
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read().strip()
                combined_text = reduce_spaces(text)
                if text:
                    doc = Document(
                        page_content=combined_text,
                        metadata={
                            "university": file_name.replace("2025 ", "").rsplit(".", 1)[0],
                            "source": file_name,
                            "page": 1
                        }
                    )
                    all_docs.append(doc)
            
        else:
            print(f"❗ 지원하지 않는 파일 형식: {file_name}")

    print("🧩 페이지별 청크 생성 중...")
    splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    chunks = splitter.split_documents(all_docs)

    print("💾 캐시 저장 중...")
    with open(chunk_cache_path, "wb") as f:
        pickle.dump(chunks, f)

    print("✅ 완료! 총 청크 수:", len(chunks))
    return chunks

chunks = process_pdfs_to_chunks()

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 강원대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 건국대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 경북대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


📄 처리 중: 2025 경희대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 고려대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 동아대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 부산대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 서강대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 서울대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 서울시립대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 성균관대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


📄 처리 중: 2025 아주대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 연세대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 영남대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 원광대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 이화여대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 인하대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 전남대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 전북대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 제주대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 중앙대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 충남대.pdf
MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict



CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 충북대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 한국외대.pdf
MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict



CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

📄 처리 중: 2025 한양대.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

🧩 페이지별 청크 생성 중...
💾 캐시 저장 중...
✅ 완료! 총 청크 수: 1598


In [None]:
chunks = process_pdfs_to_chunks()

In [148]:
with open(chunk_cache_path, "wb") as f:
        pickle.dump(chunks, f)

In [157]:
import fitz
file_path = "upload_docs\\2025 강원대.pdf"  # 파일 경로

doc = fitz.open(file_path)
page = doc.load_page(3)
text = reduce_spaces( page.get_text() )

print(text)

- 2 -
2. 모집군 및 전형유형별 모집인원
모집군
전형유형
모집인원
‘나’군
일반전형
37명
특별전형
 3명
합계
40명
 ※ 특별전형에 의하여 선발된 입학생 중 「국민기초생활보장법」에 따른 수급가구, 「국민기초생활보장법」에 
따른 차상위계층 확인서 발급 사업에 해당하는 가구(구 우선돌봄 차상위가구), 「국민기초생활보장법」에 따
라 자활사업에 참여하는 차상위가구에 해당하는 자를 특별전형에 의해 선발되는 전체 입학생의 30%(1명) 
이상으로 선발함
3. 비법학사 및 타대학쿼터, 강원지역 대학 출신자 선발 방법
모집군
쿼터구분
선발인원
비 고
‘나’군
비법학사
20명 이상
- 강원지역 대학 출신자 
선발비율 10%(4명) 이상
타대학 학사
24명 이상
 가. 비(非) 법학사의 범위
- 법학사 이외의 학사학위를 받은 자
- 비 법학을 주전공으로 하고, 법학을 부전공으로 하여 학사학위를 받은 자(법학을 주전공
으로 하고, 비 법학을 부전공으로 하여 학사학위를 받은 자는 비법학사로 불인정함)
- 법학사와 비법학사를 이중으로 취득한 자는 본인이 선택하여 법학사 또는 비법학사로 
지원가능하나, 이를 동시에 선택하여 지원은 불가함
나. 타대학의 범위
- 강원대학교 이외의 대학에서 학사학위를 받은 자
- 강원대학교 이외의 대학에서 학사학위를 받고 강원대학교 소속 대학원에서 석사 또는 
박사학위를 받은 자
- 타 대학에서 학사학위를 받은 후 강원대학교에서 학사편입학하여 학사학위를 받은 자 
(강원대학교를 졸업하고 타 대학에 학사편입을 했을 경우에는 인정하지 않음)
- 강원대학교 및 타 대학에서 복수의 학사학위를 취득한 자는 본인의 선택으로 강원대학교 
학사 또는 타 대학 학사로 지원할 수 있음. 단, 이를 동시에 선택하여 지원할 수 없음



In [144]:
# 예: 3페이지짜리 문서의 메타데이터 보기
target_doc = [d for d in chunks if d.metadata["page"] == 5 and d.metadata["university"] == "고려대"]
for doc in target_doc:
    print(doc.page_content)


고려대학교 법학전문대학원 신입생 선발요강3
 바. 등록 
- 합격 이후 등록 및 기타 안내는 홈페이지 공지에 따른다.
사. 추가 합격자 발표 및 등록
  -  미등록 등으로 결원이 발생할 경우, 추가합격자를 선발하여 개별통지를 통해 추가합격
자를 선발한다.
3. 입학정원과 전형별 모집인원
    ※2025학년도 결원 보충의 선발은 『법학전문대학원 설치*운영에 관한 법률 시행령』 개
정에 의해 진행 여부가 결정되며, 진행 시 별도 전형을 진행하지 않고 2024학년도에 발생
하는 결원 인원*에 따라 학생선발 비율*을 고려하여 예비후보자 순번대로 선발함. 단, 선
발인원은 입학전형의 10%를 초과하지 아니하는 범위로 함
    * 학생선발 비율: 일반전형/특별전형, 법학사/비법학사, 자교/타교 비율 등
    * 2024학년도 결원 인원의 확정일: 2025. 2. 28.(금)
* 학생선발 비율: 일반전형/특별전형, 법학사/비법학사, 자교/타교 비율 등
    * 2024학년도 결원 인원의 확정일: 2025. 2. 28.(금)
(1) 일반전형과 특별전형에 동시에 지원할 수 없으며 1개 전형만 선택하여 지원한다.
(2) 「법학전문대학원 설치․운영에 관한 법률」 제26조 제2항, 제3항 및 동 시행령에 근거하
여 법학사 이외의 학위소지자(비법학사) 및 고려대학교 이외의 대학에서 학사학위취득자
(타 대학졸업자)를 각각 입학정원의 1/3 이상 선발한다.
가. 법학을 부전공으로 이수한 비법학사는 비법학사로, 비법학을 부전공으로 이수한 법학사
는 법학사로 지원해야 한다.
나. 법학을 복수(이중)전공으로 이수한 비법학사 또는 비법학을 복수(이중)전공으로 이수한 
법학사는 본인의 선택으로 법학사 또는 비법학사 어느 한 곳으로만 지원하여야 한다.
다. 두 곳 이상의 대학(교)에서 모두 학사학위를 취득한 지원자는 그 중 한 곳을 선택하여 지
원해야 한다.
입학정원
다. 두 곳 이상의 대학(교)에서 모두 학사학위를 취득한 지원자는 그 중 한 곳을 선택하여 지
원해야 한다.
입학정원
전형

In [176]:
vectorstore = FAISS.from_documents(chunks, embedding_model)
vectorstore.save_local(index_path)

In [178]:
# ✅ 예시 질문
question = "고려대 일반전형과 특별전형의 모집인원을 알려주세요"

docs = vectorstore.similarity_search(question, k=3, filter={"university": "고려대", "page": 5})

In [180]:
docs
#print(f"출처: {docs.metadata.get('university')} {docs.metadata.get('source')} (페이지 {docs.metadata.get('page')})\n")

[Document(id='64931dad-e31b-490e-af78-b6258d99a1eb', metadata={'university': '고려대', 'source': '2025 고려대.pdf', 'page': 5}, page_content='* 2024학년도 결원 인원의 확정일: 2025. 2. 28.(금)\n(1) 일반전형과 특별전형에 동시에 지원할 수 없으며 1개 전형만 선택하여 지원한다.\n(2) 「법학전문대학원 설치․운영에 관한 법률」 제26조 제2항, 제3항 및 동 시행령에 근거하\n여 법학사 이외의 학위소지자(비법학사) 및 고려대학교 이외의 대학에서 학사학위취득자\n(타 대학졸업자)를 각각 입학정원의 1/3 이상 선발한다.\n가. 법학을 부전공으로 이수한 비법학사는 비법학사로, 비법학을 부전공으로 이수한 법학사\n는 법학사로 지원해야 한다.\n나. 법학을 복수(이중)전공으로 이수한 비법학사 또는 비법학을 복수(이중)전공으로 이수한 \n법학사는 본인의 선택으로 법학사 또는 비법학사 어느 한 곳으로만 지원하여야 한다.\n다. 두 곳 이상의 대학(교)에서 모두 학사학위를 취득한 지원자는 그 중 한 곳을 선택하여 지\n원해야 한다.\n입학정원\n전형유형\n전형별 모집인원\n‘나’군\n120명\n일반전형 \n111명 이내\n특별전형'),
 Document(id='3f3d9c67-d72a-45ae-8985-7876359bb99d', metadata={'university': '고려대', 'source': '2025 고려대.pdf', 'page': 5}, page_content='원해야 한다.\n입학정원\n전형유형\n전형별 모집인원\n‘나’군\n120명\n일반전형 \n111명 이내\n특별전형\n(1) 신체적 배려\n(2) 경제적 배려\n(3) 사회적 배려\n9 명 이상'),
 Document(id='266b27d8-60a5-4118-bb2f-c7bc061fe9e7', metadata={'university': '고려대', 'source': '2025 고려대.p

In [33]:
%pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [24]:
%pip install -U langchain-ollama



Note: you may need to restart the kernel to use updated packages.
