### 데이터 불러오기

In [1]:
import os
import pymupdf4llm
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

pdf_dir = "E:/work/MS_project_2/data/new_pdfs"  # 여러 PDF 파일이 들어 있는 디렉토리


### PDF-> 마크다운 형태로 변환
### source 정보 추가
from langchain.schema import Document

all_docs = []

for filename in os.listdir(pdf_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, filename)
        md_path = pdf_path.replace(".pdf", ".md")

        # 1. PDF → Markdown 저장
        md_text = pymupdf4llm.to_markdown(pdf_path)
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(md_text)

        # 2. Markdown 로딩
        loader = TextLoader(md_path, encoding="utf-8")
        documents = loader.load()

        # 3. ✅ source 메타데이터 추가
        for doc in documents:
            doc.metadata["source"] = filename

        # 4. ✅ 분할 (source 정보가 포함된 문서 기준으로)
        docs_split = text_splitter.split_documents(documents)

        # 5. ✅ all_docs에 청크된 문서 저장
        all_docs.extend(docs_split)


Processing E:/work/MS_project_2/data/new_pdfs\(대전충남)25년1차청년매입임대_표준입주자모집공고문.pdf...
Processing E:/work/MS_project_2/data/new_pdfs\(정정공고문)25년1차청년매입임대_표준입주자모집공고문.pdf...
Processing E:/work/MS_project_2/data/new_pdfs\2025년 1차 대구경북 청년매입임대 입주자 모집 공고문.pdf...
Processing E:/work/MS_project_2/data/new_pdfs\2025년1차청년매입임대입주자모집공고문(광주전남).pdf...
Processing E:/work/MS_project_2/data/new_pdfs\25년 1차 청년매입임대 입주자 모집 공고문(강원지역본부).pdf...
Processing E:/work/MS_project_2/data/new_pdfs\25년1차청년매입임대입주자모집공고문.pdf...
Processing E:/work/MS_project_2/data/new_pdfs\아츠스테이영등포_입주자모집공고문.pdf...


### 임베딩 객체 생성

In [2]:
len(all_docs)

835

In [None]:
import os
import uuid
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores import AzureSearch

# ✅ 임베딩 객체 생성
embedding_api_key = ""
embedding_endpoint = ""
embedding_api_version = "2024-02-15-preview"
embedding_deployment = "text-embedding-3-small"

os.environ.pop("OPENAI_API_BASE", None)
os.environ.pop("BASE_URL", None)

embedding = AzureOpenAIEmbeddings(
    api_key=embedding_api_key,
    azure_endpoint=embedding_endpoint,
    model=embedding_deployment,
    openai_api_version=embedding_api_version
)

### 빈 인덱스 생성

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchIndex, SimpleField, SearchField, SearchFieldDataType,
    VectorSearch, HnswAlgorithmConfiguration, VectorSearchAlgorithmKind,
    VectorSearchProfile
)

ai_search_endpoint = ""
ai_search_api_key = ""
ai_search_index_name = "new_pdf_all_index"

index_client = SearchIndexClient(
    endpoint=ai_search_endpoint,
    credential=AzureKeyCredential(ai_search_api_key)
)

embedding_dim = 1536

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchField(name="content", type=SearchFieldDataType.String, searchable=True),
    SearchField(name="source", type=SearchFieldDataType.String, searchable=True, filterable=True),
    SearchField(
        name="embedding",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=embedding_dim,
        vector_search_profile_name="default"
    )
]

vector_search = VectorSearch(
    profiles=[VectorSearchProfile(name="default", algorithm_configuration_name="my-algorithm")],
    algorithms=[HnswAlgorithmConfiguration(name="my-algorithm", kind=VectorSearchAlgorithmKind.HNSW)]
)

index = SearchIndex(
    name=ai_search_index_name,
    fields=fields,
    vector_search=vector_search
)

# ✅ 인덱스 재생성
if ai_search_index_name in [i.name for i in index_client.list_indexes()]:
    index_client.delete_index(ai_search_index_name)
index_client.create_index(index)
print("✅ Azure Search 인덱스 생성 완료")

✅ Azure Search 인덱스 생성 완료


### 벡터스토어 객체 생성 및 임베딩

In [5]:
vectorstore = AzureSearch(
    azure_search_endpoint=ai_search_endpoint,
    azure_search_key=ai_search_api_key,
    index_name=ai_search_index_name,
    embedding_function=lambda x: x  # 더미 함수로 에러 방지
)


### 배치 단위로 업로드

In [None]:
from langchain.schema import Document
import time

# ✅ 전체 리스트 준비
texts = [doc.page_content for doc in all_docs]
metadatas = [{"source": doc.metadata.get("source", "")} for doc in all_docs]
ids = [str(uuid.uuid4()) for _ in all_docs]

# ✅ 배치 처리 (예: 100개씩)
batch_size = 100

for start in range(0, len(texts), batch_size):
    end = start + batch_size
    batch_texts = texts[start:end]
    batch_metadatas = metadatas[start:end]
    batch_ids = ids[start:end]

    # ✅ 임베딩 (429 오류 방지를 위해 sleep 추가해도 됨)
    try:
        batch_embeddings = embedding.embed_documents(batch_texts)
    except Exception as e:
        print(f"❌ 임베딩 에러 발생: {e}")
        break

    # ✅ 업로드용 Azure 포맷 구성
    docs_to_upload = []
    for i in range(len(batch_texts)):
        docs_to_upload.append({
            "id": batch_ids[i],
            "content": batch_texts[i],
            "source": batch_metadatas[i]["source"],
            "embedding": batch_embeddings[i]
        })

    # ✅ Azure Search 업로드
    from azure.search.documents import SearchClient, IndexDocumentsBatch
    from azure.core.credentials import AzureKeyCredential

    search_client = SearchClient(
        endpoint=ai_search_endpoint,
        index_name=ai_search_index_name,
        credential=AzureKeyCredential(ai_search_api_key)
    )

    batch = IndexDocumentsBatch()
    batch.add_upload_actions(docs_to_upload)

    try:
        search_client.index_documents(batch=batch)
        print(f"✅ 업로드 완료: {start} ~ {end-1}")
    except Exception as e:
        print(f"❌ 업로드 에러 발생: {e}")
        break

    # ✅ 너무 빠르면 속도 제한 맞을 수 있으니 잠깐 쉬어가자
    time.sleep(1)


✅ 업로드 완료: 0 ~ 99
✅ 업로드 완료: 100 ~ 199
✅ 업로드 완료: 200 ~ 299
✅ 업로드 완료: 300 ~ 399
✅ 업로드 완료: 400 ~ 499
✅ 업로드 완료: 500 ~ 599
✅ 업로드 완료: 600 ~ 699
✅ 업로드 완료: 700 ~ 799
✅ 업로드 완료: 800 ~ 899
