In [None]:
import os
from dotenv import load_dotenv
from typing import Annotated, Dict, List, TypedDict, Any, Optional

from langchain_core.documents import Document
from langchain_core.runnables import RunnableConfig
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.output_parsers import StrOutputParser

from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_community.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_community.tools.tavily_search import TavilySearchResults

from kiwipiepy import Kiwi

from langgraph.graph import StateGraph, END
from langgraph.checkpoint.memory import MemorySaver

In [None]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

In [None]:
kiwi = Kiwi()

# kiwi 토크나이징 함수
def kiwi_tokenize(text: str) -> List[str]:
    """
    Tokenizes the input text using the Kiwi tokenizer.
    """
    tokens = kiwi.tokenize(text)
    return [token.form for token in tokens if token.tag != "SPACE"]

# 상태
class GraphState(TypedDict):
    """
    Represents the state of the graph.
    """
    query: str
    document: Document
    web_results: List[Document]
    answer: Optional[str]
    message: Optional[str]

In [4]:
from langchain_core.documents import Document

def row_to_doc(row) -> Document:
    content = f"""
    회사명: {row['company']}
    업종: {row['industry']}
    주소: {row['address']}
    홈페이지: {row['homepage']}
    대표자: {row['key_executive']}
    연락처: {row['phone_number']}
    매출: {row['sales']}
    """
    return Document(page_content=content.strip(), metadata={"source": "company_data"})


In [9]:
import pandas as pd

def load_documents_from_csv(path: str, chunksize: int = 100):
    for chunk in pd.read_csv(path, chunksize=chunksize):
        docs = []
        for _, row in chunk.iterrows():
            if pd.notna(row['company']):  # 필수값 체크
                docs.append(row_to_doc(row))
        yield docs

In [10]:
all_docs = []

for i, doc_batch in enumerate(load_documents_from_csv("company_data.csv", chunksize=100)):
    all_docs.extend(doc_batch)
    print(f"✅ {len(all_docs)}개 문서 처리 완료")

    if len(all_docs) >= 1000:
        break  # 일단 1000개까지만 테스트

✅ 100개 문서 처리 완료
✅ 200개 문서 처리 완료
✅ 300개 문서 처리 완료
✅ 400개 문서 처리 완료
✅ 500개 문서 처리 완료
✅ 600개 문서 처리 완료
✅ 700개 문서 처리 완료
✅ 800개 문서 처리 완료
✅ 900개 문서 처리 완료
✅ 1000개 문서 처리 완료
