In [3]:
import os
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_elasticsearch import ElasticsearchStore
from langchain_community.document_loaders import PyPDFLoader

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
load_dotenv()

True

In [9]:
embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002',
                                   api_key=os.getenv('OPENAI_API_KEY'))

In [7]:
os.getenv('ELASTICSEARCH_URL')

In [10]:
import re
def clean_whitespace(text):
    text = re.sub(r'\s+', ' ', text)       # 모든 연속 공백을 하나로
    text = re.sub(r'(\n\s*)+\n', '\n', text)  # 여러 줄바꿈을 하나로
    return text.strip()

def remove_special_chars(text):
    return re.sub(r'[^가-힣a-zA-Z0-9\s.,;:?!()\[\]\'"-]', '', text)

def remove_page_number(text):
    text = re.sub(r'Page\s*\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'페이지\s*\d+', '', text)
    return text

def remove_email_url(text):
    text = re.sub(r'\b[\w.-]+?@\w+?\.\w+?\b', '', text)  # 이메일
    text = re.sub(r'http[s]?://\S+', '', text)           # URL
    return text

def preprocess_chunk(text):
    text = clean_whitespace(text)
    text = remove_special_chars(text)
    text = remove_page_number(text)
    text = remove_email_url(text)
    return text

# 문서 로드 및 파싱

In [11]:
from glob import glob

In [12]:
file_paths = glob('data/*/*')

In [13]:
apply_path = file_paths[0]
policy_paths = file_paths[1:]

In [14]:
loader = PyPDFLoader(apply_path)
apply_documents = loader.load()  # 각 페이지가 Document 객체로 반환

In [15]:
from tqdm import tqdm

policy_documents = []
for policy_path in tqdm(policy_paths):
    loader = PyPDFLoader(policy_path)
    documents = loader.load()  # 각 페이지가 Document 객체로 반환
    policy_documents.append(documents)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:24<00:00, 96.00s/it]


# 텍스트 전처리

In [16]:
for i in tqdm(range(len(apply_documents))):
    apply_documents[i].page_content = preprocess_chunk(apply_documents[i].page_content)
    
for i in range(len(policy_documents)):
    for j in tqdm(range(len(policy_documents[i]))):
        policy_documents[i][j].page_content = preprocess_chunk(policy_documents[i][j].page_content)

100%|██████████████████████████████████████████████████████████████████████████████| 273/273 [00:00<00:00, 6203.11it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 7319.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 787/787 [00:00<00:00, 6195.45it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 391/391 [00:00<00:00, 12610.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 263/263 [00:00<00:00, 7512.58it/s]


In [17]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_apply_documents = splitter.split_documents(apply_documents)

In [18]:
li_split_policy_documents = []
for policy_document in policy_documents:
    split_policy_documents = splitter.split_documents(policy_document)
    li_split_policy_documents.append(split_policy_documents)

In [19]:
all_split_policy_documents = [y for x in li_split_policy_documents for y in x]

In [20]:
#2024 주택청약 FAQ
embedding_apply = ElasticsearchStore(es_api_key=os.getenv('ELASTICSEARCH_API_KEY'),
        es_url=os.getenv('ELASTICSEARCH_URL'),
        index_name="embedding_apply",
        embedding=embedding_model)

embedding_policy = ElasticsearchStore(es_api_key=os.getenv('ELASTICSEARCH_API_KEY'),
        es_url=os.getenv('ELASTICSEARCH_URL'),
        index_name="embedding_policy",
        embedding=embedding_model)

In [21]:
from tqdm import tqdm
def batch_upload(vector_store, docs, batch_size=50):
    for i in tqdm(range(0, len(docs), batch_size)):
        vector_store.add_documents(docs[i:i+batch_size])

In [22]:
batch_upload(embedding_apply, split_apply_documents)

100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:32<00:00,  4.64s/it]


In [23]:
batch_upload(embedding_policy, all_split_policy_documents)

100%|██████████████████████████████████████████████████████████████████████████████████| 39/39 [02:51<00:00,  4.41s/it]


In [29]:
query_vector = embedding_model.embed_query('해당 주택건설지역에 거주하다가 다른 주택건설지역으로 이주')

In [63]:
embedding_apply = ElasticsearchStore(es_api_key=os.getenv('ELASTICSEARCH_API_KEY'),
        es_url=os.getenv('ELASTICSEARCH_URL'),
        index_name="embedding_apply",
        embedding=embedding_model, strategy='hybrid')

In [91]:
from langchain_elasticsearch import ElasticsearchRetriever

def hybrid_query(search_query: str):
    query_vector = embedding_model.embed_query(search_query)
    return {
        "query": {
            "bool": {
                "should": [
                    {
                        "match": {
                            "text": {
                                "query": search_query,
                                "boost": 0.2
                            }
                        }
                    },
                    {
                        "knn": {
                            "field": "embedding",
                            "query_vector": query_vector,
                            "k": 10,
                            "num_candidates": 50,
                            "boost": 0.8
                        }
                    }
                ]
            }
        }
    }

retriever = ElasticsearchRetriever.from_es_params(
    index_name="embedding_apply",
    body_func=hybrid_query,
    content_field="text",
    url=os.getenv('ELASTICSEARCH_URL'),
    api_key=os.getenv('ELASTICSEARCH_API_KEY')
)

docs = retriever.invoke("해당 주택건설지역에 거주하다가 다른 주택건설지역으로 이주")