In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_files=["data/visa_docs/E-9 Visa Guide_한국어.pdf"])
docs = reader.load_data()

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
len(docs)
# docs

24

In [4]:
for doc in docs:
    print(f"PAGE: {doc.metadata['page_label']}")
    print(f"METADATA: {doc.metadata}")
    print(f"TEXT: {doc.text[:50]}\n\n\n")

PAGE: 1
METADATA: {'page_label': '1', 'file_name': 'E-9 Visa Guide_한국어.pdf', 'file_path': 'data/visa_docs/E-9 Visa Guide_한국어.pdf', 'file_type': 'application/pdf', 'file_size': 127027, 'creation_date': '2024-03-21', 'last_modified_date': '2024-03-21'}
TEXT: E-9
Visa
Guide
-
한국어
고용허가제
고용허가제
해당자와
활동범위
고용허가제
해



PAGE: 2
METADATA: {'page_label': '2', 'file_name': 'E-9 Visa Guide_한국어.pdf', 'file_path': 'data/visa_docs/E-9 Visa Guide_한국어.pdf', 'file_type': 'application/pdf', 'file_size': 127027, 'creation_date': '2024-03-21', 'last_modified_date': '2024-03-21'}
TEXT: 고용허가제
●
｢
외국인근로자의
고용
등에
관한
법률
｣
에
의거
,
사업주에게
외국인근로



PAGE: 3
METADATA: {'page_label': '3', 'file_name': 'E-9 Visa Guide_한국어.pdf', 'file_path': 'data/visa_docs/E-9 Visa Guide_한국어.pdf', 'file_type': 'application/pdf', 'file_size': 127027, 'creation_date': '2024-03-21', 'last_modified_date': '2024-03-21'}
TEXT: 고용허가제
해당자와
활동범위
●
활동범위
○
외국인근로자의
고용
등에
관한
법률의
규정에




PAGE: 4
METADATA: {'page_label': '4', 'file_name': 'E-9 Visa Guide_

In [None]:
# from llama_index.core.postprocessor import SimilarityPostprocessor
# postprocessor = SimilarityPostprocessor(similarity_cutoff=0.7)
# nodes = postprocessor.postprocess_nodes(docs)

In [89]:
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI

embed_model = HuggingFaceEmbedding(
    model_name="jhgan/ko-sbert-nli",
    normalize=True,
)

# 기본 embed_model로 세팅
Settings.embed_model = embed_model
Settings.llm = OpenAI(model="gpt-4-0613")

In [136]:
# """저장"""
# from llama_index.core import StorageContext
# from llama_index.vector_stores.faiss import FaissVectorStore
# from llama_index.embeddings.openai import OpenAIEmbedding
# import faiss


# # dimensions of text-ada-embedding-002
# d = 1536
# faiss_index = faiss.IndexFlatL2(d)

# vector_store = FaissVectorStore(faiss_index)
# storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir="./storage")

# index = VectorStoreIndex.from_documents(
#     docs, 
#     embedding=OpenAIEmbedding(mode=""), 
#     storage_context=storage_context,
#     show_progress=True
# )

# # 벡터 데이터 저장
# index.storage_context.persist(persist_dir="./storage")

In [None]:
"""
방법1
"""
index = VectorStoreIndex.from_documents(docs, show_progress=True)

"""
방법2: nodes
"""
doc_nodes = Settings.node_parser.get_nodes_from_documents(docs)
index = VectorStoreIndex(doc_nodes, show_progress=True)

In [56]:
from llama_index.core.response.notebook_utils import display_source_node

def print_source_node(retrievals):
    for n in retrievals:
        display_source_node(n, source_length=1500)

In [103]:
questions = [
    "E-9 비자 신청 시 필요한 서류는 무엇인가요?",
    "고용허가서 신청 절차는 어떻게 되나요?",
    "재입국 특례 제도 신청을 위한 조건은 무엇인가요?",
    "비전문취업 비자의 일반적인 허용 업종 범위는 어디까지인가요?",
    "고용허가제 적용 대상 국가는 어떤 나라들인가요?",
    "외국인등록 시 제출해야 하는 마약검사확인서의 구체적인 기준과 유의사항은 무엇인가요?", #Bad
    "비자 연장을 위해 제출해야 하는 건강검진 항목은 무엇인가요?", #Bad
    "한국에서의 건강검진 기준은 외국인 근로자에게 어떤 영향을 미치나요?", #Bad
    "체류기간 연장을 위한 수수료는 얼마인가요?", #Bad
]

In [68]:
base_retriever = index.as_retriever(similarity_top_k=3)

In [None]:
for question in questions:
    retrievals = base_retriever.retrieve(question)
    print(f"Question {question}")
    print_source_node(retrievals)

In [108]:
from llama_index.core.postprocessor import SimilarityPostprocessor

query_engine = index.as_query_engine(
    streaming=False, 
    similarity_top_k=3,
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.6)
    ]
)

In [90]:
def print_source_nodes(response):
    for node in response.source_nodes:
        print("-----")
        text_fmt = node.node.get_content().strip().replace("\n", " ")[:1000]
        print(f"Text:\t {text_fmt} ...")
        print(f"Metadata:\t {node.node.metadata}")
        print(f"Score:\t {node.score:.3f}")

In [119]:
import csv
def save_results_csv(filename, results):
    # results = [{"question": "123", "answer": "text"}, {"question": "456", "answer": "example"}]

    # CSV 파일로 저장
    with open(f"data/{filename}.csv", 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["question", "answer"])
        writer.writeheader()
        writer.writerows(results)

In [None]:
results = []

for question in questions:
    print(f"Question: {question}\n")
    response = query_engine.query(question)
    # response.print_response_stream()
    print(f"{response.response}\n")
    print_source_nodes(response)
    print("\n\n\n")
    results.append({"question": question, "answer": response.response})


save_results_csv(results)

In [126]:
from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer

recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": base_retriever},
    verbose=True,
)

response_synthesizer = get_response_synthesizer(response_mode="compact")

query_engine = RetrieverQueryEngine.from_args(
    recursive_retriever, 
    response_synthesizer=response_synthesizer,
    similarity_top_k=3,
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.6)
    ],
)

In [None]:
results = []

questions = ["비자 연장을 위해 제출해야 하는 건강검진 항목은 무엇인가요?"]

for question in questions:
    print(f"Question: {question}\n")
    response = query_engine.query(question)
    # response.print_response_stream()
    print(f"{response.response}\n")
    print_source_nodes(response)
    print("\n\n\n")
    results.append({"question": question, "answer": response.response})


# save_results_csv("QnA-RecursiveRetriever", results)