In [1]:
import logging
import sys
import os

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import json

json_file_path = "../../../Documents/E-9_Visa_Docs.json"

with open(json_file_path, 'r') as f:
    visa_docs_json = json.load(f)


""" Load TextNodes from JSON file """
from llama_index.core.schema import TextNode

visa_nodes = [
    TextNode(
        text=doc["text"],
        metadata=doc["metadata"],
        excluded_llm_metadata_keys=doc["excluded_llm_metadata_keys"],
        excluded_embed_metadata_keys=doc["excluded_embed_metadata_keys"],
    )
    for doc in visa_docs_json
]

In [3]:
from llama_index.core.node_parser import MarkdownNodeParser

parser = MarkdownNodeParser()

visa_nodes_markdown = parser.get_nodes_from_documents(visa_nodes)

In [None]:
print(visa_nodes[2].get_content())
print(visa_nodes_markdown[2].get_content())

#### Save E9 Visa Guide Doc

In [4]:
import chromadb
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

""" SAVE TO LOCAL"""
db = chromadb.PersistentClient(path="./chroma")
chroma_collection = db.get_or_create_collection("E9_VISA_GUIDE")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    visa_nodes_markdown,
    storage_context=storage_context,
    show_progress=True
)

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Generating embeddings:   0%|          | 0/21 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


#### Save E9 Visa FAQ Document

##### Load JSON file

In [5]:
import json

json_file_path = "../../../Documents/E9_Faq.json"
with open(json_file_path, 'r') as f:
    faq_json = json.load(f)


from llama_index.core.schema import TextNode

faq_nodes = [
    TextNode(
        text=f"Question: {doc['question']}\nAnswer: {doc['answer']}",
    )
    for doc in faq_json
]

In [None]:
faq_nodes[1]

In [6]:
""" SAVE TO LOCAL"""
db = chromadb.PersistentClient(path="./chroma")
chroma_collection = db.get_or_create_collection("E9_VISA_FAQ")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

faq_index = VectorStoreIndex(
    faq_nodes,
    storage_context=storage_context,
    show_progress=True
)

Generating embeddings:   0%|          | 0/38 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


#### Merge E9 Visa Guide and FAQ into Chroma

In [7]:
""" SAVE TO LOCAL"""
db = chromadb.PersistentClient(path="./chroma")
chroma_collection = db.get_or_create_collection("E9_VISA_GUIDE_AND_FAQ")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

merge_index = VectorStoreIndex(
    visa_nodes_markdown + faq_nodes,
    storage_context=storage_context,
    show_progress=True
)

Generating embeddings:   0%|          | 0/59 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


#### Load

In [None]:
db = chromadb.PersistentClient(path="./chroma_data")
chroma_collection = db.get_or_create_collection("E9_VISA_GUIDE_AND_FAQ")
chroma_vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

merge_index = VectorStoreIndex.from_vector_store(vector_store=chroma_vector_store)

#### Test

In [16]:
from llama_index.postprocessor.cohere_rerank import CohereRerank

reranker = CohereRerank(
    top_n=3, model="rerank-multilingual-v2.0"
)

In [None]:
faq_retriever =  faq_index.as_retriever(similarity_top_k=5)
faq_retriever_with_reranker =  faq_index.as_retriever(similarity_top_k=5, node_postprocessors=[reranker])

In [20]:
merge_retriever = merge_index.as_retriever(similarity_top_k=5)
merge_retriever_with_reranker =  merge_index.as_retriever(similarity_top_k=5, node_postprocessors=[reranker])

In [10]:
def pretty(retrievals):
    for r in retrievals:
        print(r)

In [21]:
retrievals= merge_retriever.retrieve("고용허가서 신청 절차는 어떻게 되나요?")
retrievals2= merge_retriever_with_reranker.retrieve("고용허가서 신청 절차는 어떻게 되나요?")

pretty(retrievals)
print("---------------------------------------------------------------")
pretty(retrievals2)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
Node ID: 05c2a148-a0cf-49ab-8b19-80b961a58ea0
Text: 고용허가제 해당자가 근무처(직장)를 변경하는 절차 및 제출 서류         ● 근무처(직장) 변경 절차
○ 사용자와 근로계약 종료 후 1월 이내에 사업장 변경을 신청(고용노동부 고용센터)하여야 하며, 사업장 변경을 신청한 날로부터
3개월 이내에 근무처 변경허가(관할 출입국·외국인청(사무소ᆞ출장소))를 받아야 함             ○ 다만, 업무상
재해, 질병, 임신, 출산 등의 사유로 근무처변경 허가를 받을 수 없거나 신청할 수 없는 경우에는 사유해소일로부터 기간 계산
○ 사유 해당자는 체류기간 만료 전에 직업안정기관의 장이 발급한 사업장변경신청기간 연장접수확인서에...
Score:  0.737

Node ID: 32208183-4b29-46d6-a6b8-95e44474aa99
Text: Question: EPS 서류 만료된 경우 고용센터에서 자동으로 갱신해주시나요? 아니면 직접 신청해야 하나요?
그리고 그렇다면 갱신하는 데 얼마나 걸리나요? Answer: EPS(Employment Permit System, 고용허가제)
서류의 갱신은 자동으로 이루어지지 않습니다. 근로자나 고용주는 서류 만료 전에 직접 갱신을 신청해야 합니다. 갱신 절차는
고용센터를 통해 진행되며, 필요한 서류를 준비하여 제출해야 합니