In [1]:
import os
os.chdir("../")

In [2]:
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def load_text_file(data):
    loader = DirectoryLoader(data,
                             glob="*.txt",
                             loader_cls=TextLoader)
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_text_file(data="data/")

In [5]:
# Chunk the data

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [6]:
text_chunks=text_split(extracted_data)
print("Chunk data length:", len(text_chunks))

Chunk data length: 11600


In [7]:
# Embedding Model from Hugging Face
from langchain.embeddings import HuggingFaceEmbeddings
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [8]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
query_result = embeddings.embed_query("Manila")
print("Length:", len(query_result))

Length: 384


In [10]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))


index_name = "philbot"

# pc.create_index(
#     name=index_name,
#     dimension=len(query_result), # Replace with your model dimensions
#     metric="cosine", # Replace with your model metric
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     ) 
# )

In [None]:
# Embed each chunk and upsert the embeddings into your Pinecone index.


from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [11]:
# Load Existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [12]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1e01063abd0>

In [13]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [14]:
retrieved_docs = retriever.invoke("What is commune id for Calumpang, Naval, Biliran?")

In [21]:
retrieved_docs

[Document(id='5646c1c2-e3d6-4023-8e16-120c273735cc', metadata={'source': 'data\\district_summary.txt'}, page_content='Area Info: Calumpang, Naval, Biliran / Commune ID: 63_86746819142 / District ID: 63_8674681 / Province ID: 63_867\nArea Info: Capinahan, Naval, Biliran / Commune ID: 63_8674681698 / District ID: 63_8674681 / Province ID: 63_867\nArea Info: Caraycaray, Naval, Biliran / Commune ID: 63_86746819162 / District ID: 63_8674681 / Province ID: 63_867\nArea Info: Catmon, Naval, Biliran / Commune ID: 63_86746818554 / District ID: 63_8674681 / Province ID: 63_867'),
 Document(id='40008f89-7e56-4dd9-9884-8a1152dd6bc1', metadata={'source': 'data\\district_summary.txt'}, page_content='Area Info: Calatagbak, quezon, Palawan / Commune ID: 63_81156484446 / District ID: 63_8115648 / Province ID: 63_811\nArea Info: Calumpang, quezon, Palawan / Commune ID: 63_81156485508 / District ID: 63_8115648 / Province ID: 63_811\nArea Info: Isugod, quezon, Palawan / Commune ID: 63_81156487767 / Distri

In [15]:
from langchain_openai import OpenAI
# os.environ.get('PINECONE_API_KEY')

llm = OpenAI(temperature=0.4, max_tokens=500)



In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for finding the area info tasks."
    "Use the following pieces of retrived context to answer the given address"
    "If the address is unknown, return empty value."
    "Output in JSON format:"
    """{{
    "area_info":"",
    "commune_id":"",
    "district_id":"",
    "province_id":""
    }}"""
    "\n\n"
    "context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [22]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [23]:
response = rag_chain.invoke({"input": "What is commune id for Barangay Calumpang, Naval, Biliran?"})
print(response["answer"])

KeyError: 'Input to ChatPromptTemplate is missing variables {\'\\n    "area_info"\'}.  Expected: [\'\\n    "area_info"\', \'context\', \'input\'] Received: [\'input\', \'context\']\nNote: if you intended {\n    "area_info"} to be part of the string and not a variable, please escape it with double curly braces like: \'{{\n    "area_info"}}\'.\nFor troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_PROMPT_INPUT '