In [1]:
import os
import textwrap
from pathlib import Path
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import FlashrankRerank
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from llama_parse import LlamaParse
from dotenv import load_dotenv

In [288]:
from langchain.document_loaders import TextLoader, PyPDFLoader

In [332]:
text_loader = TextLoader("./message.pdf")
parsed_document = text_loader.load()

In [333]:
parsed_document[0].page_content

'The Youth Team is organizing a biweekly meeting next Thursday at Meeting Room 105. \nThis meeting is intended for all employees of the Youth Team to participate and discuss ongoing projects, share updates, and collaborate on future initiatives. \nThe meeting will serve as a platform for team members to share their progress, exchange ideas, and align on key objectives for the upcoming weeks. \nParticipants are encouraged to prepare any topics or concerns they would like to address during the meeting.'

In [2]:
load_dotenv()
os.environ['GROQ_API_KEY']=os.getenv("GROQ_API_KEY")
os.environ['LLAMA_API_KEY']=os.getenv("LLAMA_API_KEY")

In [178]:
# instruction='''The provided document is a research paper explaining about how Sharpness Aware Minimization helps in multi-task learning,
#                 It includes several explaination and results to discuss about the benefits of such approach.
#                 It also includes algorithm to the proposed approach.
#                 Try to precise while answering the questions'''

parser=LlamaParse(
    api_key=os.environ['LLAMA_API_KEY'],
    result_type="markdown",
    # parsing_instruction=instruction,
    max_timeout=5000,
)

In [179]:
# parsed_document=await parser.aload_data("./message.pdf")

Started parsing the file under job_id b3db2456-dc52-491d-a724-85a867e345f7


In [245]:
# parsed_document

[Document(id_='e4d12064-99b1-471d-97f7-27348e1332de', embedding=None, metadata={'file_path': './message.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Announcement\n\nHi team,\n\nThis is an announcement that we will be having our biweekly meeting for Youth Team next Thursday. I kindly encourage all the employees in the team to participate in it.Current date is 2024-09-28 20:10:51.394913', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')]

In [180]:
# parsed_document[0].text

'# Announcement\n\nHi team,\n\nThis is an announcement that we will be having our biweekly meeting for Youth Team next Thursday. I kindly encourage all the employees in the team to participate in it.'

In [334]:
from datetime import datetime
parsed_doc=parsed_document[0]
parsed_doc.page_content+= f"Current date is {datetime.now()}"
print(parsed_doc)

page_content='The Youth Team is organizing a biweekly meeting next Thursday at Meeting Room 105. \nThis meeting is intended for all employees of the Youth Team to participate and discuss ongoing projects, share updates, and collaborate on future initiatives. \nThe meeting will serve as a platform for team members to share their progress, exchange ideas, and align on key objectives for the upcoming weeks. \nParticipants are encouraged to prepare any topics or concerns they would like to address during the meeting.Current date is 2024-09-29 00:03:17.048518' metadata={'source': './message.pdf'}


In [335]:
doc_path=Path('data/parsed_doc.md')
doc_path.parent.mkdir(parents=True, exist_ok=True)

In [336]:
with doc_path.open("w") as f:
    f.write(parsed_doc.page_content)

In [137]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/csgrad/gsubrama/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [37]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/csgrad/gsubrama/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [337]:
loader=UnstructuredMarkdownLoader(doc_path)
loaded_doc=loader.load()

In [338]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=2048,chunk_overlap=100)
docs=text_splitter.split_documents(loaded_doc)
print(docs[0])

page_content='The Youth Team is organizing a biweekly meeting next Thursday at Meeting Room 105. \nThis meeting is intended for all employees of the Youth Team to participate and discuss ongoing projects, share updates, and collaborate on future initiatives. \nThe meeting will serve as a platform for team members to share their progress, exchange ideas, and align on key objectives for the upcoming weeks. \nParticipants are encouraged to prepare any topics or concerns they would like to address during the meeting.Current date is 2024-09-29 00:03:17.048518' metadata={'source': PosixPath('data/parsed_doc.md')}


In [302]:
embeddings=FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 13925.31it/s]


In [339]:
qdrant=Qdrant.from_documents(
    docs,
    embeddings,
    path='./db',
    collection_name="document_embeddings"
)

In [188]:
# query="Get me the details of the event"
# similar_text=qdrant.similarity_search_with_score(query)


In [340]:
qdrant.similarity_search_with_score("Give me the venue of event")


[(Document(page_content='The Youth Team is organizing a biweekly meeting next Thursday at Meeting Room 105. \nThis meeting is intended for all employees of the Youth Team to participate and discuss ongoing projects, share updates, and collaborate on future initiatives. \nThe meeting will serve as a platform for team members to share their progress, exchange ideas, and align on key objectives for the upcoming weeks. \nParticipants are encouraged to prepare any topics or concerns they would like to address during the meeting.Current date is 2024-09-29 00:03:17.048518', metadata={'source': 'data/parsed_doc.md', '_id': '72719caa3e584389b27bce46793afb0c', '_collection_name': 'document_embeddings'}),
  0.5328336720334097)]

In [341]:
retriver=qdrant.as_retriever(search_kwargs={"k":5})


In [342]:
retriver

VectorStoreRetriever(tags=['Qdrant', 'FastEmbedEmbeddings'], vectorstore=<langchain_community.vectorstores.qdrant.Qdrant object at 0x7f5d1c1ebdc0>, search_kwargs={'k': 5})

In [343]:
retrived_docs=retriver.invoke("Get the event details")
for doc in retrived_docs:
    print(doc)

page_content='The Youth Team is organizing a biweekly meeting next Thursday at Meeting Room 105. \nThis meeting is intended for all employees of the Youth Team to participate and discuss ongoing projects, share updates, and collaborate on future initiatives. \nThe meeting will serve as a platform for team members to share their progress, exchange ideas, and align on key objectives for the upcoming weeks. \nParticipants are encouraged to prepare any topics or concerns they would like to address during the meeting.Current date is 2024-09-29 00:03:17.048518' metadata={'source': 'data/parsed_doc.md', '_id': '72719caa3e584389b27bce46793afb0c', '_collection_name': 'document_embeddings'}


In [262]:
# compressor=FlashrankRerank(model="ms-macro-MiniLM-L-12-v2")
# compression_retriever=ContextualCompressionRetriever(
#     base_compressor=compressor,base_retriever=retriver
# )

In [263]:
llama=ChatGroq(temperature=0,model_name="llama3-70b-8192")

In [264]:
from datetime import datetime

In [344]:
date=datetime.now()
prompt_template=''' Using the message extract me the event summary, team participating, date of the event,venue also give a title in json format,
                    If Venue is not specified keep it Null, If specific date is not given calculate date from the present date. Present date is given as yy-mm-dd.
                    Dont give anyother response
                    context:{context}'''
prompt=PromptTemplate(
    template=prompt_template,input_variables=['context']
)

In [345]:
qa=RetrievalQA.from_chain_type(
    llm=llama,
    chain_type="stuff",
    retriever=retriver,
    return_source_documents=True,
    chain_type_kwargs={'prompt':prompt}
)

In [346]:
import json
qa.invoke("")['result']

'{\n"title": "Youth Team Biweekly Meeting",\n"event_summary": "Biweekly meeting to discuss ongoing projects, share updates, and collaborate on future initiatives",\n"team_participating": "Youth Team",\n"date_of_event": "2024-10-04",\n"venue": "Meeting Room 105"\n}'

In [347]:
response=qa.invoke("Giv")
print(response)

{'query': 'Giv', 'result': '{\n"title": "Youth Team Biweekly Meeting",\n"event_summary": "Biweekly meeting to discuss ongoing projects, share updates, and collaborate on future initiatives",\n"team_participating": "Youth Team",\n"date_of_event": "2024-10-04",\n"venue": "Meeting Room 105"\n}', 'source_documents': [Document(page_content='The Youth Team is organizing a biweekly meeting next Thursday at Meeting Room 105. \nThis meeting is intended for all employees of the Youth Team to participate and discuss ongoing projects, share updates, and collaborate on future initiatives. \nThe meeting will serve as a platform for team members to share their progress, exchange ideas, and align on key objectives for the upcoming weeks. \nParticipants are encouraged to prepare any topics or concerns they would like to address during the meeting.Current date is 2024-09-29 00:03:17.048518', metadata={'source': 'data/parsed_doc.md', '_id': '72719caa3e584389b27bce46793afb0c', '_collection_name': 'documen

In [348]:
def print_response(response):
    res=response['result']
    for text in res.split('\n'):
        if not text:
            print()
        print("\n".join(textwrap.wrap(text,100,break_long_words=False)))

In [349]:
def answer_question(query):
    response=qa.invoke(query)
    return response['result']

In [350]:
import re
ans=answer_question("")
match = re.search(r'({.*?})', ans, re.DOTALL)
if match:
    json_part = match.group(1)
    print(json_part)

{
"title": "Youth Team Biweekly Meeting",
"event_summary": "Biweekly meeting to discuss ongoing projects, share updates, and collaborate on future initiatives",
"team_participating": "Youth Team",
"date_of_event": "2024-10-04",
"venue": "Meeting Room 105"
}
