## Dependencies

In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.messages import HumanMessage, AIMessage
from langchain_groq import ChatGroq
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
load_dotenv()
import hashlib


docx_path = '.\\data\\text_data\\'

os.chdir('../')
%pwd

'c:\\Projects\\python\\journAI'

## RAG pipeline experiments

### Call an LLM

In [2]:
llm = ChatOpenAI(model='gpt-4o-mini')
output_parser = StrOutputParser()
output_parser.invoke(llm.invoke('Tell me a joke'))

'Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!'

### Creating a chain

In [3]:
chain_1 = llm | output_parser
chain_1.invoke('Tell me a joke!')

'Why did the scarecrow win an award?\n\nBecause he was outstanding in his field!'

### Prompt templating

In [4]:
prompt = ChatPromptTemplate.from_template("Tell me a funny joke about {topic}")
prompt.invoke({'topic':'langchain'})

ChatPromptValue(messages=[HumanMessage(content='Tell me a funny joke about langchain', additional_kwargs={}, response_metadata={})])

In [5]:
chain_2 = prompt | llm | output_parser
chain_2.invoke({'topic':'langchain'})

'Why did the LangChain developer bring a ladder to work?\n\nBecause they heard the API was on a whole new level!'

### LLM message templating

In [6]:
system = SystemMessagePromptTemplate.from_template('''You are a helpful AI assitant, who answers a given query in a brief manner and then provides a small description of your actions leading to the outcome.''')
human = HumanMessagePromptTemplate.from_template('Tell me a joke about a {topic}')


prompt_2 = ChatPromptTemplate.from_messages([system,human])
prompt_2.invoke({'topic':'langchain'})

ChatPromptValue(messages=[SystemMessage(content='You are a helpful AI assitant, who answers a given query in a brief manner and then provides a small description of your actions leading to the outcome.', additional_kwargs={}, response_metadata={}), HumanMessage(content='Tell me a joke about a langchain', additional_kwargs={}, response_metadata={})])

In [7]:
chain_3 = prompt_2 | llm | output_parser
chain_3.invoke({'topic':'langchain'})

'Why did the langchain break up with its partner?\n\nBecause it couldn’t handle all the context! \n\nI created this joke by playing on the word "context," which is vital in language processing, and combining it with a humorous relationship twist.'

### Document loading and splitting

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
)

documents=[]
for filename in os.listdir(docx_path):
    if filename.endswith(".docx"):
        file_path = os.path.join(docx_path, filename)
        loader = Docx2txtLoader(file_path)
        docs = loader.load()
        documents.extend(docs)

splits = text_splitter.split_documents(documents=documents)

In [9]:
splits[0]

Document(metadata={'source': '.\\data\\text_data\\The_Evolution_of_Space_Exploration_and_Its_Future_Prospects.docx'}, page_content='The Evolution of Space Exploration and Its Future Prospects\n\n# The Evolution of Space Exploration and Its Future Prospects\n\n## Introduction  \nSpace exploration has advanced significantly since the launch of Sputnik in 1957. From manned moon landings to Mars rovers and space telescopes, humanity continues to push the boundaries of exploration. This paper discusses the milestones, challenges, and future prospects of space exploration.\n\n## Major Milestones  \n\n### 1. The Space Race and Moon Landing  \nThe Cold War fueled competition between the US and the Soviet Union, culminating in the Apollo 11 moon landing in 1969.\n\n### 2. Space Stations and Shuttle Programs  \nThe International Space Station (ISS) and space shuttles have facilitated long-term research and human presence in space.\n\n### 3. Mars Exploration  \nRovers like Curiosity and Persevera

### Embedding

In [10]:
embeddings_1 = OpenAIEmbeddings()
embeded_docs = embeddings_1.embed_documents([split.page_content for split in splits])
len(embeded_docs)

  embeddings_1 = OpenAIEmbeddings()


10

In [11]:
embeddings_2 = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')
embeded_docs_2 = embeddings_2.embed_documents([split.page_content for split in splits])
len(embeded_docs_2)


  embeddings_2 = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2')





10

### Storing in a persistant ChromaDB

In [12]:
collection_name = 'test1'
vectorstore = Chroma.from_documents(documents=splits,embedding=embeddings_1,
                                    collection_name=collection_name,persist_directory='./data/chroma_db/',
                                    )
query = 'What are the challenges of space exploration?'

search = vectorstore.similarity_search(query=query,k=2)

In [13]:
def deduplicate_vectorstore(vectorstore):
    data = vectorstore._collection.get()
    all_ids = data.get("ids", [])
    all_documents = data.get("documents", [])
    
    unique_hashes = {}
    duplicates = []
    
    for doc_id, doc in zip(all_ids, all_documents):
        if isinstance(doc, str):
            content = doc
        elif isinstance(doc, dict):
            content = doc.get("page_content", "")
        else:
            content = str(doc)
            
        content_hash = hashlib.md5(content.encode("utf-8")).hexdigest()
        if content_hash in unique_hashes:
            duplicates.append(doc_id)
        else:
            unique_hashes[content_hash] = doc_id
    
    if duplicates:
        vectorstore.delete(ids=duplicates)
        print(f"Deleted {len(duplicates)} duplicate document(s) from the vectorstore.")
    else:
        print("No duplicates found.")


In [14]:
deduplicate_vectorstore(vectorstore)
search = vectorstore.similarity_search(query=query,k=10)
search

Deleted 10 duplicate document(s) from the vectorstore.


[Document(id='cc899e74-ff9f-4635-a41c-4ae5af64e211', metadata={'source': '.\\data\\text_data\\The_Evolution_of_Space_Exploration_and_Its_Future_Prospects.docx'}, page_content='### 3. Mars Exploration  \nRovers like Curiosity and Perseverance have provided invaluable data on the Martian surface, fueling future manned missions.\n\n### 4. Commercial Spaceflight  \nCompanies like SpaceX, Blue Origin, and Virgin Galactic are pioneering private space travel, making it more accessible.\n\n## Challenges of Space Exploration  \n- High costs and funding limitations  \n- Human health risks due to microgravity and radiation  \n- Sustainability and long-term habitability of other planets\n\n## Future Prospects  \n- Lunar colonization and NASA’s Artemis Program  \n- Mars missions and potential colonization  \n- Advances in propulsion systems for deep-space travel  \n- Search for extraterrestrial life and interstellar exploration\n\n## Conclusion  \nSpace exploration has come a long way, with promisi

### Retreiver

In [15]:
retriever = vectorstore.as_retriever(search_kwargs={'k':2})
retriever.invoke('What are the challenges of space exploration?')

[Document(id='cc899e74-ff9f-4635-a41c-4ae5af64e211', metadata={'source': '.\\data\\text_data\\The_Evolution_of_Space_Exploration_and_Its_Future_Prospects.docx'}, page_content='### 3. Mars Exploration  \nRovers like Curiosity and Perseverance have provided invaluable data on the Martian surface, fueling future manned missions.\n\n### 4. Commercial Spaceflight  \nCompanies like SpaceX, Blue Origin, and Virgin Galactic are pioneering private space travel, making it more accessible.\n\n## Challenges of Space Exploration  \n- High costs and funding limitations  \n- Human health risks due to microgravity and radiation  \n- Sustainability and long-term habitability of other planets\n\n## Future Prospects  \n- Lunar colonization and NASA’s Artemis Program  \n- Mars missions and potential colonization  \n- Advances in propulsion systems for deep-space travel  \n- Search for extraterrestrial life and interstellar exploration\n\n## Conclusion  \nSpace exploration has come a long way, with promisi

## RAG pipeline

### Helper code

In [16]:
def deduplicate_vectorstore(vectorstore):
    data = vectorstore._collection.get()
    all_ids = data.get("ids", [])
    all_documents = data.get("documents", [])
    
    unique_hashes = {}
    duplicates = []
    
    for doc_id, doc in zip(all_ids, all_documents):
        if isinstance(doc, str):
            content = doc
        elif isinstance(doc, dict):
            content = doc.get("page_content", "")
        else:
            content = str(doc)
            
        content_hash = hashlib.md5(content.encode("utf-8")).hexdigest()
        if content_hash in unique_hashes:
            duplicates.append(doc_id)
        else:
            unique_hashes[content_hash] = doc_id
    
    if duplicates:
        vectorstore.delete(ids=duplicates)
        print(f"Deleted {len(duplicates)} duplicate document(s) from the vectorstore.")
    else:
        print("No duplicates found.")

def docs2str(docs):
    return '\n\n'.join(doc.page_content for doc in docs)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
)

documents=[]
for filename in os.listdir(docx_path):
    if filename.endswith(".docx"):
        file_path = os.path.join(docx_path, filename)
        loader = Docx2txtLoader(file_path)
        docs = loader.load()
        documents.extend(docs)

splits = text_splitter.split_documents(documents=documents)

collection_name = 'test1'
vectorstore = Chroma.from_documents(documents=splits,embedding=OpenAIEmbeddings(),
                                    collection_name=collection_name,persist_directory='./data/chroma_db/',
                                    )
deduplicate_vectorstore(vectorstore)

Deleted 10 duplicate document(s) from the vectorstore.


### Main

In [17]:
system = SystemMessagePromptTemplate.from_template('''You are a helpful AI assitant, who answers a given query based on the context provided, in a brief manner and then provides a small description of your actions leading to the outcome.\nContext: \n\n{context}''')
human = HumanMessagePromptTemplate.from_template('Query: {query}')

retriever = vectorstore.as_retriever(search_kwargs={'k':3})
llm = ChatOpenAI(model='gpt-4o-mini')
llm_2 = ChatGroq(model="mixtral-8x7b-32768")
prompt_3 = ChatPromptTemplate.from_messages([system,human])

rag_chain = (

    {'context': retriever | docs2str, 'query': RunnablePassthrough() }
    | prompt_3
    | llm_2
    | StrOutputParser()

)

query = 'What are the challenges of space exploration?'
response = rag_chain.invoke(query)
print(response)

The challenges of space exploration include high costs and funding limitations, human health risks due to microgravity and radiation, and sustainability and long-term habitability of other planets. Additionally, there are storage and intermittency issues, policy and regulatory challenges, and land and resource availability that pose barriers to space exploration.


## Context Aware Retreival experiments

### Save chat history

In [18]:
chat_history = []
chat_history.extend(
    [HumanMessage(content=query),
    AIMessage(content=response)]
)

chat_history

[HumanMessage(content='What are the challenges of space exploration?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='The challenges of space exploration include high costs and funding limitations, human health risks due to microgravity and radiation, and sustainability and long-term habitability of other planets. Additionally, there are storage and intermittency issues, policy and regulatory challenges, and land and resource availability that pose barriers to space exploration.', additional_kwargs={}, response_metadata={})]

### Generate as per chat history

In [19]:
system_2 = SystemMessagePromptTemplate.from_template('Answer the query based on chat history provided:\nChat history: {chat_history}')
human_2 = HumanMessagePromptTemplate.from_template('Query: {input}')

contextualized_prompt = ChatPromptTemplate.from_messages([system_2,human_2])

contextual_chain = (
    contextualized_prompt
    | llm
    | StrOutputParser()
)

query = "What was the name of Mars Rover?"
response_2=contextual_chain.invoke({'chat_history': chat_history, 'input': query})
chat_history.extend(
    [HumanMessage(content=query),
     AIMessage(content=response_2)]
)
response_2


'The Mars rovers include several missions, with some of the most well-known being Spirit, Opportunity, Curiosity, and Perseverance. Each of these rovers has contributed significantly to our understanding of Mars.'

In [20]:
contextualized_prompt

ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['chat_history'], input_types={}, partial_variables={}, template='Answer the query based on chat history provided:\nChat history: {chat_history}'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='Query: {input}'), additional_kwargs={})])

In [21]:
history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualized_prompt)

history_aware_retriever.invoke({'chat_history': chat_history,'input': 'What was the name of the mission?'})

[Document(id='cc899e74-ff9f-4635-a41c-4ae5af64e211', metadata={'source': '.\\data\\text_data\\The_Evolution_of_Space_Exploration_and_Its_Future_Prospects.docx'}, page_content='### 3. Mars Exploration  \nRovers like Curiosity and Perseverance have provided invaluable data on the Martian surface, fueling future manned missions.\n\n### 4. Commercial Spaceflight  \nCompanies like SpaceX, Blue Origin, and Virgin Galactic are pioneering private space travel, making it more accessible.\n\n## Challenges of Space Exploration  \n- High costs and funding limitations  \n- Human health risks due to microgravity and radiation  \n- Sustainability and long-term habitability of other planets\n\n## Future Prospects  \n- Lunar colonization and NASA’s Artemis Program  \n- Mars missions and potential colonization  \n- Advances in propulsion systems for deep-space travel  \n- Search for extraterrestrial life and interstellar exploration\n\n## Conclusion  \nSpace exploration has come a long way, with promisi

### Final retriever

In [26]:
system_3 = SystemMessagePromptTemplate.from_template('Answer the query based on chat history provided:\nChat history: {chat_history}')
system_4 = SystemMessagePromptTemplate.from_template('Additional context: {context}')
human_3 = HumanMessagePromptTemplate.from_template('Query: {input}')
qa_prompt = ChatPromptTemplate.from_messages([system_3,human_3,system_4])
qa_chain = create_stuff_documents_chain(llm=llm,prompt=qa_prompt)
rag_chain_2 = create_retrieval_chain(history_aware_retriever,qa_chain)

response_3 = rag_chain_2.invoke({'chat_history': chat_history,'input': 'Name two mars missions.'})


In [28]:
response_3['answer']

'Two Mars missions are the Curiosity rover and the Perseverance rover.'