In [None]:
# Install required packages.
%pip install chromadb
%pip install langchain
%pip install sentence-transformers
%pip install openai

In [None]:
# Importing data.
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.prompts import PromptTemplate


In [None]:
# Loading embeddings from ChromaDB
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
# Loading Data
loader = CSVLoader(file_path='./raw_data/bhagavad-gita.csv', encoding="utf-8")
documents = loader.load()

In [None]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


In [None]:
# save to disk
db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db")

In [None]:
# loading chroma data from disk
db = Chroma(persist_directory="./chroma_db", embedding_function=embeddings)

In [None]:
# Querying data from ChromaDB
query = "What happens after death ?"
docs = db2.similarity_search(query)
docs[0]


Document(page_content='\ufeffVerse: Verse 2.20\nEnlgish Translation: The Blessed Lord said: Dear ARJUNA, the ATMAN or Soul can neither be born nor can it die. It is forever immortal, eternal and ancient. The Soul in a body does not die when the body itself perishes and ceases to exist. The Soul always lives on.', metadata={'row': 65, 'source': 'C:/Users/ketan/Desktop/New folder (2)/data/bhagavad-gita.csv'})

In [None]:
# Making a string of related verses.
verses = ""

for doc in docs:
    verses += doc.page_content + "\n"

In [None]:
# Template for prompt.
template = """
Given the following extracted parts of a long document ("SOURCES") and a question ("QUESTION").
Don't try to make up an answer and use the text in the SOURCES only for the answer. 
If you don't know the answer, just say that you don't know.
List down all the SOURCES in the response.

QUESTION: {question}
=========
SOURCES:
{verses}
=========
ANSWER:
"""

prompt = PromptTemplate(template=template, input_variables=["question", "verses"])

In [None]:
# Function to trim tokens in a string.
def trim_string_by_tokens(input_string, num_tokens):
    # Split the string into tokens
    tokens = input_string.split()

    # Ensure the number of tokens to keep is not greater than the total number of tokens
    num_tokens = min(num_tokens, len(tokens))

    # Join the desired number of tokens back together
    trimmed_string = ' '.join(tokens[:num_tokens])

    return trimmed_string


In [None]:
# Trimming final prompt to fit in context window.
final_prompt = prompt.format(question=query, verses=verses)
final_prompt = (final_prompt, 1500)[0]
final_prompt



In [None]:
# Loading OpenAI model.
# Using local Zephyr 7B β ( Mistral Architecture ), using LM Studio.
llm = OpenAI(model="local-Zephyr-7B",base_url="http://localhost:1234/v1", openai_api_key="TEST_KEY", n=1500)

In [None]:
# Generating Response from LLM.
llm_res = llm(final_prompt)
llm_res