In [None]:
! pip -q install langchain, huggingface_hub tiktoken pypdf google-generativeai chromadb sentence_transformers 

In [None]:
! pip install -U langchain-community unstructured

In [None]:
! pip install --upgrade --quiet langchain-google-genai

In [None]:
import textwrap
import os
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
import langchain
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings
from operator import itemgetter
from langchain.prompts import (ChatPromptTemplate, ChatMessagePromptTemplate,
                                PromptTemplate, SystemMessagePromptTemplate,
                                  HumanMessagePromptTemplate)
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough

from langchain.load import dumps, loads


In [None]:
def wrap_text(text, width=90): # Preseve_newlines
    # Split input text into lines based on newline characters
    lines = text.split("\n")

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = "\n".join(wrapped_lines)

    return wrapped_text

In [None]:
GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro")
results = llm.invoke("Write a ballad about Langchain")

print(results.content)

In [None]:
data_path = "/content/drive/MyDrive/English"

In [None]:
loader = DirectoryLoader(data_path, glob=".txt", show_progress=True)
docs = loader.load()

In [None]:
docs = docs[:50]

In [None]:
print(docs[0].page_content)

In [None]:
raw_text = ""

for i, doc in enumerate(docs):
    text = doc.page_content
    if text:
        raw_text += text

In [None]:
print(raw_text)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=False
)

In [None]:
texts = text_splitter.split_text(raw_text)

In [None]:
len(texts)

In [None]:
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {"normalize_embeddings":True} # set true to cumpute cosine similarity

In [None]:
embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    # model_kwargs={"device": "cuda"},
    encode_kwargs=encode_kwargs,
    
)

In [None]:
# Make chroma and persiste to disk
db = Chroma.from_texts(texts, embedding_function, persist_directory="./chroma_db")

In [None]:
query = "Tell me about Universal Studios Singapore"
db.similarity_search(query, k=5)

In [None]:
retriever = db.as_retriever() # can add mmr fetch_k=20, search_type="mmr"

retriever.get_relevant_documents(query)

In [None]:
template = """

    Answer the question based only on the following context:
    {context}

    Question:{question}
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {"context": retriever, "question":RunnablePassthrough()}

    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
text_reply = chain.invoke("Tell me about Universal Studio Singapore")

print(wrap_text(text_reply))

In [None]:
prompt_1 = ChatPromptTemplate(input_variables=["original_query"],
                              messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template="You are a helpful assistant that generates multiple search queries based on a single input query")),
                              HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=["original_query"], template="Generate multiple search queries related to: {question} \n OUTPUT (4 queries):"))])

In [None]:
'''
prompt = ChatPromptTemplate. from_messages([
("system", "You are a helpful assistant that generates multiple search queries based on a single input query."),
("user", "Generate multiple search queries related to: {question}/n OUTPUT (4 queries):"),
])
'''

In [None]:
original_query = "universal Studios Singapore"

In [None]:
generate_queries = (
    prompt_1 | llm | StrOutputParser() | (lambda x : x.split("\n"))
)

In [None]:
generate_queries

In [None]:
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assume the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0

            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    return reranked_results

In [None]:
rag_fusion_chain = generate_queries | retriever.map() | reciprocal_rank_fusion

In [None]:
langchain.debug = True

In [None]:
rag_fusion_chain.input_schema.schema()

In [None]:
rag_fusion_chain.invoke({"question": original_query})

In [None]:
full_rag_fusion_chain = (
    {
        "context": rag_fusion_chain,
        "question": RunnablePassthrough()
    }

    | prompt_1
    | llm
    | StrOutputParser()
)

In [None]:
full_rag_fusion_chain.input_schema.schema()

In [None]:
full_rag_fusion_chain.invoke({"question": "Tell me about Singapore's nightlife scene?"})