<a href="https://colab.research.google.com/github/kmk4444/Retrieval-augmented-generation/blob/main/Part10_multiquery_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Requirements.txt**

In [4]:
!touch requirements.txt
!echo langchain_community >> requirements.txt
!echo langchain >> requirements.txt
!echo langchain-openai >> requirements.txt
!echo openai >> requirements.txt
!echo langchain-google-genai >> requirements.txt
!echo cohere >> requirements.txt
!echo faiss-cpu >> requirements.txt
!echo streamlit >> requirements.txt
!echo python-dotenv >> requirements.txt
!echo llama-index >> requirements.txt
!echo pypdf >> requirements.txt
!echo chromadb >> requirements.tx
!echo beautifulsoup4 >> requirements.tx
!echo matplotlib >> requirements.tx
!echo rank_bm25 >> requirements.tx
!echo replicate >> requirements.txt

**Bash/command**

In [5]:
pip install -r requirements.txt

Collecting langchain_community (from -r requirements.txt (line 12))
  Downloading langchain_community-0.2.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: langchain_community
Successfully installed langchain_community-0.2.0


In [6]:
%%writefile multiqueryhelper.py

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import cohere
import os
from dotenv import load_dotenv

#load_dotenv()

#my_key_openai = os.getenv("openai_apikey")
#my_key_google = os.getenv("google_apikey")
#my_key_cohere = os.getenv("cohere_apikey")

my_key_openai="---"
my_key_google="---"
my_key_cohere="---"

llm_gemini = ChatGoogleGenerativeAI(google_api_key=my_key_google, model="gemini-pro")
llm_openai = ChatOpenAI(api_key=my_key_openai)
embeddings = OpenAIEmbeddings(api_key=my_key_openai)
cohere_client = cohere.Client(api_key=my_key_cohere)

def generate_multi_query(original_prompt):
    multiquery_prompt = f"""Sen bir yapay zeka asistanısın.

    Bir vektör veri tabanından, kullanıcı sorusuna en fazla benzerlik gösteren dokümanların getirilmesi için, sana verilen kullanıcı girdisinin 3 farklı versiyonunu yazmakla görevlisin.

    Bunu yaparken amacın ise vektörleri karşılaştırırken kullanılan mesafe ölçümlerinin bazı sınırlılıklarını aşmak için, verilen soruyla ilgili birden çok bakış açısı geliştirerek kullanıcıya yardımcı olmak.

    Bu yazacağın alternatif soruları ayrı ayrı satırlarda olacak şekilde yaz.
    Alternatif soruları yazarken bunların 1, 2, 3 gibi numaralandırmalar koyma.

    Kullanıcı girdisi şöyle: {original_prompt}"""

    generated_queries = llm_openai.invoke(input=multiquery_prompt) # chatbot created several questions.

    temp_list = generated_queries.content.strip().split("\n") # we prepared output for chatbot

    #we created a list to save user question and chatbot questions.
    query_list = [original_prompt]
    query_list.extend(temp_list)

    return query_list

def get_relevant_documents(target_url,prompt):

  loader = WebBaseLoader(target_url)

  raw_documents = loader.load()

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=1000,
      chunk_overlap=0,
      length_function=len
  )

  splitted_documents = text_splitter.split_documents(raw_documents)

  custom_documents=[]

  for i, raw_doc in enumerate(splitted_documents):

    new_doc = Document(
        page_content = raw_doc.page_content,
        metadata = {
            "source" : raw_doc.metadata["source"],
            "title" : raw_doc.metadata["title"],
            "description" : raw_doc.metadata["description"],
            "language" : raw_doc.metadata["language"],
            "doc_id" : i
        }
    )

    custom_documents.append(new_doc)

  vectorstore = FAISS.from_documents(custom_documents,embeddings)
  retriever = vectorstore.as_retriever()

  relevant_documentss = retriever.get_relevant_documents(prompt)

  return relevant_documents

def run_rag(relevant_documents, prompt):
  context_data = ""

  for document in relevant_documents:
    context_data = context_data + " " + document.page_content

  final_prompt = f"""Şöyle bir sorum var: {prompt}
  Bu soruyu yanıtlamak için elimizde şu bilgiler var: {context_data} .
  Bu sorunun yanıtını vermek için yalnızca sana burada verdiğim eldeki bilgileri kullan. Bunların dışına asla çıkma.
  """
  AI_Response = llm_gemini.invoke(input=final_prompt)

  return AI_Response.content

def rag_with_url(target_url, prompt):

  loader = WebBaseLoader(target_url)

  raw_documents = loader.load()

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=1000,
      chunk_overlap=0,
      length_function=len
  )

  splitted_documents = text_splitter.split_documents(raw_documents)

  custom_documents=[]

  for i, raw_doc in enumerate(splitted_documents):

    new_doc = Document(
        page_content=raw_doc.page_content,
        metadata = {
            "source": raw_doc.metadata["source"],
            "title": raw_doc.metadata["title"],
            "description": raw_doc.metadata["description"],
            "language": raw_doc.metadata["language"],
            "doc_id":i
        }
    )

    custom_document.append(new_doc)

    vectorstore = FAISS.from_documents(custom_documents, embeddings)
    retriever = vectorstore.as_retriever()

    relevant_documents = retriever.get_relevant_documents(prompt)

    context_data = ""

    for document in relevant_documents:
      context_data = context_data + " " + document.page_content

    final_prompt = f"""Şöyle bir sorum var: {prompt}
    Bu soruyu yanıtlamak için elimizde şu bilgiler var: {context_data} .
    Bu sorunun yanıtını vermek için yalnızca sana burada verdiğim eldeki bilgileri kullan. Bunların dışına asla çıkma.
    """

    AI_Response = llm_gemini.invoke(input=final_prompt)

    return AI_Response.content, relevant_documents

def get_unique_documents(retrieved_documents):

  unique_docs = {}

  for doc in retrieved_documents:
    doc_id = doc.metada["doc_id"]

    if doc_id not in unique_docs:
      unique_docs[doc_id] = doc

  return list(unique_docs.values())

def get_reranked_documents(documents, query, document_count=4):

  document_contents = []

  for doc in documents:
    document_contents.append(doc.page_content)

  reranked_documents = cohere_client.rerank(
      model="rerank-multilingual-v2.0",
      query = query,
      documents=document_contents,
      top_n=document_count
  )

  reranked_documents_list = []

  # for reranked_doc in reranked_documents:
  #     reranked_documents_list.append(reranked_doc.document['text'])

  for reranked_doc in reranked_documents:
    reranked_documents_list.append(documents[reranked_doc.index])

  return  reranked_documents_list


In [None]:
%%writefile multiquery_rag.py
import streamlit as st
import multiqueryhelper

st.set_page_config(layout="wide")
st.title("Advanced RAG: Multi-Query | Sorgu Çeşitlendirme ile Bellek Genişletme Örneği")
st.divider()

col_input, col_docs, col_uniquedocs, col_rerankeddocs, col_response = st.columns([1,2,2,2,1])

with col_input:
    target_url = st.text_input(label="Hedef Web Adresini Giriniz", value="https://cbarkinozer.medium.com/reg%C3%BCle-edilmemi%C5%9F-yapay-zeka-teknolojileri-kullanman%C4%B1n-tehlikeleri-nelerdir-fa465da15491")
    original_prompt = st.text_input(label="Sorunuzu Giriniz:", value="Yapay zeka kullanımının yol açabileceği olumsuz durumlar nelerdir?")
    submit_btn = st.button(label="Gönder")
    st.divider()

with col_docs:
    st.empty()

with col_uniquedocs:
    st.empty()

with col_rerankeddocs:
    st.empty()

with col_response:
    st.empty()

if submit_btn:
  #Generate alternative queries and show
  with st.spinner("Soru havuzu oluşturuluyor...."):
    query_list = multiqueryhelper.generate_multi_query(original_prompt=original_prompt)

    col_input.markdown("SORU HAVUZU")
    st.divider()
    for query in query_list:
      col_input.markdown(f'**{query}**')

  #Get relevant documents for each query and show
  retrieved_documents = []

  for query in query_list:
    relevant_documents = multiqueryhelper.get_relevant_documents(target_url=target_url, prompt=query)

    retrieved_documents.extend(relevant_documents)

  col_docs.code(f"Bulunan Doküman Sayısı: {len(retrieved_documents)}")

  for retrieved_doc in retrieved_documents:
    col_docs.error(f"ID: {retrieved_doc.metadata['doc_id']} | {retrieved_doc.page_content}")

  #Get unique documents out of all retrieved documents and show
  final_documents = multiqueryhelper.get_unique_documents(retrieved_documents=retrieved_documents)

  col_uniquedocs.code(f"Bulunan Özgün Doküman Sayısı: {len(final_documents)}")

  for final_doc in final_documents:
    col_uniquedocs.warning(f"ID: {final_doc.metadata['doc_id']} | {final_doc.page_content}")

  #Get reranked documents and show

  reranked_docs = multiqueryhelper.get_reranked_documents(documents=final_documents, query=original_prompt)

  col_rerankeddocs.code(f"Yeniden Sıralanmış Doküman Sayısı: {len(reranked_docs)}")

  for reranked_doc in reranked_docs:
    col_rerankeddocs.info(f"ID: {reranked_doc.metadata['doc_id']} | {reranked_doc.page_content}")

  #Get AI response and show
  AI_Response = multiqueryhelper.run_rag(relevant_documents=reranked_docs, prompt=original_prompt)
  col_response.code("NİHAİ YANIT")
  col_response.success(AI_Response)



In [None]:
!npm install localtunnel
!streamlit run /content/multiquery_rag.py &>/content/logs.txt &
!npx localtunnel --port 8501