<a href="https://colab.research.google.com/github/kmk4444/Retrieval-augmented-generation/blob/main/RAG_app_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#raghelper.py
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS # it is vector database. we will use FAISS database. It is created by Facebook.
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import CohereEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
import os
from dotenv import load_dotenv

#load_dotenv()
#my_key_openai = os.getenv("openai_apikey")
#my_key_google = os.getenv("google_apikey")
#my_key_cohere = os.getenv("cohere_apikey")
#my_key_hf = os.getenv("huggingface_access_token")
my_key_openai="---"
my_key_google="---"
my_key_cohere="---"
my_key_hf="---"

llm_gemini = ChatGoogleGenerativeAI(google_api_key=my_key_google, model="gemini-pro")

embeddings = OpenAIEmbeddings(api_key=my_key_openai)

def ask_gemini(prompt):
  AI_Response = llm_gemini.invoke(prompt)

  return AI_Response.content

def rag_with_url(target_url,prompt):
  #load url
  loader = WebBaseLoader(target_url)
  raw_documents = loader.load() # raw_documents is document list (list[document])

  #create chunk using splitters
  #below code can run without any parameters but to reach the best answer you should try different parameters.
  #  text_splitter = RecursiveCharacterTextSplitter()
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=1000,
      chunk_overlap=0,
      length_function=len # it provides that how to estimate lenght of chunk. we will use len because of python list rule.
  )

  # we run split_documents method which is inside of text_splitter and save to splitted_document
  splitted_document = text_splitter.split_documents(raw_documents)

  # we need to save splited documents in vector database.
  # FAISS create vectors from our documents using our embedding model and save vectordatabase.
  vectorstore = FAISS.from_documents(splitted_document, embeddings)
  # we reach as_retriever method in vectorstore. We convert vector store into retriever.
  # retriever won't retrieve all vectors, it retrieves vector considering context similarity among vector and prompt.
  #mostly, similarity is estimated by cosine similarity.
  retriever = vectorstore.as_retriever()
  relevant_documents = retriever.get_relevant_documents(prompt) # it retrieves the most similar document accordingly our prompt. It can be 3,4 or more documents. Default value is 4.

  context_data =""

  #Also, we use build function instead of the below code.
  for document in relevant_documents:
    context_data = context_data + " " + document.page_content # document has two features which are metada and page_content.
    #because the main data is in context_data.
  final_prompt = f"""Şöyle bir sorum var: {prompt}
  Bu soruyu yanıtlamak için elimizde şu bilgiler var: {context_data} .
  Bu sorunun yanıtını vermek için yalnızca sana burada verdiğim eldeki bilgileri kullan. Bunların dışına asla çıkma.
  """
  AI_Response = llm_gemini.invoke(prompt)

  return AI_Response.content

def rag_with_pdf(filepath, prompt):
  loader = PyPDFLoader(filepath)
  raw_documents = loader.load()

  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=1000,
      chunk_overlap=0,
      length_function=len
  )

  splitted_documents = text_splitter.split_documents(raw_documents)

  vectorstore = FAISS.from_documents(splitted_documents, embeddings)
  retriever = vectorstore.as_retriever()

  relevant_documents = retriever.get_relevant_documents(prompt)

  context_data=""

  for ducoment in relevant_documents:
    context_data = context_data + " " + ducoment.page_content

  final_prompt = f"""Şöyle bir sorum var: {prompt}
  Bu soruyu yanıtlamak için elimizde şu bilgiler var: {context_data} .
  Bu sorunun yanıtını vermek için yalnızca sana burada verdiğim eldeki bilgileri kullan. Bunların dışına asla çıkma.
  """
  AI_Response = llm_gemini.invoke(final_prompt)

  return AI_Response.content, relevant_documents