In [1]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
import nltk
from langchain_text_splitters import NLTKTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from IPython.display import Markdown as md
from dotenv import load_dotenv
from ragas.llms import LangchainLLMWrapper
from langchain_core.language_models.base import BaseLanguageModel
import os
import glob
load_dotenv()  
key = os.getenv("GOOGLE_API_KEY")
chat_model = ChatGoogleGenerativeAI(google_api_key=key, 
                                   model="gemini-1.5-flash-latest")
pdf_files = glob.glob("data/*.pdf")
pages = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    pages.extend(loader.load_and_split())
text_splitter = NLTKTextSplitter(chunk_size=5000, chunk_overlap=1000)

chunks = text_splitter.split_documents(pages)
embedding_model = GoogleGenerativeAIEmbeddings(google_api_key=key, model="models/embedding-001")
db = Chroma.from_documents(chunks, embedding_model, persist_directory="chroma_db_")
db.persist()
db_connection = Chroma(persist_directory="chroma_db_", embedding_function=embedding_model)

  from .autonotebook import tqdm as notebook_tqdm
  db.persist()
  db_connection = Chroma(persist_directory="chroma_db_", embedding_function=embedding_model)


In [2]:

retriever = db_connection.as_retriever(search_kwargs={"k": 10})
chat_template = ChatPromptTemplate.from_messages([
    SystemMessage(content="""You are a helpful academic assistant.
    Please answer the question using only the provided context. 
    Do not include any explanations or additional information beyond what is asked.
    If the context does not contain enough information, say "I don't know" rather than making up an answer."""),
    HumanMessagePromptTemplate.from_template("""Answer the question based on the given context.
    Context: {context}
    Question: {question}
    Answer: """)
])

output_parser = StrOutputParser()


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | chat_template
    | chat_model
    | output_parser
)

def base_rag(query):
    docs = retriever.invoke(query)
    
    context = format_docs(docs)
    
    chain = (
        {"context": lambda _: context, "question": lambda _: query}
        | chat_template
        | chat_model
        | output_parser
    )
    
    answer = chain.invoke(query)

    return {
        "question": query,
        "answer": answer,
        "contexts": [doc.page_content for doc in docs]
    }



In [3]:
from datasets import Dataset

examples = [
    {"question": "What is the role of aggregate functions in SQL?", "ground_truth": "They perform calculations on sets of values."},
    {"question": "Define relationship in the E-R model.", "ground_truth": "An association among several entities."},
    {"question": "What is the purpose of a canonical cover?", "ground_truth": "A minimal set of functional dependencies equivalent to the original."},
    {"question": "What is the main goal of a DBMS?", "ground_truth": "To provide efficient and convenient access to data."},
    {"question": "List three applications of DBMS.", "ground_truth": "Banking, Airlines, Manufacturing."},
    {"question": "How does UNION differ from INTERSECT in SQL?", "ground_truth": "UNION merges results, INTERSECT finds common rows."},
    {"question": "Define data independence.", "ground_truth": "Ability to modify schema at one level without affecting the next."},
    {"question": "What is a superkey?", "ground_truth": "A set of attributes that uniquely identify an entity."},
    {"question": "What is normalization in databases?", "ground_truth": "The process of structuring a relational database to reduce redundancy."},
    {"question": "What does the SELECT clause do in SQL?", "ground_truth": "Specifies the attributes to retrieve."},
    {"question": "What is a functional dependency?", "ground_truth": "A constraint between two sets of attributes."},
    {"question": "What is data redundancy in file systems?", "ground_truth": "Duplication of information across files."},
    {"question": "What is a candidate key?", "ground_truth": "A minimal superkey."},
    {"question": "What is a derived attribute in E-R model?", "ground_truth": "An attribute whose values can be derived from other attributes."},
   
]

In [4]:
dataset = Dataset.from_list(examples)
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)

results = [base_rag(row["question"]) for row in dataset]

for col in ["answer", "contexts"]:
    if col in dataset.column_names:
        dataset = dataset.remove_columns(col)

dataset = dataset.add_column("answer", [r["answer"] for r in results])
dataset = dataset.add_column("contexts", [r["contexts"] for r in results])

from ragas import evaluate

my_llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key="key")
wrapped_llm = LangchainLLMWrapper(my_llm)

score = evaluate(
    dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
)
print(score)


Evaluating: 100%|██████████| 56/56 [00:53<00:00,  1.04it/s]


{'faithfulness': 0.7857, 'answer_relevancy': 0.7003, 'context_precision': 0.9573, 'context_recall': 0.8571}
