In [2]:
import pandas as pd

In [3]:
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import CrossEncoder
from langchain_community.chat_message_histories import SQLChatMessageHistory
from groq import Groq

In [None]:
# === Initializing re-ranker, embedding models and FAISS ===
reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local("Backend/faiss_index_luddy", embedding_model, allow_dangerous_deserialization = 'True')


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [5]:
# === PATH TO FOLDER ===
folder_path = "Final_scraped_txts"  

# === LOAD TEXT FILES ===
def load_documents_from_folder(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            loader = TextLoader(file_path, encoding='utf-8')
            documents.extend(loader.load())
    return documents
docs = load_documents_from_folder(folder_path)
print(f"📄 Loaded {len(docs)} documents.")

📄 Loaded 633 documents.


In [6]:
# === SPLIT INTO CHUNKS ===
def split_documents(docs, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_documents(docs)


In [7]:
for doc in docs:
    web_url = ""
    web_url = doc.page_content[:400].split('\n')[0][14:]
    doc.metadata['webURL'] = web_url
    

In [8]:
split_docs = split_documents(docs)
print(f"Split into {len(split_docs)} chunks.")

Split into 4716 chunks.


In [9]:
split_docs[:10]

[Document(metadata={'source': 'Final_scraped_txts\\aaai.indiana.edu_.txt', 'webURL': 'https://aaai.indiana.edu/'}, page_content='[Source URL]: https://aaai.indiana.edu/'),
 Document(metadata={'source': 'Final_scraped_txts\\aaai.indiana.edu_.txt', 'webURL': 'https://aaai.indiana.edu/'}, page_content='Upcoming Events\nApr\n16\nAAAI 50th Exhibit\nApr\n17\nAAAI 50th Exhibit\nApr\n18\nAAAI 50th Exhibit\nApr\n19\nAAAI 50th Exhibit\nApr\n20\nAAAI 50th Exhibit\nApr\n21\nAAAI 50th Exhibit\nApr\n22\nAAAI 50th Exhibit\nApr\n23\nAAAI 50th Exhibit\nApr\n24\nAAAI 50th Exhibit\nApr\n25\nAAAI 50th Exhibit\nApr\n26\nAAAI 50th Exhibit\nApr\n26\nIU Soul Revue Spring Concert\nApr\n27\nAAAI 50th Exhibit\nApr\n28\nAAAI 50th Exhibit\nApr\n29\nAAAI 50th Exhibit\nApr\n30\nAAAI 50th Exhibit\nMay\n01\nAAAI 50th Exhibit\nMay\n02\nAAAI 50th Exhibit\nMay\n03\nAAAI 50th Exhibit\nMay\n03\nAfrican American Choral Ensemble Spring Concert\nMay\n04\nAAAI 50th Exhibit\nMay\n05\nAAAI 50th Exhibit\nMay\n06\nAAAI 50th Exhibi

In [10]:
dframe = []
for sd in split_docs:
    dframe.append("Source: "+sd.metadata['webURL']+"\n\n"+sd.page_content)
# print(*dframe, sep='\n')

In [11]:
df = pd.DataFrame(dframe, columns=["text"])
# print(df)

In [1]:
from giskard.rag import generate_testset, KnowledgeBase

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import giskard

# load ypur openai-api-key
os.environ["OPENAI_API_KEY"] = "sk-proj-gsk_abcdefg1234567890abcdefg1234567890abcdefg1234567890"

# Optional, setup a model (default LLM is gpt-4o, default embedding model is text-embedding-3-small)
giskard.llm.set_llm_model("gpt-4o")
giskard.llm.set_embedding_model("text-embedding-3-small")


In [14]:
knowledge_base = KnowledgeBase(df)

In [None]:
from giskard.rag import generate_testset

testset = generate_testset(
    knowledge_base,
    num_questions=60,
    
    agent_description="You are LuddyBot, a professional AI assistant for Indiana University's Luddy School, designed to provide clear and concise responses to student queries strictly related to IU, Luddy, or academic/student services. You include accurate IU/Luddy links, handle context shifts, track conversation flow, and avoid making up answers—politely declining off-topic or inappropriate questions.",
)

In [None]:
# Save the generated testset
testset.save("my_testset.jsonl")

In [None]:
# Save the generated testset
testset.save("rag_test_data.json")

In [20]:
# You can easily load it back
from giskard.rag import QATestset

loaded_testset = QATestset.load("my_testset.jsonl")

In [21]:
# Convert it to a pandas dataframe
df = loaded_testset.to_pandas()

In [32]:
test_set_df = testset.to_pandas()

In [33]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="Here is a question"))


Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: Here is some context

Question: Here is a question



In [34]:
# === Initializing re-ranker, embedding models and FAISS ===
reranker_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local("faiss_index_luddy", embedding_model, allow_dangerous_deserialization = 'True')

In [35]:
# === Function to perform hybrid search ===
def hybrid_search(query, top_k=20):
    semantic_results = db.similarity_search_with_score(query, k=top_k * 2)
    keyword_filtered = [(doc, score) for doc, score in semantic_results if query.lower() in doc.page_content.lower()]
    if len(keyword_filtered) >= top_k:
        return keyword_filtered[:top_k]
    else:
        extra_needed = top_k - len(keyword_filtered)
        additional = [item for item in semantic_results if item not in keyword_filtered]
        return keyword_filtered + additional[:extra_needed]

In [None]:
# === Final answer generation function ===
def generate_llama_answer(query, top_k=5, groq_api_key=None):
    # Step 1: Retrieve top documents
    hybrid_results = hybrid_search(query, top_k=20)
    pairs = [(query, doc.page_content) for doc, _ in hybrid_results]

    # Step 2: Rerank using cross-encoder
    scores = reranker_model.predict(pairs)
    reranked = list(zip([doc for doc, _ in hybrid_results], scores))
    reranked.sort(key=lambda x: x[1], reverse=True)
    top_docs = reranked[:top_k]
    print("----------------------- top-docs -------------------------------------")
    return top_docs
    print("------------------------------------------------------------")

In [None]:
# GROQ_API_KEY = "gsk_gsk_abcdefg1234567890abcdefg1234567890abcdefg1234567890"     # put your API key here or load from env
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")

In [45]:
# === Final answer generation function ===
def generate_llama_answer(query, top_k=5):
    # Step 1: Retrieve top documents
    hybrid_results = hybrid_search(query, top_k=20)
    pairs = [(query, doc.page_content) for doc, _ in hybrid_results]

    # Step 2: Rerank using cross-encoder
    scores = reranker_model.predict(pairs)
    reranked = list(zip([doc for doc, _ in hybrid_results], scores))
    reranked.sort(key=lambda x: x[1], reverse=True)
    top_docs = reranked[:top_k]
    # print("----------------------- top-docs -------------------------------------")
    # print(top_docs[:1])
    # print("------------------------------------------------------------")

    # Step 3: Create context and prompt
    # context = "\n\n".join([doc.page_content for doc, _ in top_docs])
    context = "\n\n".join([f"Source: {doc.metadata.get('webURL', 'N/A')} \n{doc.page_content}"   for doc, _ in top_docs])
    # print("LLama context: ----> ", context)
    prompt = f"""Answer the following question in a clear and structured format based on the provided context. Please also provide with the necessary website links. Don't mention "context" or "based on context" in the response.
            Context:{context}
            Question: {query}
            Answer:"""

    # Step 4: Generate answer using Groq's LLaMA model
    client = Groq(api_key=GROQ_API_KEY)
    completion = client.chat.completions.create(
        model="llama3-70b-8192",
        messages=[{"role": "user", "content": prompt}]
    )

    return completion.choices[0].message.content.strip()

In [46]:
generate_llama_answer('What should libraries do to prepare for challenges related to censorship?')

"According to the American Library Association (ALA), libraries should prepare themselves to deal with challenges related to censorship by:\n\n1. **Adopting appropriate policies and procedures**: Libraries should have clear policies and procedures in place to handle challenges and complaints about materials and services.\n\n2. **Practicing inclusion**: Libraries should counter censorship by providing inclusive materials, programs, and services that reflect diverse voices and perspectives.\n\n3. **Encouraging diverse collections and programming**: Libraries should proactively seek to include an abundance of resources and programming representing the greatest possible diversity of genres, ideas, and expressions.\n\n4. **Respectfully considering community objections and complaints**: Libraries should consider objections and complaints from the community, but should not allow controversy alone to dictate policy.\n\n5. **Discouraging self-censorship**: Governing bodies, administrators, and 

In [47]:
from giskard.rag import evaluate

report = evaluate(generate_llama_answer, testset=testset, knowledge_base=knowledge_base)

Asking questions to the agent: 100%|██████████| 60/60 [11:34<00:00, 11.57s/it]
CorrectnessMetric evaluation: 100%|██████████| 60/60 [00:50<00:00,  1.20it/s]


In [48]:
display(report)

In [49]:
report.to_html("report.html")

In [50]:
report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.9
conversational,0.0
distracting element,0.8
double,0.7
simple,0.7
situational,0.5


In [52]:
len(report.get_failures())

24

In [51]:
report.get_failures()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata,agent_answer,correctness,correctness_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3ab4ac57-9d65-4552-bf49-bb4b62d93db1,Which course is listed in the CYGLPOLBS progra...,The GNDR course is listed in the CYGLPOLBS pro...,Document 523: Source: https://bulletin.college...,[],"{'question_type': 'simple', 'seed_document_id'...",To identify the course that is listed in the C...,False,The agent stated that it cannot identify a spe...
989be6f9-ee49-4bd1-8d3e-cd59c7945e51,What awards and recognitions has Apu Kapadia r...,Apu Kapadia's work on accountable anonymity ea...,Document 3222: Source: https://luddy.indiana.e...,[],"{'question_type': 'simple', 'seed_document_id'...",Apu Kapadia has received the following awards ...,False,The agent provided a comprehensive list of awa...
7f599a33-e209-4be7-938b-0fb726e9725e,What are some of the master's programs offered...,Some of the master's programs offered include ...,Document 4368: Source: https://spea.indiana.ed...,[],"{'question_type': 'simple', 'seed_document_id'...",Indiana University's School of Public and Envi...,False,The agent only mentioned the Master of Arts in...
c175eaa1-b550-4038-a320-4db336f98e64,Under what specific theme will CNETS Professor...,"The theme of the panel is 'AI, society and org...",Document 612: Source: https://cnets.indiana.ed...,[],"{'question_type': 'complex', 'seed_document_id...",**Question:** Under what specific theme will C...,False,The agent stated that the theme is about explo...
911019bf-cd86-41ba-867b-05d4cde637f6,Could you guide me on where to find informatio...,You can find information about events at India...,Document 682: Source: https://cnets.indiana.ed...,[],"{'question_type': 'distracting element', 'seed...",To find information about events at Indiana Un...,False,The agent provided a detailed guide on finding...
ed27a5be-7360-4419-a826-17da72192238,If I am an international student at IU and wis...,If you wish to work more than 20 hours per wee...,Document 3888: Source: https://ois.iu.edu/stud...,[],"{'question_type': 'distracting element', 'seed...",To work more than 20 hours per week as an inte...,False,The agent provided a detailed process for obta...
f6f1f63e-b3eb-4ec8-b57b-cb45f9b513d4,"Hi, as a prospective international student int...",You can find the faculty directory for researc...,Document 3289: Source: https://luddy.indiana.e...,[],"{'question_type': 'situational', 'seed_documen...",To find the faculty directory for research are...,False,The agent provided a general method to find fa...
d7fdddad-10c2-467e-99cf-aa37a7296725,"Hi, as a prospective student considering enrol...",Any oral or written request should be referred...,Document 2002: Source: https://global.iu.edu/r...,[],"{'question_type': 'situational', 'seed_documen...",Here is a clear and structured answer to your ...,False,The agent provided contact information for adm...
7ea4be6f-bdf4-4713-8b9a-04953bf41ee8,"Hi, I'm considering applying to the master's p...",The retention rate for Oklahoma State Universi...,Document 1721: Source: https://fortune.com/edu...,[],"{'question_type': 'situational', 'seed_documen...","Based on the provided sources, I can help you ...",False,The agent provided the retention rate for Okla...
b6f0099d-c556-477b-95d0-f344231e1fc3,"Hi, I'm considering applying to the Master of ...",Jona Jaha had one-of-a-kind opportunities to l...,Document 4360: Source: https://spea.indiana.ed...,[],"{'question_type': 'situational', 'seed_documen...","Unfortunately, I couldn't find any information...",False,The agent did not provide specific information...
