In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")

# Validate the required API keys
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not found in .env file.")
if not tavily_api_key:
    raise ValueError("TAVILY_API_KEY not found in .env file.")

In [None]:
import pandas as pd
from transformers import pipeline
import pandas as pd
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import DataFrameLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
from langchain_openai import ChatOpenAI
import os
import getpass
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.llms import HuggingFaceHub

In [3]:
def get_reteriver() : 
    import pandas as pd
    ucsc_passage_df = pd.read_csv("passage.csv")
    ucsc_passge_data_loader = DataFrameLoader(ucsc_passage_df, page_content_column="passage")
    ucsc_passage_data = ucsc_passge_data_loader.load()
    text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=50)
    docs = text_splitter.split_documents(ucsc_passage_data)
    embeddings = HuggingFaceEmbeddings()
    db = FAISS.from_documents(docs, embeddings)
    return db.as_retriever()

retriever = get_reteriver()

Created a chunk of size 265, which is longer than the specified 200
Created a chunk of size 1411, which is longer than the specified 200
Created a chunk of size 294, which is longer than the specified 200
Created a chunk of size 395, which is longer than the specified 200
Created a chunk of size 298, which is longer than the specified 200
Created a chunk of size 324, which is longer than the specified 200
Created a chunk of size 470, which is longer than the specified 200
Created a chunk of size 662, which is longer than the specified 200
Created a chunk of size 451, which is longer than the specified 200
Created a chunk of size 245, which is longer than the specified 200
Created a chunk of size 226, which is longer than the specified 200
Created a chunk of size 580, which is longer than the specified 200
Created a chunk of size 1970, which is longer than the specified 200
Created a chunk of size 617, which is longer than the specified 200
Created a chunk of size 312, which is longer t

In [4]:
from langchain_tavily import TavilySearch
from langchain.schema import HumanMessage
from typing import List, TypedDict
import re
# initialize once with your key
tavily = TavilySearch(
    max_results=5,
    search_depth="advanced",
    include_answer=True,
    include_raw_content=True,
    api_key=tavily_api_key
)
from langchain.schema import HumanMessage, SystemMessage

In [5]:
# --- Helpers---
llm = ChatOpenAI(temperature=0.0, model_name="gpt-3.5-turbo")

def vector_search(query: str) -> List[str]:
    docs: List[Document] = retriever.get_relevant_documents(query)
    # extract only the page_content
    return [d.page_content for d in docs]
    

def web_search(query: str) -> List[str]:
    """
    Perform a Tavily search and return a list of raw-content strings,
    regardless of whether the hits come back as objects, dicts, or strings.
    """
   
    return tavily.invoke(query)  



def score_docs(docs: List[str], query: str) -> float:
    """
    Use LLM to rate relevance of `docs` to the UCSC-specific `query`
    on a 0.0–1.0 scale.
    """
    # Ensure UCSC context in the query rewrite
    refined_query = f"{query} (UCSC = University of California Santa Cruz)"

    # Build numbered list of docs
    doc_list = "\n".join(f"{i+1}. {d}" for i, d in enumerate(docs))
    prompt = (
        "You are evaluating how well each document helps answer a question about UCSC "
        "(University of California Santa Cruz). "
        "Rate relevance from 0.0 (irrelevant) to 1.0 (fully relevant).\n\n"
        f"Query: {refined_query}\n\n"
        f"Documents:\n{doc_list}\n\n"
        "Respond with ONLY a single number between 0.0 and 1.0."
    )

    # Invoke the LLM
    resp = llm.invoke([HumanMessage(content=prompt)])
    text = resp.content.strip()

    # Parse and clamp
    try:
        score = float(text)
        return max(0.0, min(1.0, score))
    except ValueError:
        return 0.0




def rewrite_query(query: str) -> str:
    """
    Rewrite for vector retrieval against a UCSC-focused FAISS index.
    Injects UCSC context and academic keywords for maximal semantic relevance.
    """
    prompt = (
        "You’re rewriting a student’s question into a concise vector-search query "
        "targeting UCSC documents.  \n"
        "- Always include “UCSC (University of California Santa Cruz)”.\n"
        "- Add academic keywords like course codes, department names, document types.\n"
        "- Keep it to 3–6 key terms or short phrases.\n\n"
        f"Original question: {query}\n\n"
        "Vector-search query:"
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    return resp.content.strip()





def rewrite_web_query(query: str) -> str:
    """
    Rewrite the user’s query to be web-search–friendly, explicitly
    mentioning UCSC (University of California Santa Cruz) and
    encouraging site-specific terms or forum keywords.
    """
    prompt = (
        "You’re converting a student’s question into an advanced web search query "
        "about UCSC (University of California Santa Cruz).  \n"
        "- Mention “UCSC (University of California Santa Cruz)” explicitly.\n"
        "- Favor site-specific qualifiers (e.g., site:ucsc.edu) or student/departmental forums.\n"
        "- Keep it concise—aim for a 5–8 word search string.\n\n"
        f"Original question: {query}\n\n"
        "Web search query:"
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    return resp.content.strip()


def synthesize_answer(context: str, query: str) -> str:
    """
    Synthesize a final, conversational answer by combining
    the UCSC-specific context with the original query.
    """
    system_msg = SystemMessage(content=(
    "You are a UCSC assistant. Only use the provided context and the original query to answer. "
    "Do NOT hallucinate or invent facts. If you’re not 100% sure, say “I’m sorry, I don’t know.”"
))

    user_prompt = (
        f"Original Query: {query}\n\n"
        f"Context from UCSC sources:\n{context}\n\n"
        "Based on this information and the query above, "
        "provide a clear and direct answer. "
        "If information is partial, focus on what you DO know rather than what you don't."
    )
    resp = llm.invoke([system_msg, HumanMessage(content=user_prompt)])
    return resp.content.strip()






import json

def break_into_cot_questions( query: str) -> List[str]:
    """
    Ask the LLM to return exactly a JSON list of subquestions,
    each mentioning UCSC (University of California Santa Cruz).
    """
    prompt = (
        "You are a UCSC (University of California Santa Cruz) assistant.  \n"
        "Break down this query into 3–5 logical subquestions.  \n"
        "Output MUST be valid JSON, a top-level array of strings.  \n\n"
        f"Query: {query}\n\n"
        "Example output:\n"
        '["What is …?", "How does …?"]\n\n'
        "Now your output:"
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    content = resp.content.strip()

    try:
        questions = json.loads(content)
        if isinstance(questions, list) and all(isinstance(q, str) for q in questions):
            return questions
    except json.JSONDecodeError:
        pass

    # Fallback if JSON fails
    return [q.strip() for q in content.split("\n") if q.strip()]


def score_answer(answer: str, query: str) -> float:
    """
    Ask the LLM how confident it is (0.0–1.0) that `answer` fully and correctly
    addresses `query` in a UCSC context.
    """
    prompt = (
        "You are grading the quality of an answer to a UCSC (University of California Santa Cruz) question.\n\n"
        f"Query: {query}\n\n"
        f"Answer: {answer}\n\n"
        "On a scale from 0.0 (not at all confident) to 1.0 (extremely confident), "
        "how well does this answer address the query accurately and completely? "
        "Respond with ONLY a single number."
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    text = resp.content.strip()
    try:
        score = float(text)
        return max(0.0, min(1.0, score))
    except ValueError:
        return 0.0

def rerank_docs(docs: List[str], query: str, top_k: int = 3) -> List[str]:
    """
    Use the LLM to select the top_k most relevant UCSC docs to the query.
    """
    numbered = "\n".join(f"{i+1}. {d}" for i, d in enumerate(docs))
    prompt = (
        f"You are selecting the {top_k} most relevant UCSC (University of California Santa Cruz) documents "
        f"for the query: {query}\n\n"
        f"Documents:\n{numbered}\n\n"
        "Respond with a JSON list of the 1-based indices of the top documents, e.g. [1,3,5]."
    )
    resp = llm.invoke([HumanMessage(content=prompt)])
    try:
        inds = json.loads(resp.content.strip())
        return [docs[i-1] for i in inds if 1 <= i <= len(docs)]
    except Exception:
        return docs[:top_k]



In [6]:

from langgraph.graph import StateGraph, START, END

# --- State Schema ---
class State(TypedDict):
    query: str
    cot_questions: List[str]
    docs: List[str]
    answer: str
    direct: int  


# --- Retriever Tool Stub ---
def retriever_tool(query: str, use_web: bool = True) -> str:
    # Vector search + rewrite + web fallback
    docs = vector_search(query)
    score = score_docs(docs, query)
    retries = 0
    while score < 0.6 and retries < 3:
        query = rewrite_query(query)
        docs = vector_search(query)
        score = score_docs(docs, query)
        retries += 1
    if  use_web and score < 0.6:
        web_docs = web_search(query)
        web_score = score_docs(web_docs, query)
        web_retries = 0
        while web_score < 0.6 and web_retries < 3:
            query = rewrite_web_query(query)
            web_docs = web_search(query)
            web_score = score_docs(web_docs, query)
            web_retries += 1
        docs = web_docs if web_score > score else docs
    return "\n".join(docs)




def agent_node(state: State) -> State:
    # 1) Initial retrieval by calling your Python function directly
    raw = retriever_tool(state["query"], use_web=False)
    initial_docs = raw.split("\n")

    initial_answer = synthesize_answer(initial_docs, state["query"])
    conf_initial = score_answer(initial_answer, state["query"])

    if conf_initial >= 0.6:
        return {**state, "docs": initial_docs, "answer": initial_answer, "direct": 1}

    # 3) CoT fallback
    qs = break_into_cot_questions(state["query"])
    all_docs: List[str] = []
    

    for q in qs:
        part = retriever_tool(q, use_web=True)   # part is a "\n"-joined string
        chunk_docs = part.split("\n")
        chunk_docs = rerank_docs(chunk_docs, q, top_k=5)
        all_docs.extend(chunk_docs)                  


    # all_docs = rerank_docs(all_docs, state["query"], top_k=len(qs) * 2)
    context = "\n".join(all_docs)
    final_answer = synthesize_answer( context, state["query"])
    conf_final = score_answer(final_answer, state["query"])

    if conf_final < 0.6:
        return {**state, "docs": [], "answer": "NO_ANSWER", "direct": 3 }

    return {**state, "docs": all_docs, "answer": final_answer, "direct" : 2}



# --- Build Graph ---
graph = StateGraph(State)
graph.add_node("agent", agent_node)
graph.set_entry_point("agent")
graph.add_edge("agent", END)
app = graph.compile()


In [7]:
init_state = {"query": "What classes does UCSC Extension Silicon Valley Campus provide?",
                  "cot_questions": [],
                  "docs": [],
                  "answer": ""}
result = app.invoke(init_state)
print(result["answer"])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  docs: List[Document] = retriever.get_relevant_documents(query)


UCSC Extension Silicon Valley Campus provides UC-approved courses, professional certificate programs, skills-based intensives, and boot camps to students and professionals at every stage of their careers.


In [8]:
result["direct"]

1

In [9]:
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics import faithfulness, answer_relevancy, context_recall, context_precision

question_list = []
gen_ans_list  = []
searched_docs = []
src_ans_list  = []

def run_query(query: str) -> dict:
    init_state = {
        "query":         query,
        "cot_questions": [],
        "docs":          [],
        "answer":        "",
        "direct":        0
    }
    return app.invoke(init_state)



In [10]:
df = pd.read_csv("new_qa.csv")  # expects columns "questions" and "answers"

for idx, row in df.iterrows():
    q = row["questions"]
    a = row["answers"]            # ground-truth
    print(f"Processing #{idx+1}: {q}")

    # Always record the true answer
    src_ans_list.append(a)

    # Run your LangGraph agent
    final_state = run_query(q)

    # Skip if NO_ANSWER
    if final_state["answer"] == "NO_ANSWER":
        continue

    # Otherwise collect
    question_list.append(q)
    gen_ans_list.append(final_state["answer"])
    searched_docs.append(final_state["docs"])

Processing #1: Who can be a member of the Boating Club?
Processing #2: What classes does the boating center offer to community members?
Processing #3: When is the boating club open?
Processing #4: What boats are available to members?
Processing #5: How much does a boating membership cost?
Processing #6: How do I sign up for a boating membership?
Processing #7: How can I join the Boating Club?
Processing #8: Are there additional fees to take out the boats after I join the boating club?
Processing #9: Do the boating club rent Kayaks?
Processing #10: How can I get involved with the Boating Club?
Processing #11: Why is health insurance required at UCSC?
Processing #12: How do I get UC SHIP?
Processing #13: How do I get my UC SHIP ID card?
Processing #14: Am I eligible if I opted out of UC SHIP last year?
Processing #15: Am I eligible for UC SHIP if I've graduated? 
Processing #16: How much does UC SHIP cost?
Processing #17: Will financial aid pay for my UC SHIP?
Processing #18: Will Medi-C

In [11]:
df = pd.read_csv("new_qa.csv")
df["questions"][43]

'What classes does UCSC Extension Silicon Valley provide?'

In [None]:
df_out = pd.DataFrame({
    "question":         question_list,
    "generated_answer": gen_ans_list,
    "supporting_docs":  searched_docs
})
df_out.to_csv("HybridRAG_generated_outputsFinal.csv", index=False)


In [12]:
len(gen_ans_list), len(question_list), len(searched_docs), len(src_ans_list)

(107, 107, 107, 112)

In [13]:
gt_map = dict(zip(df["questions"], df["answers"]))

# 2) filter in the order of your actually‐answered question_list
src_ans_list = [gt_map[q] for q in question_list]

In [14]:
len(gen_ans_list), len(question_list), len(searched_docs), len(src_ans_list)

(107, 107, 107, 107)

In [15]:
ragas_dataset = Dataset.from_dict({
    "question":           question_list,
    "ground_truth":       src_ans_list,
    "answer":             gen_ans_list,
    "retrieved_contexts": searched_docs
})

In [16]:
metrics = [faithfulness, answer_relevancy, context_recall, context_precision]
run_cfg = RunConfig(
    # timeout=60,
    max_retries=10
    # max_wait=180,
    # max_workers=2
)

results = evaluate(
    dataset=ragas_dataset,
    metrics=metrics,
    run_config=run_cfg,
    raise_exceptions=False
)

# 5) Show per‐example and average scores
results_df = results.to_pandas()
print(results_df)

Evaluating: 100%|█████████████████████████████| 428/428 [05:22<00:00,  1.33it/s]


                                            user_input  \
0             Who can be a member of the Boating Club?   
1    What classes does the boating center offer to ...   
2                       When is the boating club open?   
3                 What boats are available to members?   
4             How much does a boating membership cost?   
..                                                 ...   
102  What if I don't fulfill the general education ...   
103                                    What is UC TAP?   
104              When are acceptance notices sent out?   
105  What are the Cross-Campus and Simultaneous Enr...   
106    Can I talk to an adviser during a campus visit?   

                                    retrieved_contexts  \
0    [How can I join the Boating Club?, The boating...   
1    [UC Santa Cruz Community Boating Center is a c...   
2    [Boating Club Hours, The Community Boating Cen...   
3    [Our boats:, Sailing vessels for weekend use c...   
4    [Cost, C

In [18]:
avg_scores = results_df[[
    "faithfulness","answer_relevancy","context_recall","context_precision"
]].mean()
print("Average Scores:")
print(avg_scores)

Average Scores:
faithfulness         0.881475
answer_relevancy     0.827332
context_recall       0.535358
context_precision    0.468474
dtype: float64
