In [None]:
# pip install "ragas>=0.3" datasets pandas google-generativeai langchain-community


In [None]:
import os, google.generativeai as genai
os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY"     # or set in env
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


In [None]:
from langchain_community.llms import ChatGoogleGenerativeAI
from langchain_community.embeddings import GoogleGenerativeAIEmbeddings
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

# LLM-as-judge (deterministic)
evaluator_llm = LangchainLLMWrapper(
    ChatGoogleGenerativeAI(model="gemini-2.0-pro", temperature=0.0)
)
# Embeddings for Ragas metrics
evaluator_emb = LangchainEmbeddingsWrapper(
    GoogleGenerativeAIEmbeddings(model="text-embedding-004")
)


In [None]:
from datasets import Dataset

def make_ragas_dataset(sources, summaries, questions=None, references=None):
    rows = []
    for i, (src, pred) in enumerate(zip(sources, summaries)):
        rec = {"retrieved_contexts": [src], "response": pred}
        if questions is not None:  rec["question"]  = questions[i]
        if references is not None: rec["reference"] = references[i]
        rows.append(rec)
    return Dataset.from_list(rows)


In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    summarization_score,
    answer_correctness,
    answer_relevancy,
)
import pandas as pd

def run_ragas_eval(ds,
                   conciseness_coeff: float = 0.4,
                   use_correctness: bool = True,
                   use_relevancy: bool = True):
    metrics = [faithfulness, summarization_score(use_conciseness=True, coeff=conciseness_coeff)]
    if use_correctness and "reference" in ds.column_names: metrics.append(answer_correctness)
    if use_relevancy and "question" in ds.column_names:   metrics.append(answer_relevancy)

    res = evaluate(ds, metrics=metrics, llm=evaluator_llm, embeddings=evaluator_emb)
    df = res.to_pandas()
    for col in ["answer_correctness", "answer_relevancy"]:
        if col not in df.columns: df[col] = 0.0
    return df


In [None]:
def blended_score(df: pd.DataFrame,
                  w_faith=0.50, w_sum=0.30, w_corr=0.15, w_rel=0.05):
    for c in ["faithfulness","summarization_score","answer_correctness","answer_relevancy"]:
        if c in df.columns: df[c] = df[c].clip(0,1)
        else: df[c] = 0.0
    total = max(w_faith + w_sum + w_corr + w_rel, 1e-9)
    df["reliability_score"] = (
        w_faith*df["faithfulness"] +
        w_sum*df["summarization_score"] +
        w_corr*df["answer_correctness"] +
        w_rel*df["answer_relevancy"]
    ) / total
    return df


In [None]:
def apply_gates(df, min_faith=0.80, min_blended=0.75):
    df["pass_faithfulness"] = df["faithfulness"] >= min_faith
    df["pass_overall"] = df["reliability_score"] >= min_blended
    return df


In [None]:
# Your data
sources   = [source_text_1, source_text_2]
summaries = [summary_1, summary_2]                # from Gemini 2.5 (generator)
questions = ["Summarize the financial document"]*len(sources)   # or per-doc
references = [gold_1, gold_2]      # optional; omit if not available

ds  = make_ragas_dataset(sources, summaries, questions, references)
df  = run_ragas_eval(ds, conciseness_coeff=0.4, use_correctness=True, use_relevancy=True)
df  = blended_score(df, w_faith=0.50, w_sum=0.30, w_corr=0.15, w_rel=0.05)
df  = apply_gates(df, min_faith=0.80, min_blended=0.75)
print(df[["faithfulness","summarization_score","answer_correctness","answer_relevancy","reliability_score","pass_faithfulness","pass_overall"]])


In [None]:
import dspy

class RagasReliabilityMetric:
    def __init__(self, w=(0.50,0.30,0.15,0.05), conciseness_coeff=0.4):
        self.w = w
        self.conciseness_coeff = conciseness_coeff

    def __call__(self, example, pred) -> float:
        ds = make_ragas_dataset(
            [example["source"]],
            [pred],
            [example.get("question","Summarize the financial document")],
            [example["reference"]] if "reference" in example else None,
        )
        df = run_ragas_eval(ds, conciseness_coeff=self.conciseness_coeff,
                            use_correctness=("reference" in example), use_relevancy=True)
        df = blended_score(df, *self.w)
        return float(df["reliability_score"].iloc[0])

# Define your summarizer signature and LM for generation (Gemini 2.5)
class Summarize(dspy.Signature):
    """Summarize the financial document."""
    source = dspy.InputField()
    summary = dspy.OutputField()

lm = dspy.LM("google/gemini-2.5-pro", temperature=0.2)  # adapt to your env
dspy.settings.configure(lm=lm)

generate_summary = dspy.Predict(Summarize)
devset = [{"source": source_text_1, "reference": gold_1, "question":"Summarize the financial document"}]

metric = RagasReliabilityMetric()
tp = dspy.teleprompt.BootstrapFewShotWithRandomSearch(metric=metric, num_trials=10, max_bootstrapped_demos=6)
optimized_program = tp.compile(generate_summary, trainset=devset)
