In [None]:
! pip install langchainhub langsmith openai

In [1]:
import os
os.environ["LANGCHAIN_API_KEY"] = "xxx"

# Summarization with criteria and pairwise evaluators

Here, we will walk through the evaluation workflow for summarization.

## Dataset

Here is a dataset of papers to summarize (into tweets):

https://smith.langchain.com/public/659b07af-1cab-4e18-b21a-91a69a4c3990/d

In [2]:
from langchain_community.document_loaders import ArxivLoader

# Arxiv IDs
# phi3, llama3 context extension, jamba, longRope, can llms reason & plan, action learning, roformer, attn is all you need, segment anything, # swin transformer
ids = [
    "2404.14219",
    "2404.19553",
    "2403.19887",
    "2402.13753",
    "2403.04121",
    "2402.15809",
    "2104.09864",
    "1706.03762",
    "2304.02643",
    "2111.09883",
]

# Load papers
docs = []
for paper_id in ids:
    doc = ArxivLoader(query=paper_id, load_max_docs=1).load()
    docs.extend(doc)

In [3]:
from langsmith import Client

# Summarization
inputs = [d.page_content for d in docs]

# Create dataset
client = Client()
dataset_name = "Paper_Tweet_Generator"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Papers to summarize",
)
client.create_examples(
    inputs=[{"text": d} for d in inputs],
    dataset_id=dataset.id,
)

#### Chain 

Here is a summarization chain 

In [16]:
from langchain_openai import ChatOpenAI
from langchain_cohere import ChatCohere
from langchain_core.prompts import ChatPromptTemplate

system_tweet_instructions = (
    "You are an assistant that generates Tweets to distill / summarize"
    " an academic paper. Ensure the summary: (1) has an engaging title, "
    " (2) provides a bullet point list of main points from the paper, "
    " (3) utilizes emojis, (4) includes limitations of the approach, and "
    " (5) highlights in one sentence the key point or innovation in the paper."
)

human = "Here is a paper to convert into a Tweet: {paper}"

prompt = ChatPromptTemplate.from_messages(
    [("system", system_tweet_instructions), ("human", human)]
)

Here we adapt the chain to our dataset examples 

In [17]:
from langchain_core.output_parsers import StrOutputParser

def predict_tweet_openai_4o(example: dict):
    chat = ChatOpenAI(temperature=0, model_name="gpt-4o")
    tweet_generator_openai = prompt | chat | StrOutputParser()
    response = tweet_generator_openai.invoke({"paper": example["text"]})
    return {"answer": response}

def predict_tweet_command_r(example: dict):
    chat = ChatCohere(model="command-r", temperature=0)
    tweet_generator_cohere = prompt | chat | StrOutputParser()
    """Use this for summary evaluation"""
    response = tweet_generator_cohere.invoke({"paper": example["text"]})
    return {"answer": response}

#### Evaluator 

Here we use an evaluator prompt: 

https://smith.langchain.com/hub/rlm/summary-evaluator

https://smith.langchain.com/hub/rlm/summary-accurancy-evaluator

This prompt can be forked to add any custom criteria 

In [12]:
from langchain import hub
from langsmith.schemas import Example, Run

summary_criteria_prompt = hub.pull("rlm/summary-evaluator")
summary_accuracy_prompt = hub.pull("rlm/summary-accurancy-evaluator")

def text_summary_grader(run, example) -> dict:
    """
    A simple criteria evaluator for text summarization
    """
    
    # Get summary
    summary = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = summary_criteria_prompt | llm

    # Get score
    score = answer_grader.invoke({"summary": summary})
    score = score["Score"]

    return {"key": "summary_engagement_score", "score": score}

def text_summary_accuracy_grader(run, example) -> dict:
    """
    A simple accuracy evaluator for text summarization
    """

    # Get summary
    paper = example.inputs["text"]
    inputs = {"document": paper}
    summary = run.outputs["answer"]

    # LLM grader
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    answer_grader = summary_accuracy_prompt | llm

    # Get score
    score = answer_grader.invoke({"summary": summary, "input": inputs})
    score = score["Score"]

    return {"key": "summary_accuracy_score", "score": score}

In [None]:
from langsmith.evaluation import evaluate

experiment_results = evaluate(
    predict_tweet_openai_4o,
    data=dataset_name,
    evaluators=[text_summary_grader,text_summary_accuracy_grader],
    experiment_prefix="summary-gpt4o",
    metadata={"variant": "paper summary tweet, gpt4o"},
)



In [18]:
experiment_results = evaluate(
    predict_tweet_command_r,
    data=dataset_name,
    evaluators=[text_summary_grader,text_summary_accuracy_grader],
    experiment_prefix="summary-cmdr",
    metadata={"variant": "paper summary tweet, cmdr"},
)

View the evaluation results for experiment: 'summary-cmdr-2f7d59bd' at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/10075ef4-9b96-46ec-9f4c-cf44e8c1e6c3/compare?selectedSessions=7d46a8cf-059e-4ea4-bc7a-4d67e22800ed




0it [00:00, ?it/s]

In [19]:
from langchain import hub

from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langsmith.evaluation import evaluate

def evaluate_pairwise(runs: list, example) -> dict:
    """
    A simple evaluator for pairwise answers to score based on  engagement
    """

    # Store scores
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i

    # Runs is the pair of runs for each example
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]

    # LLM with function call, use highest capacity model
    llm = ChatOpenAI(model="gpt-4-turbo", temperature=0)

    # Structured prompt
    grade_prompt = hub.pull("rlm/pairwise-evaluation-tweet-summary")
    answer_grader = grade_prompt | llm

    # Get score
    score = answer_grader.invoke(
        {
            "question": system_tweet_instructions,
            "answer_a": answer_a,
            "answer_b": answer_b,
        }
    )
    score = score["Preference"]

    # Map from the score to the run assisnment
    if score == 1:  # Assistant A is preferred
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == 2:  # Assistant B is preferred
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0

    return {"key": "ranked_preference", "scores": scores}

In [20]:
from langsmith.evaluation import evaluate_comparative

evaluate_comparative(
    # Replace the following array with the names or IDs of your experiments
    ["summary-gpt4o-d782b2aa", "summary-cmdr-2f7d59bd"],
    evaluators=[evaluate_pairwise],
)

View the pairwise evaluation results at:
https://smith.langchain.com/o/ebbaf2eb-769b-4505-aca2-d11de10372a4/datasets/10075ef4-9b96-46ec-9f4c-cf44e8c1e6c3/compare?selectedSessions=a50748c6-9d55-4463-93ef-8e129dd8485c%2C7d46a8cf-059e-4ea4-bc7a-4d67e22800ed&comparativeExperiment=5b9e56f8-24a9-41a6-8a99-8446dafac6c3




  0%|          | 0/10 [00:00<?, ?it/s]

<langsmith.evaluation._runner.ComparativeExperimentResults at 0x10ee57450>