In [None]:
# Ref: https://api.python.langchain.com/en/latest/evaluation/langchain.evaluation.criteria.eval_chain.CriteriaEvalChain.html

In [1]:
import langchain_community
from langchain.evaluation.criteria import CriteriaEvalChain
from langchain.evaluation.criteria import LabeledCriteriaEvalChain

In [9]:
# What llms do we have?
! ollama list

NAME            	ID          	SIZE  	MODIFIED   
llama2:chat     	78e26419b446	3.8 GB	4 days ago	
tinyllama:latest	2644915ede35	637 MB	8 days ago	
llama3:8b       	365c0bd3c000	4.7 GB	8 days ago	
mistral:latest  	2ae6f6dd7a3d	4.1 GB	8 days ago	
phi3:latest     	64c1188f2485	2.4 GB	8 days ago	


In [11]:
# 1.0
from langchain_community.llms import Ollama
from langchain_community.chat_models.ollama import ChatOllama

# 1.0.1

llm= ChatOllama(model = "llama3:8b",    # This is also the default
             temperature=0.9,    # Default is None (ie 0.8)
             num_predict=256      # Maximum number of tokens to predict when generating text
                                 #  (Default: 128, -1 = infinite generation, -2 = fill context)
           )


# llm = Ollama()
llm

ChatOllama(model='llama3:8b', num_predict=256, temperature=0.9)

In [24]:
criteria = {"my-custom-criterion": "Is the submission the most amazing ever?"}
evaluator = CriteriaEvalChain.from_llm(llm=llm, criteria=criteria)
evaluator.evaluate_strings(prediction="Imagine an ice cream flavor for the color aquamarine", input="Tell me an idea")


{'reasoning': 'Here\'s my step-by-step reasoning:\n\n1. Is the submission an idea?\n\t* Yes, the submission is "Imagine an ice cream flavor for the color aquamarine", which is an idea.\n2. Does the submission relate to the input ("Tell me an idea")?\n\t* Yes, the submission is a response to the request to provide an idea.\n3. Does the submission meet my-custom-criterion ("Is the submission the most amazing ever")?\n\t* This criterion is subjective and hard to evaluate objectively. However, I\'ll assume that it\'s meant to be evaluated based on creativity, originality, and potential impact.\n\t* Upon reviewing the submission, I think that "Imagine an ice cream flavor for the color aquamarine" is a creative and unique idea that might spark interesting conversations about colors and flavors. While it may not be the most amazing ever, it\'s certainly a thoughtful and imaginative response.\n\nBased on my evaluation, here\'s my answer:\n\nY',
 'value': 'Y',
 'score': 1}

In [13]:
# 1.0.1

llm= Ollama(model = "mistral",    # This is also the default
             temperature=0.9,    # Default is None (ie 0.8)
             num_predict=256      # Maximum number of tokens to predict when generating text
                                 #  (Default: 128, -1 = infinite generation, -2 = fill context)
           )


# llm = Ollama()
llm

Ollama(model='mistral', num_predict=256, temperature=0.9)

In [8]:
criteria = "correctness"
evaluator = LabeledCriteriaEvalChain.from_llm(
    llm=llm,
    criteria=criteria,
)
evaluator.evaluate_strings(
  prediction="The answer is 4",
  input="How many apples are there?",
  reference="There are 3 apples",
  )

{'reasoning': "To determine if the submission meets the Criteria, let's break down the analysis:\n\n1. Correctness: A correct answer is one that matches with the provided reference in terms of the number of apples.\n2. Reference: There are 3 apples according to the given data.\n3. Submission: The submission states there are 4 apples.\n\nComparing the Submission and the Reference, we find that they do not match - the answer in the reference is 3 while the submission claims it's 4. Therefore, based on our analysis, the answer is incorrect.\n\nY\n(Repeat the letter for clarity)",
 'value': 'Y',
 'score': 1}

In [None]:
#from langchain.evaluation import load_evaluator
#from langchain import open_ai
from langchain.evaluation.loading import load_evaluator

evaluator = load_evaluator("criteria", llm=llm, criteria="conciseness")



In [26]:
from langchain.evaluation import load_evaluator
from langchain.chat_models import ChatOllama
from langchain.llms import Ollama
from langchain.embeddings import HuggingFaceEmbeddings

In [37]:
#This is work
evaluator = load_evaluator("labeled_score_string", llm=ChatOllama(model="llama2"))
evaluator = load_evaluator("pairwise_string",  llm=Ollama(model="llama3:8b"))

This chain was only tested with GPT-4. Performance may be significantly worse with other models.


In [38]:
# https://python.langchain.com/v0.1/docs/guides/productionization/evaluation/comparison/pairwise_string/
# https://stackoverflow.com/q/78231114/3282777

evaluator.evaluate_string_pairs(
    prediction="there are three dogs",
    prediction_b="4",
    input="how many dogs are in the park?",
    reference="four",
)

To use a reference, use the LabeledPairwiseStringEvalChain (EvaluatorType.LABELED_PAIRWISE_STRING) instead.


{'reasoning': 'I will now compare and evaluate the responses provided by the two AI assistants to the user question "how many dogs are in the park?".\n\nFrom my analysis, I can see that both responses aim to directly answer the user\'s question. However, there is a significant difference between the quality of their answers.\n\nAssistant A provides a response that is brief and simplistic, stating that "there are three dogs". While this response attempts to answer the user\'s question, it lacks depth, context, and relevance. It does not provide any additional information or insights that might be helpful to the user. The response also relies heavily on guesswork, as there is no evidence provided to support the claim.\n\nOn the other hand, Assistant B provides a direct and accurate response stating "4". This answer is straightforward, concise, and relevant to the question. It does not attempt to provide additional information or context, but it accurately answers the user\'s query.\n\nIn

In [None]:
# This is equivalent to loading using the enum
from langchain.evaluation import EvaluatorType

evaluator = load_evaluator(EvaluatorType.CRITERIA, criteria="conciseness")

In [3]:
from langchain.evaluation import Criteria
list(Criteria)

[<Criteria.CONCISENESS: 'conciseness'>,
 <Criteria.RELEVANCE: 'relevance'>,
 <Criteria.CORRECTNESS: 'correctness'>,
 <Criteria.COHERENCE: 'coherence'>,
 <Criteria.HARMFULNESS: 'harmfulness'>,
 <Criteria.MALICIOUSNESS: 'maliciousness'>,
 <Criteria.HELPFULNESS: 'helpfulness'>,
 <Criteria.CONTROVERSIALITY: 'controversiality'>,
 <Criteria.MISOGYNY: 'misogyny'>,
 <Criteria.CRIMINALITY: 'criminality'>,
 <Criteria.INSENSITIVITY: 'insensitivity'>,
 <Criteria.DEPTH: 'depth'>,
 <Criteria.CREATIVITY: 'creativity'>,
 <Criteria.DETAIL: 'detail'>]

In [6]:
from langchain_community.llms import Ollama
from langchain_community.chat_models.ollama import ChatOllama

In [7]:
from langchain.evaluation import load_evaluator

#evaluator = load_evaluator("pairwise_string",  llm=Ollama(model="llama3:8b"))

evaluator = load_evaluator("criteria", criteria="conciseness", llm=Ollama(model="llama3:8b"))
eval_result = evaluator.evaluate_strings(
    prediction="""Joe Biden is an American politician 
    who is the 46th and current president of the United States. 
    Born in Scranton, Pennsylvania on November 20, 1942, 
    Biden moved with his family to Delaware in 1953. 
    He graduated from the University of Delaware 
    before earning his law degree from Syracuse University. 
    He was elected to the New Castle County Council in 1970 
    and to the U.S. Senate in 1972.""",
    input="Who is the president of United States?",
)
print(eval_result)

{'reasoning': 'Let\'s assess the submission based on the conciseness criterion.\n\nStep 1: Review the input prompt.\nThe input prompt is "Who is the president of United States?"\n\nStep 2: Evaluate the submission\'s relevance to the input prompt.\nThe submission mentions Joe Biden, an American politician who is the 46th and current president of the United States. This directly answers the input prompt.\n\nStep 3: Assess the submission\'s conciseness.\nThe submission provides a detailed biography of Joe Biden, including his birthplace, date, education, and political career. While it does mention that he is the current president of the United States, which is relevant to the input prompt, the overall length of the submission exceeds what one would typically consider concise.\n\nConclusion: The submission does not meet the conciseness criterion.\n\nY', 'value': 'Y', 'score': 1}


In [8]:
evaluator = load_evaluator("labeled_criteria", criteria="correctness" , llm=Ollama(model="llama3:8b"))

eval_result = evaluator.evaluate_strings(
    input="Is there any river on the moon?",
    prediction="There is no evidence of river on the Moon",
    reference="""In a hypothetical future, lunar scientists discovered 
    an astonishing phenomenon—a subterranean river 
    beneath the Moon's surface""",
)

In [9]:
eval_result

{'reasoning': 'Step-by-step reasoning for each criterion:\n\n1. Correctness:\n\t* The submission states "There is no evidence of river on the Moon."\n\t* This statement is accurate and factual based on current scientific knowledge.\n\t* However, the reference provided suggests that in a hypothetical future scenario, scientists discovered a subterranean river beneath the Moon\'s surface.\n\t* Since this discovery is not part of our current understanding, the submission is correct within the scope of current knowledge.\n\nReasoning: Y\n\n2. Correctness:\n\t* The same reasoning as above applies; the submission is accurate and factual based on current scientific knowledge.\n\t* However, it does not account for hypothetical future discoveries or scenarios that might contradict its claim.\n\t* In this sense, the submission is correct in stating that there is no evidence of river on the Moon.\n\nReasoning: Y\n\nBased on my reasoning, I conclude that the submission meets all criteria. Therefor

In [14]:
from langchain.evaluation import EvaluatorType

custom_criteria = {
    "numeric": "Does the output contain numeric information?",
    "mathematical": "Does the output contain mathematical information?"
}
prompt = "Tell me a joke"

output = """
Why did the mathematician break up with his girlfriend?

Because she had too many "irrational" issues!
"""

llm= ChatOllama(model = "llama3:8b",    # This is also the default
             temperature=0.9,    # Default is None (ie 0.8)
             num_predict= 1000      # Maximum number of tokens to predict when generating text
                                 #  (Default: 128, -1 = infinite generation, -2 = fill context)
           )



eval_chain = load_evaluator(
    EvaluatorType.CRITERIA,
    criteria=custom_criteria,
    llm=llm
)
eval_result = eval_chain.evaluate_strings(prediction = output, input = prompt)
print("===================== Multi-criteria evaluation =====================")
print(eval_result)

{'reasoning': 'Here\'s my step-by-step reasoning for each criterion:\n\n**Numeric: Does the output contain numeric information?**\n\nI\'ll start by examining the submission: "Why did the mathematician break up with his girlfriend?\n\nBecause she had too many \'irrational\' issues!"\n\nAt first glance, I don\'t see any numerical values or quantities mentioned in the submission. The word "irrational" is a mathematical term, but it\'s not referring to a specific number or quantity. Therefore, I conclude that this criterion is NOT MET.\n\n**Mathematical: Does the output contain mathematical information?**\n\nNow, let\'s look at the same submission again. This time, I\'m focusing on whether the output contains any mathematical concepts or terms. The word "irrational" does appear in the submission, which is a mathematical term referring to numbers that cannot be expressed as a finite decimal or fraction. So, yes, this criterion IS MET.\n\n**Conclusion:**\n\nBased on my reasoning, I conclude 

In [19]:
# No llm is needed here
from langchain.evaluation import ExactMatchStringEvaluator
exact_match_evaluator = ExactMatchStringEvaluator()
exact_match_evaluator = ExactMatchStringEvaluator(ignore_case=True)
exact_match_evaluator.evaluate_strings(
    prediction="Data Science",
    reference="My Data science",
)

{'score': 0}

In [30]:
from langchain.chat_models import ChatOpenAI
from langchain.evaluation import load_evaluator
accuracy_criteria = {
    "accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference."""
}
evaluator = load_evaluator(
    "labeled_score_string",
    criteria=accuracy_criteria,
    llm= Ollama(model="llama3:8b"),
)

In [31]:
eval_result = evaluator.evaluate_strings(
    prediction="You can find them in the dresser's third drawer.",
    reference="The socks are in the third drawer in the dresser",
    input="Where are my socks?",
)

print(eval_result['reasoning'])

Evaluation:

The assistant's response is attempting to answer the user's question about the location of their socks. The assistant provides a specific location, stating that the socks are in the third drawer of the dresser, which aligns with the ground truth provided.

Rating: [[7]]


Explanation:
The assistant's answer has moderate relevance and accuracy, as it correctly identifies the dresser as the location where the user can find their socks. However, the minor error is that the assistant does not mention that they have been told specifically that the drawer is the third one in the dresser, which could lead to some ambiguity if the dresser had multiple drawers. Nonetheless, the answer provides a helpful and accurate direction for the user to find their socks.


In [23]:
! pip install rapidfuzz -- quiet

[33mDEPRECATION: Loading egg at /home/ashok/anaconda3/envs/langchain/lib/python3.11/site-packages/huggingface_hub-0.23.2-py3.8.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0mCollecting rapidfuzz
  Downloading rapidfuzz-3.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.9.3


In [24]:
# Does not need llm
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("string_distance")
evaluator.evaluate_strings(
    prediction="Senior Data Scientist",
    reference="Data Scientist",
)

{'score': 0.23015873015873023}

In [25]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator("labeled_pairwise_string", llm = Ollama(model="llama3:8b"))
evaluator.evaluate_string_pairs(
    prediction="there are 5 days",
    prediction_b="7",
    input="how many days in a week?",
    reference="Seven",
)

{'reasoning': "I will now evaluate the responses provided by the two AI assistants, focusing on the criteria mentioned: helpfulness, relevance, accuracy, depth, and creativity.\n\nAssistant A's response is brief and straightforward. However, it is not entirely accurate, as there are seven days in a week (Monday to Sunday). The response lacks depth and does not provide any additional information or insights.\n\nOn the other hand, Assistant B's response is extremely concise and directly answers the user's question. It is accurate, stating that there are indeed 7 days in a week. While it may lack creativity and depth, its simplicity and correctness make it a more helpful and relevant answer.\n\nAfter comparing both responses, I conclude that Assistant B's answer better meets the criteria of helpfulness, relevance, accuracy, and depth. Therefore, my final verdict is:\n\n[[B]]",
 'value': 'B',
 'score': 0}

In [None]:
# Needs open api key
from langchain.evaluation import load_evaluator
evaluator = load_evaluator("pairwise_embedding_distance",  llm = Ollama(model="llama3:8b") )
evaluator.evaluate_string_pairs(
    prediction="Rajasthan is hot in June", prediction_b="Rajasthan is warm in June."
)

In [None]:
# Needs open ai key
evaluator = load_evaluator("embedding_distance", llm = Ollama(model="llama3:8b"))
evaluator.evaluate_strings(prediction="Total Profit is 04.25 Cr", 
reference="Total return is 4.25 Cr")