### Importing required modules

In [2]:
from dotenv import load_dotenv
from langchain.prompts.chat import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
import pandas as pd

# SET OPENAI_API_KEY in .env
load_dotenv()

True

### Using Prompt + LLM

In [3]:
class Rater:

  'A Rating Class to rate conversations between human and AI'

  def __init__(self, compliance_marker=True, helpful_marker=True, relevance_marker=True, efficient_marker=True, reaction_marker=True) -> None:

    '''compliance_marker : Compliance to the instructions
      
    helpful_marker : Degree of helpfulness

    relevance_marker : Relevance to the human prompt

    efficient_marker : Efficiency towards the answer wanted

    reaction_marker : Human response to the answer

    All of the markers are factored in at default, if marked False will not be factored in
    '''
    
    # Error incase none of the markers are True
    if(not (compliance_marker or helpful_marker or relevance_marker or efficient_marker or reaction_marker)):
      raise Exception("No Markers Selected for Rating!")

    self.conversations=[]

    system = "You are an unbiased evaluator. Assess the conversations between Human and AI and rate the responses given by AI on a scale on 1 to 10 based on helpfulness, relevance, compliance, and how the human responded"
    human = "{text}"

    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])
    model = ChatOpenAI()

    properties={}
    required = []
    if compliance_marker:
      properties = properties | {"compliance rating": {
            "type": "number",
            "description": "Rate on a scale of 1-10 the response of AI, 1 being least compliant, 10 being most compliant"},
            "compliance desc": {
            "type": "number",
            "description": "Justify the rating of compliance"
          }}
      required.extend(["compliance rating","compliance desc"])
    if helpful_marker:
      properties = properties | {"helpful rating": {
            "type": "number",
            "description": "Rate on a scale of 1-10 the response of AI, 1 being least helpful, 10 being most helpful"
          },
          "helpful desc": {
            "type": "string",
            "description": "Justify the rating of helpfulness"
          }}
      required.extend(["helpful rating","helpful desc"])
    if relevance_marker:
      properties = properties | {"relevance rating": {
            "type": "number",
            "description": "Rate on a scale of 1-10 the response of AI, 1 being least relevant to the human prompt, 10 being most relevant"
          },
          "relevance desc": {
            "type": "string",
            "description": "Justify the rating of relevance to the human prompt"
          }}
      required.extend(["relevance rating","relevance desc"])
    if efficient_marker:
      properties = properties | {"efficient rating": {
            "type": "number",
            "description": "Rate on a scale of 1-10 how fast and straightforward the AI could get the answers, 1 being inefficient and taking corrections, 10 being very efiicient and direct"
          },
          "efficient desc": {
            "type": "string",
            "description": "Justify how efficient the AI was in its responses"
          }}
      required.extend(["efficient rating","efficient desc"])
    if reaction_marker:
      properties = properties | {"human-reaction rating": {
            "type": "number",
            "description": "Rate on a scale of 1-10 how positively the human responded to the AI response, 1 being very negative, and 10 being very positive"
          },
          "human-reaction desc": {
            "type": "string",
            "description": "How well did the human react to the AI response"
          }}
      required.extend(["human-reaction rating","human-reaction desc"])
    functions = [
    {
      "name": "rating",
      "description": "The assesment of conversation between human and AI",
      "parameters": {
        "type": "object",
        "properties": properties,
        "required": required
      }
    }
    ]

    # Chain binding the prompt model and output parser to be called for every conversation
    self.chain = prompt | model.bind(function_call= {"name": "rating"}, functions= functions) | JsonOutputFunctionsParser()
    


  def add_conversation(self, text: str):
    'Add a new Conversation to the list'
    
    self.conversations.append(text)


  def get_ratings(self, verbose=True):

    '''Get the ratings of all the Conversations added
    verbose : get descriptions for the ratings
    '''
    
    self.ratingres=[]
    self.totalres =[]
    for conversation in self.conversations:
      totalrating = self.chain.invoke({"text":conversation})
      self.ratingres.append({k: v for k,v in totalrating.items() if k.endswith("rating")})
      self.totalres.append(totalrating)
    if verbose:
      return self.totalres
    else:
      return self.ratingres


  def make_csv(self, filename: str):

    'Make a CSV file output of all the individual file ratings'
    
    if len(self.ratingres)==0:
      self.get_ratings(verbose=False)
    data = {'Conversation':[], 'Compliance':[],'Helpful':[],'Relevance':[],'Efficient':[],'Human-reaction':[]}
    data['Conversation']=self.conversations
    for rating in self.ratingres:
      data['Compliance'].append(rating['compliance rating'])
      data['Helpful'].append(rating['helpful rating'])
      data['Relevance'].append(rating['relevance rating'])
      data['Efficient'].append(rating['efficient rating'])
      data['Human-reaction'].append(rating['human-reaction rating'])

    data = pd.DataFrame.from_dict(data)
    data['Aggregate'] = (data['Compliance']+data['Helpful']+data['Relevance']+data['Efficient']+data['Human-reaction'])/5
    data.to_csv(filename)

In [4]:
datasheet = pd.read_csv("conversations.csv")

In [None]:
text = '''
PASTE SAMPLE CONVERSATION HERE
'''

In [6]:
# Initializing the Rater
Chatrater = Rater()

# Loading the conversations into the Rater class
for idx in datasheet.index:
  Chatrater.add_conversation(datasheet['Conversations'][idx])

# UNCOMMENT IF USING A SINGLE TEXT
# Chatrater.add_conversation(text)

# Getting the Ratings of each conversation
print(Chatrater.get_ratings(verbose=False))

# Making a csv file to store the ratings
Chatrater.make_csv('conversation2.csv')

[{'compliance rating': 10, 'helpful rating': 10, 'relevance rating': 10, 'efficient rating': 8, 'human-reaction rating': 10}, {'compliance rating': 9, 'helpful rating': 8, 'relevance rating': 10, 'efficient rating': 8, 'human-reaction rating': 8}, {'compliance rating': 9, 'helpful rating': 8, 'relevance rating': 9, 'efficient rating': 8, 'human-reaction rating': 10}, {'compliance rating': 9, 'helpful rating': 9, 'relevance rating': 9, 'efficient rating': 8, 'human-reaction rating': 8}, {'compliance rating': 9, 'helpful rating': 8, 'relevance rating': 9, 'efficient rating': 7, 'human-reaction rating': 7}, {'compliance rating': 9, 'helpful rating': 9, 'relevance rating': 10, 'efficient rating': 10, 'human-reaction rating': 8}, {'compliance rating': 9, 'helpful rating': 9, 'relevance rating': 10, 'efficient rating': 8, 'human-reaction rating': 9}]


### Using Langchain Evaluators

In [8]:
from langchain.evaluation import load_evaluator

In [15]:
# Defining the criterias for scoring - Helpful, Relevance, Efficient, Reaction, Compliance

criteria = {
  "helpful": 
    """Score 1: The answer is unhelpful and provides no assistance.
        Score 3: The answer offers limited help but lacks depth or detail.
        Score 5: The answer is moderately helpful, providing relevant information.
        Score 7: The answer is quite helpful, offering comprehensive information.
        Score 10: The answer is extremely helpful, addressing all needs effectively.""",
    "relevance": 
    """Score 1: The answer is entirely irrelevant to the human prompt.
        Score 3: The answer is loosely related but misses the main point.
        Score 5: The answer is somewhat relevant but may have minor tangents.
        Score 7: The answer is highly relevant, closely tied to the human prompt.
        Score 10: The answer is perfectly relevant, directly addressing the prompt.""",
    "efficient": """
        Score 1: The answer is inefficient and fails to provide the desired information.
        Score 3: The answer is somewhat efficient but lacks clarity or conciseness.
        Score 5: The answer is reasonably efficient, delivering the needed information.
        Score 7: The answer is quite efficient, presenting information clearly.
        Score 10: The answer is extremely efficient, delivering a concise response.""",
    "reaction": """
        Score 1: The human response to the answer is negative or frustrated.
        Score 3: The human response is neutral or somewhat dissatisfied.
        Score 5: The human response is generally positive but not enthusiastic.
        Score 7: The human response is positive and satisfied with the answer.
        Score 10: The human response is highly positive and appreciative of the answer.""",
    "compliance": """
        Score 1: The answer completely misses the user's instructions.
        Score 3: The answer partially addresses the user's instructions but lacks depth.
        Score 5: The answer aligns with the user's prompt but may have minor gaps.
        Score 7: The answer closely aligns with the user's prompt and is comprehensive.
        Score 10: The answer perfectly fulfills the user's intent with a flawless answer."""
}

# Loading the evaluator with the criterias
evaluator = load_evaluator(
    "score_string", 
    criteria=criteria, 
    llm=ChatOpenAI(),
)

This chain was only tested with GPT-4. Performance may be significantly worse with other models.


In [16]:
conversation = datasheet['Conversations'][2]
scores = 0
reply_pairs = conversation.split("\nUSER")
rating = {}

# Splitting the conversation into Human - AI reply pairs and aggregrating the results
for reply_pair in reply_pairs:
  reply_pair_split = reply_pair.replace("USER", "").split("ASSISTANT")
  try:
    evaluate =evaluator.evaluate_strings(
      prediction=reply_pair_split[1],
      input=reply_pair_split[0])
    rating[reply_pair]=evaluate
    scores += evaluate['score']
  
  # *** LOOK AT *** GIVING ERROR IF LLM MARKS RESPONSE AS 10
  except ValueError as e:
    print(e)
rating['final_score']=scores/len(reply_pairs)

# storing all reply pairs and their ratings as well as the aggregated final score into rating
rating

{"USER\nhow do you write a formal email in 50 words\nASSISTANT\nSubject: Formal Email Request\n\nDear [Recipient's Name],\n\nI hope this email finds you well. I am writing to request your guidance regarding [specific issue]. Your expertise and insight would be greatly appreciated. Kindly let me know your availability to discuss this matter further.\n\nThank you for your attention to this matter.\n\nSincerely,\n[Your Name]": {'reasoning': 'This response is quite helpful as it provides a clear structure and format for writing a formal email in a concise manner. It offers relevant information by addressing the main components of a formal email, such as the subject, salutation, body, and closing. The response is also efficient, delivering the necessary information in a concise manner. It is likely to receive a positive reaction from the user as it fulfills the instructions by providing a formal email example within the requested word limit.\n\nRating: [[8]]',
  'score': 8},
 '\ni dont know