# LLM JUDGE EVALUATION SYSTEM

In [29]:
import os 
import tempfile

import chromadb
import streamlit as st

from langchain_chroma import Chroma
from pypdf import PdfReader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import CrossEncoder
from streamlit.runtime.uploaded_file_manager import UploadedFile
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage
from langchain_ollama import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser

import pandas as pd

In [30]:
#Load evaluator LLM
evaluator_llm = ChatOllama(model="gemma2:9b")

In [43]:
system_prompt = """
You are an evaluator for question and answers.

You are given a: 

(1) question, 
(2) a candidate answer,
(3) the marks allocated for the question if correctly evaluated, and
(4) a criteria to evaluate the candidate answer.

The criteria is a list of conditions that the candidate answer must satisfy to be awarded full marks.
Each line will denote an extra mark that can be awarded.

You need to evaluate the candidate answer based on the criteria.

[QUESTION]: {question}

[CANDIDATE ANSWER]: {candidate_answer}

[TOTAL MARKS]: {marks}

[CRITERIA]: {criteria}


You must reply in the following format:

[EVALUATED MARKS]: (answer here)

[REASONING]: (answer here)
"""

In [32]:
def get_evaluations(question: str, marks: int, candidate_answer: str, criteria: str):
    llm = evaluator_llm

    evaluator_prompt = ChatPromptTemplate.from_template(system_prompt)

    chain = evaluator_prompt | llm | StrOutputParser()

    response = chain.invoke({
        "question": question,
        "candidate_answer": candidate_answer,
        "marks": marks,
        "criteria": criteria
    })

    return response


## Prepare input file for evaluator

In [51]:
#read the calculation file
combined_df = pd.read_csv("combined_df.csv", index_col="ID")

In [52]:
combined_df.head(2)

Unnamed: 0_level_0,QUESTION,ANSWER,MARKS,CRITERIA,Model1_GA,Model2_GA,Model1a_GA,Model2a_GA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Define the term ‘adaptation’,Adaptation are the inherited favourable charac...,1,1 mark = correctly definition of adaptation,"The term ""adaptation"" refers to a trait or beh...",Adaptation refers to the process by which an o...,An adaptation is a characteristic that an orga...,An adaptation is a characteristic that an orga...
2,Distinguish between sexual reproduction and as...,"Sexual reproduction, such as meiosis, is the p...",4,1 mark = Define sexual reproduction\r\n\r\n1 m...,Sexual reproduction involves the combination o...,Sexual reproduction involves the combination o...,Reproduction ensures the continuity of a speci...,Reproduction ensures the continuity of a speci...


In [37]:
#extract the questions and answers
questions = combined_df['QUESTION'].values.tolist()
reference_answers = combined_df['ANSWER'].values.tolist()
reference_marks = combined_df['MARKS'].values.tolist()
criteria = combined_df['CRITERIA'].values.tolist()

model1_answers = combined_df['Model1_GA'].values.tolist()
model2_answers = combined_df['Model2_GA'].values.tolist()
model1a_answers = combined_df['Model1a_GA'].values.tolist()
model2a_answers = combined_df['Model2a_GA'].values.tolist()

Get evaluations for model 1

In [46]:
model1_evaluation = []

for i in range(len(questions)):
    response = get_evaluations(questions[i], reference_marks[i], model1_answers[i], criteria[i])
    model1_evaluation.append(response)

Get evaluations for model 2

In [48]:
model2_evaluation = []

for i in range(len(questions)):
    response = get_evaluations(questions[i], reference_marks[i], model2_answers[i], criteria[i])
    model2_evaluation.append(response)

Get evaluations for model 1a

In [47]:
model1a_evaluation = []

for i in range(len(questions)):
    response = get_evaluations(questions[i], reference_marks[i], model1a_answers[i], criteria[i])
    model1a_evaluation.append(response)

Get evaluations for model 2a

In [50]:
model2a_evaluation = []

for i in range(len(questions)):
    response = get_evaluations(questions[i], reference_marks[i], model2a_answers[i], criteria[i])
    model2a_evaluation.append(response)

In [53]:
index_list = []

for i in range(1, len(questions)+1):
    index_list.append(i)

In [55]:
evaluated_response_df = pd.DataFrame({
    "ID": index_list,
    "QUESTION": questions,
    "REFERENCE_ANSWER": reference_answers,
    "REFERENCE_MARKS": reference_marks,
    "CRITERIA": criteria,
    "MODEL1_ANSWER": model1_answers,
    "MODEL1_EVALUATION": model1_evaluation,
    "MODEL2_ANSWER": model2_answers,
    "MODEL2_EVALUATION": model2_evaluation,
    "MODEL1a_ANSWER": model1a_answers,
    "MODEL1a_EVALUATION": model1a_evaluation,
    "MODEL2a_ANSWER": model2a_answers,
    "MODEL2a_EVALUATION": model2a_evaluation
})

In [57]:
evaluated_response_df.set_index("ID", inplace=True)

In [58]:
evaluated_response_df.to_csv("evaluated_response_df.csv", index=False)