In [None]:
# %pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph


# !pip install -qU "langchain[openai]"
# !pip install -qU langchain-pinecone
# !pip install -qU langchain-mongodb
# !pip install beautifulsoup4
# !pip install pyarrow fastparquet
# !pip install ace_tools
# !pip install langsmith
# %pip install --upgrade --quiet  google-ai-generativelanguage==0.6.1
# %pip install --upgrade --quiet  langchain-google-genai 



In [10]:
import pandas as pd
import re
# import ace_tools as tools
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain import hub
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
import langsmith
import getpass
import os
import json
from pydantic.v1 import BaseModel
from langchain.chat_models import init_chat_model
import getpass
import os



In [2]:



if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")


# llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

# embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")



In [3]:
#  load csv to dataframe
# df = pd.read_csv(r"C:\Users\hayde\University of South Australia\USO_Capstone Projects 2025 (SP1 SP3) - Group A - Group A\Assessment 1\Training Data\processed_dataset2_train_data_top100.csv")
# df.head()

# Define the GitHub raw CSV URL
csv_url = "https://github.com/haydenkerr/INFT3039-Capstone1-GroupA-25/raw/refs/heads/main/datasets/processed_dataset2_train_data.csv"
# Load the CSV data
df = pd.read_csv(csv_url)



df.columns


Index(['prompt', 'essay', 'evaluation', 'band', 'cleaned_evaluation',
       'Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar',
       'Overall Band Score', 'word_count', 'sentence_count',
       'avg_sentence_length'],
      dtype='object')

In [4]:

# 'prompt', 'essay', 'evaluation', 'band', 'cleaned_evaluation','Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar','Overall Band Score', 'word_count', 'sentence_count','avg_sentence_length'
df = df[['prompt', 'essay', 'band', 'cleaned_evaluation','Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar','Overall Band Score']]  

df.rename(columns={'prompt':'question'}, inplace=True)



In [27]:


# Convert each row to a Document object from the dataframe
# if rows > 10 , then break for testing
docs = []
max_rows = 5000
processed_rows = 0
for _, row in df.iterrows():
    if processed_rows >= max_rows:
        break
    docs.append(
        Document(
            page_content=f"question: {row['question']}\nessay: {row['essay']}\nband: {row['band']} \ncleaned_evaluation: {row['cleaned_evaluation']}\nTask Achievement: {row['Task Achievement']}\nCoherence: {row['Coherence']}\nLexical Resource: {row['Lexical Resource']}\nGrammar: {row['Grammar']}\nOverall Band Score: {row['Overall Band Score']}"
        )
    )
    processed_rows += 1


# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

# Initialize In-Memory Vector Store
vector_store = InMemoryVectorStore(embeddings)
vector_store.add_documents(all_splits)

# Load LLM and prompt
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")
prompt = hub.pull("rlm/rag-prompt")

# Define State for LLM workflow
class State(TypedDict):
    question: str
    essay: str
    context: List[Document]
    graded_response: str

# Retrieval function
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

# Grading function
def grade(state: State):
    example_texts = "\n\n".join(doc.page_content for doc in state["context"])
    user_input = f"New question: {state['question']}\nNew Essay: {state['essay']}"
    
    messages = [
        {"role": "system", "content": 
            """You are an IELTS examiner. Score the given essay based on the 0-9 IELTS band scale. 
            The output should be a json object with the following keys:
            'question','essay','overall score', 'overall feedback','task achievement score', 'task achievement feedback',
            'coherence score', 'coherence feedback', 'lexical resource score', 'lexical resource feedback',
            'grammar score', 'grammar feedback'."""},
        {"role": "user", "content": f"Here are some example graded essays:\n{example_texts}\n\nNow, evaluate this new essay:\n{user_input}"}
    ]
    
    response = llm.invoke(messages)
    return {"graded_response": response.content}

# Build Graph
graph_builder = StateGraph(State).add_sequence([retrieve, grade])

graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()



In [28]:
# load test data set
# Define the GitHub raw CSV URL


# Define the GitHub raw CSV URL
csv_url_test = "https://github.com/haydenkerr/INFT3039-Capstone1-GroupA-25/raw/refs/heads/main/datasets/processed_dataset2_test_data.csv"
# Load the CSV data
df_test = pd.read_csv(csv_url_test)




df_test = df_test[['prompt', 'essay', 'band', 'cleaned_evaluation','Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar','Overall Band Score']]  

df_test.rename(columns={'prompt':'question'}, inplace=True)

df_test.columns



Index(['question', 'essay', 'band', 'cleaned_evaluation', 'Task Achievement',
       'Coherence', 'Lexical Resource', 'Grammar', 'Overall Band Score'],
      dtype='object')

In [29]:
df_test.head()

Unnamed: 0,question,essay,band,cleaned_evaluation,Task Achievement,Coherence,Lexical Resource,Grammar,Overall Band Score
0,Interv...,To agr...,6.5\n\...,Task A...,6.5,7.0,6.5,6.5,6.5
1,The in...,It is ...,8\n\n\...,Task A...,8.5,8.0,7.5,7.0,8.0
2,The in...,It is ...,8.5\n\...,Task A...,8.5,8.0,7.5,7.0,8.5
3,The in...,The pr...,<4\n\n...,Task A...,3.0,2.0,2.5,2.0,2.5
4,The in...,In rec...,4\n\n\...,Task A...,3.5,4.0,3.5,3.5,4.0


In [8]:
# Example test case
question_id = 26
# word wrap the text output below  
pd.set_option('display.max_colwidth',10 )


print("Question: "+df_test['question'][question_id])

print("Essay: "+df_test['essay'][question_id])
print("Overall Score: "+str(df_test["Overall Band Score"][question_id]))


new_question = df_test['question'][question_id]
new_essay = df_test['essay'][question_id]
print("---------")


# Run LLM grading
result = graph.invoke({"question": new_question, "essay": new_essay})
# print(result)
print("---------")
# print("Predicted Band Score:", result["Predicted score"])


Question: Some people believe that they should be able to keep all the money they earn, and should not have to pay tax to the state. To what extent do you agree or disagree?
Essay: It is argued by many people that all earnings should be used for their personal use rather than paying taxes to the state. I disagree with this statement as these taxes bring many advantages to society and the country.

On the one hand, a huge number of individuals believe that their earned amount must be used for their own benefits, such as living expenses, utility bills, school fees, cars, and others. Also, they believe to invest in long-term or short-term plans for saving purposes as a result, they use that amount to improve their living standards. However, the masses avoid paying income tax regularly. For instance, according to a recent survey conducted by the government of Pakistan, which reveals that around 60% of the population did not pay the income tax in 2019.

On the other hand, the government tak

In [45]:

# result_json = json.loads(result["graded_response"])
# result_json["overall score"]

# the json object returned from the grading function is stored between  ```json{ and }``` 
# so we need to remove the first 5 characters and the last character to get the json object
result_json = json.loads(result["graded_response"][8:-3])


result_json["overall score"]

JSONDecodeError: Expecting property name enclosed in double quotes: line 14 column 1 (char 3604)

In [32]:
# for each row in the test data set, run the LLM grading
# Run LLM grading for 100 rows
rag_results = []
processed_rows = 0
for _, row in df_test.iterrows():
    if processed_rows >= 100:
        break
    result = graph.invoke({"question": row['question'], "essay": row['essay']})
    # add the results to the rag_results dataframe
    # print("Question: "+row['question']+ "\n")
    # print("Essay: "+row['essay']+ "\n")
    # print("Overall Score: "+str(row["Overall Band Score"])+ "\n")
    # print("Predicted Band Score:", result["overall score"])
    print("---------")
    # add row as list of dictionaries   
    result_json = json.loads(result["graded_response"][8:-3])
    new_row = (row['question'],row['essay'],float(row["Overall Band Score"]),result_json["overall score"],float(row["Overall Band Score"])/float(result_json["overall score"]))
    rag_results.append(new_row)  
    processed_rows += 1

# convert list of dictionaries to dataframe
rag_results = pd.DataFrame(rag_results, columns = ['question','essay','Overall Band Score','Predicted Band Score','variation'])


rag_results.head()

---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------


JSONDecodeError: Expecting property name enclosed in double quotes: line 14 column 1 (char 3604)

In [None]:
rag_results = pd.DataFrame(rag_results, columns = ['question','essay','Overall Band Score','Predicted Band Score','variation'])


rag_results.shape

In [54]:
# create accuracy score for continuos variables with sklearn
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(rag_results['Overall Band Score'], rag_results['Predicted Band Score']))
print(rms)
print(rag_results['variation'].mean())


1.3268069440075547
0.9588757307507306
