In [None]:
# %pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph


# !pip install -qU "langchain[openai]"
# !pip install -qU langchain-pinecone
# !pip install -qU langchain-mongodb
# !pip install beautifulsoup4
# !pip install pyarrow fastparquet
# !pip install ace_tools
# !pip install langsmith



This notebook produces an inmemory vector database using the Open AI api and langchain openai embeddings.

When run for 9500 entries, the Open AI usage calcultor provided a cost of $0.90 a run. 

The 

In [61]:
import pandas as pd
import re
# import ace_tools as tools
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
import langsmith
import getpass
import os
import json
from pydantic.v1 import BaseModel
from langchain.chat_models import init_chat_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:


os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass("LANGSMITH_API_KEY")

In [3]:
#  load csv to dataframe
# df = pd.read_csv(r"C:\Users\hayde\University of South Australia\USO_Capstone Projects 2025 (SP1 SP3) - Group A - Group A\Assessment 1\Training Data\processed_dataset2_train_data_top100.csv")
# df.head()

# Define the GitHub raw CSV URL
csv_url = "https://github.com/haydenkerr/INFT3039-Capstone1-GroupA-25/raw/refs/heads/main/datasets/processed_dataset2_train_data.csv"
# Load the CSV data
df = pd.read_csv(csv_url)



df.columns


Index(['prompt', 'essay', 'evaluation', 'band', 'cleaned_evaluation',
       'Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar',
       'Overall Band Score', 'word_count', 'sentence_count',
       'avg_sentence_length'],
      dtype='object')

In [4]:

# 'prompt', 'essay', 'evaluation', 'band', 'cleaned_evaluation','Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar','Overall Band Score', 'word_count', 'sentence_count','avg_sentence_length'
df = df[['prompt', 'essay', 'band', 'cleaned_evaluation','Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar','Overall Band Score']]  

df.rename(columns={'prompt':'question'}, inplace=True)



In [41]:


# Convert each row to a Document object from the dataframe
# if rows > 10 , then break for testing
docs = []
max_rows = 5000
processed_rows = 0
for _, row in df.iterrows():
    if processed_rows >= max_rows:
        break
    docs.append(
        Document(
            page_content=f"question: {row['question']}\nessay: {row['essay']}\nband: {row['band']} \ncleaned_evaluation: {row['cleaned_evaluation']}\nTask Achievement: {row['Task Achievement']}\nCoherence: {row['Coherence']}\nLexical Resource: {row['Lexical Resource']}\nGrammar: {row['Grammar']}\nOverall Band Score: {row['Overall Band Score']}"
        )
    )
    processed_rows += 1

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Initialize embeddings
embeddings = OpenAIEmbeddings()

# Initialize In-Memory Vector Store
vector_store = InMemoryVectorStore(embeddings)
vector_store.add_documents(all_splits)

# Load LLM and prompt
llm = ChatOpenAI(model_name="gpt-4")
prompt = hub.pull("rlm/rag-prompt")

# Define State for LLM workflow
class State(TypedDict):
    question: str
    essay: str
    context: List[Document]
    graded_response: str

# Retrieval function
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}

# Grading function
def grade(state: State):
    example_texts = "\n\n".join(doc.page_content for doc in state["context"])
    user_input = f"New question: {state['question']}\nNew Essay: {state['essay']}"
    
    messages = [
        {"role": "system", "content": 
            """You are an IELTS examiner. Score the given essay based on the 0-9 IELTS band scale. 
            The output should be a json object with the following keys:
            'question','essay','overall score', 'overall feedback','task achievement score', 'task achievement feedback',
            'coherence score', 'coherence feedback', 'lexical resource score', 'lexical resource feedback',
            'grammar score', 'grammar feedback'."""},
        {"role": "user", "content": f"Here are some example graded essays:\n{example_texts}\n\nNow, evaluate this new essay:\n{user_input}"}
    ]
    
    response = llm.invoke(messages)
    return {"graded_response": response.content}

# Build Graph
graph_builder = StateGraph(State).add_sequence([retrieve, grade])

graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()


In [6]:
# load test data set
# Define the GitHub raw CSV URL


# Define the GitHub raw CSV URL
csv_url_test = "https://github.com/haydenkerr/INFT3039-Capstone1-GroupA-25/raw/refs/heads/main/datasets/processed_dataset2_test_data.csv"
# Load the CSV data
df_test = pd.read_csv(csv_url_test)




df_test = df_test[['prompt', 'essay', 'band', 'cleaned_evaluation','Task Achievement', 'Coherence', 'Lexical Resource', 'Grammar','Overall Band Score']]  

df_test.rename(columns={'prompt':'question'}, inplace=True)

df_test.columns



Index(['question', 'essay', 'band', 'cleaned_evaluation', 'Task Achievement',
       'Coherence', 'Lexical Resource', 'Grammar', 'Overall Band Score'],
      dtype='object')

In [None]:
df_test.head()

In [42]:
# Example test case
question_id = 10
# word wrap the text output below  
pd.set_option('display.max_colwidth',10 )


print("Question: "+df_test['question'][question_id])

print("Essay: "+df_test['essay'][question_id])
print("Overall Score: "+str(df_test["Overall Band Score"][question_id]))


new_question = df_test['question'][question_id]
new_essay = df_test['essay'][question_id]
print("---------")


# Run LLM grading
result = graph.invoke({"question": new_question, "essay": new_essay})
# print(result)
print("---------")
# print("Predicted Band Score:", result["Predicted score"])

Question: Children find it difficult to concentrate on or pay attention to school. What are the reasons? How can we solve this problem?
Essay: It is true that many children find it difficult to focus on the course at school. There are many reasons behind this phenomenon, so we need to take a comprehensive approach to mitigate it.

On the one hand, several factors contribute to this aissue. One of the reasons is the development of technology such as the applications of smartphones. With smartphones, many students are addicted to the virtual world brought about by mobile games and gain a sense of achievement

, which cannot be found in schools. The influence of the environment is another main reason. For example, if the juveniles around them behave badly in schools like talking to others or sleeping in classes without being punished by teachers, which may have a bad impact on them and they may imitate this behaviour, leading to their lack of focus on their school work.

However, many mea

In [43]:
# result["context"][1].page_content.split("\n")
# convert to json

result_json = json.loads(result["graded_response"])
result_json["overall score"]




8.0

In [None]:
# for each row in the test data set, run the LLM grading
# Run LLM grading for 100 rows
rag_results = []
processed_rows = 0
for _, row in df_test.iterrows():
    if processed_rows >= 50:
        break
    result = graph.invoke({"question": row['question'], "essay": row['essay']})
    # add the results to the rag_results dataframe
    # print("Question: "+row['question']+ "\n")
    # print("Essay: "+row['essay']+ "\n")
    # print("Overall Score: "+str(row["Overall Band Score"])+ "\n")
    # print("Predicted Band Score:", result["overall score"])
    print("---------")
    # add row as list of dictionaries   
    result_json = json.loads(result["graded_response"]) 
    # show accuracy of the model by comparing the predicted score to the actual score
    
    new_row = (row['question'],row['essay'],float(row["Overall Band Score"]),result_json["overall score"],float(row["Overall Band Score"])/result_json["overall score"])
    rag_results.append(new_row)  
    processed_rows += 1




---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------
---------


TypeError: unsupported operand type(s) for /: 'float' and 'str'

In [60]:
# convert list of dictionaries to dataframe
rag_results = pd.DataFrame(rag_results, columns = ['question','essay','Overall Band Score','Predicted Band Score','variation'])

rag_results.shape

(16, 5)

In [62]:
# create accuracy score for continuos variables with sklearn


rms = sqrt(mean_squared_error(rag_results['Overall Band Score'], rag_results['Predicted Band Score']))
print("rms")
print(rms)
print("r2Square")
y_pred = rag_results['Predicted Band Score']
y_true = rag_results['Overall Band Score']
openai_r2_score = r2_score(y_true, y_pred)
print(openai_r2_score)


# save to excel file
rag_results.to_excel("openai_rag_results.xlsx")


rms
2.3351927115336757
r2Square
-1.2969971205265325
