In [60]:
#Import for model and responses
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser

#Import for PDF text extraction and splitting
from langchain_community.document_loaders import PyMuPDFLoader
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter

#Import for vector database and embeddings
import os
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from uuid import uuid4
from langchain_core.documents import Document

#Import for keyword extraction
import yake

#Import for relative current working directory
import os
import sys

#Import for BERTScore calculation
from transformers import BertTokenizer, BertModel
from bert_score import BERTScorer

#Imports for dataset calculation
import pandas as pd


In [61]:
#Initialising LLM 
llm = ChatOllama(model="llama3")

#Initialising the Embedding model
embedding_model = OllamaEmbeddings(model='nomic-embed-text')

#Initialising the keyword extraction model
kw_extractor = yake.KeywordExtractor()
language = "en"

#Initialising Current Working Directory
app_dir = r"C:\Users\KD\Desktop\Git\hsc-llm"

In [62]:
#Function that loads or creates a vector store
def create_or_load_vector_store(embeddings, store_name):
    persistent_directory = os.path.join(app_dir, store_name)
    if not os.path.exists(persistent_directory):
        print("creating vector store")
        vector_store = Chroma.from_documents(embeddings, persist_directory=persistent_directory)
    else:
        print("loading existing vector store")
        vector_store = Chroma(
        persist_directory=persistent_directory,
        embedding_function=embeddings
        )
    return vector_store

vector_store = create_or_load_vector_store(embedding_model, "chroma_db")

loading existing vector store


In [63]:
def get_response(query, context):
    qa_template = """
    You are an assistant for question-answering tasks.

    Use the following documents that is retrieved from the database is relevant \
    use it to provide a complete and concise response to the user's query. \
    Do not mention references, sources, or citations in your response

    If the documents provided are not relevant to the question, use your own knowledge to answer.

    Limit your answer to 3-4 sentences.

    User question: {user_questions}

    Documents: {documents}

    Answer:
    """

    qa_prompt = ChatPromptTemplate.from_template(qa_template)

    llm = ChatOllama(model="llama3")

    chain = qa_prompt | llm | StrOutputParser()

    return chain.invoke({
        "user_questions" : query,
        "documents" : context
    })


In [64]:
def rewrite_query(query: str):

    query_rewriting_str = """
    You are an expert at reformulating questions. \
    Your reformulated questions are understandable for high students and teachers.
    The question you reformulate will begin and end with with ’**’. 
    
    Question: 
    {question} 

    Only reply using a complete sentence and only give the answer in the following format:
    **Question**"""

    query_rewriting_prompt = ChatPromptTemplate.from_template(query_rewriting_str)

    chain = query_rewriting_prompt | llm | StrOutputParser()

    response = chain.invoke({
        "question" : query
    })

    print(response)

    return response 

In [65]:
def get_keywords(rewritten_query):
    keywords = kw_extractor.extract_keywords(rewritten_query)
    for kw in keywords:
        print(kw)
    return keywords

In [66]:
def perform_retrieval(vector_store, query, k=5):
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k})
    results = retriever.invoke(query)
    return results

In [67]:
#Imports for dataset calculation
import pandas as pd

goldenset = pd.read_csv(r"C:\Users\KD\Desktop\Git\hsc-llm\ground_truth.csv", index_col=0)

In [68]:
goldenset.head()

Unnamed: 0_level_0,QUESTION,ANSWER
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Define 'asexual reproduction',Asexual reproduction is a type of reproductive...
2,Identify the sort of cell division that is inv...,The cell division in asexual reproduction is m...
3,Outline the ideal environmental conditions for...,Stable and uniform environments with a good su...
4,List the key events in binary fission for bact...,The key events that happen in binary fission f...
5,What is the difference between a somatic cell ...,Somatic cells are all the diploid cells in the...


In [69]:
questions = goldenset['QUESTION'].values.tolist()
reference_answers = goldenset['ANSWER'].values.tolist()

### Generating AI responses (NO AI QUERY REWRITING) MODEL 1

In [70]:
generated_answers = [] 
for question in questions:
    response = get_response(question, perform_retrieval(vector_store, question))
    generated_answers.append(response)

In [71]:
len(generated_answers)

162

#### Computing BERTSCORE FOR MODEL 1

In [72]:
scorer = BERTScorer(model_type='bert-base-uncased')

In [73]:
precision = []
recall = []
f1 = []
for x in range(len(generated_answers)):
    P, R, F1 = scorer.score([generated_answers[x]], [reference_answers[x]])
    precision.append(P.item())
    recall.append(R.item())   
    f1.append(F1.item())

In [74]:
precision = [round(p, 3) for p in precision]
recall = [round(r, 3) for r in recall]
f1 = [round(f, 3) for f in f1]

In [75]:
for i in range(len(precision)):
    print(f"Question {i+1} - Precision: {precision[i]} Recall: {recall[i]} F1: {f1[i]}")

Question 1 - Precision: 0.635 Recall: 0.709 F1: 0.67
Question 2 - Precision: 0.547 Recall: 0.689 F1: 0.61
Question 3 - Precision: 0.621 Recall: 0.62 F1: 0.62
Question 4 - Precision: 0.643 Recall: 0.647 F1: 0.645
Question 5 - Precision: 0.618 Recall: 0.666 F1: 0.641
Question 6 - Precision: 0.54 Recall: 0.552 F1: 0.546
Question 7 - Precision: 0.671 Recall: 0.603 F1: 0.635
Question 8 - Precision: 0.53 Recall: 0.569 F1: 0.549
Question 9 - Precision: 0.666 Recall: 0.663 F1: 0.665
Question 10 - Precision: 0.751 Recall: 0.665 F1: 0.705
Question 11 - Precision: 0.607 Recall: 0.693 F1: 0.647
Question 12 - Precision: 0.57 Recall: 0.596 F1: 0.583
Question 13 - Precision: 0.617 Recall: 0.618 F1: 0.617
Question 14 - Precision: 0.559 Recall: 0.658 F1: 0.604
Question 15 - Precision: 0.546 Recall: 0.693 F1: 0.611
Question 16 - Precision: 0.504 Recall: 0.569 F1: 0.535
Question 17 - Precision: 0.449 Recall: 0.656 F1: 0.533
Question 18 - Precision: 0.595 Recall: 0.595 F1: 0.595
Question 19 - Precision: 0

In [76]:
goldenset = pd.read_csv(r"C:\Users\KD\Desktop\Git\hsc-llm\ground_truth.csv", index_col=0)

In [77]:
goldenset

Unnamed: 0_level_0,QUESTION,ANSWER
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Define 'asexual reproduction',Asexual reproduction is a type of reproductive...
2,Identify the sort of cell division that is inv...,The cell division in asexual reproduction is m...
3,Outline the ideal environmental conditions for...,Stable and uniform environments with a good su...
4,List the key events in binary fission for bact...,The key events that happen in binary fission f...
5,What is the difference between a somatic cell ...,Somatic cells are all the diploid cells in the...
...,...,...
158,Explain how the eye detects different colours.,"There are three types of cone cells, each with..."
159,Discuss the value of having binocular vision.,Binocular vision requires two eyes quite close...
160,Arrange the following structures of the excret...,"The largest is kidney, then nephron, glomerulu..."
161,Name the two hormones responsible for regulati...,ADH (antidiuretic hormone)—responds to water l...


In [78]:
index_list = []
for i in range(1, len(goldenset)+1):
    index_list.append(i)

In [79]:
len(index_list)

162

In [80]:
generated_col = pd.DataFrame({'ID':index_list,'Generated Answer': generated_answers, 'Precision': precision, 'Recall': recall, 'F1': f1})

In [81]:
generated_col.set_index('ID', inplace=True)

In [82]:
generated_col

Unnamed: 0_level_0,Generated Answer,Precision,Recall,F1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Asexual reproduction is the process by which a...,0.635,0.709,0.670
2,The type of cell division involved in asexual ...,0.547,0.689,0.610
3,The ideal environmental conditions for asexual...,0.621,0.620,0.620
4,Here are the key events in binary fission for ...,0.643,0.647,0.645
5,A somatic cell is a type of cell that is not r...,0.618,0.666,0.641
...,...,...,...,...
158,The eye detects different colors through the c...,0.641,0.668,0.654
159,Having binocular vision is crucial for humans ...,0.672,0.580,0.623
160,Here is the answer:\n\nThe structures of the e...,0.629,0.747,0.683
161,The two hormones responsible for regulating sa...,0.571,0.705,0.631


#### Creating Pandas Dataframe to store calculations in CSV

In [83]:
goldenset = pd.concat([goldenset, generated_col], axis=1)

In [84]:
goldenset

Unnamed: 0_level_0,QUESTION,ANSWER,Generated Answer,Precision,Recall,F1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Define 'asexual reproduction',Asexual reproduction is a type of reproductive...,Asexual reproduction is the process by which a...,0.635,0.709,0.670
2,Identify the sort of cell division that is inv...,The cell division in asexual reproduction is m...,The type of cell division involved in asexual ...,0.547,0.689,0.610
3,Outline the ideal environmental conditions for...,Stable and uniform environments with a good su...,The ideal environmental conditions for asexual...,0.621,0.620,0.620
4,List the key events in binary fission for bact...,The key events that happen in binary fission f...,Here are the key events in binary fission for ...,0.643,0.647,0.645
5,What is the difference between a somatic cell ...,Somatic cells are all the diploid cells in the...,A somatic cell is a type of cell that is not r...,0.618,0.666,0.641
...,...,...,...,...,...,...
158,Explain how the eye detects different colours.,"There are three types of cone cells, each with...",The eye detects different colors through the c...,0.641,0.668,0.654
159,Discuss the value of having binocular vision.,Binocular vision requires two eyes quite close...,Having binocular vision is crucial for humans ...,0.672,0.580,0.623
160,Arrange the following structures of the excret...,"The largest is kidney, then nephron, glomerulu...",Here is the answer:\n\nThe structures of the e...,0.629,0.747,0.683
161,Name the two hormones responsible for regulati...,ADH (antidiuretic hormone)—responds to water l...,The two hormones responsible for regulating sa...,0.571,0.705,0.631


In [85]:
#exporting the dataframe to a csv file
goldenset.to_csv(r'C:\Users\KD\Desktop\Git\hsc-llm\berteval-model1.csv')

### Generating AI responses (with AI QUERY REWRITING) MODEL 2

In [86]:
generated_answers = [] 
for question in questions:
    reworded_query = rewrite_query(question)        
    test_response = get_response(reworded_query, perform_retrieval(vector_store, reworded_query))
    generated_answers.append(test_response)

**What is meant by asexual reproduction, which refers to the process by which some living organisms, such as bacteria, plants, or single-celled organisms, produce offspring that are genetically identical to themselves without the fusion of gametes or meiosis?**
**What type of cell division process allows an organism to produce offspring genetically identical to itself without the fusion of gametes, often seen in processes like budding or spore formation?**
**What factors create an optimal environment that facilitates the successful asexual reproduction of certain organisms, such as budding or fragmentation, and what specific characteristics of this environment contribute to their reproductive success?**
**What are the crucial steps involved in the process of binary fission in bacteria, where one cell divides into two daughter cells with identical genetic material?**
**What is the key distinction between cells that make up most of our body, known as somatic cells, which are diploid and 

In [87]:
len(generated_answers)

162

#### Computing BERTSCORE FOR MODEL 2

In [88]:
scorer = BERTScorer(model_type='bert-base-uncased')

In [89]:
precision = []
recall = []
f1 = []
for x in range(len(generated_answers)):
    P, R, F1 = scorer.score([generated_answers[x]], [reference_answers[x]])
    precision.append(P.item())
    recall.append(R.item())   
    f1.append(F1.item())

In [90]:
precision = [round(p, 3) for p in precision]
recall = [round(r, 3) for r in recall]
f1 = [round(f, 3) for f in f1]

In [91]:
for i in range(len(precision)):
    print(f"Question {i+1} - Precision: {precision[i]} Recall: {recall[i]} F1: {f1[i]}")

Question 1 - Precision: 0.624 Recall: 0.741 F1: 0.678
Question 2 - Precision: 0.512 Recall: 0.595 F1: 0.55
Question 3 - Precision: 0.603 Recall: 0.637 F1: 0.619
Question 4 - Precision: 0.687 Recall: 0.721 F1: 0.703
Question 5 - Precision: 0.582 Recall: 0.635 F1: 0.607
Question 6 - Precision: 0.585 Recall: 0.652 F1: 0.617
Question 7 - Precision: 0.689 Recall: 0.609 F1: 0.647
Question 8 - Precision: 0.583 Recall: 0.612 F1: 0.597
Question 9 - Precision: 0.676 Recall: 0.729 F1: 0.701
Question 10 - Precision: 0.647 Recall: 0.687 F1: 0.666
Question 11 - Precision: 0.523 Recall: 0.619 F1: 0.567
Question 12 - Precision: 0.577 Recall: 0.573 F1: 0.575
Question 13 - Precision: 0.594 Recall: 0.599 F1: 0.597
Question 14 - Precision: 0.563 Recall: 0.613 F1: 0.587
Question 15 - Precision: 0.538 Recall: 0.67 F1: 0.597
Question 16 - Precision: 0.503 Recall: 0.505 F1: 0.504
Question 17 - Precision: 0.541 Recall: 0.695 F1: 0.609
Question 18 - Precision: 0.518 Recall: 0.562 F1: 0.539
Question 19 - Precisi

In [92]:
index_list = []
for i in range(1, len(goldenset)+1):
    index_list.append(i)

In [93]:
len(index_list)

162

In [94]:
goldenset = pd.read_csv(r"C:\Users\KD\Desktop\Git\hsc-llm\ground_truth.csv", index_col=0)

In [95]:
generated_col = pd.DataFrame({'ID':index_list,'Generated Answer': generated_answers, 'Precision': precision, 'Recall': recall, 'F1': f1})

In [96]:
generated_col.set_index('ID', inplace=True)

In [97]:
generated_col

Unnamed: 0_level_0,Generated Answer,Precision,Recall,F1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Asexual reproduction refers to the process by ...,0.624,0.741,0.678
2,The type of cell division process that allows ...,0.512,0.595,0.550
3,For certain organisms that undergo asexual rep...,0.603,0.637,0.619
4,The crucial steps involved in the process of b...,0.687,0.721,0.703
5,"The key distinction between somatic cells, whi...",0.582,0.635,0.607
...,...,...,...,...
158,"Photoreceptors in the retina, specifically con...",0.641,0.692,0.665
159,The unique combination of monocular and binocu...,0.577,0.540,0.558
160,The correct structures of the excretory system...,0.426,0.521,0.468
161,The two hormones that play a crucial role in c...,0.513,0.705,0.594


#### Creating Pandas Dataframe to store calculations in CSV

In [98]:
goldenset = pd.concat([goldenset, generated_col], axis=1)

In [99]:
goldenset

Unnamed: 0_level_0,QUESTION,ANSWER,Generated Answer,Precision,Recall,F1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Define 'asexual reproduction',Asexual reproduction is a type of reproductive...,Asexual reproduction refers to the process by ...,0.624,0.741,0.678
2,Identify the sort of cell division that is inv...,The cell division in asexual reproduction is m...,The type of cell division process that allows ...,0.512,0.595,0.550
3,Outline the ideal environmental conditions for...,Stable and uniform environments with a good su...,For certain organisms that undergo asexual rep...,0.603,0.637,0.619
4,List the key events in binary fission for bact...,The key events that happen in binary fission f...,The crucial steps involved in the process of b...,0.687,0.721,0.703
5,What is the difference between a somatic cell ...,Somatic cells are all the diploid cells in the...,"The key distinction between somatic cells, whi...",0.582,0.635,0.607
...,...,...,...,...,...,...
158,Explain how the eye detects different colours.,"There are three types of cone cells, each with...","Photoreceptors in the retina, specifically con...",0.641,0.692,0.665
159,Discuss the value of having binocular vision.,Binocular vision requires two eyes quite close...,The unique combination of monocular and binocu...,0.577,0.540,0.558
160,Arrange the following structures of the excret...,"The largest is kidney, then nephron, glomerulu...",The correct structures of the excretory system...,0.426,0.521,0.468
161,Name the two hormones responsible for regulati...,ADH (antidiuretic hormone)—responds to water l...,The two hormones that play a crucial role in c...,0.513,0.705,0.594


In [100]:
#exporting the dataframe to a csv file
goldenset.to_csv(r'C:\Users\KD\Desktop\Git\hsc-llm\berteval-model2.csv')

### Generating AI responses (with AI QUERY REWRITING AND KEYWORD EXTRACTION) MODEL 3

In [115]:
generated_answers = [] 
for question in questions:
    reworded_query = rewrite_query(question)
    kws = get_keywords(reworded_query)
    test_response = get_response(reworded_query, perform_retrieval(vector_store, kws[0]))
    generated_answers.append(test_response)

**What is meant by "asexual reproduction" in biology, and how does it differ from sexual reproduction in terms of the role of gametes and genetic variation?**
('genetic variation', 0.019943439662486344)
('asexual reproduction', 0.028850471065862877)
('differ from sexual', 0.033100028790236186)
('role of gametes', 0.033100028790236186)
('gametes and genetic', 0.033100028790236186)
('reproduction in terms', 0.10620858116989315)
('asexual', 0.10923981997294611)
('biology', 0.10923981997294611)
('variation', 0.10923981997294611)
('reproduction', 0.12834816870701238)
('meant', 0.17899586521379354)
('differ', 0.17899586521379354)
('sexual', 0.17899586521379354)
('terms', 0.17899586521379354)
('role', 0.17899586521379354)
('gametes', 0.17899586521379354)
('genetic', 0.17899586521379354)
**What type of cell division process occurs in organisms that reproduce without gametes, resulting in genetically identical offspring?**
('genetically identical offspring', 0.0025258761156662004)
('cell divisi

In [116]:
len(generated_answers)

162

#### Computing BERTSCORE FOR MODEL 3

In [117]:
scorer = BERTScorer(model_type='bert-base-uncased')

In [118]:
precision = []
recall = []
f1 = []
for x in range(len(generated_answers)):
    P, R, F1 = scorer.score([generated_answers[x]], [reference_answers[x]])
    precision.append(P.item())
    recall.append(R.item())   
    f1.append(F1.item())

In [119]:
precision = [round(p, 3) for p in precision]
recall = [round(r, 3) for r in recall]
f1 = [round(f, 3) for f in f1]

In [120]:
for i in range(len(precision)):
    print(f"Question {i+1} - Precision: {precision[i]} Recall: {recall[i]} F1: {f1[i]}")

Question 1 - Precision: 0.645 Recall: 0.804 F1: 0.716
Question 2 - Precision: 0.561 Recall: 0.626 F1: 0.592
Question 3 - Precision: 0.65 Recall: 0.647 F1: 0.649
Question 4 - Precision: 0.598 Recall: 0.671 F1: 0.633
Question 5 - Precision: 0.57 Recall: 0.656 F1: 0.61
Question 6 - Precision: 0.608 Recall: 0.608 F1: 0.608
Question 7 - Precision: 0.658 Recall: 0.573 F1: 0.613
Question 8 - Precision: 0.644 Recall: 0.636 F1: 0.64
Question 9 - Precision: 0.637 Recall: 0.63 F1: 0.633
Question 10 - Precision: 0.678 Recall: 0.658 F1: 0.668
Question 11 - Precision: 0.546 Recall: 0.597 F1: 0.57
Question 12 - Precision: 0.609 Recall: 0.563 F1: 0.585
Question 13 - Precision: 0.636 Recall: 0.649 F1: 0.642
Question 14 - Precision: 0.521 Recall: 0.492 F1: 0.506
Question 15 - Precision: 0.528 Recall: 0.638 F1: 0.578
Question 16 - Precision: 0.506 Recall: 0.53 F1: 0.517
Question 17 - Precision: 0.477 Recall: 0.633 F1: 0.544
Question 18 - Precision: 0.562 Recall: 0.579 F1: 0.571
Question 19 - Precision: 0

In [121]:
index_list = []
for i in range(1, len(goldenset)+1):
    index_list.append(i)

In [122]:
len(index_list)

162

In [123]:
len(generated_answers)

162

In [124]:
goldenset = pd.read_csv(r"C:\Users\KD\Desktop\Git\hsc-llm\ground_truth.csv", index_col=0)

In [125]:
generated_col = pd.DataFrame({'ID':index_list,'Generated Answer': generated_answers, 'Precision': precision, 'Recall': recall, 'F1': f1})

In [126]:
generated_col.set_index('ID', inplace=True)

In [127]:
generated_col

Unnamed: 0_level_0,Generated Answer,Precision,Recall,F1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,"Asexual reproduction, also known as agamogenes...",0.645,0.804,0.716
2,The type of cell division process that occurs ...,0.561,0.626,0.592
3,To create an optimal environment for efficient...,0.650,0.647,0.649
4,The critical stages or milestones that occur d...,0.598,0.671,0.633
5,The primary distinction between a somatic cell...,0.570,0.656,0.610
...,...,...,...,...
158,The human eye detects and differentiates betwe...,0.642,0.634,0.638
159,The specific advantages of possessing binocula...,0.604,0.581,0.592
160,"The excretory system, also known as the urinar...",0.390,0.619,0.479
161,The two hormones that play a crucial role in c...,0.513,0.712,0.596


#### Creating Pandas Dataframe to store calculations in CSV

In [128]:
goldenset = pd.concat([goldenset, generated_col], axis=1)

In [129]:
goldenset

Unnamed: 0_level_0,QUESTION,ANSWER,Generated Answer,Precision,Recall,F1
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Define 'asexual reproduction',Asexual reproduction is a type of reproductive...,"Asexual reproduction, also known as agamogenes...",0.645,0.804,0.716
2,Identify the sort of cell division that is inv...,The cell division in asexual reproduction is m...,The type of cell division process that occurs ...,0.561,0.626,0.592
3,Outline the ideal environmental conditions for...,Stable and uniform environments with a good su...,To create an optimal environment for efficient...,0.650,0.647,0.649
4,List the key events in binary fission for bact...,The key events that happen in binary fission f...,The critical stages or milestones that occur d...,0.598,0.671,0.633
5,What is the difference between a somatic cell ...,Somatic cells are all the diploid cells in the...,The primary distinction between a somatic cell...,0.570,0.656,0.610
...,...,...,...,...,...,...
158,Explain how the eye detects different colours.,"There are three types of cone cells, each with...",The human eye detects and differentiates betwe...,0.642,0.634,0.638
159,Discuss the value of having binocular vision.,Binocular vision requires two eyes quite close...,The specific advantages of possessing binocula...,0.604,0.581,0.592
160,Arrange the following structures of the excret...,"The largest is kidney, then nephron, glomerulu...","The excretory system, also known as the urinar...",0.390,0.619,0.479
161,Name the two hormones responsible for regulati...,ADH (antidiuretic hormone)—responds to water l...,The two hormones that play a crucial role in c...,0.513,0.712,0.596


In [130]:
#exporting the dataframe to a csv file
goldenset.to_csv(r'C:\Users\KD\Desktop\Git\hsc-llm\berteval-model3.csv')