# **Retrieval Augmented Generation (RAG) with LLMs**

## **Section A. Experimenting with Vector Store Query Design**

In [206]:
import json, os, io, re, requests, fitz
import requests
from langchain_text_splitters import RecursiveJsonSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.schema import TextNode
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI

# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
import os, io, json, transformers, pinecone, fitz, pypdf, faiss, sqlite3, langchain_community, langchain, openai, math, time 
from transformers import pipeline
import pandas as pd
import numpy as np
from io import StringIO
from dotenv import load_dotenv
from operator import itemgetter

from langchain import document_loaders, embeddings
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from pinecone import Pinecone, ServerlessSpec, Pinecone         # vector store

from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader
)
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.schema import TextNode
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI

from sentence_transformers import SentenceTransformer

# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


In [207]:
load_dotenv()

#Anyscale Endpoint API Key:
## https://www.anyscale.com/endpoints
access_endpoint_api_key =os.getenv('access_endpoint_api_key') 

# OpenAI API Key:
openai = os.getenv('OPENAI_API_KEY')

# Pinecone API Key:
pinecone_api_key =os.getenv('PINECONE_API_KEY')
environment =os.getenv('PINECONE_ENV')

HF_TOKEN = os.getenv('HF_TOKEN')

# configure Pinecone client
pc = Pinecone(api_key=pinecone_api_key)

In [209]:
doc = fitz.open("./data/CMU Student Handbook 2023-24.pdf")

### **Choose a method to chunk the text data:**

- [Semantic chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker)

- [Recursive chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)

- [Character chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/character_text_splitter)

- [Token chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/split_by_token)

##### Choose a type of chunker (From langchain):
1. Semantic chunking
2. Token chunking

#### **Chunker Choices** (Run only one of the two following cells)

In [161]:
# Chunker choice #1: Semantic chunking

from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(OpenAIEmbeddings(), breakpoint_threshold_type="standard_deviation")

text_chunks = []
doc_idxs = []

for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.create_documents([page_text])
    cur_text_chunks = [cur_chunk.page_content for cur_chunk in cur_text_chunks]
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

print(text_chunks)

nodes = []

for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)




In [210]:
# Chunker choice #2: Token chunking
from langchain_text_splitters import NLTKTextSplitter

text_splitter = NLTKTextSplitter(chunk_size=1000)

text_chunks = []
doc_idxs = []

for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

print(text_chunks)

nodes = []

for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text=text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

Created a chunk of size 1015, which is longer than the specified 1000
Created a chunk of size 1200, which is longer than the specified 1000
Created a chunk of size 1305, which is longer than the specified 1000
Created a chunk of size 1034, which is longer than the specified 1000




### **Create the vector store using chosen similarity metrics:**

In [211]:
use_serverless = os.environ.get("USE_SERVERLESS", "False").lower() == "true"

if use_serverless:
    spec = pinecone.ServerlessSpec(cloud='aws', region='us-west-2')
else:
    spec = pinecone.PodSpec(environment=environment)

# Name our Pinecone Index:
index_name = "hw02"

# If a Pinecone index of the same name already exists, delete it:
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

### **choose a similarity metric to use for the vector store:** cosine similarity

In [212]:
# Name our Pinecone Index:
index_name = "hw02"

# define similarity and additional parameters for the vector store index:

dimensions = 1536 #768              # the dimensions of the index need to align with the LLM we are using for the RAG system. For example, if using openAI then dimenion = 1536. If using Llama2, then dimension = 384.

# "dotproduct" is one similarity metric we can for the vector store index. We can use different distance metrics to measure the similarity between vector embeddings and user queries. This is where we define what similarity metric we are going to use for the vector store.
# "cosine" is another similarity metric we can use for the vector store index.
# "euclidean" is another similarity metric we can use for the vector store index.

pc.create_index(
    name=index_name, 
    dimension=dimensions, 
    metric="cosine",          # we can use different distance metrics to measure the similarity between vector embeddings and user queries. this is where we define what similarity metric we are going to use for the vector store.
    spec=spec
)

# wait for index to be ready before connecting
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

for index in pc.list_indexes():
    print(index['name'])


pc.describe_index("hw02")


pc_index = pc.Index(index_name)  # create an index to use in the vector store


vector_store = PineconeVectorStore(pinecone_index=pc_index)    # this function creates a vector store where we will add and store embeddings

hw02


In [213]:

pc_index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [214]:
llm = OpenAI(model="gpt-3.5-turbo")

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:00<00:00,  6.35it/s]
100%|██████████| 906/906 [04:28<00:00,  3.37it/s]


### ***choose an embedding model to use for the vector store:** small text embedding

#### **OpenAI Embeddings**

In [215]:
model_ada="text-embedding-ada-002"
small_txt_embedmodel_="text-embedding-3-small"


In [216]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small", openai_api_key=openai)

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

### **load the embeddings into the vector store (e.g. create a vector store):**

In [217]:
vector_store.add(nodes)

Upserted vectors: 100%|██████████| 906/906 [00:04<00:00, 197.44it/s]


['a5926d99-c89b-4972-aa1e-dc9d5911b1ce',
 'bc1ee046-9d63-4007-acbd-0f4a760751eb',
 '6ce89012-1312-4538-b7f1-a47f8164e2f3',
 'ede19be9-f4a9-4509-9d2e-517a993c4664',
 'bf247eaf-4404-4ee9-b0bd-0591627949ca',
 'a7cc6aed-44b4-477a-8ea5-45c732e4bb54',
 '38eab5ff-b53b-4e70-b87f-1ea5d5e8dd21',
 '3bce30d2-53e3-457a-a9d0-4576fa982b5d',
 'e206f66e-c29f-4bc8-a853-8146339c86aa',
 '358a8677-ce1d-4a99-afac-0fb8470409a1',
 '5fc07c26-125d-424d-96bd-13e06f925ab1',
 'd50feb14-05a0-4ec5-b603-a26e60d24ef4',
 '0bbdff56-263a-4b1c-a58b-3117d759e256',
 '02b2809c-36ac-43a1-ad43-dea0523e12b7',
 '2bccf625-a10f-41ec-b79e-5537f208583f',
 '2d74eeb7-a51c-4c26-bb40-f947f98f16b8',
 '4a970ed5-e40c-464f-adfa-e7e31d16c484',
 'b6a86b7d-8003-400b-93f5-c67aa8cae529',
 '8c6818e1-4e44-48c5-a560-58a2e53f3c24',
 'fca91fca-03d5-4a8c-bce5-11647a89d9e5',
 'b7456e62-fd9b-4202-88da-132a36860636',
 '6265a972-7011-4124-8c80-141a08b4bf5f',
 'e0115ace-b6ac-42cf-bd49-93fa636a50c2',
 'be907948-26a1-4c9c-9adf-e166c2b100be',
 'b9b20fa2-f290-

In [219]:

pc_index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.005,
 'namespaces': {'': {'vector_count': 500}},
 'total_vector_count': 500}

In [220]:
print(nodes[0].metadata)

{'document_title': '"The Carnegie Mellon University Student Handbook: Navigating Rights, Responsibilities, and Resources for Academic Success"', 'questions_this_excerpt_can_answer': '1. What is the title of the document that provides information on navigating rights, responsibilities, and resources for academic success at Carnegie Mellon University for the academic year 2023-2024?\n2. What specific information is included in the Carnegie Mellon University Student Handbook that can help students understand their rights and responsibilities as members of the university community?\n3. How does the Student Handbook at Carnegie Mellon University aim to support students in achieving academic success and accessing resources on campus?'}


In [221]:
print(nodes[0])

Node ID: a5926d99-c89b-4972-aa1e-dc9d5911b1ce
Text: 1        The Word: Student Handbook  2023-2024


### **Retrieve Content from the Vector Store**

In [222]:
from openai import OpenAI
client = OpenAI()

#### **Query the vector store using these queries**

**Instruction: set the 'k' parameter to 5**

Query 1: What is the policy statement for the academic integrity policy?

Query 2: What is the policy violation definition for cheating?

Query 3: What is the policy statement for improper or illegal communications?

Query 4: What are CMU’s quiet hours?

Query 5: Where are pets allowed on CMU?


### ***query the vector store with the 5 queries above :***

In [223]:
# query the vector store with the 5 queries above (don't forget to record the responses in your homework submission!):

def query_vector_store(query, k):
    res = client.embeddings.create(
        input=[query],
        model= embed_model_3_small
    )

    # retrieve from Pinecone
    xq = res.data[0].embedding #res['data'][0]['embedding']

    # get relevant contexts (including the questions)
    res2 = pc_index.query(vector=xq, top_k=k, include_metadata=True)
    content = json.loads(json.dumps(res2.to_dict(), indent=4))
    match_id = []
    for match in content['matches']:
        match_id.append(match['id'])
    for node in nodes:
        if node.id_ in match_id:
            print(node)

In [190]:
# Query 1

query = (
    "What is the policy statement for the academic integrity policy?"
)

query_vector_store(query, 5)

Node ID: 7aa06d48-c86f-4861-860c-b2d9339abb3e
Text: 11      classroom, they should strive to advance the art of
teaching. One of the primary goals should  be to instill in their
students a desire to learn and an enthusiasm for the subject matter at
hand. The faculty as a whole also has the major responsibility for
establishing and  maintaining curricula which meet the standards and
fulfill the e...
Node ID: 62a7f471-2ed8-494d-8df0-0fed6c6529ba
Text: 13      conducting their learning in a similarly honest and
committed fashion-by avoiding plagiarism,  cheating or taking credit
for work not their own-and thus contributing to a campus  atmosphere
which expects and supports academic integrity. Practice of the Mission
of Academic Integrity  Academic integrity refers to the implicit
commitment th...
Node ID: a32ed50f-62e0-42fc-ad4c-28017b19c6bd
Text: The university expects its members to be leaders in matters
concerning integrity, not only  here, but in the larger society we
serve. Standards

In [191]:
# Query 2

query = (
    "What is the policy violation definition for cheating?"
)

query_vector_store(query, 5)

Node ID: 5dd1152a-7a30-410e-a10f-88ead7d31a32
Text: 17      fairness and exemplary behavior represent the
expectations for ethical behavior for all  members of the Carnegie
Mellon community. Policy Statement  In any manner of presentation, it
is the responsibility of each student to produce her/his  own original
academic work. Collaboration or assistance on academic work to be
graded  is not perm...
Node ID: afef51b5-911e-427c-af8e-308f1cf466aa
Text: 18      6. Falsification of academic credentials. Plagiarism is
defined as the use of work or concepts contributed by other
individuals  without proper attribution or citation. Unique ideas or
materials taken from another  source for either written or oral use
must be fully acknowledged in academic work to be  graded. Examples of
sources expecte...
Node ID: 5c9e1031-18ca-44e0-a355-8f70e18b0c2f
Text: 116      Academic disciplinary actions are outcomes imposed when
any student violates the University  Policy on Academic Integrity
including c

In [192]:
# Query 3

query = (
    "What is the policy statement for improper or illegal communications?"
)

query_vector_store(query, 5)

Node ID: 8235f22e-1050-4471-b0a0-cb33584f64c7
Text: 20      Patrick McCue  Accessibility  Specialist  412-268-7537
pmccue@andrew.cmu.edu   Ethan Obstarczyk  Senior Disability  Resources
Program  Administrator  412-268-5940  ethano@andrew.cmu.edu  Jill
Ramsey  Testing Center  Proctor  412-268-5940  jillrams@andrew.cmu.edu
Change of Address  A student is required to report all changes in
address...
Node ID: 52a15a33-6a4b-4a14-b747-48ed270a9b47
Text: 22      Carnegie Mellon, should immediately notify the
Information Security Office at iso- ir@andrew.cmu.edu. Maintain the
Security and Confidentiality of your Account  Users assume personal
responsibility for the use made of their computer accounts. This
responsibility begins with selecting a secure password, and involves
maintaining the  conf...
Node ID: 4969511a-5d6a-43dc-99f9-816caecae18a
Text: 30      • Using another person’s computer account, userID,
files, or data without appropriate  permission, as described in the
previous bullet (e.

In [193]:
# Query 4

query = (
    "What are CMU's quiet hours?"
)

query_vector_store(query, 5)

Node ID: c1dbf187-8ba9-4820-893a-b7bb7d74a7e0
Text: 112      Housing, Meal Plan & Fee Adjustments  Housing charges
are adjusted daily, beginning on check-in day and ending on the last
day  of final exams for the semester. Holiday breaks are included. The
Winter Break period is  not included. Meal plan charges are adjusted
weekly. Dine Express and Campus Express are assessed  based upon
actual use...
Node ID: b3be81b0-0eec-409e-8829-827c47374803
Text: 113      As with any policy, there must be a means of making
exceptions. Any academic or artistic  activities which you feel must
be scheduled between 4:50 p.m. and 6:50 p.m. must be  cleared with the
University Registrar. These requests must be in writing either as a
memo  or through email to CMURegistrar@andrew.cmu.edu. All requests
must inclu...
Node ID: 16e46d34-5a25-46b6-b201-71460b1142e0
Text: 12. Carnegie Mellon reserves the right to establish other
reasonable expectations or  restrictions at any time to ensure the
health or safety 

In [194]:
# Query 5

query = (
    "Where are pets allowed on CMU?"
)

query_vector_store(query, 5)

Node ID: 36a76f65-8e97-4ecf-a498-b3d0622f374b
Text: 181      Outdoor Poster Policy for Residential Areas  No signs,
posters or banners of any size shall be hung outside or around any
residential  building, except upon special request. This applies to
all residence halls, apartment  buildings and houses as well as
fraternities and sororities. Special consideration may be  given, on
request, for gr...
Node ID: ee4fdc35-490b-4554-b35d-0f257d7645d7
Text: 182      1. Any emotional support animal or service animal must
be healthy and free of  communicable disease. If appropriate for the
animal, documentation of all  veterinarian recommended vaccination
must be provided to the Office of Disability  Resources. New proof of
vaccination must be provided before the previous proof of  vaccination
expire...
Node ID: fe6336fe-49de-427e-b3d1-851ccf7cdcc7
Text: 183      encouraged to establish a plan to care for the animal
in the event the student is  unable to do so (e.g. due to illness or
emergency)

The student handbook is a long PDF file, so we thought cosine similarity would be the most appropriate metric. This is because cosine similarity measures the similarity between two vectors regardless of their magnitudes, focusing only on the direction – this is effective for files of large length such as in this case, and makes it robust to differences in document lengths. Cosine similarity metric is also commonly used in NLP tasks such as information retrieval. Since this common use case is similar to our own of reading and answering queries based on a text, we thought it would be a good fit.

In comparison, dot product uses both magnitude and direction of vectors as opposed to only using direction. This means that the result can be skewed by irrelevant details. For instance, in a situation like this, where we are working with text-based data, the construction of an NLP model where vectors may represent frequencies of words can skew results. Dot product directly multiplies the frequency of each word in the vectors, potentially amplifying the importance of frequent words and leading to skewed results. Meanwhile, Euclidean distance is also not as intuitive for interpreting similarity as cosine similarity. While smaller distances generally indicate greater similarity, the absolute values are not directly comparable across different pairs of passages, making it challenging to assess similarity comprehensively.


Results or information retrieved from the vector store in response to each of the queries you submitted to the vector store.  

	Response 1 (to Query 1):

	Node ID: 7aa06d48-c86f-4861-860c-b2d9339abb3e
	Text: 11      classroom, they should strive to advance the art of
	teaching. One of the primary goals should  be to instill in their
	students a desire to learn and an enthusiasm for the subject matter at
	hand. The faculty as a whole also has the major responsibility for
	establishing and  maintaining curricula which meet the standards and
	fulfill the e...
	
	Node ID: 62a7f471-2ed8-494d-8df0-0fed6c6529ba
	Text: 13      conducting their learning in a similarly honest and
	committed fashion-by avoiding plagiarism,  cheating or taking credit
	for work not their own-and thus contributing to a campus  atmosphere
	which expects and supports academic integrity. Practice of the Mission
	of Academic Integrity  Academic integrity refers to the implicit
	commitment th...
	
	Node ID: a32ed50f-62e0-42fc-ad4c-28017b19c6bd
	Text: The university expects its members to be leaders in matters
	concerning integrity, not only  here, but in the larger society we
	serve. Standards for Academic & Creative Life (Academic Policies)
	This section of The Word serves to highlight many policies and
	procedures related to academics  at Carnegie Mellon University. You
	should also become fam...
	
	Node ID: 5dd1152a-7a30-410e-a10f-88ead7d31a32
	Text: 17      fairness and exemplary behavior represent the
	expectations for ethical behavior for all  members of the Carnegie
	Mellon community. Policy Statement  In any manner of presentation, it
	is the responsibility of each student to produce her/his  own original
	academic work. Collaboration or assistance on academic work to be
	graded  is not perm...
	
	Node ID: 5c9e1031-18ca-44e0-a355-8f70e18b0c2f
	Text: 116      Academic disciplinary actions are outcomes imposed when
	any student violates the University  Policy on Academic Integrity
	including cheating, plagiarism and unauthorized assistance. Statute of
	Limitations  There is no statute of limitations that precludes course
	instructors from acting on the discovery  of alleged violations either
	duri...


	Response 2 (to Query 2): 

	Node ID: 5dd1152a-7a30-410e-a10f-88ead7d31a32
	Text: 17      fairness and exemplary behavior represent the
	expectations for ethical behavior for all  members of the Carnegie
	Mellon community. Policy Statement  In any manner of presentation, it
	is the responsibility of each student to produce her/his  own original
	academic work. Collaboration or assistance on academic work to be
	graded  is not perm...

	Node ID: afef51b5-911e-427c-af8e-308f1cf466aa
	Text: 18      6. Falsification of academic credentials. Plagiarism is
	defined as the use of work or concepts contributed by other
	individuals  without proper attribution or citation. Unique ideas or
	materials taken from another  source for either written or oral use
	must be fully acknowledged in academic work to be  graded. Examples of
	sources expecte...

	Node ID: 5c9e1031-18ca-44e0-a355-8f70e18b0c2f
	Text: 116      Academic disciplinary actions are outcomes imposed when
	any student violates the University  Policy on Academic Integrity
	including cheating, plagiarism and unauthorized assistance. Statute of
	Limitations  There is no statute of limitations that precludes course
	instructors from acting on the discovery  of alleged violations either
	duri...

	Node ID: 6b8f1e80-3625-4dd1-b9c6-92643e8e8ad3
	Text: 252      hours of learning them. Relevant information includes
	(to the extent known by the  Responsible Employee):   a. Name (or
	names) of the Complainant(s);  b. Name (or names) of Respondent(s);
	c. Details about the nature of the alleged Prohibited Conduct; and  d.
	Any information available about location(s), date(s), and/or time(s).
	A full l...

	Node ID: 997a4e3d-2db8-49e6-b537-f77ebd884453
	Text: 271      • Trespassing on a roof, parapet, and/or other non-
	public areas  • Unauthorized possession or use of keys or access cards
	• Unauthorized tapping into or altering university utility lines  •
	Vandalism  • Violation of the Carnegie Mellon University Housing
	Services policies  • Willful destruction of, disabling, or damaging
	computer facil...


	Response 3 (to Query 3):

	Node ID: 8235f22e-1050-4471-b0a0-cb33584f64c7
	Text: 20      Patrick McCue  Accessibility  Specialist  412-268-7537
	pmccue@andrew.cmu.edu   Ethan Obstarczyk  Senior Disability  Resources
	Program  Administrator  412-268-5940  ethano@andrew.cmu.edu  Jill
	Ramsey  Testing Center  Proctor  412-268-5940  jillrams@andrew.cmu.edu
	Change of Address  A student is required to report all changes in
	address...

	Node ID: 52a15a33-6a4b-4a14-b747-48ed270a9b47
	Text: 22      Carnegie Mellon, should immediately notify the
	Information Security Office at iso- ir@andrew.cmu.edu. Maintain the
	Security and Confidentiality of your Account  Users assume personal
	responsibility for the use made of their computer accounts. This
	responsibility begins with selecting a secure password, and involves
	maintaining the  conf...

	Node ID: 4969511a-5d6a-43dc-99f9-816caecae18a
	Text: 30      • Using another person’s computer account, userID,
	files, or data without appropriate  permission, as described in the
	previous bullet (e.g. using an account found “logged in”  on a cluster
	machine). • Deleting or tampering with another user’s files or with
	information stored by another  user on any information-bearing medium
	(disk, tape...

	Node ID: 0f37692c-50d8-4f0e-b457-fa36270d691e
	Text: 31      • Reselling of services based on the university network,
	such as web hosting, mailing  services or the selling of shell
	accounts. • Running a proxy server which results in inappropriate or
	unauthorized access to  university materials to non-university
	members. • Advertising commercial businesses or ventures on Web pages
	hosted by Carnegi...

	Node ID: 9e989017-9662-45d7-9147-4e71d657a084
	Text: 188      Wires and Utility Lines  No wire may be strung outside
	any living unit or any alterations made to any university  residential
	building without the express permission of Housing Services. This
	includes  satellite dishes, radio and television aerials. In addition,
	no student or student organization may alter or tap into university
	utility...


	Response 4 (to Query 4):

	Node ID: c1dbf187-8ba9-4820-893a-b7bb7d74a7e0
	Text: 112      Housing, Meal Plan & Fee Adjustments  Housing charges
	are adjusted daily, beginning on check-in day and ending on the last
	day  of final exams for the semester. Holiday breaks are included. The
	Winter Break period is  not included. Meal plan charges are adjusted
	weekly. Dine Express and Campus Express are assessed  based upon
	actual use...

	Node ID: b3be81b0-0eec-409e-8829-827c47374803
	Text: 113      As with any policy, there must be a means of making
	exceptions. Any academic or artistic  activities which you feel must
	be scheduled between 4:50 p.m. and 6:50 p.m. must be  cleared with the
	University Registrar. These requests must be in writing either as a
	memo  or through email to CMURegistrar@andrew.cmu.edu. All requests
	must inclu...

	Node ID: 16e46d34-5a25-46b6-b201-71460b1142e0
	Text: 12. Carnegie Mellon reserves the right to establish other
	reasonable expectations or  restrictions at any time to ensure the
	health or safety of the community. Individuals with concerns or
	complaints regarding animals in residence facilities should  contact a
	Resident Assistant or University Police. Personal Property and
	Insurance  Students are ...

	Node ID: 251f51ad-7ad8-4fd5-8870-3fe1026f69f6
	Text: 184      in effect until the night before classes resume.
	Courtesy hours exist at all other times,  during which the right of
	residents to live in an environment free from bothersome noise  should
	be respected. "Quiet" is defined as being unable to hear any noise at
	a distance of 10 feet from a room  with a closed door. During
	"courtesy" hours, ...

	Node ID: d0d4b66e-54d9-4a4e-8766-d18997282925
	Text: 192      Student Affairs following a determination that the
	arrangement is no longer warranted or  necessary. A student
	considering request of an NCA or NCO may discuss the situation with a
	Housefellow, a College Liaison, the Office of the Dean of Student
	Affairs, the Office for  Institutional Equity and Title IX, the Office
	of Community Standa...


	Response 5 (to Query 5): 

	Node ID: 36a76f65-8e97-4ecf-a498-b3d0622f374b
	Text: 181      Outdoor Poster Policy for Residential Areas  No signs,
	posters or banners of any size shall be hung outside or around any
	residential  building, except upon special request. This applies to
	all residence halls, apartment  buildings and houses as well as
	fraternities and sororities. Special consideration may be  given, on
	request, for gr...

	Node ID: ee4fdc35-490b-4554-b35d-0f257d7645d7
	Text: 182      1. Any emotional support animal or service animal must
	be healthy and free of  communicable disease. If appropriate for the
	animal, documentation of all  veterinarian recommended vaccination
	must be provided to the Office of Disability  Resources. New proof of
	vaccination must be provided before the previous proof of  vaccination
	expire...

	Node ID: fe6336fe-49de-427e-b3d1-851ccf7cdcc7
	Text: 183      encouraged to establish a plan to care for the animal
	in the event the student is  unable to do so (e.g. due to illness or
	emergency).

	Node ID: 16e46d34-5a25-46b6-b201-71460b1142e0
	Text: 12. Carnegie Mellon reserves the right to establish other
	reasonable expectations or  restrictions at any time to ensure the
	health or safety of the community. Individuals with concerns or
	complaints regarding animals in residence facilities should  contact a
	Resident Assistant or University Police. Personal Property and
	Insurance  Students are ...
	
	Node ID: ba8a89e5-9d1a-4f42-b80c-45aee29a88fb
	Text: 197      Bicycles or other wheeled vehicles impounded may be
	recovered only upon proof of  ownership and after required fees are
	paid. Owners are responsible for all costs involved  in removal and
	storage of impounded items. Unclaimed wheeled vehicles shall be held
	for a minimum of six months by Transportation  Services, at which time
	the owner ...


We expected that the queries submitted to the vector store would be generally successful in retrieving relevant information, however in reality this varied based on the provided responses. 
The responses related to academic integrity, including the policy statement and definitions of cheating and plagiarism, were clear and directly relevant. These queries were successful because of the structured and explicit nature of these policies within the university's documentation. On the other hand, the queries regarding CMU's quiet hours and where pets are allowed did not retrieve as direct information and were not successful. This might be due to these topics being more specific and possibly covered in less detail or scattered across different sections of the handbook. 


## **Section B. Experimenting with Vector Store Embeddings & Query Parameters**
Choosing 1 of the 5 queries provided above, and experiment with submitting the query to the vector store by changing the search parameters in the following manner:


*   A) Baseline query, e.g. query, k=5.

*   B) Query, parameter k = 10

*   C) Query, parameter k = 50

*   D) Query, parameter k = 75

*   E) Query, parameter k = 100

*   F) Query, parameter k = 500


In [224]:
query = ("What are CMU's quiet hours?")

In [225]:
# k = 5
query_vector_store(query, 5)

Node ID: b0fa1aa3-a37f-416f-9c9d-d44834bf799c
Text: Members of the floor also have the right to determine community
standards for the floor.  Roommates have equal voice in determining
the private activities  of the room, and one roommate's rights must
not infringe upon another roommate's  rights.  Public Lounge
Facilities  At the beginning of each academic year, a layout of each
lounge area and ...
Node ID: 13734c0e-ed7d-4783-a116-2d38ecbb5dcf
Text: Quiet Hours  Quiet hours for all residential areas are 8 pm - 8
am on Sunday through Thursday nights,  and Midnight - 9 am on Friday
and Saturday nights.  During finals week or reading days,  24-hour
quiet hours will be in effect.  For long vacation weekends, weekend
hours will be
Node ID: 2a9fb2e9-e69a-4ccd-8e51-57b0a59419dd
Text: 184      in effect until the night before classes resume.
Courtesy hours exist at all other times,  during which the right of
residents to live in an environment free from bothersome noise  should
be respected.  

In [226]:
# k = 10
query_vector_store(query, 10)

Node ID: 4b9967d5-747a-4d5a-85b9-953517e2d0d7
Text: Library Policies  Conduct  • Silence cell phones.  Use cell
phones in appropriate areas only.  • Food is allowed in the Maggie
Murph Café and adjacent areas only.  • Students may sign up to use
group study rooms in the Sorrells Engineering & Science  Library and
Hunt Library.  Reserve rooms online.  • Informal group study is
permitted in all pub...
Node ID: fba41415-f164-4a35-9aab-d5f0d9c09f75
Text: Undergraduate Course Meeting Policy  Policy Statement  No
undergraduate classes, exams, academic, or artistic activities
(including: extra help  session, rehearsals, ROTC drill, make-up
exams, etc.)  are scheduled on weekdays between  4:50 p.m. and 6:50
p.m.  Extra class time beyond those regularly scheduled must take
place either before 4:50 p...
Node ID: 6528ab5d-b448-49f0-8e9e-eed1ba9a3dbb
Text: Since  we are all concerned about the quality of life at the
university, this time must be held for  the students.  In planning the
academic co

In [227]:
# k = 50
query_vector_store(query, 50)

Node ID: d35bdbb8-90e3-4c1e-87d5-ca9d3e56b3e5
Text: 20      Patrick McCue  Accessibility  Specialist  412-268-7537
pmccue@andrew.cmu.edu   Ethan Obstarczyk  Senior Disability  Resources
Program  Administrator  412-268-5940  ethano@andrew.cmu.edu  Jill
Ramsey  Testing Center  Proctor  412-268-5940  jillrams@andrew.cmu.edu
Change of Address  A student is required to report all changes in
address...
Node ID: 5f3bd64e-3f55-42c5-96ab-1e2d133e2d2a
Text: 33      access to external sites or removes network access for
internal sites, the purpose of the  action is to maintain the security
and reliability of the computer systems and networks  rather than to
punish an individual or a site, or to restrict the free expression of
ideas.  Conduct of Classes (Undergraduate Students)  Students are
expected...
Node ID: 4b05809b-4265-42df-83dd-99954c8417af
Text: 43      Policies  I. In-term Examinations  1.  All in-term
examinations should be given during the regularly scheduled class
time.  However, if t

In [228]:
# k = 75
query_vector_store(query, 75)

Node ID: d35bdbb8-90e3-4c1e-87d5-ca9d3e56b3e5
Text: 20      Patrick McCue  Accessibility  Specialist  412-268-7537
pmccue@andrew.cmu.edu   Ethan Obstarczyk  Senior Disability  Resources
Program  Administrator  412-268-5940  ethano@andrew.cmu.edu  Jill
Ramsey  Testing Center  Proctor  412-268-5940  jillrams@andrew.cmu.edu
Change of Address  A student is required to report all changes in
address...
Node ID: eeb01ade-3723-4da6-9637-0adcc5c32759
Text: 23      policy.  For further guidelines, see also the university
policy on Separation of Individual’s  and Institution’s Interests.
Responsible Sharing of Resources   Where a resource such as memory,
CPU time or access to network resources belongs to  the whole
community collectively, it must be shared.  It is unacceptable to make
such excessiv...
Node ID: 5bc42b8e-b925-4b56-9cef-d1a6d928cf8d
Text: In cases of questionable personal use of resources, you may
contact it- help@andrew.cmu.edu to determine whether a particular
activity is permissi

In [229]:
# k = 100
query_vector_store(query, 100)

Node ID: d35bdbb8-90e3-4c1e-87d5-ca9d3e56b3e5
Text: 20      Patrick McCue  Accessibility  Specialist  412-268-7537
pmccue@andrew.cmu.edu   Ethan Obstarczyk  Senior Disability  Resources
Program  Administrator  412-268-5940  ethano@andrew.cmu.edu  Jill
Ramsey  Testing Center  Proctor  412-268-5940  jillrams@andrew.cmu.edu
Change of Address  A student is required to report all changes in
address...
Node ID: eeb01ade-3723-4da6-9637-0adcc5c32759
Text: 23      policy.  For further guidelines, see also the university
policy on Separation of Individual’s  and Institution’s Interests.
Responsible Sharing of Resources   Where a resource such as memory,
CPU time or access to network resources belongs to  the whole
community collectively, it must be shared.  It is unacceptable to make
such excessiv...
Node ID: 5bc42b8e-b925-4b56-9cef-d1a6d928cf8d
Text: In cases of questionable personal use of resources, you may
contact it- help@andrew.cmu.edu to determine whether a particular
activity is permissi

In [230]:
# k = 500
query_vector_store(query, 500)

Node ID: ede19be9-f4a9-4509-9d2e-517a993c4664
Text: 3      Index    Carnegie Mellon Vision and Mission .............
...................................................................  4
Carnegie Mellon Code .................................................
......................................................  5  Academic
Policies
..................................................................
Node ID: 38eab5ff-b53b-4e70-b87f-1ea5d5e8dd21
Text: 5      Carnegie Mellon Code    Students at Carnegie Mellon,
because they are members of an academic community dedicated  to the
achievement of excellence, are expected to meet the highest standards
of personal,  ethical and moral conduct possible.  These standards
require personal integrity, a commitment  to honesty without
compromise, as well a...
Node ID: e206f66e-c29f-4bc8-a853-8146339c86aa
Text: 6                            Academic Policies
Node ID: 358a8677-ce1d-4a99-afac-0fb8470409a1
Text: 7      Academic Policies  Educational Goals

We chose the “What are CMU’s quiet hours” query (Query 4) because it has a more definitive answer in terms of having exact quiet hours depending on the day compared to the other 4 queries. Hence, we believed that it would be easier to experiment with it because testing the result becomes easier.

**Responses to the 6 queries you submitted to the vector store, as described in B.1 above.**

	Response 1 (to Query A):

	Node ID: c1dbf187-8ba9-4820-893a-b7bb7d74a7e0
	Text: 112      Housing, Meal Plan & Fee Adjustments  Housing charges
	are adjusted daily, beginning on check-in day and ending on the last
	day  of final exams for the semester. Holiday breaks are included. The
	Winter Break period is  not included. Meal plan charges are adjusted
	weekly. Dine Express and Campus Express are assessed  based upon
	actual use...

	Node ID: b3be81b0-0eec-409e-8829-827c47374803
	Text: 113      As with any policy, there must be a means of making
	exceptions. Any academic or artistic  activities which you feel must
	be scheduled between 4:50 p.m. and 6:50 p.m. must be  cleared with the
	University Registrar. These requests must be in writing either as a
	memo  or through email to CMURegistrar@andrew.cmu.edu. All requests
	must inclu...

	Node ID: 16e46d34-5a25-46b6-b201-71460b1142e0
	Text: 12. Carnegie Mellon reserves the right to establish other
	reasonable expectations or  restrictions at any time to ensure the
	health or safety of the community. Individuals with concerns or
	complaints regarding animals in residence facilities should  contact a
	Resident Assistant or University Police. Personal Property and
	Insurance  Students are ...

	Node ID: 251f51ad-7ad8-4fd5-8870-3fe1026f69f6
	Text: 184      in effect until the night before classes resume.
	Courtesy hours exist at all other times,  during which the right of
	residents to live in an environment free from bothersome noise  should
	be respected. "Quiet" is defined as being unable to hear any noise at
	a distance of 10 feet from a room  with a closed door. During
	"courtesy" hours, ...

	Node ID: d0d4b66e-54d9-4a4e-8766-d18997282925
	Text: 192      Student Affairs following a determination that the
	arrangement is no longer warranted or  necessary. A student
	considering request of an NCA or NCO may discuss the situation with a
	Housefellow, a College Liaison, the Office of the Dean of Student
	Affairs, the Office for  Institutional Equity and Title IX, the Office
	of Community Standa...
	

	Response 2 (to Query B): 

	Node ID: 04652146-d336-49a4-982d-39201e458bf8
	Text: 75      adopted by the university trustees. Once adopted,
	amendments will become binding on  new faculty, administration, and
	staff when hired, on existing faculty and staff when they  sign new
	employment contracts, and on graduate and undergraduate students when
	admitted. Other university personnel, including tenured faculty, and
	current staff...

	Node ID: ca781f31-7936-46cd-8826-719136a9c8b6
	Text: 108      suspended, only the president (or the president's
	designated representative) has the  authority to modify operations
	and/or close the university and to specify those persons or  group of
	persons who are to refrain from coming to the Pittsburgh campus and/or
	who  are free to leave work or class early. Staff members who provide
	essential ...

	Node ID: c1dbf187-8ba9-4820-893a-b7bb7d74a7e0
	Text: 112      Housing, Meal Plan & Fee Adjustments  Housing charges
	are adjusted daily, beginning on check-in day and ending on the last
	day  of final exams for the semester. Holiday breaks are included. The
	Winter Break period is  not included. Meal plan charges are adjusted
	weekly. Dine Express and Campus Express are assessed  based upon
	actual use...

	Node ID: b3be81b0-0eec-409e-8829-827c47374803
	Text: 113      As with any policy, there must be a means of making
	exceptions. Any academic or artistic  activities which you feel must
	be scheduled between 4:50 p.m. and 6:50 p.m. must be  cleared with the
	University Registrar. These requests must be in writing either as a
	memo  or through email to CMURegistrar@andrew.cmu.edu. All requests
	must inclu...

	Node ID: aafd25e6-e0f0-4408-accd-45ca6b68c4e9
	Text: 175      Bicycles  Bicycles are to be "walked" inside any
	residential building. Bicycle racks are provided in  residential areas
	where appropriate. No bicycles shall be parked or stored in such a
	manner as to block entrance/exit from any building, to impede the
	normal movement of  wheelchair users or others with special mobility
	needs, and/or t...

	Node ID: 679bda26-6856-45ce-b712-c220e83cef74
	Text: 178      not Underwriters Laboratory approved. If you are
	uncertain about whether or not an item  is prohibited, please contact
	Housing Services prior to purchasing. Electric kettles and standalone
	rice cookers are approved for use in residential kitchen  spaces only
	and are not able to be used in student bedrooms or living areas that
	are not  d...

	Node ID: 16e46d34-5a25-46b6-b201-71460b1142e0
	Text: 12. Carnegie Mellon reserves the right to establish other
	reasonable expectations or  restrictions at any time to ensure the
	health or safety of the community. Individuals with concerns or
	complaints regarding animals in residence facilities should  contact a
	Resident Assistant or University Police. Personal Property and
	Insurance  Students are ...

	Node ID: 251f51ad-7ad8-4fd5-8870-3fe1026f69f6
	Text: 184      in effect until the night before classes resume.
	Courtesy hours exist at all other times,  during which the right of
	residents to live in an environment free from bothersome noise  should
	be respected. "Quiet" is defined as being unable to hear any noise at
	a distance of 10 feet from a room  with a closed door. During
	"courtesy" hours, ...

	Node ID: 5774fc1b-27f4-4ecb-9087-367adc71630d
	Text: 185      • Stereos, televisions and radios will be permitted but
	must be used only in accordance  with the general policy. • Use of
	musical instruments will not be permitted. • Social events and private
	gatherings will be permitted; however, they must be very  small and
	are subject to the above standards. It is hoped that this designation
	of a q...
	
	Node ID: d0d4b66e-54d9-4a4e-8766-d18997282925
	Text: 192      Student Affairs following a determination that the
	arrangement is no longer warranted or  necessary. A student
	considering request of an NCA or NCO may discuss the situation with a
	Housefellow, a College Liaison, the Office of the Dean of Student
	Affairs, the Office for  Institutional Equity and Title IX, the Office
	of Community Standa...



Responses to the 6 queries you submitted to the vector store, as described in B.2 above. 

	Response 1 (to Query A): 

	Node ID: b0fa1aa3-a37f-416f-9c9d-d44834bf799c
	Text: Members of the floor also have the right to determine community
	standards for the floor.  Roommates have equal voice in determining
	the private activities  of the room, and one roommate's rights must
	not infringe upon another roommate's  rights.  Public Lounge
	Facilities  At the beginning of each academic year, a layout of each
	lounge area and ...

	Node ID: 13734c0e-ed7d-4783-a116-2d38ecbb5dcf
	Text: Quiet Hours  Quiet hours for all residential areas are 8 pm - 8
	am on Sunday through Thursday nights,  and Midnight - 9 am on Friday
	and Saturday nights.  During finals week or reading days,  24-hour
	quiet hours will be in effect.  For long vacation weekends, weekend
	hours will be

	Node ID: 2a9fb2e9-e69a-4ccd-8e51-57b0a59419dd
	Text: 184      in effect until the night before classes resume.
	Courtesy hours exist at all other times,  during which the right of
	residents to live in an environment free from bothersome noise  should
	be respected.  "Quiet" is defined as being unable to hear any noise at
	a distance of 10 feet from a room  with a closed door.  During
	"courtesy" hour...

	Node ID: bc4603fd-6fdf-4aed-a93f-f5b5e4759503
	Text: Quiet Living Areas  Welch and designated first-year housing
	areas offer quiet living, a particularly desirable  alternative to
	traditional on-campus housing for undergraduate students.  This
	designation provides an assurance to all residents in these buildings,
	and to all residents  in the surrounding area, that resident students
	will act with ...
	
	Node ID: 2c6d7eba-1105-4008-b8d7-b2abb69e527b
	Text: Noise  Members of the university community have a right to work,
	study, practice and live in an  environment free from disruptive
	noise.  Acknowledging our desire to foster a vibrant campus  culture,
	which will necessarily involve some noise associated with campus
	activities, the  following expectations are intended to guide
	thoughtful planning ...


	Response 2 (to Query B):

	Node ID: 4b9967d5-747a-4d5a-85b9-953517e2d0d7
	Text: Library Policies  Conduct  • Silence cell phones.  Use cell
	phones in appropriate areas only.  • Food is allowed in the Maggie
	Murph Café and adjacent areas only.  • Students may sign up to use
	group study rooms in the Sorrells Engineering & Science  Library and
	Hunt Library.  Reserve rooms online.  • Informal group study is
	permitted in all pub...

	Node ID: fba41415-f164-4a35-9aab-d5f0d9c09f75
	Text: Undergraduate Course Meeting Policy  Policy Statement  No
	undergraduate classes, exams, academic, or artistic activities
	(including: extra help  session, rehearsals, ROTC drill, make-up
	exams, etc.)  are scheduled on weekdays between  4:50 p.m. and 6:50
	p.m.  Extra class time beyond those regularly scheduled must take
	place either before 4:50 p...

	Node ID: 6528ab5d-b448-49f0-8e9e-eed1ba9a3dbb
	Text: Since  we are all concerned about the quality of life at the
	university, this time must be held for  the students.  In planning the
	academic course schedule, the University Registrar's Office will
	review all  courses to ensure that no academic or artistic courses be
	scheduled in this period.  In  addition, any requests to schedule
	additional or ...

	Node ID: b0fa1aa3-a37f-416f-9c9d-d44834bf799c
	Text: Members of the floor also have the right to determine community
	standards for the floor.  Roommates have equal voice in determining
	the private activities  of the room, and one roommate's rights must
	not infringe upon another roommate's  rights.  Public Lounge
	Facilities  At the beginning of each academic year, a layout of each
	lounge area and ...

	Node ID: 13734c0e-ed7d-4783-a116-2d38ecbb5dcf
	Text: Quiet Hours  Quiet hours for all residential areas are 8 pm - 8
	am on Sunday through Thursday nights,  and Midnight - 9 am on Friday
	and Saturday nights.  During finals week or reading days,  24-hour
	quiet hours will be in effect.  For long vacation weekends, weekend
	hours will be

	Node ID: 2a9fb2e9-e69a-4ccd-8e51-57b0a59419dd
	Text: 184      in effect until the night before classes resume.
	Courtesy hours exist at all other times,  during which the right of
	residents to live in an environment free from bothersome noise  should
	be respected.  "Quiet" is defined as being unable to hear any noise at
	a distance of 10 feet from a room  with a closed door.  During
	"courtesy" hour...

	Node ID: 53d0f973-ec34-4b8d-a1ba-e49dc51ce3a4
	Text: The established quiet hours stated above are the minimums for
	every residential area.  If  you or anyone in your living unit
	believes that quiet hours should be extended, contact  your Resident
	Assistant (RA) or Student Dormitory Council representative.  That
	person  has the authority to conduct a vote of the people living in
	your area and, pend...

	Node ID: bc4603fd-6fdf-4aed-a93f-f5b5e4759503
	Text: Quiet Living Areas  Welch and designated first-year housing
	areas offer quiet living, a particularly desirable  alternative to
	traditional on-campus housing for undergraduate students.  This
	designation provides an assurance to all residents in these buildings,
	and to all residents  in the surrounding area, that resident students
	will act with ...

	Node ID: c4a59bee-8559-4793-b034-ad463d116f13
	Text: 185      • Stereos, televisions and radios will be permitted but
	must be used only in accordance  with the general policy.  • Use of
	musical instruments will not be permitted.  • Social events and
	private gatherings will be permitted; however, they must be very
	small and are subject to the above standards.  It is hoped that this
	designation of ...

	Node ID: 2c6d7eba-1105-4008-b8d7-b2abb69e527b
	Text: Noise  Members of the university community have a right to work,
	study, practice and live in an  environment free from disruptive
	noise.  Acknowledging our desire to foster a vibrant campus  culture,
	which will necessarily involve some noise associated with campus
	activities, the  following expectations are intended to guide
	thoughtful planning ...

Based on the responses to the queries provided, it seems the 'k' parameter set to 10 retrieved the highest quality/most accurate result. This is because with a 'k' value of 10, the responses contained detailed and relevant information from the CMU Student Handbook, focusing on CMU's quiet hours. This parameter strikes a balance between breadth and depth - a smaller 'k' value might not capture enough information to fully answer the query, while a larger 'k' might retrieve too much information, making it harder to identify the most relevant details. The 'k' of 10 provided a focused yet comprehensive overview, making it the best choice for these queries. This suggests that for detailed documents or specific queries, a moderate 'k' value efficiently balances comprehensiveness and relevance.

In this case, we used token-based text chunking instead of semantic-based text chunking and achieved better results for the same query when using token-based chunking. This can be seen from the responses as it includes the exact time for quiet hours on specific days fetched from the CMU Handbook. Based on the responses to the queries provided, it seems the 'k' parameter set to 5 and 10 retrieved the highest quality/most accurate result. For queries that are very specific or require detailed information, a higher 'k' value might be beneficial because it retrieves more candidate responses, increasing the chance of finding accurate and relevant information. On the other hand, for broader queries or when seeking a general overview, a lower 'k' value might suffice and even be preferable, as it focuses on the most relevant results without overwhelming the user with too much or too specific information.

Since the CMU Student Handbook covers a wide range of topics in detail, a 'k' parameter like 10 that strikes a balance between specificity and comprehensiveness might be most effective overall. 

In [448]:
def rag_llm(model, q, k):
    import openai
    access_endpoint_api_key=os.getenv('access_endpoint_api_key')
    f=get_sys_message(q, k)

    client = openai.OpenAI(
            base_url = "https://api.endpoints.anyscale.com/v1",
            api_key=access_endpoint_api_key
            )

    query= "Summarize the resume; include the candidate's occupation title, software and technology skills, and education."

    # Define the messages
    messages = [
        {"role": "system", "content": f},
        {"role": "user", "content": query}]

    # Call the completions endpoint (e.g. the LLM API)
    response = client.chat.completions.create(
        model= model, 
        messages=messages,
    )
    responseContent = response.choices[0].message.content
    responseContentStr = str(responseContent)
    return responseContentStr, responseContent, f #, print(f"MODEL: {model}--RESUME SUMMARY: \n\n{responseContent}") 


### **Call LLM API**

In [483]:
model_01 = "mistralai/Mixtral-8x7B-Instruct-v0.1"
model_02 = "google/gemma-7b-it"
model_03="meta-llama/Llama-2-7b-chat-hf"

In [485]:
# MistralAI Model:
textA, __, original_messageA =rag_llm(model=model_01, q="Which resume has the most software skills listed?", k=100)



In [486]:
# Google Gemma Model:
textB, __, original_messageB=rag_llm(model=model_02, q="Which resume has the most software skills listed?", k=100)

In [487]:
# Llama2 model:
textC, __, original_messageC=rag_llm(model=model_03, q="Which resume has the most software skills listed?", k=10)

#### Evaluating LLM Responses

In [463]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compare_text_similarity(text_a, text_b, text_c):
    """
    Compares the similarity between three texts using TF-IDF vectors and cosine similarity.
    
    Parameters:
    - text_a (str): Text A
    - text_b (str): Text B
    - text_c (str): Text C
    
    Returns:
    - A dictionary with similarity scores between Text A & Text B, A & C, and B & C.
    """
    # Initialize the vectorizer and transform texts into TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text_a, text_b, text_c])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Similarity between Text A and B, A and C, and then B and C
    similarity_scores = {
        "A_B": similarity_matrix[0, 1],
        "A_C": similarity_matrix[0, 2],
        "B_C": similarity_matrix[1, 2]
    }

    return similarity_scores

def compare_text_similarity_response2context(om, model_response):
    """
    Compares the similarity between three texts using TF-IDF vectors and cosine similarity.
    
    Parameters:
    - text_a (str): Text A
    - text_b (str): Text B
    - text_c (str): Text C
    
    Returns:
    - A dictionary with similarity scores between Text A & Text B, A & C, and B & C.
    """
    # Initialize the vectorizer and transform texts into TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([om, model_response])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Similarity between Text A and B, A and C, and then B and C
    similarity_scores = {
        "A_B": similarity_matrix[0, 1],
        #"A_C": similarity_matrix[0, 2],
        #"B_C": similarity_matrix[1, 2]
    }

    return similarity_scores

'\nif __name__ == "__main__":\n    text_a = "This is an example sentence for text A."\n    text_b = "Text B\'s sentence is slightly different from text A."\n    text_c = "Text C is here with another unique sentence."\n\n    similarities = compare_text_similarity(text_a, text_b, text_c)\n    print("Similarity Scores:")\n    for pair, score in similarities.items():\n        print(f"{pair}: {score:.4f}")\n'

In [458]:
similarities = compare_text_similarity(text_a=textA, text_b=textB, text_c=textC)
#similarities
print("LLM Response Similarity Scores:")
for pair, score in similarities.items():
    print(f"{pair}: {score:.4f}")

LLM Response Similarity Scores:
A_B: 0.5716
A_C: 0.5783
B_C: 0.4375


In [472]:
similaritiesOM = compare_text_similarity(text_a=original_messageA, text_b=original_messageB, text_c=original_messageC)
#similaritiesOM
print("System Prompt/Context Similarity 2  System Prompt/Context Similarity Scores:")
for pair, score in similaritiesOM.items():
    print(f"{pair}: {score:.4f}")

System Prompt/Context Similarity 2  System Prompt/Context Similarity Scores:
A_B: 1.0000
A_C: 0.3726
B_C: 0.3726


In [471]:
similaritiesOM_LLM_A =compare_text_similarity_response2context(om=original_messageA, model_response=textA)
similaritiesOM_LLM_B =compare_text_similarity_response2context(om=original_messageB, model_response=textB)
similaritiesOM_LLM_C =compare_text_similarity_response2context(om=original_messageC, model_response=textC)

print("System Prompt/Context to LLM Response \n Similarity Scores \n Model 1:")
for pair, score in similaritiesOM_LLM_A.items():
    print(f"{pair}: {score:.4f}")
print("\n")
print("System Prompt/Context to LLM Response\n Similarity Scores \n Model 2:")
for pair, score in similaritiesOM_LLM_B.items():
    print(f"{pair}: {score:.4f}")
print("\n")
print("System Prompt/Context to LLM Response\n Similarity Scores \n Model 3:")
for pair, score in similaritiesOM_LLM_C.items():
    print(f"{pair}: {score:.4f}")

System Prompt/Context to LLM Response 
 Similarity Scores 
 Model 1:
A_B: 0.8864


System Prompt/Context to LLM Response
 Similarity Scores 
 Model 2:
A_B: 0.6651


System Prompt/Context to LLM Response
 Similarity Scores 
 Model 3:
A_B: 0.4878


## **Misc.**

### **Demonstration of RAG software packages**

In the previous section, we build a FAISS vector store from scratch. In practice, there are already-built software packages we can use to build RAG systems.

A few popular software packages for RAG include:
- langchain
- llama-index

Below, these software tools are demonstrated.

In [488]:
#!pip install langchainhub
from langchain_openai import OpenAI
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [489]:
loader = PyPDFLoader("/Users/skingsle/94844_A4_Generative_AI_Lab/data/Kingsley_CV_february_2024_sei.pdf")
pages = loader.load_and_split()

In [490]:

embeddings_model = langchain_community.embeddings.openai.OpenAIEmbeddings(openai_api_key=openai_key)

vectorstore = FAISS.from_documents(pages, embeddings_model) 

In [491]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)


In [492]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [494]:
rag_chain.invoke("Summarize the resume")

"Sara Kingsley's resume includes teaching experience at Carnegie Mellon University, professional service as a reviewer for various conferences, honors and awards, hobbies, peer-reviewed workshop papers, policy papers, working papers, invited talks and presentations, academic appointments, expertise and focus, education, industry and government experience, skills, and peer-reviewed conference and journal publications. She is a researcher with expertise in building machine learning tools, has worked for Microsoft and the U.S. Department of Labor, and is pursuing a PhD in Human-Computer Interaction at Carnegie Mellon University."