# Multi vector retrieval (and inverse HyDE)

### Imports

In [3]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever

from langchain.storage import InMemoryByteStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough

### Config

In [4]:
model_name = 'gpt-3.5-turbo-0125'

### Load Texts and split them into chunks

In [5]:
loaders = [
    TextLoader("./data/lyrics/anti_hero.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/bejewled.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/lavender_haze.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/maroon.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/snow_on_the_beach.txt", encoding='utf-8')
]

docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000)
docs = text_splitter.split_documents(docs)

### Build chain for summarization and summarize texts

In [6]:
summery_chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document. Do not include the title. Do not mention the Document.\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)

summaries = summery_chain.batch(docs, {"max_concurrency": 5})

for i, summery in enumerate(summaries):
    print(f"Document {i+1}:\n{summery}")

Document 1:
The lyrics of the song describe a person who feels like they are the problem and struggles with issues like depression, narcissism, and feeling like an outsider. They express a sense of self-awareness and acknowledgement of their flaws, and the chorus emphasizes the idea of always being the anti-hero that others root for. The song touches on themes of self-reflection, inner turmoil, and the complexities of human nature.
Document 2:
The document is a song about a person realizing they have been too kind and taken advantage of in a relationship. The lyrics describe reclaiming their power and self-worth, shining brightly like jewels in a room. The person expresses their desire for the best in a relationship and emphasizes their worth and ability to shine. The song ends with a message of empowerment and self-confidence.
Document 3:
The lyrics of Lavender Haze describe a desire to escape scrutiny and societal expectations by staying in a dreamy, surreal state of mind represented

### Build chain for hypothetical questions

In [7]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 3 hypothetical questions that a person, 
        who seeks emotional guidence would ask that could be answered by this song's lyrics and or meaning. 
        Do not mention the song or the lysics in these questions.
        Do not add any counter to these questions.:\n\n{doc}"""
    )
    | ChatOpenAI(max_retries=0, model=model_name).bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

# chain.invoke(docs[4])

hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

for i, questions in enumerate(hypothetical_questions):
    print(f"Document {i+1}:\n{questions}")


Document 1:
['What are the internal struggles that come with feeling like an outsider among others?', 'How does one navigate the complexity of self-awareness and self-destruction?', "In what ways can one address the conflict between being perceived as a 'problem' and seeking emotional growth?"]
Document 2:
['How can I maintain my sense of self-worth and confidence in a relationship where I feel taken for granted?', "What steps can I take to prioritize my own happiness and fulfillment without depending on someone else's validation?", 'How can I gracefully navigate a situation where I feel undervalued and overlooked, while still maintaining my inner strength and sparkle?']
Document 3:
['What emotions arise when faced with scrutiny and expectations from others?', 'How do you navigate between societal norms and your own desires in relationships?', 'In what ways can love provide a sanctuary from external pressures and judgment?']
Document 4:
['What memories do you hold on to that leave a sc

### Create collection and init retriever

Retriever is empty at the start. It is filled with the chunks of the texts. The chunks are indexed by the retriever. The retriever is then used to retrieve the chunks that are relevant to the query. The chunks are then used to retrieve the original texts.

In [8]:
db = Chroma(collection_name="summaries_v3", embedding_function=OpenAIEmbeddings())

store = InMemoryByteStore() # The storage layer for the parent documents
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=db,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

# create documents for the summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

### Add documents and summaries to the retriever

In [9]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

### Add Questions to the retriever

In [10]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )
retriever.vectorstore.add_documents(question_docs)

['61a63086-e5ee-11ee-be1b-c89402f94a42',
 '61a63087-e5ee-11ee-9f47-c89402f94a42',
 '61a63088-e5ee-11ee-a2a6-c89402f94a42',
 '61a63089-e5ee-11ee-ba82-c89402f94a42',
 '61a6308a-e5ee-11ee-a462-c89402f94a42',
 '61a6308b-e5ee-11ee-83ac-c89402f94a42',
 '61a6308c-e5ee-11ee-a0ec-c89402f94a42',
 '61a6308d-e5ee-11ee-a495-c89402f94a42',
 '61a6308e-e5ee-11ee-88b7-c89402f94a42',
 '61a6308f-e5ee-11ee-aba8-c89402f94a42',
 '61a63090-e5ee-11ee-8300-c89402f94a42',
 '61a63091-e5ee-11ee-93d6-c89402f94a42',
 '61a63092-e5ee-11ee-b773-c89402f94a42',
 '61a63093-e5ee-11ee-9a56-c89402f94a42',
 '61a63094-e5ee-11ee-b51e-c89402f94a42']

### Add original documents to the retriever

First, add the ids of the full documents as metadata to the chunks, because we will embed these.

In [11]:
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
    
retriever.vectorstore.add_documents(docs)

['6359540e-e5ee-11ee-b631-c89402f94a42',
 '63595648-e5ee-11ee-90c4-c89402f94a42',
 '63595649-e5ee-11ee-9023-c89402f94a42',
 '6359564a-e5ee-11ee-8da6-c89402f94a42',
 '6359564b-e5ee-11ee-9a2d-c89402f94a42']

## Tests

In [34]:
query = "Song about importance of self-worth and independence in a relationship." # bejewled
# query = "What can i do to make things right?" # bejewled
# query = "I am the one at fault." # anti hero
# query = "Everybody expects too mutch of me. I'm tired of it. I need to be free. What should I do?" # bejewled
# query = "One day we are dancing and being happy, the next day we are fighting and crying. What is wrong with us?" # maroon
# query = "I feel like my mind is hazy. I can't think straight. What should I do?" # lavender haze
# query = "Someone splashed wine on my t-shirt. Should i confront this person?" # maroon
# query = "Can i get free tickets to the concert?"
# query = "I unexpectedly found a beatiful stone on the beach. Shoud I keep it?"


#### Direct Query

In [31]:
sub_docs = db.similarity_search(query)

print(sub_docs[0].page_content)

How can I maintain my sense of self-worth and confidence in a relationship where I feel taken for granted?


In [32]:
retrieved_docs = retriever.invoke(query)

print(retrieved_docs[0].page_content)

Title: Bejewled
[Verse 1]
Baby love, I think I've been a little too kind
Didn't notice you walkin' all over my peace of mind
In the shoes I gave you as a present
Puttin' someone first only works when you're in their top five
And by the way, I'm goin' out tonight

[Chorus]
Best believe I'm still bejeweled
When I walk in the room
I can still make the whole place shimmer
And when I meet the band
They ask, "Do you have a man?"
I could still say, "I don't remember"
Familiarity breeds contempt
Don't put mе in the basement
Whеn I want the penthouse of your heart
Diamonds in my eyes
I polish up real, I polish up real nice

[Post-Chorus]
Nice

[Verse 2]
Baby boy, I think I've been too good of a girl (Too good of a girl)
Did all the extra credit, then got graded on a curve
I think it's time to teach some lessons
I made you my world (Huh), have you heard? (Huh)
I can reclaim the land
And I miss you (Miss you), but I miss sparklin' (Nice)

[Chorus]
Best believe I'm still bejeweled
When I walk in t

### RAG

In [197]:
from langchain.globals import set_debug

set_debug(False)

In [35]:
template = """You are Taylor Swift. 
A person, who seeks emotional guidence asks you for help. 
Tell this person exactly what he or she needs to do to resolve his/her issues. 
Do mention your song's title and that listening to it will help the person.
Use a passage from the song to support your advice.
Answer the Question only using the context you are provided with.:

{context}

[Question]: 
{question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(model_name = model_name)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke(query)

'Based on the lyrics you provided, I recommend listening to my song "Bejeweled." In this song, I talk about reclaiming your worth and not putting yourself in a lower position in a relationship. The lyrics emphasize the importance of self-love and standing up for yourself. \n\nOne passage from the song that supports this advice is: \n"Familiarity breeds contempt\nDon\'t put me in the basement\nWhen I want the penthouse of your heart\nDiamonds in my eyes\nI polish up real, I polish up real nice"\n\nBy listening to "Bejeweled," you can find the strength to prioritize your own worth and not settle for less than you deserve in a relationship. Remember that you are precious and valuable, and don\'t be afraid to shine bright like a diamond.'