# Multi vector retrieval (and inverse HyDE)

### Imports

In [135]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever

from langchain.storage import InMemoryByteStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough

### Config

In [136]:
model_name = 'gpt-3.5-turbo-0125'

### Load Texts and split them into chunks

In [153]:
loaders = [
    TextLoader("./data/lyrics/anti_hero.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/bejewled.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/lavender_haze.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/maroon.txt", encoding='utf-8'),
    TextLoader("./data/lyrics/snow_on_the_beach.txt", encoding='utf-8')
]

docs = []
for loader in loaders:
    docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000)
docs = text_splitter.split_documents(docs)

### Build chain for summarization and summarize texts

In [154]:
summery_chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document. Do not include the title. Do not mention the Document.\n\n{doc}")
    | ChatOpenAI(max_retries=0)
    | StrOutputParser()
)

summaries = summery_chain.batch(docs, {"max_concurrency": 5})

for i, summery in enumerate(summaries):
    print(f"Document {i+1}:\n{summery}")

Document 1:
The document describes feelings of inadequacy, self-awareness of one's faults, and a sense of being an outsider. The speaker struggles with depression, narcissism, and the fear of being abandoned. The lyrics also touch on themes of family dynamics and the fear of being seen as a villain. The overall tone is introspective and self-critical.
Document 2:
The lyrics of the song talk about a girl who feels like she has been too kind and taken for granted in a relationship. She wants to reclaim her power and shine like a diamond. The chorus emphasizes her confidence and ability to make a room shimmer. The bridge mentions feeling sad but being told her aura is like moonstone, symbolizing hope. The song ends with the girl asserting her worth and determination to shine despite challenges.
Document 3:
The lyrics of the song describe feelings of being misunderstood and under scrutiny, with a desire to stay in a peaceful and dreamy state represented by the "lavender haze." The singer r

### Build chain for hypothetical questions

In [156]:
functions = [
    {
        "name": "hypothetical_questions",
        "description": "Generate hypothetical questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template(
        """Generate a list of exactly 3 hypothetical questions that a person, 
        who seeks emotional guidence would ask that could be answered by this song's lyrics and or meaning. 
        Do not mention the song or the lysics in these questions.
        Do not add any counter to these questions.:\n\n{doc}"""
    )
    | ChatOpenAI(max_retries=0, model=model_name).bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

# chain.invoke(docs[4])

hypothetical_questions = chain.batch(docs, {"max_concurrency": 5})

for i, questions in enumerate(hypothetical_questions):
    print(f"Document {i+1}:\n{questions}")


Document 1:
['What does it feel like to grapple with self-destructive tendencies while seeking approval from others?', 'How does it impact relationships when one struggles with their own identity and self-worth?', "Can you relate to the internal conflict of being perceived as the 'anti-hero' in your own story?"]
Document 2:
['How can I maintain my sense of self-worth and confidence when I feel like others are taking advantage of me?', 'What steps can I take to set boundaries and prioritize my own happiness in relationships?', 'How can I embrace my own uniqueness and shine bright without seeking validation from others?']
Document 3:
['What do you do when you feel like people are constantly scrutinizing you and your choices?', 'Have you ever felt pressured to fit into societal expectations regarding relationships and marriage?', 'How do you navigate the feeling of wanting to escape from judgment and criticism in order to find peace and clarity in your own thoughts?']
Document 4:
["What e

### Create collection and init retriever

Retriever is empty at the start. It is filled with the chunks of the texts. The chunks are indexed by the retriever. The retriever is then used to retrieve the chunks that are relevant to the query. The chunks are then used to retrieve the original texts.

In [157]:
db = Chroma(collection_name="summaries_v2", embedding_function=OpenAIEmbeddings())

store = InMemoryByteStore() # The storage layer for the parent documents
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=db,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]

# create documents for the summaries
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]

### Add documents and summaries to the retriever

In [158]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

### Add Questions to the retriever

In [159]:
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )
retriever.vectorstore.add_documents(question_docs)

['8705074d-e315-11ee-b307-c89402f94a42',
 '8705074e-e315-11ee-b252-c89402f94a42',
 '8705074f-e315-11ee-89b1-c89402f94a42',
 '87050750-e315-11ee-9d32-c89402f94a42',
 '87050751-e315-11ee-97a0-c89402f94a42',
 '87050752-e315-11ee-98b0-c89402f94a42',
 '87050753-e315-11ee-98f1-c89402f94a42',
 '87050754-e315-11ee-ba22-c89402f94a42',
 '87050755-e315-11ee-9731-c89402f94a42',
 '87050756-e315-11ee-a3c2-c89402f94a42',
 '87050757-e315-11ee-8174-c89402f94a42',
 '87050758-e315-11ee-a3a6-c89402f94a42',
 '87050759-e315-11ee-b98f-c89402f94a42',
 '8705075a-e315-11ee-81a9-c89402f94a42',
 '8705075b-e315-11ee-b888-c89402f94a42']

### Add original documents to the retriever

First, add the ids of the full documents as metadata to the chunks, because we will embed these.

In [160]:
for i, doc in enumerate(docs):
    doc.metadata[id_key] = doc_ids[i]
    
retriever.vectorstore.add_documents(docs)

['88fcf9cc-e315-11ee-b9db-c89402f94a42',
 '88fcf9cd-e315-11ee-8c9e-c89402f94a42',
 '88fcf9ce-e315-11ee-9d98-c89402f94a42',
 '88fcf9cf-e315-11ee-ac33-c89402f94a42',
 '88fcf9d0-e315-11ee-8db3-c89402f94a42']

## Tests

In [217]:
query = "Song about importance of self-worth and independence in a relationship." # bejewled
# query = "What can i do to make things right?" # bejewled
# query = "I am the one at fault." # anti hero
# query = "Everybody expects too mutch of me. I'm tired of it. I need to be free. What should I do?" # bejewled
# query = "One day we are dancing and being happy, the next day we are fighting and crying. What is wrong with us?" # maroon
# query = "I feel like my mind is hazy. I can't think straight. What should I do?" # lavender haze
# query = "Someone splashed wine on my t-shirt. Should i confront this person?" # maroon
# query = "Can i get free tickets to the concert?"
# query = "I unexpectedly found a beatiful stone on the beach. Shoud I keep it?"


#### Direct Query

In [218]:
sub_docs = db.similarity_search(query)

print(sub_docs[0].page_content)

What does it feel like when unexpected beauty enters your life?


In [219]:
retrieved_docs = retriever.invoke(query)

print(retrieved_docs[0].page_content)

Title: Snow on the beach
[Verse 1]
One night, a few moons ago
I saw flecks of what could've been lights
But it might just have been you
Passing by unbeknownst to me
Life is emotionally abusive
And time can't stop me quite like you did
And my flight was awful, thanks for asking
I'm unglued, thanks to you

[Chorus]
And it's like snow at the beach
Weird, but fuckin' beautiful
Flying in a dream
Stars by the pocketful
You wanting me
Tonight feels impossible
But it's comin' down
No sound, it's all around

[Post-Chorus]
Like snow on the beach
Like snow on the beach
Like snow on the beach
Like snow, ah

[Verse 2]
This scene feels like what I once saw on a screen
I searched "aurora borealis green"
I've never seen someone lit from within
Blurring out my periphery
My smile is like I won a contest
And to hide that would be so dishonest
And it's fine to fake it 'til you make it
'Til you do, 'til it's true

[Chorus]
Now it's like snow at the beach
Weird, but fuckin' beautiful
Flying in a dream
Stars

### RAG

In [197]:
from langchain.globals import set_debug

set_debug(False)

In [220]:
template = """You are Taylor Swift. 
A person, who seeks emotional guidence asks you for help. 
Tell this person exactly what he or she needs to do to resolve his/her issues. 
Do mention your song's title and that listening to it will help the person.
Do mention explicitly the part of the song that will help the person.
Answer the Question only using the context you are provided with.:

{context}

[Question]: 
{question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(model_name = model_name)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke(query)

'Listen to my song "Snow on the beach" to find the emotional guidance you seek. The part of the song that will help you with your decision is: "And it\\\'s like snow at the beach, Weird, but fuckin\\\' beautiful." This will help you reflect on the beauty and uniqueness of the stone you found and decide if you want to keep it as a special memory.'