In [16]:
import os
import openai
import tiktoken

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [3]:
# Collect sample data
import requests

url = "https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt"
res = requests.get(url)
with open("state_of_the_union.txt", "w") as f:
    f.write(res.text)


In [5]:
#Load text from the file
from langchain.document_loaders import TextLoader

loader = TextLoader("state_of_the_union.txt")
documents = loader.load()

In [6]:
# chank the text into smaller chunks
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

### Save the chunks to vector database
I've used the [weaviate](https://weaviate.io/developers/weaviate/quickstart#step-1-create-a-weaviate-database) DB sandbox with 14 days free trial. 

In [14]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions


auth_config = weaviate.AuthApiKey(api_key=os.environ['WEAVIATE-API-KEY'])

client = weaviate.Client(
  url=os.environ['WEAVIATE-URL'],
  auth_client_secret=auth_config
)


### Vectorise and store data

As a vector database, Weaviate make use of vector representations, also known as "embeddings", of data objects. 

An embedding is a vector (list) of floating point numbers. The distance between two vectors measures their relatedness. Small distances suggest high relatedness and large distances suggest low relatedness. 

One way to generate these vectors is to use a "vectorizer" module, which will generate a vector at import and query time. This can be a convenient method if you are using a publicly available model and want to simplify your pipeline.

Alternatively, you can supply your own, "custom", vector embeddings at import time as well as for any vector-based queries. This is useful if you have a custom model, or if you want to use a model that is not available through a Weaviate module.

To generate the vector embeddings, you can use the [OpenAIEmbeddings() model](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings), and to store them, you can use the Weaviate vector database. Vectorisation can be done by multiple ways: [Vectorizers and Rerankers](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules)

Here is the set of [10 pre-vectorized Jeopardy questions](https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny+vectors.json) in JSON format.


In [39]:
embedding = OpenAIEmbeddings()

text = "This is a test text"

query_result = embedding.embed_query(text)

query_result

  warn_deprecated(
/Users/morosus/Library/Caches/pypoetry/virtualenvs/rag-tunning-FybJCK1R-py3.11/lib/python3.11/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/morosus/Library/Caches/pypoetry/virtualenvs/rag-tunning-FybJCK1R-py3.11/lib/python3.11/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


[-0.0031054428427034985,
 0.01113630133311102,
 -0.00402958255378551,
 -0.011749064064443116,
 -0.0010406979861090604,
 0.010789957383080384,
 -0.010357026514219556,
 -0.00529840194152607,
 -0.009917435757902776,
 -0.02614899430434389,
 0.0203410676659396,
 0.022592305203783792,
 -0.007519667657512143,
 0.017250609802842626,
 -0.00600774142287229,
 0.01916882502821316,
 0.021260212228599012,
 -0.01559881429009609,
 0.007639555876267167,
 -0.018356247978074332,
 -0.0007051769934785911,
 -0.006414029947941704,
 -0.010969789245551653,
 0.017969940978017844,
 -0.0221260721036756,
 -0.0030421683239367425,
 0.014373288827431895,
 -0.029972103117527984,
 0.018489456903063795,
 -0.007939276888815997,
 0.010050645614214775,
 -0.019421923103280184,
 -0.0036898989568720955,
 -0.02427074212899921,
 -0.005501546204060777,
 0.0038197779381335833,
 -0.005658067063452412,
 -0.029465906967393936,
 0.018542741590646623,
 -0.016624526365276088,
 0.007506346485616436,
 0.012461734420839848,
 -0.0009666003

In [17]:
vectorstore = Weaviate.from_documents(
    client = client,    
    documents = chunks,
    embedding = OpenAIEmbeddings(),
    by_text = False
)

  warn_deprecated(
/Users/morosus/Library/Caches/pypoetry/virtualenvs/rag-tunning-FybJCK1R-py3.11/lib/python3.11/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/morosus/Library/Caches/pypoetry/virtualenvs/rag-tunning-FybJCK1R-py3.11/lib/python3.11/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/


In [26]:
# Test search
vectorstore.search("dream", search_type="mmr", k=5)

[Document(page_content='We can do all this while keeping lit the torch of liberty that has led generations of immigrants to this land—my forefathers and so many of yours. \n\nProvide a pathway to citizenship for Dreamers, those on temporary status, farm workers, and essential workers. \n\nRevise our laws so businesses have the workers they need and families don’t wait decades to reunite. \n\nIt’s not only the right thing to do—it’s the economically smart thing to do.', metadata={'source': 'state_of_the_union.txt'}),
 Document(page_content='Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter. \n\nBut cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n\nDanielle says Heath was a fighter to the very end. \n\nHe didn’t know how to stop fighting, and neither did she. \n\nThrough her pain she found purpose to demand we do better. \n\nTonight, Danielle—we are.', metadata={'source'

### Define retriver component

define DB as the retriever component, which fetches the additional context based on the semantic similarity between the user query and the embedded chunks

In [27]:
retriever = vectorstore.as_retriever()

### Augmentation

To augment the prompt with the additional context, you need to prepare a prompt template

In [34]:
from langchain.prompts import ChatPromptTemplate

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Provide the answer in JSON format and include the question and context in the response.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

### Generate an text

you can build a chain for the RAG pipeline, chaining together the retriever, the prompt template and the LLM. Once the RAG chain is defined, you can invoke it

In [35]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

query = "What did the president say about Ukraine"
rag_chain.invoke(query)

  warn_deprecated(
/Users/morosus/Library/Caches/pypoetry/virtualenvs/rag-tunning-FybJCK1R-py3.11/lib/python3.11/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/morosus/Library/Caches/pypoetry/virtualenvs/rag-tunning-FybJCK1R-py3.11/lib/python3.11/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
/Users/morosus/Library/Caches/pypoetry/virtualenvs/rag-tunning-FybJCK1R-py3.11/lib/python3.11/site-packages/langchain_community/chat_models/openai.py:456: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instea

'{"question": "What did the president say about Ukraine?", "answer": "President Zelenskyy said in his speech to the European Parliament \'Light will win over darkness.\'", "context": "[Document(page_content=\'Groups of citizens blocking tanks with their bodies. Everyone from students to retirees teachers turned soldiers defending their homeland. \\n\\nIn this struggle as President Zelenskyy said in his speech to the European Parliament “Light will win over darkness.” The Ukrainian Ambassador to the United States is here tonight. \\n\\nLet each of us here tonight in this Chamber send an unmistakable signal to Ukraine and to the world.\', metadata={\'source\': \'state_of_the_union.txt\'})]"}'