In [1]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from openai import OpenAI

In [2]:
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))
openai_client = OpenAI()

**Create a collection for data injection**

Throws an error if the table already exists

In [3]:
collection = chroma_client.get_or_create_collection(name="titanic")

In [4]:
file_dir = here("data/csv/titanic.csv")
df = pd.read_csv(file_dir, nrows=100)

In [5]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
95,0,1,Mr. George B Goldschmidt,male,71.0,0,0,34.6542
96,1,1,Mr. William Bertram Greenfield,male,23.0,0,1,63.3583
97,1,2,Mrs. John T (Ada Julia Bone) Doling,female,34.0,0,1,23.0000
98,0,2,Mr. Sinai Kantor,male,34.0,1,0,26.0000


NOTE: Process in chunks if dataset is big.

In [6]:
docs = []
metadatas = []
ids = []
embeddings = []
for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    response = openai_client.embeddings.create(
        input = output_str,
        model= "text-embedding-ada-002"
    )
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({"source": "titanic"})
    ids.append(f"id{index}")

In [9]:
docs

['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22.0,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38.0,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n',
 'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26.0,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35.0,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n',
 'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35.0,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n',
 'Survived: 0,\nPclass: 3,\nName: Mr. James Moran,\nSex: male,\nAge: 27.0,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.4583,\n',
 'Su

In [11]:
print(metadatas)
print(ids)
print(len(embeddings))
print(len(embeddings[0]))

[{'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source': 'titanic'}, {'source':

In [12]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

Verify the vectorDB creation

In [13]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 100


### RAG

In [14]:
from openai import OpenAI

In [15]:
model_name = "gpt-3.5-turbo"
openai_client = OpenAI()

**Perform similarity search**

In [48]:
query_texts = "what's the average age of survivors"
response = openai_client.embeddings.create(
        input = query_texts,
        model= "text-embedding-ada-002",
    )
query_embeddings = response.data[0].embedding

**Load the chromaDB collection for vector search**

In [128]:
vectordb = chroma_client.get_collection(name="titanic")
vectordb.count()

100

In [135]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)
results

{'ids': [['id13']],
 'distances': [[0.4677661061286926]],
 'metadatas': [[{'source': 'titanic'}]],
 'embeddings': None,
 'documents': [['Survived: 0,\nPclass: 3,\nName: Mr. Anders Johan Andersson,\nSex: male,\nAge: 39.0,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 5,\nFare: 31.275,\n']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

Pass the results to an LLM

In [136]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

In [137]:
response = openai_client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=messages,
    temperature=0.0
)
response.choices[0].message.content

'The average age of survivors is not provided in the search results. Would you like me to try searching for the average age of survivors in a different database or provide you with more information on this topic?'

In [140]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=5 #top_k
)
results

{'ids': [['id13', 'id79', 'id71', 'id56', 'id80']],
 'distances': [[0.4677661061286926,
   0.4689880311489105,
   0.46990108489990234,
   0.4712609350681305,
   0.4714908003807068]],
 'metadatas': [[{'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'}]],
 'embeddings': None,
 'documents': [['Survived: 0,\nPclass: 3,\nName: Mr. Anders Johan Andersson,\nSex: male,\nAge: 39.0,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 5,\nFare: 31.275,\n',
   'Survived: 0,\nPclass: 3,\nName: Mr. Achille Waelens,\nSex: male,\nAge: 22.0,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 9.0,\n',
   'Survived: 0,\nPclass: 2,\nName: Mr. Ambrose Jr Hood,\nSex: male,\nAge: 21.0,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 73.5,\n',
   'Survived: 0,\nPclass: 3,\nName: Mr. Mansouer Novel,\nSex: male,\nAge: 28.5,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.2292,\n',
   'S

In [141]:
response = openai_client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=messages,
    temperature=0.0
)
response.choices[0].message.content

'The average age of survivors is not provided in the search results. Would you like me to try to find more information on the average age of survivors for you?'

In [144]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=10 #top_k
)
results

{'ids': [['id13',
   'id79',
   'id71',
   'id56',
   'id80',
   'id82',
   'id74',
   'id10',
   'id33',
   'id23']],
 'distances': [[0.4677661061286926,
   0.4689880311489105,
   0.46990108489990234,
   0.4712609350681305,
   0.4714908003807068,
   0.4721372723579407,
   0.4728683829307556,
   0.473883718252182,
   0.4751106798648834,
   0.4757847785949707]],
 'metadatas': [[{'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'},
   {'source': 'titanic'}]],
 'embeddings': None,
 'documents': [['Survived: 0,\nPclass: 3,\nName: Mr. Anders Johan Andersson,\nSex: male,\nAge: 39.0,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 5,\nFare: 31.275,\n',
   'Survived: 0,\nPclass: 3,\nName: Mr. Achille Waelens,\nSex: male,\nAge: 22.0,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 9.0,\n',
   'Sur

In [145]:
response = openai_client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=messages,
    temperature=0.0
)
response.choices[0].message.content

'Based on the search results, the average age of survivors is not explicitly provided. The specific information given is about a male passenger named Mr. Anders Johan Andersson who did not survive. To determine the average age of survivors, we would need more data on the ages of all survivors from the Titanic dataset.'