In [1]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

False


In [2]:
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [3]:
# Load the LLM
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    openai_api_base= os.getenv("OPENAI_API_BASE"),
    openai_api_key= os.getenv("OPENAI_API_KEY"),
    # tream=True,
    temperature=0)

In [4]:
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [5]:
# collection = chroma_client.create_collection(name="titanic_small")

# 列出所有集合的名称
existing_collections = chroma_client.list_collections()

collection_name = "titanic_small"

# 获取所有集合
existing_collections = chroma_client.list_collections()

# 提取集合名称
existing_collection_names = [collection.name for collection in existing_collections]

if collection_name in existing_collection_names:
    # 如果集合存在，获取它
    collection = chroma_client.get_collection(name=collection_name)
    print(f"Retrieved existing collection: {collection_name}")
else:
    # 如果集合不存在，创建它
    collection = chroma_client.create_collection(name=collection_name)
    print(f"Created new collection: {collection_name}")


Created new collection: titanic_small


In [6]:
file_dir = here("data/for_upload/titanic_small.csv")
df = pd.read_csv(file_dir, nrows=5)

In [7]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05


NOTE: Process in chunks if dataset is big.

In [8]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI

# 设置 OpenAI API 密钥
import os
# os.environ["OPENAI_API_KEY"] = openai_api_key  # 如果你已经设置了环境变量，则不需要这行

# 创建 OpenAIEmbeddings 实例
OpenAIEmbeddings = OpenAIEmbeddings()

# 输入文本
# text = "Your text string goes here"

# # 获取文本的嵌入
# embedding = OpenAIEmbeddings.embed_documents(text)[0]

# # 输出嵌入
# print(embedding)


  OpenAIEmbeddings = OpenAIEmbeddings()


In [9]:
docs = []
metadatas = []
ids = []
embeddings = []
for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    response = OpenAIEmbeddings.embed_documents(output_str)[0]
    embeddings.append(response)
    docs.append(output_str)
    metadatas.append({"source": "titanic_small"})
    ids.append(f"id{index}")

In [10]:
docs

['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n',
 'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n',
 'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n',
 'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n']

In [11]:
print(metadatas)
print(ids)

[{'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}, {'source': 'titanic_small'}]
['id0', 'id1', 'id2', 'id3', 'id4']


In [12]:
embeddings[0][:10]

[0.00481596199813357,
 -0.008011599724067266,
 0.019405304710828745,
 0.020151167407290473,
 -0.02875431340786187,
 0.009464746943323867,
 -0.02289028572418294,
 -0.022427336464310145,
 -0.007825134049951834,
 -0.01372131191911114]

In [20]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

Verify the vectorDB creation

In [21]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 5


### RAG

**Perform similarity search**

In [27]:
query_texts = "what's the average age of survivors"
response = OpenAIEmbeddings.embed_documents(query_texts)[0]
query_embeddings = response

**Load the chromaDB collection for vector search**

In [28]:
vectordb = chroma_client.get_collection(name="titanic_small")
vectordb.count()

5

In [29]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=5 #top_k
)

results

{'ids': [['id0', 'id1', 'id2', 'id4', 'id3']],
 'embeddings': None,
 'documents': [['Survived: 0,\nPclass: 3,\nName: Mr. Owen Harris Braund,\nSex: male,\nAge: 22,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 7.25,\n',
   'Survived: 1,\nPclass: 1,\nName: Mrs. John Bradley (Florence Briggs Thayer) Cumings,\nSex: female,\nAge: 38,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 71.2833,\n',
   'Survived: 1,\nPclass: 3,\nName: Miss. Laina Heikkinen,\nSex: female,\nAge: 26,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 7.925,\n',
   'Survived: 0,\nPclass: 3,\nName: Mr. William Henry Allen,\nSex: male,\nAge: 35,\nSiblings/Spouses Aboard: 0,\nParents/Children Aboard: 0,\nFare: 8.05,\n',
   'Survived: 1,\nPclass: 1,\nName: Mrs. Jacques Heath (Lily May Peel) Futrelle,\nSex: female,\nAge: 35,\nSiblings/Spouses Aboard: 1,\nParents/Children Aboard: 0,\nFare: 53.1,\n']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'titanic_small'}

Pass the results to an LLM

In [30]:
from langchain_openai import ChatOpenAI

chat = ChatOpenAI(
    openai_api_base=os.environ["OPENAI_API_BASE"],
    openai_api_key=os.environ["OPENAI_API_KEY"],
    # tream=True,
    temperature=0)

system_role = "You will recieve the user's question along with the search results of that question over a database. Give the user the proper answer."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content":system_role},
    {"role": "user", "content": prompt}
]

# Call the model with the messages
response = chat(messages)

# Print the response
print(response.content)


The average age of survivors is 31.2 years old.


**Fact check**

In [19]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35,0,0,8.05
