# Vector Store

We start by defining a few constants and imports

In [9]:
DATASET_DIR = "../../datasets/bioasq/"

In [10]:
import os

DATASET_PATH = os.path.join(DATASET_DIR, "data.csv")
DB_PATH = os.path.join(DATASET_DIR, "vector_db")

In [11]:
from langchain.chains import LLMChain, RetrievalQA, HypotheticalDocumentEmbedder
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings, OpenAI, ChatOpenAI
from langchain_chroma import Chroma

In [12]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY", getpass.getpass())

 ········


Documents are processed and encoded into dense embeddings

In [13]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [14]:
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#data loader
loader = CSVLoader(file_path=DATASET_PATH, encoding='utf-8')
docs = loader.load()

#data transformers
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in docs:
    docs_processed += text_splitter.split_documents([doc])

docs_processed[0]

Document(metadata={'source': '../../datasets/bioasq/data.csv', 'row': 0, 'start_index': 0}, page_content='passage: 1. The temperature function of the myeloma IgG(K) IVA, Bence-Jones protein \n(K-type) IVA and its fragments (Fab(t), Fc\'(t), VL and CL) was studied by \nthermal perturbation difference spectroscopy and circular dichroism. 2. The IgG \nand Bence-Jones protein studied were found to be capable of a fully reversible \nstructural changes at temperatures between 25 and 35 degrees C. The changes \noccurring at the higher temperature are accompanied by the screening of the \nsignificant part of exposed tyrosine residues. The transition is not accompanied \nby an appreciable change in the main IgG secondary structure-beta-pleated sheet, \naccording to the CD data. 3. It was found that the temperature-dependent changes \nof IgG occur in its Fab fragments, the changes of Bence-Jones protein occur in \nits variable part (VL domains). 4. The temperature changes in the interval 25-35 \

In [15]:
# Cost/compute intensive

%time vector_db = Chroma.from_documents(docs_processed, embeddings, persist_directory=DB_PATH)

CPU times: user 13.4 s, sys: 417 ms, total: 13.8 s
Wall time: 22.6 s
