# Setup

In [2]:
# install dependencies
!pipenv install langchain
!pipenv install sentence_transformers
!pipenv install chromadb
!pipenv install unstructured

[1mLoading .env environment variables...[0m
[1;32mInstalling langchain[0m[1;33m...[0m
[?25lResolving langchain[33m...[0m
[2K✔ Installation Succeeded
[2K[32m⠋[0m Installing langchain...
[1A[2K[1;33mPipfile.lock [0m[1;33m([0m[1;33m21ba60[0m[1;33m)[0m[1;33m out of date, updating to [0m[1;33m([0m[1;33md3d6e2[0m[1;33m)[0m[1;33m...[0m
Locking[0m [33m[packages][0m dependencies...[0m
[?25lBuilding requirements[33m...[0m
[2KResolving dependencies[33m...[0m
[2K✔ Success! Locking...
[2K[32m⠇[0m Locking...
[1A[2KLocking[0m [33m[dev-packages][0m dependencies...[0m
[?25lBuilding requirements[33m...[0m
[2KResolving dependencies[33m...[0m
[2K✔ Success! Locking...
[2K[32m⠦[0m Locking...
[1A[2K[1mUpdated Pipfile.lock (710967f062910ffce98e65863f099e0664f6382fb08b7aa2a791a6207dd3d6e2)![0m
[1mInstalling dependencies from Pipfile.lock [0m[1m([0m[1md3d6e2[0m[1m)[0m[1;33m...[0m
To activate this project's virtualenv, run [33mpipenv s

In [3]:
# load dependencies
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [4]:
# set params
DATA_PATH = "data/html"
CHROMA_PATH = "chroma_db"
EMBED_MODEL = "all-MiniLM-L6-v2" # Chroma defaults to "sentence-transformers/all-MiniLM-L6-v2"

In [5]:
# load docs
def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(DATA_PATH)
len(documents)

[nltk_data] Downloading package punkt to /Users/steve/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/steve/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


3487

In [6]:
documents[0]

Document(page_content='46.708 Warranties of data.\n\nWarranties of data shall be developed and used in accordance with agency regulations.\n\nSubpart 46.7 - Warranties', metadata={'source': 'data/html/46.708.html'})

In [7]:
# define text embedding model
embedding_func = SentenceTransformerEmbeddings(model_name=EMBED_MODEL) # "BAAI/bge-small-en-v1.5"

# See https://huggingface.co/spaces/mteb/leaderboard

  from .autonotebook import tqdm as notebook_tqdm
.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 1.40MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 984kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 14.0MB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 2.67MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 296kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 445kB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:03<00:00, 29.8MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 111kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 388kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.69MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 2.04MB/s]
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 13.2MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.26MB/s]
modules.json: 100%|██████████| 349/349

In [8]:
# initialize Chroma db and save to disk
db = Chroma.from_documents(
    documents=documents, embedding=embedding_func, persist_directory=CHROMA_PATH
    )

db.persist()

# print message
print(f"Saved {len(documents)} chunks to {CHROMA_PATH}.")

In [10]:
# query vector db
query = "What is the purpose of the Federal Acquisition Regulations System?"
matching_docs = db.similarity_search(query)

matching_docs

[Document(page_content='1.101 Purpose.\n\nThe Federal Acquisition Regulations System is established for the codification and publication of uniform policies and procedures for acquisition by all executive agencies. The Federal Acquisition Regulations System consists of the Federal Acquisition Regulation (FAR), which is the primary document, and agency acquisition regulations that implement or supplement the FAR. The FAR System does not include internal agency guidance of the type described in 1.301(a)(2).\n\nSubpart 1.1 - Purpose, Authority, Issuance', metadata={'source': 'data/html/1.101.html'}),
 Document(page_content='1.000 Scope of part.\n\nThis part sets forth basic policies and general information about the Federal Acquisition Regulations System including purpose, authority, applicability, issuance, arrangement, numbering, dissemination, implementation, supplementation, maintenance, administration, and deviation. subparts\xa0 1.2,1.3, and 1.4 prescribe administrative procedures f