In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from glob import glob

from functools import wraps
from time import time
from datetime import datetime

In [3]:
# for each document set (eventually)
# read CSV probably using NonStupidCSVReader
import re
import os

fname = glob('../sample-data/*.csv')[0]
# fname = "../sample-data/Tweets by @NYCFireWire - Sheet1.csv"
fname = "../sample-data/constellation-10q.csv"
project_name = re.sub(r"\W", "_", os.path.basename(fname).split(".")[0])


In [4]:
# just for lookin'
import pandas as pd
df = pd.read_csv(fname)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,cik,fn,paragraph,classification,paragraph_index,font-size,font-family,font-style,...,line-height,text-align,width,margin-bottom,margin-top,text-indent,vertical-align,color,text_len,pct_numbers
0,1825,1325,23217,filings_raw/0000023217-2024Q1-10-Q-cag20231126...,After taking into account liabilities recogniz...,body,405,10pt,"""Times New Roman""",,...,,justify,,,,27pt,,,478.0,0.0
1,2577,842,910606,filings_raw/0000910606-2024Q1-8-K-d690058d8k.html,REGENCY CENTERS CORPO...,body,74,10pt,"""Times New Roman""",italic,...,normal,left,100%,1pt,0.0,,bottom,,479.0,0.025052
2,451,1022,16918,filings_raw/0000016918-2024Q1-10-Q-stz-2023113...,Quarter 2023 and (ii) \$6.8 million of favora...,body,451,11pt,"""Calibri"", sans-serif",,...,120%,,,,,,,#5e5e5e,482.0,0.020704
3,1363,756,19617,filings_raw/0000019617-2024Q1-8-K-jpm-20240116...,JPMorgan Chase & Co. elected Mark Weinberger a...,body,22,10pt,"""Amplitude TF"", sans-serif",,...,120%,,,,,,,#000,482.0,0.029046
4,2427,2227,1090872,filings_raw/0001090872-2024Q1-8-K-a-20240105.html,"On December 20, 2023, we issued a press releas...",body,25,10pt,"""Times New Roman"", sans-serif",,...,112%,,,,,36pt,,#000,484.0,0.012397


In [5]:
from non_stupid_csv_reader import NonStupidCSVReader
#TODO: automatically guess the text column name
# TODO: let users select multiple text columns (sewing them together into one entry? or embedding them separately? idk.)
documents = NonStupidCSVReader().load_data(fname, "paragraph")

# TODO: if embedding multiple columns, this is a way to embed them together.
# bizarrely, by default, LlamaIndex embeds the metadata too.
# we don't want that. Just embed the dang'd text.
# cf. https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/usage_documents/
for document in documents:
    document.excluded_embed_metadata_keys = document.metadata.keys()

In [6]:
# just for lookin'

len(documents)

182

In [19]:
from llama_index.core.schema import TextNode, TransformComponent, NodeRelationship
from llama_index.core.node_parser import SentenceSplitter

class Sploder(TransformComponent):
    def __call__(self, nodes, max_string_token_count=50, **kwargs):
        new_nodes = []
        splitter = SentenceSplitter() # just for the token_size method
                                      # TODO just go get the token_size method
        for node in nodes:
            new_nodes.append(node)
            if splitter._token_size(node.text) > max_string_token_count: continue
            if NodeRelationship.NEXT in node.relationships:
                b_c_node = TextNode(
                                text =  node.text + \
                                        node.relationships[NodeRelationship.NEXT].metadata["original_text"], 
                                metadata=node.metadata)
                new_nodes.append(b_c_node)
            if NodeRelationship.NEXT in node.relationships and NodeRelationship.PREVIOUS in node.relationships:
                a_b_c_node = TextNode(text=node.relationships[NodeRelationship.PREVIOUS].metadata["original_text"] + \
                                       node.text + \
                                       node.relationships[NodeRelationship.NEXT].metadata["original_text"],
                                  metadata=node.metadata
                                 )
                new_nodes.append(a_b_c_node)
        return new_nodes

In [20]:
from typing import Any, Callable, List
from llama_index.core import Document
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
import openai
import os
from dotenv import load_dotenv
load_dotenv()

openai.api_key = os.environ["OPENAI_API_KEY"]

# from llama_index.core import Settings
MODEL_NAME = "text-embedding-3-small"
USE_SPLODER = True
SPLODER_MAX_SIZE = 50
# Settings.embed_model = OpenAIEmbedding(model=MODEL_NAME)

# all transformations for the pipeline except the embedding that (might) costs money


# via https://github.com/run-llama/llama_index/blob/main/llama-index-core/llama_index/core/node_parser/text/utils.py
from llama_index.core.node_parser.text.utils import split_by_sentence_tokenizer_internal
def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
    # via https://stackoverflow.com/questions/14095971/how-to-tweak-the-nltk-sentence-tokenizer
    from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
    punkt_param = PunktParameters()    
    punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'corp', 'ltd'])
    tokenizer = PunktSentenceTokenizer(punkt_param)
    return lambda text: split_by_sentence_tokenizer_internal(text, tokenizer)


text_transformations = [
        # automatically splits by sentences in a logical way
        # my sentence splitter handlers abbreviations better.
        SentenceWindowNodeParser(sentence_splitter=split_by_sentence_tokenizer()), 
    ]
if USE_SPLODER:
    text_transformations.append(
        Sploder(max_string_token_count=SPLODER_MAX_SIZE),
                                    # for reasonably short sentences
                                    # add nodes for:
                                    # - the sentence and the sentence after
                                    # - the sentence before, tjhe sentence, and the sentence after
                                    # in hopes of capturing paragraph-level meaning too (but only for short sentences)
    )

In [21]:
import tiktoken
price_estimation_pipeline = IngestionPipeline(
    transformations=text_transformations
)
preview_nodes = price_estimation_pipeline.run(documents=documents)

price_per_1M = {"text-embedding-3-small": 0.020,
                "text-embedding-3-large": 0.130}

enc = tiktoken.encoding_for_model(MODEL_NAME)

token_count = sum([len(enc.encode(node.text)) for node in preview_nodes])
estimated_price = token_count * (price_per_1M[MODEL_NAME] / 1_000_000)
print("cost estimate: ${:.2f} ({:,.0f} tokens)".format(estimated_price, token_count))

cost estimate: $0.00 (76,394 tokens)


In [22]:
# just for lookin'
metadata = {
    "../sample-data/Tweets by @NYCFireWire - Sheet1.csv":["date", "acct", "url", "addr", "notes"],
    "../sample-data/constellation-10q.csv": ["fn", "paragraph_index"]
}
METADATA_COLUMNS_TO_DISPLAY = metadata[fname]

print("N nodes: {}".format(len(preview_nodes)))
with pd.option_context("display.max_colwidth", None):
    display(
            pd.DataFrame(
                [{"text": node.text, **{k:node.metadata[k] for k in METADATA_COLUMNS_TO_DISPLAY}} for node in preview_nodes[len(preview_nodes)//2:(len(preview_nodes)//2)+10]]
            )
    )
            


N nodes: 1346


Unnamed: 0,text,fn,paragraph_index
0,"As of November 30, 2023, and August 31, 2023, the assets and liabilities of the Business were classified as held for sale and the carrying value is less than the estimated fair value less cost to sell and, thus, no adjustment to the carrying value of the disposal group is necessary. For the three months ended November 30, 2023, depreciation and amortization expense for long-lived assets are not recorded while these assets are classified as held for sale. The divestiture did not meet the criteria to be reported as discontinued operations and we continued to report the operating results for the Business in our Condensed Consolidated Statement of Operations in the DMS segment until the Closing Date.",filings_raw/0000898293-2024Q1-10-Q-d57908d10q.html,352
1,The divestiture did not meet the criteria to be reported as discontinued operations and we continued to report the operating results for the Business in our Condensed Consolidated Statement of Operations in the DMS segment until the Closing Date.,filings_raw/0000898293-2024Q1-10-Q-d57908d10q.html,352
2,"Mr. Roth, age 48, has been the Company’s Executive Vice President, National Operations and East Region President since January of 2023.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33
3,"Mr. Roth, age 48, has been the Company’s Executive Vice President, National Operations and East Region President since January of 2023. Prior to this role, he served as Senior Managing Director, East Region, and Managing Director of the Northeast, Mid-Atlantic and Southeast Regions, respectively.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33
4,"Prior to this role, he served as Senior Managing Director, East Region, and Managing Director of the Northeast, Mid-Atlantic and Southeast Regions, respectively.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33
5,"Prior to this role, he served as Senior Managing Director, East Region, and Managing Director of the Northeast, Mid-Atlantic and Southeast Regions, respectively. Mr. Roth has also held various other leadership positions at the Company, including Senior Vice President and Senior Market Officer for the Mid-Atlantic and Northeast portfolio, and Vice President and Regional Officer.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33
6,"Mr. Roth, age 48, has been the Company’s Executive Vice President, National Operations and East Region President since January of 2023. Prior to this role, he served as Senior Managing Director, East Region, and Managing Director of the Northeast, Mid-Atlantic and Southeast Regions, respectively. Mr. Roth has also held various other leadership positions at the Company, including Senior Vice President and Senior Market Officer for the Mid-Atlantic and Northeast portfolio, and Vice President and Regional Officer.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33
7,"Mr. Roth has also held various other leadership positions at the Company, including Senior Vice President and Senior Market Officer for the Mid-Atlantic and Northeast portfolio, and Vice President and Regional Officer.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33
8,"Mr. Roth has also held various other leadership positions at the Company, including Senior Vice President and Senior Market Officer for the Mid-Atlantic and Northeast portfolio, and Vice President and Regional Officer. Mr. Roth joined the Company as a Leasing Agent in 1997 through the Company’s acquisition of Midland Development Group, and is a graduate of the Kelley School of Business at Indiana University.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33
9,"Prior to this role, he served as Senior Managing Director, East Region, and Managing Director of the Northeast, Mid-Atlantic and Southeast Regions, respectively. Mr. Roth has also held various other leadership positions at the Company, including Senior Vice President and Senior Market Officer for the Mid-Atlantic and Northeast portfolio, and Vice President and Regional Officer. Mr. Roth joined the Company as a Leasing Agent in 1997 through the Company’s acquisition of Midland Development Group, and is a graduate of the Kelley School of Business at Indiana University.",filings_raw/0000910606-2024Q1-8-K-reg-20240101.html,33


In [23]:
# TODO: support Llama etc.

# this is just monkeypatching to look at what's going on.
# from llama_index.embeddings.openai.base import get_embeddings
# def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
#     """Get text embeddings.

#     By default, this is a wrapper around _get_text_embedding.
#     Can be overridden for batch queries.

#     """
#     client = self._get_client()
#     retry_decorator = self._create_retry_decorator()
#     print(texts)
#     @retry_decorator
#     def _retryable_get_embeddings():
#         return get_embeddings(
#             client,
#             texts,
#             engine=self._text_engine,
#             **self.additional_kwargs,
#         )

#     return _retryable_get_embeddings()

# OpenAIEmbedding._get_text_embeddings = _get_text_embeddings
embedding_step = OpenAIEmbedding(model=MODEL_NAME )


In [24]:
# run the pipeline
pipeline = IngestionPipeline(
    transformations=text_transformations + [
        embedding_step
    ]
)
start_time = datetime.now()
nodes = pipeline.run(documents=documents)
end_time = datetime.now()
duration = end_time - start_time
print("took {}s to embed {} documents".format(duration.total_seconds(), len(documents)))

took 25.059383s to embed 182 documents


In [27]:
# TODO see https://docs.llamaindex.ai/en/stable/examples/vector_stores/DuckDBDemo/
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

model_dims = {
    "text-embedding-3-small": 1536,
    "text-embedding-3-large": 3072
}

VECTOR_STORE = "chroma"

start_time = datetime.now()
if VECTOR_STORE == "chroma":
    db = chromadb.PersistentClient(path="./chroma")
    chroma_collection = db.get_or_create_collection("{}".format(project_name))
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
elif VECTOR_STORE == "duckdb":
    vector_store = DuckDBVectorStore("{}2.duckdb".format(project_name), persist_dir="../duckdb/")
elif VECTOR_STORE == "postgres":
    vector_store = PGVectorStore.from_params(
        database="meaningfully",
        host="localhost",
        # password=url.password,
        port=5432,
        user="jeremybmerrill",
        table_name=project_name,
        embed_dim=model_dims[MODEL_NAME],  # openai embedding dimension
        hnsw_kwargs={
            "hnsw_m": 16,
            "hnsw_ef_construction": 64,
            "hnsw_ef_search": 40,
            "hnsw_dist_method": "vector_cosine_ops",
        },
    )
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    nodes, storage_context=storage_context, show_progress=True
)
end_time = datetime.now()
duration = end_time - start_time
print("took {}s to store {} nodes".format(duration.total_seconds(), len(nodes)))
#index = VectorStoreIndex.from_documents(documents)

Generating embeddings: 0it [00:00, ?it/s]

took 6.292975s to store 1346 nodes


## basic basic search

just a smoke test. see search.ipynb for more.

In [29]:
from llama_index.core.vector_stores.types import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
)

retriever = index.as_retriever(
    similarity_top_k=10,
    embed_model=OpenAIEmbedding(model_name=MODEL_NAME)
)
result_nodes = retriever.retrieve("snow at our factory cost us a lot of money")
len(result_nodes)

10

In [30]:
# just for lookin'
result_nodes_list = []
for node_with_score in result_nodes:
    result_node_dict = node_with_score.node.dict()
    result_node_dict["score"] = node_with_score.score
    result_nodes_list.append(result_node_dict)
result_nodes_df = pd.DataFrame(result_nodes_list)
with pd.option_context('display.max_colwidth', 500):
    display(result_nodes_df[["text", "metadata", "score"]])

Unnamed: 0,text,metadata,score
0,Operating profit in the second quarter of fiscal 2023 included charges of \$7.9 million associated with a fire occurring at one of our manufacturing facilities.,"{'Unnamed: 0.1': 2039, 'Unnamed: 0': 1539, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 619, 'font-size': '10pt', 'font-family': 'Times New Roman', 'font-style': '', 'font-weight': '', 'line-height': '', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': '', 'color': '', 'text_len': 663.0, 'pct_numbers': 0.036036036036036, 'window': 'Operating prof...",0.282017
1,Operating profit in the first half of fiscal 2024 included a benefit of \$5.9 million associated with insurance proceeds from the previous fire that occurred at one of our manufacturing facilities.,"{'Unnamed: 0.1': 2046, 'Unnamed: 0': 1546, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 626, 'font-size': '10pt', 'font-family': 'Times New Roman', 'font-style': '', 'font-weight': '', 'line-height': '', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': '', 'color': '', 'text_len': 717.0, 'pct_numbers': 0.0333333333333333, 'window': 'Operating pro...",0.276567
2,"The de crease was driven by the net sales decline discussed above, impacts of input cost inflation, and unfavorable fixed cost leverage, partially offset by productivity and lower transportation costs .","{'Unnamed: 0.1': 2039, 'Unnamed: 0': 1539, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 619, 'font-size': '10pt', 'font-family': 'Times New Roman', 'font-style': '', 'font-weight': '', 'line-height': '', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': '', 'color': '', 'text_len': 663.0, 'pct_numbers': 0.036036036036036, 'window': 'Operating prof...",0.271637
3,"The de crease was driven by the net sales decline discussed above, impacts of input cost inflation, and unfavorable fixed cost leverage, partially offset by inflation driven pricing that was implemented in the prior year, productivity, lower transportation costs, and lower inventory write-offs.","{'Unnamed: 0.1': 2040, 'Unnamed: 0': 1540, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 620, 'font-size': '10pt', 'font-family': '""Times New Roman""', 'font-style': '', 'font-weight': '', 'line-height': '120%', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': 'top', 'color': '', 'text_len': 1058.0, 'pct_numbers': 0.0319849482596425, 'window': 'Ope...",0.269803
4,Operating profit in the first half of fiscal 2023 included charges of \$7.9 million associated with a fire occurring at one of our manufacturing facilities and \$5.7 million related to the impairment of businesses previously held for sale.,"{'Unnamed: 0.1': 2040, 'Unnamed: 0': 1540, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 620, 'font-size': '10pt', 'font-family': '""Times New Roman""', 'font-style': '', 'font-weight': '', 'line-height': '120%', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': 'top', 'color': '', 'text_len': 1058.0, 'pct_numbers': 0.0319849482596425, 'window': 'The...",0.269772
5,These recoveries related to an outage at our Nava Brewery due to severe winter weather events in early 2021.,"{'Unnamed: 0.1': 295, 'Unnamed: 0': 866, 'cik': 16918, 'fn': 'filings_raw/0000016918-2024Q1-10-Q-stz-20231130.html', 'classification': 'body', 'paragraph_index': 295, 'font-size': '11pt', 'font-family': '""Calibri"", sans-serif', 'font-style': '', 'font-weight': '400', 'line-height': '120%', 'text-align': '', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '36pt', 'vertical-align': '', 'color': '#5e5e5e', 'text_len': 487.0, 'pct_numbers': 0.0327868852459016, 'window': 'In De...",0.267396
6,Operati ng profit in the second quarter of fiscal 2024 included a benefit of \$2.6 million associated with insurance proceeds from the previous fire that occurred at one of our manufacturing facilities.,"{'Unnamed: 0.1': 2045, 'Unnamed: 0': 1545, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 625, 'font-size': '10pt', 'font-family': 'Times New Roman', 'font-style': '', 'font-weight': '', 'line-height': '', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': '', 'color': '', 'text_len': 538.0, 'pct_numbers': 0.0314814814814814, 'window': 'Operating pro...",0.266229
7,"Our industry continues to be impacted by commodity cost fluctuations, labor cost inflation, input cost inflation, and other global macroeconomic challenges.","{'Unnamed: 0.1': 1965, 'Unnamed: 0': 1465, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 545, 'font-size': '10pt', 'font-family': '""Times New Roman"", Times, serif', 'font-style': '', 'font-weight': '', 'line-height': '', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': '', 'color': '', 'text_len': 518.0, 'pct_numbers': 0.0154440154440154, 'window'...",0.265619
8,"The de crease was driven by the net sales decline discussed above, impacts of input cost inflation, and unfavorable fixed cost leverage, partially offset by productivity and lower transportation costs . Advertising and promotion expenses decreased \$13.4 million compared to the second quarter of fiscal 2023 .","{'Unnamed: 0.1': 2039, 'Unnamed: 0': 1539, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 619, 'font-size': '10pt', 'font-family': 'Times New Roman', 'font-style': '', 'font-weight': '', 'line-height': '', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': '', 'color': '', 'text_len': 663.0, 'pct_numbers': 0.036036036036036, 'window': 'Operating prof...",0.263388
9,"Operating profit in our Refrigerated & Frozen segment for the second quarter of fiscal 2024 reflected a decrease in gross profits of \$35.9 million compared to the second quarter of fiscal 2023. The de crease was driven by the net sales decline discussed above, impacts of input cost inflation, and unfavorable fixed cost leverage, partially offset by productivity and lower transportation costs . Advertising and promotion expenses decreased \$13.4 million compared to the second quarter of fisc...","{'Unnamed: 0.1': 2039, 'Unnamed: 0': 1539, 'cik': 23217, 'fn': 'filings_raw/0000023217-2024Q1-10-Q-cag20231126_10q.html', 'classification': 'body', 'paragraph_index': 619, 'font-size': '10pt', 'font-family': 'Times New Roman', 'font-style': '', 'font-weight': '', 'line-height': '', 'text-align': 'justify', 'width': '', 'margin-bottom': '', 'margin-top': '', 'text-indent': '27pt', 'vertical-align': '', 'color': '', 'text_len': 663.0, 'pct_numbers': 0.036036036036036, 'window': 'Operating prof...",0.263237


In [31]:
sorted([node.text for node in nodes if 'severe winter' in node.text])

['In December 2023, we recorded \\$ 37 million of business interruption and other recoveries from our insurance carrier. We are pursuing additional reimbursement from another insurance carrier, however there can be no assurance there will be any incremental recoveries. These recoveries related to an outage at our Nava Brewery due to severe winter weather events in early 2021. ',
 'These recoveries related to an outage at our Nava Brewery due to severe winter weather events in early 2021. ',
 'These recoveries related to an outage at our Nava Brewery due to severe winter weather events in early 2021. These proceeds will be included in our consolidated results of operations for the year ending February\xa029, 2024.',
 'We are pursuing additional reimbursement from another insurance carrier, however there can be no assurance there will be any incremental recoveries. These recoveries related to an outage at our Nava Brewery due to severe winter weather events in early 2021. ',
 'We are pur

# timing test

In [None]:
# TODO see https://docs.llamaindex.ai/en/stable/examples/vector_stores/DuckDBDemo/
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb
model_dims = {
    "text-embedding-3-small": 1536,
    "text-embedding-3-large": 3072
}
VECTOR_STORE = "chroma"

for i in range(10):

    start_time = datetime.now()
    if VECTOR_STORE == "chroma":
        db = chromadb.PersistentClient(path="./chroma")
        chroma_collection = db.get_or_create_collection("{}-{}".format(project_name, i))
        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    elif VECTOR_STORE == "duckdb":
        vector_store = DuckDBVectorStore(
            "{}-i{}.duckdb".format(project_name, i),
            persist_dir="../duckdb/"
        )
    elif VECTOR_STORE == "postgres":
        vector_store = PGVectorStore.from_params(
            database="meaningfully",
            host="localhost",
            # password=url.password,
            port=5432,
            user="jeremybmerrill",
            table_name=project_name,
            embed_dim=model_dims[MODEL_NAME],  # openai embedding dimension
            hnsw_kwargs={
                "hnsw_m": 16,
                "hnsw_ef_construction": 64,
                "hnsw_ef_search": 40,
                "hnsw_dist_method": "vector_cosine_ops",
            },
        )
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex(
        nodes[:(i+1)*(len(nodes)//10)], storage_context=storage_context, show_progress=True
    )
    end_time = datetime.now()
    duration = end_time - start_time
    print("took {}s to store {} nodes".format(duration.total_seconds(), len(nodes[:(i+1)*(len(nodes)//10)])))
    #index = VectorStoreIndex.from_documents(documents)

In [None]:
[(node.node_id, node.text) for node in result_nodes]