In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from llama_index.core import VectorStoreIndex, StorageContext, SimpleDirectoryReader
# from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.node_parser import SentenceSplitter
import logging

logging.basicConfig(level=logging.INFO)

Check default settings

In [3]:
from llama_index.core import Settings

Settings.text_splitter

SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7fcf10393700>, id_func=<function default_id_func at 0x7fcec9365dc0>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')

Important: This default `text_splitter` will be used to convert documents to nodes.

It will also happen when we save in the `VectorStoreIndex`. LlamaIndex will automatically turn documents into nodes.

### Load Documents

In [4]:
DATA_DIR = "/home/kris/dev/syenza-docs/pdfs-tests"

documents = SimpleDirectoryReader(DATA_DIR).load_data()

nodes = SentenceSplitter().get_nodes_from_documents(documents)

In [5]:
len(documents), len(nodes)

(28, 78)

Count tokens for documents

In [6]:
import tiktoken

embedding = tiktoken.get_encoding("cl100k_base")

In [7]:
for doc in documents:
    print(len(embedding.encode(doc.text)))

1392
1323
1445
1166
1786
1718
1495
1109
1041
940
1291
886
1009
1104
1028
965
1213
961
1402
1203
780
1646
725
990
971
1629
9596
15247


In [8]:
for node in nodes:
    tkn_count = len(embedding.encode(node.text))
    print(tkn_count)
    # if tkn_count > 950:
    #     print("Long one!!")
    #     print(node.text)

948
628
938
563
911
726
735
624
938
957
74
924
912
240
941
742
948
332
942
292
940
924
498
886
679
443
847
283
846
378
965
954
440
961
949
637
935
433
780
813
927
725
862
268
971
930
876
967
919
811
890
969
977
958
956
975
954
975
930
785
911
859
989
987
925
978
980
873
977
903
940
844
909
910
981
805
967
919


### Qdrant Client

For localhost

In [9]:
from qdrant_client import QdrantClient

client = QdrantClient(url="http://localhost:6333")

In [10]:

vector_store = QdrantVectorStore(client=client, collection_name="second_test")
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents,
    storage_context=storage_context,
)

INFO:httpx:HTTP Request: GET http://localhost:6333/collections/second_test/exists "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/second_test "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/second_test/index?wait=true "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections/second_test/exists "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET http://localhost:6333/collections/second_test "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/second_test/points?wait=true "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: PUT http://localhost:6333/collections/second_test/points?wait=true "HTTP/1.1 200 OK"


### Query Vector Store

In [11]:
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/second_test/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [12]:
response

Response(response='The author explained that growing up, they were involved in a variety of activities such as sports, music, and volunteering.', source_nodes=[NodeWithScore(node=TextNode(id_='b351f3a4-e72a-409f-8ac7-0d0dcf24c436', embedding=None, metadata={'file_name': 'Brazil.docx', 'file_path': '/home/kris/dev/syenza-docs/pdfs-tests/Brazil.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'file_size': 62329, 'creation_date': '2024-07-11', 'last_modified_date': '2023-11-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='92e30d14-eb00-4f49-ad88-6d1d01a27373', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_name': 'Brazil.docx', 'file_path': '/home/kris/

### Retriever

In [27]:
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

retriever = VectorIndexRetriever(index, similarity_top_k=5)
query_engine_rt = RetrieverQueryEngine(retriever)



Get Response

In [29]:
response = query_engine_rt.query("What did the author do growing up?")
response

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/second_test/points/search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Response(response='The author discussed interactions with a business regarding financial transactions and international transfers.', source_nodes=[NodeWithScore(node=TextNode(id_='b351f3a4-e72a-409f-8ac7-0d0dcf24c436', embedding=None, metadata={'file_name': 'Brazil.docx', 'file_path': '/home/kris/dev/syenza-docs/pdfs-tests/Brazil.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'file_size': 62329, 'creation_date': '2024-07-11', 'last_modified_date': '2023-11-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='92e30d14-eb00-4f49-ad88-6d1d01a27373', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_name': 'Brazil.docx', 'file_path': '/home/kris/dev/syenza-do

In [30]:
response.response

'The author discussed interactions with a business regarding financial transactions and international transfers.'

Get Similar Nodes

In [22]:
retrieved_nodes = query_engine_rt.retrieve("What did the author do growing up?")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://localhost:6333/collections/second_test/points/search "HTTP/1.1 200 OK"


In [23]:
retrieved_nodes

[NodeWithScore(node=TextNode(id_='b351f3a4-e72a-409f-8ac7-0d0dcf24c436', embedding=None, metadata={'file_name': 'Brazil.docx', 'file_path': '/home/kris/dev/syenza-docs/pdfs-tests/Brazil.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'file_size': 62329, 'creation_date': '2024-07-11', 'last_modified_date': '2023-11-18'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='92e30d14-eb00-4f49-ad88-6d1d01a27373', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_name': 'Brazil.docx', 'file_path': '/home/kris/dev/syenza-docs/pdfs-tests/Brazil.docx', 'file_type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'file_size': 62329, 'creation_

In [26]:
for node in retrieved_nodes:
    print(node.score, node.text)

0.71907055 Brazil Interview 091023

Mon, Oct 09, 2023 6:45PM • 48:38

SUMMARY KEYWORDS

valve, patients, mitral, surgery, years, call, magna, prosthesis, Brazil, brazil, biological, mitral valve surgery, perfect, practice, difference, key opinion leaders, surgeon, durability, questions, interview

SPEAKERS

Brazil, João Carapinha, Danélia Botes



Danélia Botes  00:00

Like, they were thinking we were crooks. Like they were suspect from the get go, I remember their finance guy would call me and stuff. 



João Carapinha  00:06

Yeah.



Danélia Botes  00:07

And I Googled him, he looks like he's the owner's son. 



João Carapinha  00:12

Oh, I see.



Danélia Botes  00:12

But he's like a production manager or something. So I don't know why he's the one that makes the call whether they do international transfers and stuff. But I mean, I explained to him on that phone call that it's going to be money from the US, and it's going to be international transfer and I said, you'll see it on 