# Demo 5 (LlamaIndex, vectors)

In [1]:
import os
import cassio

cassio.init(
    token=os.environ['ASTRA_DB_APPLICATION_TOKEN'],
    database_id=os.environ['ASTRA_DB_ID'],
    keyspace=os.environ.get('ASTRA_DB_KEYSPACE'),
)

In [2]:
from llama_index import VectorStoreIndex, StorageContext
from llama_index.vector_stores import CassandraVectorStore
from llama_index import VectorStoreIndex

from llama_index import ServiceContext
from llama_index import set_global_service_context

from llama_index import SimpleWebPageReader
from llama_index.node_parser import SimpleNodeParser
from llama_index.node_parser.extractors import (
    MetadataExtractor,
    MetadataFeatureExtractor,
)

from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters

from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding

In [3]:
oai_llm = OpenAI()
oai_embeddings = OpenAIEmbedding()
vector_dimension = 1536

In [4]:
service_context = ServiceContext.from_defaults(
    llm=oai_llm,
    embed_model=oai_embeddings,
    chunk_size=256,
)
set_global_service_context(service_context)

In [5]:
my_v_store = CassandraVectorStore(
    table='llama_news_v_store',
    embedding_dimension=vector_dimension,
    insertion_batch_size=30,
    session=None,
    keyspace=None,
)

storage_context = StorageContext.from_defaults(vector_store=my_v_store)

#### Custom metadata

This cell constructs a custom extractor that inspects the node content to generate its metadata dict:

In [7]:
class CovidMentionExtractor(MetadataFeatureExtractor):

    def class_name(self):
        return 'covid_mention_extractor'
    
    def extract(self, nodes):
        return [
            {
                # contrived metadata for demonstration purposes
                "mentions_covid": "covid" in node.text.lower(),
                "mentions_recipients": "weissman" in node.text.lower() or "kariko" in node.text.lower()
            }
            for node in nodes
        ]

md_extractor = MetadataExtractor(extractors=[CovidMentionExtractor()])

In [8]:
src_url = 'https://www.washingtonpost.com/science/2023/10/02/nobel-prize-medicine/'

node_parser = SimpleNodeParser.from_defaults(
    chunk_size=400,
    chunk_overlap=80,
    metadata_extractor=md_extractor,
)

documents = SimpleWebPageReader(html_to_text=True).load_data([src_url])
nodes = node_parser.get_nodes_from_documents(documents)

print(f"Documents: {len(documents)} => {len(nodes)} nodes.")

Documents: 1 => 19 nodes.


In [9]:
my_v_index = VectorStoreIndex.from_vector_store(vector_store=my_v_store)

my_v_index.insert_nodes(nodes)

## Query

### Simple query

In [10]:
query_engine = my_v_index.as_query_engine(similarity_top_k=3)
q_result = query_engine.query("Who won the Nobel Prize or Medicine in 2023?")
print(q_result.response.strip())

Katalin Karikó and Drew Weissman won the Nobel Prize in Medicine in 2023.


### MMR retrieval

In [11]:
query_engine_mmr = my_v_index.as_query_engine(
    similarity_top_k=3,
    vector_store_query_mode="mmr",
    vector_store_kwargs={
        "mmr_prefetch_k": 8,
    },
)
q_result_mmr = query_engine_mmr.query("Who won the Nobel Prize or Medicine in 2023?")
print(q_result_mmr.response.strip())

Katalin Karikó and Drew Weissman won the Nobel Prize in Medicine in 2023.


### Metadata filtering

We hide the _names_ of the recipients and try the question again:

In [12]:
query_engine = my_v_index.as_query_engine(
    similarity_top_k=3,
    filters=MetadataFilters(filters=[
        ExactMatchFilter(key="mentions_recipients", value="false"),
    ])
)
q_result = query_engine.query("Who won the Nobel Prize or Medicine in 2023?")
print(q_result.response.strip())

The Nobel Prize in Medicine in 2023 was awarded to scientists who laid the foundation for messenger RNA vaccines.
