In [1]:
##############################################################
## This code is for academic and educational purposes only. ##
## Event: Global Summit 2024 Maryland USA                   ##
## InterSystems Corporation 2024 (C)                        ##
## Date: June 9th 2024                                      ##
##############################################################

##### We are going to use llama index that allows us to load and store data from file and put it into iris
from llama_index import download_loader
from llama_index import SimpleDirectoryReader, StorageContext, ServiceContext
from llama_index.readers.json import JSONReader
from llama_index.indices.vector_store import VectorStoreIndex
from llama_iris import IRISVectorStore

from dotenv import load_dotenv
load_dotenv(override=True)

import os

##### Let's load our dataset
reader = JSONReader(is_jsonl=True)
documents = reader.load_data('./data/financial/tweets_all.jsonl')


In [2]:
##### Let's see the first 5 documents
documents[:5]

##### We have already reduced these documents (in Step 0) to just the text and first 100 documents

[Document(id_='34e355d8-87fe-45c9-8541-696b7b9e72d5', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='"note": "$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT"', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='1fb16baf-29d6-45e7-99f2-fa83411fb957', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='"note": "$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https://t.co/yGjpT2ReD3"', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='edf6051c-8107-48fb-be23-a928b25cfb58', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationship

In [3]:
##### Configuring IRIS
# Setup our demo connectivity
username = 'demo'
password = 'demo' 
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '61209' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
#####

In [4]:
##### Here, we connect the dataset into the IRISVectorStore helper
vector_store = IRISVectorStore.from_params(
    connection_string=CONNECTION_STRING,
    table_name="financial_tweets_llamaindex",
    embed_dim=1536,  # openai embedding dimension
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [5]:
##### Finally, We can connect into the iris instance and save our data in a vectorized format
## TODO: explain how embeddings work and why we're using them
##### Below, we setup how we are going to index the vectorized data (using an embeddings model)
index = VectorStoreIndex.from_documents(
    documents,                              ##### These are our clinical notes we loaded up
    storage_context=storage_context,        ##### This is our connection to the vector store
    show_progress=True,                     ##### Let's see the progress as it happens
)

##### To interact with our embeddings, we take the query engine from our documents
query_engine = index.as_query_engine()      ##### The "as_query_engine" is a llama_index directive which lets 
                                            ##### us search and retrieve based on vector similarity

Parsing nodes:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
##### Now, let's use this against our vector store!

response = query_engine.query("Can you tell me about microsoft earnings")
import textwrap
print(textwrap.fill(str(response), 100))

I am unable to provide information on Microsoft earnings based on the context provided.
