We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key.

In [1]:
import getpass
import os
from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"): 
    #os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
    pass

# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")  
# os.environ["OPENAI_API_KEY"]



In [2]:
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, JSONLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.fastembed import FastEmbedEmbeddings

from langchain_iris import IRISVector


In [3]:
# loader = TextLoader("../data/state_of_the_union.txt", encoding='utf-8')
# Windows only install: 
# ! pip install https://jeffreyknockel.com/jq/jq-1.4.0-cp311-cp311-win_amd64.whl
# Other platforms
# ! pip install jq
#

loader = JSONLoader(
    file_path='./data/financial/tweets_all.jsonl',
    jq_schema='.note',
    json_lines=True # TODO: tell audience what json lines are
)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
# embeddings = FastEmbedEmbeddings()

  warn_deprecated(


In [4]:
username = 'demo'
password = 'demo' 
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '61209' # '1972'
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

In [5]:
# print(os.environ.get("OPENAI_API_KEY"))
print(CONNECTION_STRING)


iris://demo:demo@localhost:61209/USER


In [6]:
COLLECTION_NAME = "financial_tweets"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [7]:
# If reconnecting to the database, use this:

# db = IRISVector(
#     embedding_function=embeddings,
#     dimension=1536,
#     collection_name=COLLECTION_NAME,
#     connection_string=CONNECTION_STRING,
# )

In [8]:
# To add documents to existing vector store:

db.add_documents(documents)

['82d95b80-0c9e-11ef-b397-24418cc2f064',
 '82d95b81-0c9e-11ef-81e2-24418cc2f064',
 '82d95b82-0c9e-11ef-b8e9-24418cc2f064',
 '82d95b83-0c9e-11ef-bf55-24418cc2f064',
 '82d95b84-0c9e-11ef-a13a-24418cc2f064',
 '82d95b85-0c9e-11ef-8c9c-24418cc2f064',
 '82d95b86-0c9e-11ef-97c7-24418cc2f064',
 '82d95b87-0c9e-11ef-945e-24418cc2f064',
 '82d95b88-0c9e-11ef-927e-24418cc2f064',
 '82d95b89-0c9e-11ef-92b2-24418cc2f064',
 '82d95b8a-0c9e-11ef-a175-24418cc2f064',
 '82d95b8b-0c9e-11ef-a2b1-24418cc2f064',
 '82d95b8c-0c9e-11ef-987f-24418cc2f064',
 '82d95b8d-0c9e-11ef-a7b6-24418cc2f064',
 '82d95b8e-0c9e-11ef-9f30-24418cc2f064',
 '82d95b8f-0c9e-11ef-b516-24418cc2f064',
 '82d95b90-0c9e-11ef-a625-24418cc2f064',
 '82d95b91-0c9e-11ef-8a9a-24418cc2f064',
 '82d95b92-0c9e-11ef-9b28-24418cc2f064',
 '82d95b93-0c9e-11ef-a822-24418cc2f064',
 '82d95b94-0c9e-11ef-9b6c-24418cc2f064',
 '82d95b95-0c9e-11ef-8ee7-24418cc2f064',
 '82d95b96-0c9e-11ef-9f32-24418cc2f064',
 '82d95b97-0c9e-11ef-964f-24418cc2f064',
 '82d95b98-0c9e-

In [9]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

Number of docs in vector store: 2000


In [10]:
query = "Earning hit"
docs_with_score = db.similarity_search_with_score(query)

In [11]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.206096636122251
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.206205877698645
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.20792379246252
Hasbro upgraded on earnings growth potential from Entertainment One acquisition
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.207954574375134
Hasbro upgraded on earnings growth potential from Entertainment One acquisition
--------------------------------------------------------------------------------


In [12]:
db.add_documents([Document(page_content="foo")])
docs_with_score = db.similarity_search_with_score("foo")
docs_with_score[0]

(Document(page_content='foo'), 0.0)

In [13]:
docs_with_score

[(Document(page_content='foo'), 0.0),
 (Document(page_content='Analysis: Popeyes Chicken Sandwich Translates To Surge In Foot Traffic', metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 945}),
  0.217646553582138),
 (Document(page_content='Analysis: Popeyes Chicken Sandwich Translates To Surge In Foot Traffic', metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 945}),
  0.217646553582138),
 (Document(page_content='Bull camp empty on Sally Beauty', metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 368}),
  0.222598389051368)]

In [14]:
retriever = db.as_retriever()
print(retriever)

tags=['IRISVector'] vectorstore=<langchain_iris.vectorstores.IRISVector object at 0x000001CD894F9550>
