We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key.

In [2]:
import getpass
import os
from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"): 
    #os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
    pass

# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")  
# os.environ["OPENAI_API_KEY"]



In [3]:
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, JSONLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.fastembed import FastEmbedEmbeddings

from langchain_iris import IRISVector


In [4]:
# loader = TextLoader("../data/state_of_the_union.txt", encoding='utf-8')
# Windows only install: 
# ! pip install https://jeffreyknockel.com/jq/jq-1.4.0-cp311-cp311-win_amd64.whl
# Other platforms
# ! pip install jq
#

loader = JSONLoader(
    file_path='./data/financial/tweets_all.jsonl',
    jq_schema='.note',
    json_lines=True # TODO: tell audience what json lines are
)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
# embeddings = FastEmbedEmbeddings()

  warn_deprecated(


In [5]:
username = 'demo'
password = 'demo' 
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '51729' # '1972'
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

In [6]:
# print(os.environ.get("OPENAI_API_KEY"))
print(CONNECTION_STRING)


iris://demo:demo@localhost:51729/USER


In [7]:
COLLECTION_NAME = "financial_tweets"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [8]:
# If reconnecting to the database, use this:

# db = IRISVector(
#     embedding_function=embeddings,
#     dimension=1536,
#     collection_name=COLLECTION_NAME,
#     connection_string=CONNECTION_STRING,
# )

In [9]:
# To add documents to existing vector store:

db.add_documents(documents)

['d7dd9bca-08a4-11ef-881a-3448ed843086',
 'd7dd9bcb-08a4-11ef-ac98-3448ed843086',
 'd7dd9bcc-08a4-11ef-b883-3448ed843086',
 'd7dd9bcd-08a4-11ef-87ec-3448ed843086',
 'd7dd9bce-08a4-11ef-938d-3448ed843086',
 'd7dd9bcf-08a4-11ef-8687-3448ed843086',
 'd7dd9bd0-08a4-11ef-abd4-3448ed843086',
 'd7dd9bd1-08a4-11ef-9a3a-3448ed843086',
 'd7dd9bd2-08a4-11ef-bfeb-3448ed843086',
 'd7dd9bd3-08a4-11ef-9cba-3448ed843086',
 'd7dd9bd4-08a4-11ef-9be7-3448ed843086',
 'd7dd9bd5-08a4-11ef-8ab4-3448ed843086',
 'd7dd9bd6-08a4-11ef-9ed8-3448ed843086',
 'd7dd9bd7-08a4-11ef-b56f-3448ed843086',
 'd7dd9bd8-08a4-11ef-a40a-3448ed843086',
 'd7dd9bd9-08a4-11ef-9b9d-3448ed843086',
 'd7dd9bda-08a4-11ef-8994-3448ed843086',
 'd7dd9bdb-08a4-11ef-82f3-3448ed843086',
 'd7dd9bdc-08a4-11ef-ae0d-3448ed843086',
 'd7dd9bdd-08a4-11ef-aa00-3448ed843086',
 'd7dd9bde-08a4-11ef-9edb-3448ed843086',
 'd7dd9bdf-08a4-11ef-9ad0-3448ed843086',
 'd7dd9be0-08a4-11ef-8478-3448ed843086',
 'd7dd9be1-08a4-11ef-aa18-3448ed843086',
 'd7dd9be2-08a4-

In [10]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

Number of docs in vector store: 2000


In [20]:
query = "Earning hit"
docs_with_score = db.similarity_search_with_score(query)

In [21]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.206096636122251
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.206096636122251
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.20792379246252
Hasbro upgraded on earnings growth potential from Entertainment One acquisition
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.207954574375134
Hasbro upgraded on earnings growth potential from Entertainment One acquisition
--------------------------------------------------------------------------------


In [13]:
db.add_documents([Document(page_content="foo")])
docs_with_score = db.similarity_search_with_score("foo")
docs_with_score[0]

(Document(page_content='foo'), 0.0)

In [22]:
docs_with_score

[(Document(page_content="L'Oreal Expects Short-Term Virus Hit", metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 832}),
  0.206096636122251),
 (Document(page_content="L'Oreal Expects Short-Term Virus Hit", metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 832}),
  0.206096636122251),
 (Document(page_content='Hasbro upgraded on earnings growth potential from Entertainment One acquisition', metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 252}),
  0.20792379246252),
 (Document(page_content='Hasbro upgraded on earnings growth potential from Entertainment One acquisition', metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\

In [23]:
retriever = db.as_retriever()
print(retriever)

tags=['IRISVector'] vectorstore=<langchain_iris.vectorstores.IRISVector object at 0x0000018DCDC70210>
