We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key.

In [19]:
import getpass
import os
from dotenv import load_dotenv

load_dotenv(override=True)

if not os.environ.get("OPENAI_API_KEY"): 
    #os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
    pass

# os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")  
# os.environ["OPENAI_API_KEY"]



In [20]:
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader, JSONLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.fastembed import FastEmbedEmbeddings

from langchain_iris import IRISVector


In [21]:
# loader = TextLoader("../data/state_of_the_union.txt", encoding='utf-8')
# Windows only install: 
# ! pip install https://jeffreyknockel.com/jq/jq-1.4.0-cp311-cp311-win_amd64.whl
# Other platforms
# ! pip install jq
#

loader = JSONLoader(
    file_path='./data/financial/tweets_all.jsonl',
    jq_schema='.note',
    json_lines=True # TODO: tell audience what json lines are
)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()
# embeddings = FastEmbedEmbeddings()

In [22]:
username = 'demo'
password = 'demo' 
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '61209' # '1972'
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

In [23]:
# print(os.environ.get("OPENAI_API_KEY"))
print(CONNECTION_STRING)


iris://demo:demo@localhost:61209/USER


In [24]:
COLLECTION_NAME = "financial_tweets"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [25]:
# If reconnecting to the database, use this:

# db = IRISVector(
#     embedding_function=embeddings,
#     dimension=1536,
#     collection_name=COLLECTION_NAME,
#     connection_string=CONNECTION_STRING,
# )

In [26]:
# To add documents to existing vector store:

db.add_documents(documents)

['bf3a9aaf-0c90-11ef-9f83-24418cc2f064',
 'bf3a9ab0-0c90-11ef-aefd-24418cc2f064',
 'bf3a9ab1-0c90-11ef-ab6a-24418cc2f064',
 'bf3a9ab2-0c90-11ef-86f9-24418cc2f064',
 'bf3a9ab3-0c90-11ef-97bc-24418cc2f064',
 'bf3a9ab4-0c90-11ef-b29d-24418cc2f064',
 'bf3a9ab5-0c90-11ef-b82d-24418cc2f064',
 'bf3a9ab6-0c90-11ef-b17c-24418cc2f064',
 'bf3a9ab7-0c90-11ef-9499-24418cc2f064',
 'bf3a9ab8-0c90-11ef-8dab-24418cc2f064',
 'bf3a9ab9-0c90-11ef-b64b-24418cc2f064',
 'bf3a9aba-0c90-11ef-a04e-24418cc2f064',
 'bf3a9abb-0c90-11ef-a8e7-24418cc2f064',
 'bf3a9abc-0c90-11ef-a284-24418cc2f064',
 'bf3a9abd-0c90-11ef-9d78-24418cc2f064',
 'bf3a9abe-0c90-11ef-bf01-24418cc2f064',
 'bf3a9abf-0c90-11ef-8c14-24418cc2f064',
 'bf3a9ac0-0c90-11ef-bfad-24418cc2f064',
 'bf3a9ac1-0c90-11ef-9634-24418cc2f064',
 'bf3a9ac2-0c90-11ef-8080-24418cc2f064',
 'bf3a9ac3-0c90-11ef-a5fd-24418cc2f064',
 'bf3a9ac4-0c90-11ef-bb4a-24418cc2f064',
 'bf3a9ac5-0c90-11ef-a208-24418cc2f064',
 'bf3a9ac6-0c90-11ef-b467-24418cc2f064',
 'bf3a9ac7-0c90-

In [27]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

Number of docs in vector store: 4001


In [28]:
query = "Earning hit"
docs_with_score = db.similarity_search_with_score(query)

In [29]:
for doc, score in docs_with_score:
    print("-" * 80)
    print("Score: ", score)
    print(doc.page_content)
    print("-" * 80)

--------------------------------------------------------------------------------
Score:  0.206096636122251
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.206191386612508
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.206191386612508
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Score:  0.206191386612508
L'Oreal Expects Short-Term Virus Hit
--------------------------------------------------------------------------------


In [30]:
db.add_documents([Document(page_content="foo")])
docs_with_score = db.similarity_search_with_score("foo")
docs_with_score[0]

(Document(page_content='foo'), 0.0)

In [31]:
docs_with_score

[(Document(page_content='foo'), 0.0),
 (Document(page_content='foo'), 0.0),
 (Document(page_content='Analysis: Popeyes Chicken Sandwich Translates To Surge In Foot Traffic', metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 945}),
  0.217591388484072),
 (Document(page_content='Analysis: Popeyes Chicken Sandwich Translates To Surge In Foot Traffic', metadata={'source': 'C:\\Users\\nmitchko\\Documents\\Sales\\Events\\2024\\gs24-ai-workshop\\exercise\\financial\\data\\financial\\tweets_all.jsonl', 'seq_num': 945}),
  0.217591388484072)]

In [32]:
retriever = db.as_retriever()
print(retriever)

tags=['IRISVector'] vectorstore=<langchain_iris.vectorstores.IRISVector object at 0x0000020B5E76E610>
