In [1]:
import os
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.node_parser import SentenceSplitter
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
azure_api_key=os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
azure_api_version=os.getenv("AZURE_OPENAI_API_VERSION")
open_ai_api_key=os.getenv("OPENAI_API_KEY")

In [3]:
print(azure_api_version)

2024-02-01


In [4]:
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

# base node parser is a sentence splitter
text_splitter = SentenceSplitter()

# llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
# embed_model = OpenAIEmbedding(model="text-embedding-3-small")


llm = AzureOpenAI(deployment_name="gpt-35-turbo-1106",
                    model="gpt-35-turbo",
                    api_key=azure_api_key,
                    azure_endpoint=azure_endpoint,
                    azure_api_version=azure_api_version,
                    temperature=0.1)
# embed_model = AzureOpenAIEmbedding(
#     model="text-embedding-ada-002",
#     deployment_name="text-embedding",
#     api_key=azure_api_key,
#     azure_endpoint=azure_endpoint,
#     azure_api_version=azure_api_version
# )

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-mpnet-base-v2", max_length=512
)

from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter

In [6]:
embeddings = embed_model.get_text_embedding(
    "Open AI new Embeddings models is great."
)

Retrying llama_index.embeddings.openai.base.get_embedding in 0.507629071543005 seconds as it raised NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.
Retrying llama_index.embeddings.openai.base.get_embedding in 0.022862618272656565 seconds as it raised NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.
Retrying llama_index.embeddings.openai.base.get_embedding in 0.39819001757766515 seconds as it raised NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.
Retrying llama_index.embeddings.openai.base.get_embedding in 5.98160433539908 seconds as it raised NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.


KeyboardInterrupt: 

In [9]:
documents = SimpleDirectoryReader(
    input_files=["data/luri_higher_topos.pdf"]
).load_data()

We extract out the set of nodes that will be stored in the VectorIndex. This includes both the nodes with the sentence window parser, as well as the "base" nodes extracted using the standard parser.

In [10]:
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes = text_splitter.get_nodes_from_documents(documents)

In [17]:
from llama_index.core import VectorStoreIndex

sentence_index = VectorStoreIndex(nodes)
base_index = VectorStoreIndex(base_nodes)

Retrying llama_index.embeddings.openai.base.get_embeddings in 0.6751136004744102 seconds as it raised NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.
Retrying llama_index.embeddings.openai.base.get_embeddings in 1.838552138279843 seconds as it raised NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.
Retrying llama_index.embeddings.openai.base.get_embeddings in 3.5715455359355452 seconds as it raised NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}.


KeyboardInterrupt: 