In [None]:
%pip install llama-index
%pip install llama-index-llms
%pip install llama-index-readers
%pip install llama-index-embeddings
%pip install dotenv

## Load Credentials

In [32]:
from __future__ import print_function
import logging
import sys
import os
from dotenv import load_dotenv
import asyncio
from llama_index.core import ( Settings, VectorStoreIndex, SimpleDirectoryReader)
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core.extractors import ( SummaryExtractor, QuestionsAnsweredExtractor, TitleExtractor, KeywordExtractor)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core.question_gen import LLMQuestionGenerator
from llama_index.core.question_gen.prompts import (DEFAULT_SUB_QUESTION_PROMPT_TMPL)
from llama_index.core.schema import MetadataMode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.extractors.entity import EntityExtractor
from llama_index.llms.azure_openai import AzureOpenAI

logging.getLogger().setLevel(logging.WARNING)

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

load_dotenv('../Credentials/.env')

endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
credential = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_api_version = "2024-04-01-preview"
azure_openai_embedding_deployment = "text-embedding-ada-002"
embedding_model_name = "text-embedding-ada-002"
llm_model_name = "gpt-35-turbo-16k"
api_type = "azure"

## Load Documents

In [33]:
reader = SimpleDirectoryReader("../Data/", recursive=True, filename_as_id=True, required_exts=[".pdf", ".docx", ".xlsx", ".pptx"])

documents = []
for docs in reader.iter_data():
    documents.extend(docs)

## Node Parsing

In [34]:
llm = AzureOpenAI(
            model = llm_model_name,
            deployment_name = llm_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type
        )

embed_model = AzureOpenAIEmbedding(
            model = embedding_model_name,
            deployment_name = embedding_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type,
            embed_batch_size=50
        )

Settings.llm = llm
Settings.embed_model = embed_model

In [35]:
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)

In [36]:

extractors = [
    TitleExtractor(nodes=2, llm=llm),
    # QuestionsAnsweredExtractor(questions=3, llm=llm),
    # EntityExtractor(prediction_threshold=0.5),
    # SummaryExtractor(summaries=["prev", "self"], llm=llm),
    # KeywordExtractor(keywords=3, llm=llm),
]

In [37]:
transformations = [text_splitter] + extractors

In [38]:
pipeline = IngestionPipeline(transformations=transformations)

In [39]:
nodes = pipeline.run(documents=documents, num_workers=4)

100%|██████████| 429/429 [02:18<00:00,  3.10it/s]
  4%|▍         | 17/429 [00:02<00:46,  8.80it/s]Exception in thread Thread-22:
Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 576, in _handle_results
    task = get()
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/connection.py", line 256, in recv
    return _ForkingPickler.loads(buf.getbuffer())
TypeError: __init__() missing 2 required keyword-only arguments: 'response' and 'body'

 82%|█

In [None]:
print("LLM sees:\n",(nodes)[9].get_content(metadata_mode=MetadataMode.LLM))

## Create Index

In [None]:
index = VectorStoreIndex(nodes)

engine = index.as_query_engine(similarity_top_k=10, llm=llm)

index.storage_context.persist(persist_dir="./data2/index")

question_gen = LLMQuestionGenerator.from_defaults(
    llm=llm,
    prompt_template_str="""
        Follow the example, but instead of giving a question, always prefix the question 
        with: 'By first identifying and quoting the most relevant sources, '. 
        """
    + DEFAULT_SUB_QUESTION_PROMPT_TMPL,
)

## Save to Persistent Storage

In case you want to load your index later, saving you from having to re-parse your documents every time

In [None]:
index.storage_context.persist(persist_dir="../Data/course113113_index")

## Create Query Engine, Ask a Question

In [None]:
query_engine_tools = [
    QueryEngineTool(
        query_engine=engine,
        metadata=ToolMetadata(
            name="course_documents",
            description="course files from IHP1",
        ),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools,
    question_gen=question_gen,
    use_async=False
)


# query_engine = index.as_query_engine(similarity_top_k=5)

query = (
    'Give me a 3 sentence summary of Hemodynamics'
)

query_response = query_engine.query(
    query
)

print(query_response)