In [1]:
%pip install llama-index
%pip install llama-index-llms
%pip install llama-index-embeddings
%pip install dotenv

Defaulting to user installation because normal site-packages is not writeable
Collecting tqdm<5.0.0,>=4.66.1 (from llama-index-core<0.11.0,>=0.10.0->llama-index)
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.66.4-py3-none-any.whl (78 kB)
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mInstalling collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.55.1
    Uninstalling tqdm-4.55.1:
      Successfully uninstalled tqdm-4.55.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the follow

## Load Credentials

In [1]:
from __future__ import print_function
import logging
import sys
import os
from dotenv import load_dotenv
from llama_index.core import ( Settings, VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage)
from llama_index.core.callbacks import CallbackManager
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser

logging.getLogger().setLevel(logging.WARNING)

load_dotenv('../Credentials/.env')

user_path = "Users/sam/dev/spaghetti_dev/"

endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
credential = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_api_version = "2024-04-01-preview"
azure_openai_embedding_deployment = "text-embedding-ada-002"
embedding_model_name = "text-embedding-ada-002"
llm_model_name = "gpt-35-turbo-16k"
api_type = "azure"

## Load Documents

In [2]:
reader = SimpleDirectoryReader("".join([user_path,"MedEd-AI/Data/studentguides/"]), recursive=True, filename_as_id=True, required_exts=[".pdf", ".docx", ".xlsx", ".pptx"])

documents = []
for docs in reader.iter_data():
    documents.extend(docs)        

In [3]:
documents[0].metadata

{'page_label': '1',
 'file_name': '1-s2.0-S1538544221000821-main.pdf',
 'file_path': '/Users/sam/dev/spaghetti_dev/MedEd-AI/Code/../Data/course113113/downloads/1-s2.0-S1538544221000821-main.pdf',
 'file_type': 'application/pdf',
 'file_size': 719173,
 'creation_date': '2024-04-16',
 'last_modified_date': '2024-04-16'}

## Node Parsing

In [4]:
parser = LangchainNodeParser(RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
))

nodes = parser.get_nodes_from_documents(documents)

## Create Index

In [5]:
llm = AzureOpenAI(
            model = llm_model_name,
            deployment_name = llm_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type
        )

embed_model = AzureOpenAIEmbedding(
            model = embedding_model_name,
            deployment_name = embedding_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type,
            embed_batch_size=50
        )

Settings.llm = llm
Settings.embed_model = embed_model

callback_manager = CallbackManager()

index = VectorStoreIndex(nodes)

print("Index created")

Index created


## Save to Persistent Storage

In case you want to load your index later, saving you from having to re-parse your documents every time

In [None]:
index.storage_context.persist(persist_dir="../Data/course113113_index")

## Create Query Engine, Ask a Question

In [6]:
query_engine = index.as_query_engine(similarity_top_k=5)

query = (
    'Please create a 5 question review quiz on the pharmacology of antithrombotics . Do not ask questions about text from a photo/graph. Do not ask any questions about textbook sections. Every question should have only one correct answer. Every question should have 4 potential answers. Every answer option should have a detailed explanation. Format the output like this:{"info":{"title": "", "description": "", "questions": [ { "question": "","type": "RADIO","options": [{"option":"","explanation":""}], "correct_answer": ""}]}}'
)

query_response = query_engine.query(
    query
)

print(query_response)

{"info":{"title": "Pharmacology of Antithrombotics Quiz", "description": "This quiz will test your knowledge on the pharmacology of antithrombotics. Each question has one correct answer and four potential answer options. Read each question carefully and select the best answer.", "questions": [
{ 
  "question": "What is the main purpose of antithrombotics?",
  "type": "RADIO",
  "options": [
    {"option": "To promote thrombus formation", "explanation": "Incorrect. Antithrombotics are agents that prevent and/or reverse thrombus formation."},
    {"option": "To treat hypertension", "explanation": "Incorrect. Antithrombotics are not primarily used to treat hypertension."},
    {"option": "To prevent and/or reverse thrombus formation", "explanation": "Correct. Antithrombotics are agents that prevent and/or reverse thrombus formation."},
    {"option": "To promote blood clotting", "explanation": "Incorrect. Antithrombotics do not promote blood clotting."}
  ],
  "correct_answer": "To preven