In [None]:
%pip install llama-index
%pip install llama-index-llms
%pip install llama-index-readers
%pip install llama-index-embeddings
%pip install dotenv

## Load Credentials

In [1]:
from __future__ import print_function
import logging
import sys
import os
import pandas as pd
import glob
from dotenv import load_dotenv
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import ( Settings, VectorStoreIndex, SimpleDirectoryReader)
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
import chromadb
from llama_index.core.callbacks import CallbackManager
import datetime
import numpy as np
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter
from sqlalchemy import *

logging.getLogger().setLevel(logging.WARNING)

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

load_dotenv('../Credentials/.env')

endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
credential = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_api_version = "2024-04-01-preview"
azure_openai_embedding_deployment = "text-embedding-ada-002"
embedding_model_name = "text-embedding-ada-002"
llm_model_name = "gpt-35-turbo-16k"
api_type = "azure"

ModuleNotFoundError: No module named 'dotenv'

## Load ChromaDB

In [135]:
remote_db = chromadb.HttpClient(host='localhost',port=8000)
remote_db.delete_collection("quickstart")
chroma_collection = remote_db.get_or_create_collection("quickstart")

## Load Data, Get Metadata

In [136]:
reader = SimpleDirectoryReader("../Data/", recursive=True, filename_as_id=True, required_exts=[".pdf", ".docx", ".xlsx", ".pptx"])

pd.set_option('future.no_silent_downcasting', True)

documents = []
for docs in reader.iter_data():
    for filename in glob.glob('../Data/course*/course_file.json',recursive=True):
        df = pd.read_json(filename)

        file_metadata = df.loc[df['filename'] == docs[0].metadata['file_name'].replace('_','+')]
        if file_metadata.empty != True:                
            file_metadata = file_metadata.squeeze().to_dict()
            file_metadata = pd.DataFrame(file_metadata, index=[0]).replace(np.NaN, 0).replace(0, None)
            file_metadata = file_metadata.to_dict('records')[0]
            folder = file_metadata.get('folder_id')
        else:
            file_metadata = {}
            folder = ''
            pass

    for filename in glob.glob('../Data/course*/course_folder.json',recursive=True):
        df = pd.read_json(filename)

        folder_metadata = df.loc[df['id'] == folder]
        if folder_metadata.empty != True:
            folder_metadata = folder_metadata.squeeze().to_dict()
            folder_metadata = pd.DataFrame(folder_metadata, index=[0]).replace(np.NaN, 0).replace(0, None)
            folder_metadata = folder_metadata.to_dict('records')[0]
            if 'Week' in folder_metadata['full_name']:
                week = [i for i in folder_metadata['full_name'].split("/") if 'Week' in i][0].replace('Week','').replace(' ','')
                folder_metadata.update({"week":week})
        else:
            folder_metadata = {}
            pass
    
    for filename in glob.glob('../Data/course*/course_course.json',recursive=True):
        df = pd.read_json(filename)        
        course_metadata = df.loc[df['id'] == folder_metadata.get('context_id')]
        if course_metadata.empty != True:
            course_metadata = course_metadata.squeeze().to_dict()
            course_id = folder_metadata.get('context_id')
        else:
            course_metadata = {}
            pass
        
    for doc in docs:
        doc.metadata.update({"file_id": file_metadata.get('id'), "folder_id":file_metadata.get('folder_id'), "display_name":file_metadata.get('display_name')})
        doc.metadata.update({"week": folder_metadata.get('week')})
        doc.metadata.update({"course_id": course_metadata.get('id'), "course_name":course_metadata.get('name'),"course_code":course_metadata.get('course_code'),"course_term":course_metadata.get('term', {}).get('name')}) 

    documents.extend(docs)


In [137]:
documents[0].metadata

{'page_label': '1',
 'file_name': '1-s2.0-S1538544221000821-main.pdf',
 'file_path': '../Data/course113113/downloads/1-s2.0-S1538544221000821-main.pdf',
 'file_type': 'application/pdf',
 'file_size': 719173,
 'creation_date': '2024-04-16',
 'last_modified_date': '2024-04-16',
 'last_accessed_date': '2024-04-30',
 'file_id': 19434347,
 'folder_id': 3847349,
 'display_name': '1-s2.0-S1538544221000821-main.pdf',
 'week': '4',
 'course_id': 131972,
 'course_name': 'PWY 133: Integrated Human Pathophysiology II - HMS - 01/29/2024 - 03/15/2024',
 'course_code': 'PWY 133',
 'course_term': '2023-2024 Spring'}

## Node Parsing, Index Creation

In [138]:
# set up ChromaVectorStore and load in data
llm = AzureOpenAI(
            model = llm_model_name,
            deployment_name = llm_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type
        )

embed_model = AzureOpenAIEmbedding(
            model = embedding_model_name,
            deployment_name = embedding_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type,
            embed_batch_size=50
        )

Settings.llm = llm
Settings.embed_model = embed_model


vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)


**********
Trace: index_construction
    |_embedding ->  1.638826 seconds
    |_embedding ->  0.70462 seconds
    |_embedding ->  1.033021 seconds
    |_embedding ->  0.682273 seconds
    |_embedding ->  0.533042 seconds
    |_embedding ->  0.438322 seconds
    |_embedding ->  0.445663 seconds
    |_embedding ->  0.709348 seconds
    |_embedding ->  0.548527 seconds
    |_embedding ->  0.446297 seconds
    |_embedding ->  0.485594 seconds
    |_embedding ->  0.62756 seconds
    |_embedding ->  0.477588 seconds
    |_embedding ->  0.449777 seconds
    |_embedding ->  0.518441 seconds
    |_embedding ->  0.416154 seconds
    |_embedding ->  0.597191 seconds
    |_embedding ->  0.824583 seconds
    |_embedding ->  1.720888 seconds
    |_embedding ->  0.617372 seconds
    |_embedding ->  0.622606 seconds
    |_embedding ->  0.588417 seconds
    |_embedding ->  0.59923 seconds
    |_embedding ->  0.563162 seconds
    |_embedding ->  0.799817 seconds
    |_embedding ->  0.505863 seconds
    

## Save to Persistent Storage

In case you want to load your index later, saving you from having to re-parse your documents every time

In [139]:
db = chromadb.PersistentClient(path="../chroma_db")

## Create Query Engine, Ask a Question

In [150]:
# filters = MetadataFilters(
#     filters=[MetadataFilter(key="course_id", value=131972)]
# )

filters = MetadataFilters(filters=[
    MetadataFilter(
        key="course_id", 
        value=131972
    ),
     MetadataFilter(
        key="week", 
        value='2'
    ),
])

query_engine = index.as_query_engine(filters=filters)
response = query_engine.query("Give me a multiple choice question from the context")
print(response)


**********
Trace: query
    |_query ->  1.504121 seconds
      |_retrieve ->  0.286729 seconds
        |_embedding ->  0.230248 seconds
      |_synthesize ->  1.217181 seconds
        |_templating ->  1.2e-05 seconds
        |_llm ->  1.214584 seconds
**********
Question: If lifestyle modifications alone are not sufficient, which initial anti-hypertensive medication (or class of medications) would you choose? Explain your rationale.
A) ACE inhibitors
B) Beta blockers
C) Calcium channel blockers
D) Diuretics
