In [7]:
from __future__ import print_function
import logging
import sys
import os
import pandas as pd
import glob
from dotenv import load_dotenv
import logging
import sys
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core import ( Settings, VectorStoreIndex, SimpleDirectoryReader)
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core import (StorageContext, ServiceContext)
from llama_index.core.node_parser import MarkdownNodeParser
import datetime
import numpy as np
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter
from sqlalchemy import *
import duckdb
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import QueryEngineTool
from llama_index.core.tools import ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.core import SQLDatabase, Document
from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core.indices.struct_store import SQLTableRetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.query_engine import SQLJoinQueryEngine
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core.retrievers import VectorIndexAutoRetriever
from llama_index.core.vector_stores import MetadataInfo, VectorStoreInfo
from langchain.text_splitter import RecursiveCharacterTextSplitter
from llama_index.core.node_parser import LangchainNodeParser
from sqlalchemy import (create_engine,MetaData,Table,Column,String,Integer,select,column,)
from sqlalchemy.dialects.postgresql import (INTEGER, FLOAT, BIGINT, VARCHAR, DOUBLE_PRECISION)
from llama_index.core.query_engine import NLSQLTableQueryEngine
from llama_index.core.indices.struct_store import SQLTableRetrieverQueryEngine
from llama_index.core.objects import (SQLTableNodeMapping,ObjectIndex,SQLTableSchema,)
from llama_index.core import VectorStoreIndex
from llama_index.core.query_engine import SQLAutoVectorQueryEngine
logging.getLogger().setLevel(logging.ERROR)

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

Settings.callback_manager = callback_manager

load_dotenv('/Users/sam/dev/spaghetti_dev/MedEd-AI/Credentials/.env')

#Azure OpenAI Creds
endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
credential = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_api_version = "2024-04-01-preview"
azure_openai_embedding_deployment = "text-embedding-ada-002"
embedding_model_name = "text-embedding-ada-002"
llm_model_name = "gpt-35-turbo-16k"
api_type = "azure"

llm = AzureOpenAI(
            model = llm_model_name,
            deployment_name = llm_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type
        )

embed_model = AzureOpenAIEmbedding(
            model = embedding_model_name,
            deployment_name = embedding_model_name,
            api_key = credential,
            azure_endpoint = endpoint,
            api_version = azure_openai_api_version,
            api_type = api_type,
            embed_batch_size=50
        )

Settings.llm = llm
Settings.embed_model = embed_model


In [4]:
def create_sql_engine():
    engine = create_engine("duckdb:////Users/sam/dev/spaghetti_dev/MedEd-AI/Code/pwyRAG/meded_ai_dev.duckdb", future=True)

    metadata_obj = MetaData()

    alter_schema = Table('graded_quizzes', 
                metadata_obj, 
                Column("quiz_id", INTEGER), 
                Column('quiz_type', VARCHAR), 
                Column('quiz_title', VARCHAR),
                Column('history_id', BIGINT),
                Column('submission_id', BIGINT),
                Column('student_score', DOUBLE_PRECISION),
                Column('quiz_question_count', BIGINT),
                Column('quiz_points_possible', DOUBLE_PRECISION),
                Column('question_points_possible', DOUBLE_PRECISION),
                Column('answer_points_scored', DOUBLE_PRECISION),
                Column('attempt', BIGINT),
                Column('question_name',VARCHAR),
                Column('question_type', VARCHAR),
                Column('question_text', VARCHAR),
                Column('question_answer', VARCHAR),
                Column('student_answer', VARCHAR),
                Column('course_id', VARCHAR),
                Column('accuracy', INTEGER),
                Column('completeness', INTEGER),
                autoload_with=engine, 
                extend_existing=True)
    
    with engine.connect() as connection:
        with connection:
            metadata=MetaData()
            my_table=Table("graded_quizzes", metadata, autoload_with=connection)
    
    sql_database = SQLDatabase(engine, include_tables=["graded_quizzes"])

    table_node_mapping = SQLTableNodeMapping(sql_database)

    table_schema = [SQLTableSchema(table_name='graded_quizzes')]

    obj_index = ObjectIndex.from_objects(
        table_schema,
        table_node_mapping,
        VectorStoreIndex,
    )

    sql_query_engine = SQLTableRetrieverQueryEngine(
        sql_database,
        obj_index.as_retriever(similarity_top_k=1),
    )
    
    return(sql_query_engine)

In [5]:
def create_query_engine():
    logging.getLogger().setLevel(logging.WARNING)
    pd.set_option('future.no_silent_downcasting', True)
    # embed that content, with metadata for where they came from/what consolidation exercise they're a part of 

    reader = SimpleDirectoryReader("/Users/sam/dev/spaghetti_dev/MedEd-AI/Data/studentguides/", recursive=True, filename_as_id=True, required_exts=[".pdf", ".docx", ".xlsx", ".pptx"])

    documents = []
    for docs in reader.iter_data():
        file_filename = [x for x in glob.glob("/Users/sam/dev/spaghetti_dev/MedEd-AI/Data/coursedata_*/course_file.json",recursive=True) if docs[0].metadata['file_name'][0:6] in x][0]
        file_df = pd.read_json(file_filename)
        file_metadata = file_df.loc[file_df['filename'] == docs[0].metadata['file_name'][7:].replace('_','+')]
        if file_metadata.empty != True:                
            file_metadata = file_metadata.squeeze().to_dict()
            file_metadata = pd.DataFrame(file_metadata, index=[0]).replace(np.NaN, 0).replace(0, None)
            file_metadata = file_metadata.to_dict('records')[0]
            folder = file_metadata.get('folder_id')
        else:
            file_metadata = {}
            folder = ''

        folder_filename = [x for x in glob.glob("/Users/sam/dev/spaghetti_dev/MedEd-AI/Data/coursedata_*/course_folder.json",recursive=True) if docs[0].metadata['file_name'][0:6] in x][0]
        folder_df = pd.read_json(folder_filename)
        folder_metadata = folder_df.loc[folder_df['id'] == folder]
        if folder_metadata.empty != True:
            folder_metadata = folder_metadata.squeeze().to_dict()
            folder_metadata = pd.DataFrame(folder_metadata, index=[0]).replace(np.NaN, 0).replace(0, None)
            folder_metadata = folder_metadata.to_dict('records')[0]
            if 'Week' in folder_metadata['full_name']:
                week = [i for i in folder_metadata['full_name'].split("/") if 'Week' in i][0].replace('Week','').replace(' ','')
                folder_metadata.update({"week":week})
            full_name = folder_metadata['full_name'].split("/")[-1]
            folder_metadata.update({"folder_name":full_name})
        else:
            folder_metadata = {}
        
        course_filename = [x for x in glob.glob("/Users/sam/dev/spaghetti_dev/MedEd-AI/Data/coursedata_*/course_course.json",recursive=True) if docs[0].metadata['file_name'][0:6] in x][0]
        course_df = pd.read_json(course_filename)        
        course_metadata = course_df.loc[course_df['id'] == folder_metadata.get('context_id')]
        if course_metadata.empty != True:
            course_metadata = course_metadata.squeeze().to_dict()
            course_id = folder_metadata.get('context_id')
        else:
            course_metadata = {}
                
        for doc in docs:
            doc.metadata.update({"file_id": file_metadata.get('id'), "folder_id":file_metadata.get('folder_id'), "display_name":file_metadata.get('display_name')})
            doc.metadata.update({"week": folder_metadata.get('week'),  "folder_name": folder_metadata.get('folder_name')})
            doc.metadata.update({"course_id": course_metadata.get('id'), "course_name":course_metadata.get('name'),"course_code":course_metadata.get('course_code'),"course_term":course_metadata.get('term', {}).get('name')}) 
        documents.extend(docs)


    parser = LangchainNodeParser(RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=100,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    ))

    nodes = parser.get_nodes_from_documents(documents)

    vector_store = DuckDBVectorStore(nodes=nodes)

    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    vector_index = VectorStoreIndex.from_documents(
        documents, storage_context=storage_context, embed_model=embed_model, llm=llm
    )
    
    vector_store_info = VectorStoreInfo(
    content_info="Student guides to help prepare for consolidation assessments",
    metadata_info=[
                MetadataInfo(
                    name="page_label",
                    description="What page of the file the context is from",
                    type="string",
                ),
                MetadataInfo(
                    name="file_name",
                    description="The name of the file the context is from",
                    type="string",
                ),
                MetadataInfo(
                    name="file_path",
                    description="The file path of the context file",
                    type="string",
                ),
                MetadataInfo(
                    name="file_type",
                    description="The type of file",
                    type="string",
                ),
                MetadataInfo(
                    name="file_size",
                    description="The size of the file in bytes",
                    type="integer",
                ),
                MetadataInfo(
                    name="creation_date",
                    description="When the file was created",
                    type="string",
                ),
                MetadataInfo(
                    name="last_modified_date",
                    description="When the file was last modified",
                    type="string",
                ),
                MetadataInfo(
                    name="display_name",
                    description="The name of the file",
                    type="string",
                ),
                MetadataInfo(
                    name="week",
                    description="The week the context was administered",
                    type="string",
                ),
                MetadataInfo(
                    name="folder_name",
                    description="The course folder that contains the file",
                    type="string",
                ),
                MetadataInfo(
                    name="course_id",
                    description="The unique identifier of the course",
                    type="integer",
                ),
                MetadataInfo(
                    name="course_name",
                    description="The full name of the course",
                    type="string",
                ),
                MetadataInfo(
                    name="course_code",
                    description="The shortened name of the course",
                    type="string",
                ),
                MetadataInfo(
                    name="course_term",
                    description="What term the course was offered in",
                    type="string",
                ),],)

    vector_auto_retriever = VectorIndexAutoRetriever(
        vector_index, vector_store_info=vector_store_info
    )

    retriever_query_engine = RetrieverQueryEngine.from_args(
        vector_auto_retriever, llm=llm
    )
    
    return(retriever_query_engine)

In [6]:
sql_query_engine = create_sql_engine()

NoSuchModuleError: Can't load plugin: sqlalchemy.dialects:duckdb

In [9]:
retriever_query_engine = create_query_engine()

**********
Trace: index_construction
    |_embedding -> 1.384201 seconds
    |_embedding -> 1.072492 seconds
    |_embedding -> 1.004756 seconds
    |_embedding -> 0.812897 seconds
    |_embedding -> 0.891374 seconds
    |_embedding -> 0.98197 seconds
    |_embedding -> 0.926156 seconds
    |_embedding -> 0.91497 seconds
    |_embedding -> 0.840605 seconds
    |_embedding -> 0.798109 seconds
    |_embedding -> 0.849738 seconds
    |_embedding -> 0.851721 seconds
    |_embedding -> 0.893558 seconds
    |_embedding -> 0.928841 seconds
    |_embedding -> 0.951602 seconds
    |_embedding -> 0.82258 seconds
    |_embedding -> 0.939885 seconds
    |_embedding -> 0.768516 seconds
    |_embedding -> 0.832789 seconds
    |_embedding -> 0.583164 seconds
**********


In [None]:
sql_tool = QueryEngineTool.from_defaults(
    query_engine=sql_query_engine,
    description=(
        "Useful for translating a natural language query into a SQL query over"
        " a table graded_quizzes, containing columns:"
        " quiz_id, quiz_type, quiz_title, history_id, submission_id,"
        " student_score, quiz_question_count, quiz_points_possible, question_points_possible," 
        "answer_points_scored, attempt, question_name, question_type, question_text, question_answer, and student_answer"
    ),
)


vector_tool = QueryEngineTool.from_defaults(
        query_engine=retriever_query_engine,
        description=f"Useful for answering semantic questions about consolidation assessments, and general course-related questions like when certain material is being taught",
    )

query_engine = SQLAutoVectorQueryEngine(
sql_tool, 
vector_tool,
llm=llm
)    

In [1]:
from RAG import engines
query_engine = engines()



OperationalError: (duckdb.duckdb.IOException) IO Error: Cannot open file "/meded_ai_dev.duckdb": Read-only file system
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [4]:
print(endpoint)

https://azure-ai-dev.hms.edu


In [5]:
metadata_obj = MetaData()

alter_schema = Table('graded_quizzes', 
            metadata_obj, 
            Column("quiz_id", INTEGER), 
            Column('quiz_type', VARCHAR), 
            Column('quiz_title', VARCHAR),
            Column('history_id', BIGINT),
            Column('submission_id', BIGINT),
            Column('student_score', DOUBLE_PRECISION),
            Column('quiz_question_count', BIGINT),
            Column('quiz_points_possible', DOUBLE_PRECISION),
            Column('question_points_possible', DOUBLE_PRECISION),
            Column('answer_points_scored', DOUBLE_PRECISION),
            Column('attempt', BIGINT),
            Column('question_name',VARCHAR),
            Column('question_type', VARCHAR),
            Column('question_text', VARCHAR),
            Column('question_answer', VARCHAR),
            Column('student_answer', VARCHAR),
            Column('course_id', VARCHAR),
            Column('accuracy', INTEGER),
            Column('completeness', INTEGER),
            autoload_with=engine, 
            extend_existing=True)

  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, enums, schema)  # type: ignore[attr-defined]
  columns = self._get_columns_info(rows, domains, 

In [6]:
with engine.connect() as connection:
    with connection:
        metadata=MetaData()
        my_table=Table("graded_quizzes", metadata, autoload_with=connection)

In [7]:

sql_database = SQLDatabase(engine, include_tables=["graded_quizzes"])

table_node_mapping = SQLTableNodeMapping(sql_database)

table_schema = [SQLTableSchema(table_name='graded_quizzes')]

obj_index = ObjectIndex.from_objects(
    table_schema,
    table_node_mapping,
    VectorStoreIndex,
)

sql_query_engine = SQLTableRetrieverQueryEngine(
    sql_database,
    obj_index.as_retriever(similarity_top_k=1),
)

**********
Trace: index_construction
    |_embedding -> 0.54971 seconds
**********


In [10]:
response = query_engine.query("For course_id 110777 and quiz_id 277245, which questions did the students have the worst average completeness and average accuracy?")
print(response.response)

**********
Trace: query
    |_retrieve -> 0.30948 seconds
      |_embedding -> 0.307282 seconds
    |_templating -> 1.1e-05 seconds
    |_llm -> 2.540442 seconds
    |_synthesize -> 1.346406 seconds
      |_templating -> 1.5e-05 seconds
      |_llm -> 1.343226 seconds
**********
The students had the worst average completeness and average accuracy for the following questions in course_id 110777 and quiz_id 277245: 
1. Question 1 with an average completeness of 2.407 and an average accuracy of 2.282
2. Question 3 with an average completeness of 2.441 and an average accuracy of 2.625
3. Question 4 with an average completeness of 2.752 and an average accuracy of 2.786
4. Question 2 with an average completeness of 2.930 and an average accuracy of 2.640


In [13]:
response = query_engine.query("For course_id 110777 and quiz_id 277245 Question 1, compare the student answers to the correct answer. What keywords were most frequently missed?")
print(response.response)

**********
Trace: query
    |_retrieve -> 0.288963 seconds
      |_embedding -> 0.28698 seconds
    |_templating -> 1.1e-05 seconds
    |_llm -> 1.06513 seconds
    |_synthesize -> 2.052015 seconds
      |_templating -> 1.7e-05 seconds
      |_llm -> 2.047122 seconds
**********
The most frequently missed keywords in the student answers for Question 1 of course_id 110777 and quiz_id 277245 are "Neutrophil", "lobed", "purple nucleus", "granules", "do not stain very well", "Basophil", "purple granules", "cytoplasm", "Bilobed nucleus", "hard to visualize", "Eosinophils", "bilobed nucleus", "pink granules", "cytoplasm", "multisegmented nucleus", "granules do not stain well", "stain predominantly blue", "obscure the nucleus from view", "stain predominantly pink/red", "granules in the cytoplasm", "bigger than lymphocytes", "smaller than monocytes", "multi-lobed nuclei", "double-lobed nuclei", "much more pronounced/stained granules", "appear pinkish", "granules stain blue", "granules are basop

In [None]:
response = query_engine.query("What course is offered in Fall 2022?")
print(response.response)

In [None]:
response = query_engine.query("What Question of CBB 1 Consolidation Assessment Exercise did students have the poorest average accuracy?")
print(response.response)

In [None]:
response = query_engine.query("What Question of CBB 1 Consolidation Assessment Exercise did students have the best average accuracy?")
print(response.response)

In [None]:
response = query_engine.query("What is the distribution of accuracy on Question 5 of CBB 1 Consolidation Assessment Exercise?")
print(response.response)

In [None]:
response = query_engine.query("What is the distribution of accuracy on Question 5 of CBB 1 Consolidation Assessment Exercise?")
print(response.response)