In [1]:
## Load dotenv
from dotenv import load_dotenv
load_dotenv()


True

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [3]:
from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    KnowledgeGraphIndex,
)
from llama_index.graph_stores import SimpleGraphStore

from llama_index.llms import OpenAI
from IPython.display import Markdown, display

In [None]:
"/home/njui/kn_workspace/curriculum_taxonomy_extractor/grade_6_math_13_to_17.pdf"

In [None]:
## load pdf file to llama index
file_name = "../data/interim/"
documents = SimpleDirectoryReader(input_files=[file_name]).load_data()

In [None]:
llm= OpenAI(temperature=0,model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)

In [None]:
from llama_index.storage.storage_context import StorageContext

graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(
    documents,
    max_triplets_per_chunk=2,
    storage_context=storage_context,
    service_context=service_context,
)

In [None]:
query_engine = index.as_query_engine(
    include_text=False, response_mode="tree_summarize"
)
response = query_engine.query(
    "How many subjects are here?",
)

In [None]:
response

In [None]:
#### Using Nebula Graph

import logging
import sys


from llama_index import (
    KnowledgeGraphIndex,
    ServiceContext,
    SimpleDirectoryReader,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from llama_index.llms import OpenAI

from IPython.display import Markdown, display


# define LLM
# NOTE: at the time of demo, text-davinci-002 did not have rate-limit errors
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm)

In [None]:
from nebula3.gclient.net import ConnectionPool
from nebula3.Config import Config

# Connection Configuration
config = Config()
config.max_connection_pool_size = 10
connection_pool = ConnectionPool()

# Initialize connection pool
if not connection_pool.init([('172.17.0.1', 9669)], config):
    print("Failed to connect to Nebula Graph!")
    exit(1)

# Creating a session
with connection_pool.session_context('root', 'nebula') as session:
    # Create a space with VID type
    print("Creating space with VID type...")
    # create_space_command = "CREATE SPACE curriculum_graph (partition_num=10, replica_factor=1, vid_type=FIXED_STRING(30));"
    create_space_command = "CREATE SPACE curriculum_graph (partition_num=10, replica_factor=1, vid_type=FIXED_STRING(3000));"
    # session.execute(create_space_command)
    result = session.execute(create_space_command)
    if not result.is_succeeded():
        print("Failed to create space:", result.error_msg())
        exit(1)

    # Validate the creation of the space
    print("Validating space creation...")
    show_spaces_command = "SHOW SPACES;"
    result = session.execute(show_spaces_command)
    if result.is_succeeded():
        print("Available Spaces: ", result)

# Close the connection pool
connection_pool.close()


In [None]:
from nebula3.gclient.net import ConnectionPool
from nebula3.Config import Config

# Connection Configuration
config = Config()
config.max_connection_pool_size = 10
connection_pool = ConnectionPool()

# Initialize connection pool
if not connection_pool.init([('172.17.0.1', 9669)], config):
    print("Failed to connect to Nebula Graph!")
    exit(1)

# Creating a session
with connection_pool.session_context('root', 'nebula') as session:
    # Create space with VID type
    # create_space_command = "CREATE SPACE curriculum_graph (partition_num=10, replica_factor=1, vid_type=FIXED_STRING(3000));"
    # session.execute(create_space_command)
    
    use_space_command = "USE curriculum_graph;"
    result  = session.execute(use_space_command)
    ## show if space is loaded from result
    if result.is_succeeded():
        print("Space loaded: ", result)
    # Create tags with properties
    create_subject_tag = "CREATE TAG Subject(name string, grade string);"
    create_strand_tag = "CREATE TAG Strand(name string);"
    create_substrand_tag = "CREATE TAG SubStrand(name string);"
    create_learningoutcome_tag = "CREATE TAG LearningOutcome(description string);"
    create_assessment_rubrics_tag = "CREATE TAG AssessmentRubrics(indicator_name string, exceeds_expectations string, meets_expectations string, approaches_expectations string, below_expectations string);"



    session.execute(create_subject_tag)
    session.execute(create_strand_tag)
    session.execute(create_substrand_tag)
    session.execute(create_learningoutcome_tag)
    session.execute(create_assessment_rubrics_tag)
    # Create edge types with properties
    create_subjecttostrand_edge = "CREATE EDGE SubjectToStrand(importance string, sequence_order string, time_allocation string);"
    create_strandtosubstrand_edge = "CREATE EDGE StrandToSubStrand(importance string, sequence_order string, time_allocation string);"
    create_substrandtolearningoutcome_edge = "CREATE EDGE SubStrandToLearningOutcome(importance string, sequence_order string, time_allocation string);"
    create_rubricstosubstrand_edge = "CREATE EDGE RubricsToSubStrand(relevance string, application string);"


    session.execute(create_subjecttostrand_edge)
    session.execute(create_strandtosubstrand_edge)
    session.execute(create_substrandtolearningoutcome_edge)
    session.execute(create_rubricstosubstrand_edge)
    # Validate the creation of the schema
    print("Validating schema creation...")
    show_tags_command = "SHOW TAGS;"
    show_edges_command = "SHOW EDGES;"
    tags_result = session.execute(show_tags_command)
    edges_result = session.execute(show_edges_command)
    print("Available Tags: ", tags_result)
    print("Available Edges: ", edges_result)

# Close the connection pool
connection_pool.close()


In [None]:
# Nebula Graph Store Configuration
space_name = "curriculum_graph"
edge_types = ["SubjectToStrand", "StrandToSubStrand", "SubStrandToLearningOutcome", "RubricsToSubStrand"]
rel_prop_names = ["importance", "sequence_order", "time_allocation", "relevance,application"]
tags = ["Subject", "Strand", "SubStrand", "LearningOutcome","AssessmentRubrics"]

# Define properties for each tag
tag_prop_names = ["name,grade","name","name","description","indicator_name,exceeds_expectations,meets_expectations,approaches_expectations,below_expectations"]

# Initialize the Nebula Graph Store
graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    tag_prop_names=tag_prop_names,
)

# Create a storage context
storage_context = StorageContext.from_defaults(graph_store=graph_store)


In [None]:
# ## load pdf file to llama index
# file_name = "../data/interim/GRADE 4 CURRICULUM DESIGNS- Volume 3.pdf"
# documents = SimpleDirectoryReader(input_files=[file_name]).load_data(num_workers=5)

In [None]:
## load json file in llama index
from llama_index import download_loader
import json
file_name = "../data/interim/GRADE 6 CURRICULUM DESIGNS- MATHEMATICS_processed.json"
## read json file
with open(file_name, 'r', encoding='utf-8') as f:
    json_data = json.load(f)


JsonDataReader = download_loader("JsonDataReader")
loader = JsonDataReader()
documents = loader.load_data(json_data)

In [None]:
def extract_triplets_from_curriculum(data):
    triplets = []
    print(data)
    for item in data:
        subject = item['subject']
        strand = item['strand']
        substrand = item['sub_strand']
        grade = item['grade']
        # Triplets for subject, strand, and substrand
        # triplets.append((grade, 'contains_subject', subject))
        triplets.append((subject, 'SubjectToStrand', strand))
        triplets.append((strand, 'StrandToSubStrand', substrand))

        # Processing assessment rubrics
        for assessment in item['assessment_rubrics']:
            indicator = assessment['indicator_name']
            triplets.append((substrand, 'RubricsToSubStrand', indicator))

            # Processing each rubric level
            for rubric in assessment['rubrics']:
                level = rubric['level']
                statement = rubric['statement']
                rubric_detail = f"{level}: {statement}"
                triplets.append((indicator, 'has_rubric_level', rubric_detail))

    return triplets


In [None]:
kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    max_triplets_per_chunk=10,
    service_context=service_context,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    tag_prop_names=tag_prop_names,
    show_progress=True,
    max_object_length= 100000,
    # kg_triplet_extract_fn=extract_triplets_from_curriculum,
    include_embeddings=True,
)

In [None]:
from llama_index.query_engine import KnowledgeGraphQueryEngine

from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore

query_engine = KnowledgeGraphQueryEngine(
    storage_context=storage_context,
    service_context=service_context,
    llm=llm,
    verbose=True,
)

In [None]:
response = query_engine.query(
    "Share with me the substrands that depend on knowing this from the rubrics \"Correctly and consistently multiplies up to a 4-digit\"",
)
display(Markdown(f"<b>{response}</b>"))
