In [1]:
import os
from dotenv import load_dotenv
from typing import List

from src.KnowledgeParser import KnowledgeParser
from src.utils import save_documents_as_json, save_markdown_content, save_nodes_as_json

In [2]:
load_dotenv()
LLAMA_CLOUD_API_KEY = os.getenv("LLAMA_CLOUD_API_KEY") 
DATA_DIR = os.getenv('DATA_DIR_PATH')

In [8]:
# Initialize the KnowledgeParser
knowledge_parser = KnowledgeParser(api_key=LLAMA_CLOUD_API_KEY)

pdf = os.path.join(DATA_DIR, "documents/pdf/tie-geography-f2.pdf")
documents = knowledge_parser.parse_pdf_sync(pdf)

Started parsing the file under job_id ec2bb553-a847-40e8-ad5b-8c1c2cf12f90


In [9]:
print(documents[0].text[:1000] + '...')

## Geography for Secondary Schools

Student's Book

Two

Form

Tanzania Institute of Education
---
## Geography for Secondary Schools Student's Book Form Two

THE UNITED REPUBLIC OF TANZANIA

Ministry of Education, Science and Technology

|Textbook|August 2019|
|---|---|
|Author:|Dr. Lyabwene M. Mtahabwa|
|ISBN|978-9987-09-034-|

For Online Use Only

Tanzania Institute of Education

PROPERTY OF THE UNITED REPUBLIC OF TANZANIA GOVERNMENT

Ministry of Education, Science and Technology
---
Geography for Secondary Schools

FOR ONLINE USE ONLY

DO NOT DUPLICATE

© Tanzania Institute of Education, 2019

Published 2019

ISBN 978–9987–09–034–1

Tanzania Institute of Education

P. O. Box 35094

Dar es Salaam

Telephone: +255-22-2773005/+255-22-2771358

Fax: +255-22-2774420

Email: director.general@tie.go.tz

Website: www.tie.go.tz

All rights reserved. This book may not be reproduced, stored in any retrieval system or transmitted in any form or by any means, electronic, mechanical, photocopying

In [10]:
# Save data in json and markdown files
# save_documents_as_json(documents, os.path.join(DATA_DIR, "documents", "markdown-entire-docs.json"))
save_markdown_content(documents[0], os.path.join(DATA_DIR, "documents/markdown", "tie-geography-f2.md"))

In [7]:
import nest_asyncio
nest_asyncio.apply()

In [8]:
# Let's figure out what the nodes are and look like...

nodes = knowledge_parser.node_parser.get_nodes_from_documents(documents)

6it [00:00, 8955.81it/s]
100%|██████████| 6/6 [00:03<00:00,  1.78it/s]


In [9]:
base_nodes, objects = knowledge_parser.node_parser.get_nodes_and_objects(nodes)

# TODO: check if this is neccessary

In [10]:
from llama_index.core import VectorStoreIndex

In [11]:
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)
raw_index = VectorStoreIndex.from_documents(documents)

In [13]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=15, 
    node_postprocessors=[reranker], 
    verbose=True
)

raw_query_engine = raw_index.as_query_engine(similarity_top_k=15, node_postprocessors=[reranker])

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
print(nodes[0:2])

[TextNode(id_='bf3387b4-f93e-46c8-a9e0-ed16be137cde', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='4135d9a1-4fe8-4dd1-884a-f304cada32a6', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='97c43d9d824f0c722797d4732d68a2d6118d714db0a1e60cb327a1106a8cbeb2'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='id_5ac92209-10a6-4076-bcc0-847339ca47d5_2_table_ref', node_type=<ObjectType.INDEX: '3'>, metadata={'col_schema': ''}, hash='0ade328d12f06180df2ca425461fdee2460f55a52f41e969a194b6bf99dfbd48')}, text='Check-in/out Details', start_char_idx=1, end_char_idx=21, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), IndexNode(id_='id_5ac92209-10a6-4076-bcc0-847339ca47d5_2_table_ref', embedding=None, metadata={'col_schema': ''}, excluded_embed_metadata_keys=['col_schema'], excluded_llm_metadata_keys=[], relation