
# Llamaindex - Advanced RAG 

useful links:
- https://www.llamaindex.ai/


In [None]:
%%capture
!pip install llama-index >> null
!pip install openai >> null
!pip install pypdf >> null   # for reading PDF files
!pip install docx2txt > null # for reading MS doc files

In [None]:
import os

import logging
import sys
from pprint import pprint

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    load_index_from_storage,
    StorageContext,
    ServiceContext,
    Document
)

# from llama_index.llms import OpenAI, Anthropic
# from openai import OpenAI
import openai

from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import SentenceWindowNodeParser, HierarchicalNodeParser, get_leaf_nodes
from llama_index.core.text_splitter import SentenceSplitter
# from llama_index.embeddings import OpenAIEmbedding, HuggingFaceEmbedding
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.core.schema import MetadataMode
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

# from IPython.display import Markdown, display
# from transformers import AutoTokenizer, T5ForConditionalGeneration

# Step 0:  Authentication with Org ID and API Key

In [None]:
openai_key = "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" #<--- Your API KEY
#org_ID = "xxxxxxxxxxxx" #<--- Your Organization ID

In [None]:



os.environ["OPENAI_API_KEY"] = openai_key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Step 1:  Fetch Data and Store into local directory

In [None]:
# create local directory and retrieve file from external source
!mkdir -p 'my_data'
!wget 'https://www.gutenberg.org/cache/epub/72306/pg72306.txt' -O './my_data/teahistory.txt'
!wget 'https://www.gutenberg.org/cache/epub/11367/pg11367.txt' -O './my_data/chinahistory.txt'

--2024-03-26 02:07:27--  https://www.gutenberg.org/cache/epub/72306/pg72306.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 493829 (482K) [text/plain]
Saving to: ‘./my_data/teahistory.txt’


2024-03-26 02:07:27 (3.28 MB/s) - ‘./my_data/teahistory.txt’ saved [493829/493829]

--2024-03-26 02:07:27--  https://www.gutenberg.org/cache/epub/11367/pg11367.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977246 (954K) [text/plain]
Saving to: ‘./my_data/chinahistory.txt’


2024-03-26 02:07:28 (5.23 MB/s) - ‘./my_data/chinahistory.txt’ saved [977246/977246]



# Step 2:  Load into files into "Document" Object

In [None]:
def count_lines(file_path):
    with open(file_path, 'r') as file:
        line_count = sum(1 for line in file)
    return line_count

def count_words(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
        word_count = len(text.split())
    return word_count


def count_paragraphs(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
        # Split the text into paragraphs based on one or more newline characters
        paragraphs = text.split('\n\n')
        # Filter out empty paragraphs
        paragraphs = [paragraph for paragraph in paragraphs if paragraph.strip()]
        paragraph_count = len(paragraphs)
    return paragraph_count




# Example usage:
file_path = '/content/my_data/chinahistory.txt'  # Replace 'example.txt' with the path to your text file
num_lines = count_lines(file_path)
print("Number of lines in the file:", num_lines)

num_words = count_words(file_path)
print("Number of words in the file:", num_words)

num_paragraphs = count_paragraphs(file_path)
print("Number of paragraphs in the file:", num_paragraphs)


Number of lines in the file: 16714
Number of words in the file: 158193
Number of paragraphs in the file: 1354


In [None]:
 documents = SimpleDirectoryReader("./my_data/").load_data()

# Step 2B (Optional):  Inspect the documents obect

In [None]:
# Inspect the documents
print("length of doc: "+ str(len(documents)))
print("----")
pprint(documents)


length of doc: 1
----
[Document(id_='50186eaf-9f1b-42c0-83d5-f1a0e6cb0777', embedding=None, metadata={'file_path': '/content/my_data/calender.txt', 'file_name': 'calender.txt', 'file_type': 'text/plain', 'file_size': 5417, 'creation_date': '2024-03-26', 'last_modified_date': '2024-03-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="\n\n1. **Marketing Strategy Review**\n   - Date: January 7, 2024\n   - Time: 3:00 PM - 4:30 PM\n   - Location: Office Boardroom\n   - Pre-read Items:\n     - SWOT Analysis of Current Marketing Efforts\n     - Competitor Analysis Report\n     - Proposed Marketing Budget for Q1 2024\n\n2. **Product Development Brainstorming**\n   - Date: January 10, 2024\n   - Time: 2:00 PM - 4:00 PM\n   - Location: Innovation Lab\n   - Pre

In [None]:
documents[0].metadata
documents[1].metadata

{'file_path': '/content/my_data/teahistory.txt',
 'file_name': 'teahistory.txt',
 'file_type': 'text/plain',
 'file_size': 493829,
 'creation_date': '2024-03-25',
 'last_modified_date': '2024-02-29'}

# Step 3:  Node Parsing & Indexing (Base & Sentence Window Method)

In [None]:
# create the sentence window node parser w/ default settings
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

base_node_parser = SentenceSplitter()

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)


In [None]:
nodes = sentence_node_parser.get_nodes_from_documents(documents)
base_nodes = base_node_parser.get_nodes_from_documents(documents)

In [None]:
print("---------")
print("SENTENCE NODES")
print("---------")
print(nodes[1])
print("---------")
print("BASE NODES")
print("---------")
print(base_nodes[1])

---------
SENTENCE NODES
---------
Node ID: fa939ca2-d07c-4e21-95f2-019950176494
Text: **Marketing Strategy Review**    - Date: January 7, 2024    -
Time: 3:00 PM - 4:30 PM    - Location: Office Boardroom    - Pre-read
Items:      - SWOT Analysis of Current Marketing Efforts      -
Competitor Analysis Report      - Proposed Marketing Budget for Q1
2024  2.
---------
BASE NODES
---------
Node ID: 073a8990-5449-4472-8098-bb2dacac745d
Text: Event Title: Project Status Update      Date: January 18, 2024
Time: 3:00 PM - 4:00 PM      Location: Virtual Meeting (Microsoft
Teams)   18. Event Title: Company All-Hands Meeting      Date: January
19, 2024      Time: 10:00 AM - 11:30 AM      Location: Office
Auditorium   19. Event Title: Product Demo for Stakeholders      Date:
January 2...


In [None]:
dict(base_nodes[100])

{'id_': '69b770a8-68ce-4b2f-91ce-a6678cb04b39',
 'embedding': None,
 'metadata': {'file_path': 'my_data/chinahistory.txt',
  'file_name': 'chinahistory.txt',
  'file_type': 'text/plain',
  'file_size': 977274,
  'creation_date': '2023-12-13',
  'last_modified_date': '2023-12-05',
  'last_accessed_date': '2023-12-13'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='569f5583-9b5a-4ac8-9c0b-d998a0355ef3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'my_data/chinahistory.txt', 'file_name': 'chinahistory.txt', 'file_type': 'text/plain', 'file_size': 977274, 'creation_date': '2023-12-13', 'last_modified_date': '2023-12-05', 'last_accessed_date': '2023-12-13'}, has

In [None]:
ctx_sentence = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=sentence_node_parser)
ctx_base = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=base_node_parser)

sentence_index = VectorStoreIndex(nodes, service_context=ctx_sentence)
base_index = VectorStoreIndex(base_nodes, service_context=ctx_base)

  ctx_sentence = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=sentence_node_parser)
  ctx_base = ServiceContext.from_defaults(llm=llm, embed_model=OpenAIEmbedding(embed_batch_size=50), node_parser=base_node_parser)


# Step 4:  Save to Persistent Storage

In [None]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")
base_index.storage_context.persist(persist_dir="./base_index")


In [None]:
# Download to own computer for backup

!zip -r ./indexes.zip ./*_index

from google.colab import files
files.download("./indexes.zip")

updating: base_index/ (stored 0%)
updating: base_index/graph_store.json (stored 0%)
updating: base_index/index_store.json (deflated 49%)
updating: base_index/default__vector_store.json (deflated 60%)
updating: base_index/docstore.json (deflated 78%)
updating: base_index/image__vector_store.json (deflated 19%)
updating: sentence_index/ (stored 0%)
updating: sentence_index/graph_store.json (stored 0%)
updating: sentence_index/index_store.json (deflated 66%)
updating: sentence_index/default__vector_store.json (deflated 63%)
updating: sentence_index/docstore.json (deflated 94%)
updating: sentence_index/image__vector_store.json (deflated 19%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Step 5:  Retrieve from Storage

In [None]:
# rebuild storage context
SC_retrieved_sentence = StorageContext.from_defaults(persist_dir="./sentence_index")
SC_retrieved_base = StorageContext.from_defaults(persist_dir="./base_index")

In [None]:
# load index
retrieved_sentence_index = load_index_from_storage(SC_retrieved_sentence)
retrieved_base_index = load_index_from_storage(SC_retrieved_base)

# Step 6: Create query engine

In [None]:
from llama_index.core.postprocessor import MetadataReplacementPostProcessor

sentence_query_engine = retrieved_sentence_index.as_query_engine(
    similarity_top_k=5,
    verbose=True,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)

base_query_engine = retrieved_base_index.as_query_engine(
    similarity_top_k=5,
    verbose=True
)

# Step 7:  Inference

In [None]:
question = "Something happened in the United States 10 years after the first American ships sailed for China which could have made it more expensive to purchase tea. what happened that year? Try to break down your answer into steps."

In [None]:
base_response = base_query_engine.query(
    question
)
print(base_response)

1. American ships sailed for China in 1784, with more vessels dispatched the following year, bringing back significant amounts of Tea.
2. During 1786-87, five other ships brought over 1,000,000 pounds of Tea to the United States.
3. In 1790, the earliest official record of Tea importation into the United States was documented.
4. The importation, value, and consumption of Tea in the United States increased steadily by decades since 1790.
5. In 1794, the rates of duty on Tea were increased by 75% on direct importations and 100% on all teas shipped from Europe.
6. This increase in duty rates in 1794, ten years after the first American ships sailed for China, could have made it more expensive to purchase tea in the United States.


In [None]:
question = "when did I visited Rottnest Island?"

base_response = base_query_engine.query(
    question
)
print(base_response)

There is no information provided in the context about your visit to Rottnest Island.


In [None]:
question = "Something happened in the United States 10 years after the first American ships sailed for China which could have made it more expensive to purchase tea. what happened that year? Try to break down your answer into steps."

sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

1. American ships sailed for China.
2. Ten years later, something happened in the United States that could have made it more expensive to purchase tea.


In [None]:
question = "when did I visited Rottnest Island?"

sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

You visited Rottnest Island in December, 2022.


In [None]:
question = "why did i visit  Rottnest Island?"

sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

You visited Rottnest Island for a Mantel Group sponsored event for a year-end party in December 2022.


In [None]:
# testing for calendar events

In [None]:
question = "how many meeting do I have next week?"

sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

You have three meetings next week.


In [None]:
question = "Can you list 5 most important meeting ?"

In [None]:


base_response = base_query_engine.query(
    question
)
print(base_response)

1. Marketing Strategy Review
2. Product Development Brainstorming
3. Quarterly Review Meeting
4. Company All-Hands Meeting
5. Industry Conference


In [None]:
question = "do I have any pre-read for next week's meeting? if yes, can you please those"

base_response = base_query_engine.query(
    question
)
print(base_response)

Yes, you have pre-read items for next week's meeting. The pre-read items for the meeting are as follows:
- Market Research Findings on Consumer Trends
- Customer Feedback from Beta Testing Phase
- Innovative Ideas from Previous Brainstorming Sessions


In [None]:
sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

Yes, you have pre-read items for next week's meeting. The pre-read items for the upcoming meeting include:
- SWOT Analysis of Current Marketing Efforts
- Competitor Analysis Report
- Proposed Marketing Budget for Q1 2024


In [None]:
question = "Can you list 5 most important meeting ?"
sentence_response = sentence_query_engine.query(
    question
)
print(sentence_response)

1. Quarterly Business Review on February 2, 2024
2. Sales Conference on February 6-8, 2024
3. Marketing Campaign Launch on January 15, 2024
4. Employee Training Seminar on February 3, 2024
5. Budget Planning Meeting on January 13, 2024


# adding a new sentence to the existing txt file

In [None]:
def add_sentence_to_file(file_path, new_sentence):
    with open(file_path, 'a') as file:
        file.write('\n' + new_sentence)

# File path of the existing text file
file_path = './historyOfChina.txt'

# New sentence to add
new_sentence = "I have visited Rottnest Island on December, 2022. It was a Mantel Group sponsored event for year end party."

# Add the new sentence to the existing file
add_sentence_to_file(file_path, new_sentence)

print("Sentence added to the existing file.")


Sentence added to the existing file.
