# RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval

In [1]:
# NOTE: An OpenAI API key must be set here for application initialization, even if not in use.
# If you're not utilizing OpenAI models, assign a placeholder string (e.g., "not_used").
import os
os.environ["OPENAI_API_KEY"] = "your-key"

In [2]:
import json

with open('finQA/output_summary_metadata.json', 'r') as file:
    json_data = json.load(file)

1) **Building**: RAPTOR recursively embeds, clusters, and summarizes chunks of text to construct a tree with varying levels of summarization from the bottom up. You can create a tree from the text in 'sample.txt' using `RA.add_documents(text)`.

2) **Querying**: At inference time, the RAPTOR model retrieves information from this tree, integrating data across lengthy documents at different abstraction levels. You can perform queries on the tree with `RA.answer_question`.

### Building the tree

In [3]:
from raptor import RetrievalAugmentation, RetrievalAugmentationConfig

  from .autonotebook import tqdm as notebook_tqdm
2024-06-03 12:41:34,385 - Loading faiss.
2024-06-03 12:41:34,426 - Successfully loaded faiss.


In [4]:
from raptor import ClusterTreeBuilder, ClusterTreeConfig

In [5]:
from raptor.QAModels import GPT4oQAModel
from raptor.SummarizationModels import GPT4oSummarizationModel

custom_qa = GPT4oQAModel()
custom_summarizer = GPT4oSummarizationModel()

custom_config = RetrievalAugmentationConfig(
    json_data=json_data,
    summarization_model=custom_summarizer,
    qa_model=custom_qa
)

In [7]:
ctc = ClusterTreeConfig(json_data=json_data,
    summarization_model=custom_summarizer)

In [None]:
ctb = ClusterTreeBuilder(config=ctc)

In [None]:
ctb.build_from_json(ctb.json_data)

In [None]:
RA = RetrievalAugmentation(config=custom_config)

# construct the tree
RA.add_documents(json_data)

In [None]:
RA.tree.layer_to_nodes

### Querying from the tree

```python
question = # any question
RA.answer_question(question)
```

In [None]:
question = "What is Apple?"
answer = RA.answer_question(question=question)

print("Answer: ", answer)

In [6]:
# Save the tree by calling RA.save("path/to/save")
SAVE_PATH = "finQA/GPT4o_full_metadata"
# RA.save(SAVE_PATH)

In [65]:
# load back the tree by passing it into RetrievalAugmentation

RA = RetrievalAugmentation(tree=SAVE_PATH, config=custom_config)

question = 'what is apples net revenue in 2012?'

answer = RA.answer_question(question=question)
print("Answer: ", answer)

2024-06-02 22:09:53,428 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 500
            Summarization Model: <raptor.SummarizationModels.GPT4oSummarizationModel object at 0x7fa184c2e850>
            Embedding Models: {'OpenAI': <raptor.EmbeddingModels.OpenAIEmbeddingModel object at 0x7fa191738c40>}
            Cluster Embedding Model: OpenAI
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2024-06-02 22:09:53,429 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Sel

Answer:  Apple Inc.'s net revenue for the year 2012 was $156.5 billion. This figure represents a significant increase from the previous years, with net sales of $108.2 billion in 2011 and $65.2 billion in 2010. The substantial growth in net revenue highlights Apple's strong financial performance and market presence in the Tech & Electronics sector during that period.


In [7]:
import json

# Function to load JSON file
def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

# Function to save JSON file
def save_json(data, filepath):
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=2)

# Function to answer questions using RA model
def generate_answers(data, ra_model):
    new_data = []
    for entry in data:
        question = entry["question"]
        model_answer = ra_model.answer_question(question=question)
        new_entry = {
            "question": question,
            "answer": model_answer
        }
        new_data.append(new_entry)
    return new_data

# Function to answer questions using RA model
def generate_rag_answers(data, ra_model, start_layer):
    new_data = []
    for entry in data:
        question = entry["question"]
        model_answer = ra_model.answer_question(question=question,
                                                start_layer=start_layer, 
                                                num_layers=1)
        new_entry = {
            "question": question,
            "answer": model_answer
        }
        new_data.append(new_entry)
    return new_data

In [8]:
# Path to your original JSON file
json_file_path = 'evaluations/ground_truth_3.json'
# Path to save the new JSON file
new_json_file_path = 'evaluations/raptor_output_metadata_3.json'

# Load the JSON data
data = load_json(json_file_path)

RA = RetrievalAugmentation(tree=SAVE_PATH, config=custom_config)

new_data = generate_answers(data, RA)

# Save the new questions and answers to a new JSON file
save_json(new_data, new_json_file_path)

print(f"New JSON with model answers saved to {new_json_file_path}")



2024-06-02 22:46:04,718 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 500
            Summarization Model: <raptor.SummarizationModels.GPT4oSummarizationModel object at 0x7fba9e8989a0>
            Embedding Models: {'OpenAI': <raptor.EmbeddingModels.OpenAIEmbeddingModel object at 0x7fba887ebb20>}
            Cluster Embedding Model: OpenAI
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2024-06-02 22:46:04,718 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Sel

New JSON with model answers saved to evaluations/raptor_output_metadata_3.json


In [None]:
for i in range(135):
    print('--------------------------------------------')
    print(RA.tree.layer_to_nodes[2][i].metadata.company)
    print(RA.tree.layer_to_nodes[2][i].text)

In [8]:
# Path to your original JSON file
json_file_path = 'evaluations/ground_truth_3.json'
# Path to save the new JSON file
new_json_file_path = 'evaluations/rag0_output_3.json'

# Load the JSON data
data = load_json(json_file_path)

# Change to only leaf layer
RA = RetrievalAugmentation(tree=SAVE_PATH, config=custom_config)

new_data = generate_rag_answers(data, RA, start_layer=0)

# Save the new questions and answers to a new JSON file
save_json(new_data, new_json_file_path)

print(f"New JSON with model answers saved to {new_json_file_path}")

2024-06-03 12:42:46,979 - Successfully initialized TreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Selection Mode: top_k
            Summarization Length: 500
            Summarization Model: <raptor.SummarizationModels.GPT4oSummarizationModel object at 0x7fa2be213940>
            Embedding Models: {'OpenAI': <raptor.EmbeddingModels.OpenAIEmbeddingModel object at 0x7fa299af3af0>}
            Cluster Embedding Model: OpenAI
        
        Reduction Dimension: 10
        Clustering Algorithm: RAPTOR_Clustering
        Clustering Parameters: {}
        
2024-06-03 12:42:46,980 - Successfully initialized ClusterTreeBuilder with Config 
        TreeBuilderConfig:
            Tokenizer: <Encoding 'cl100k_base'>
            Max Tokens: 100
            Num Layers: 5
            Threshold: 0.5
            Top K: 5
            Sel

New JSON with model answers saved to evaluations/rag0_output_3.json
