### Import needed libraries

In [None]:
# File and system utilities
import os
import sys
from pathlib import Path
import json  # Handle JSON files

#  Data Modeling and Typing
from typing import Optional, List, Dict, Any
from pydantic import BaseModel

#  LangChain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain, create_extraction_chain_pydantic
# from langchain.chains import create_extraction_chain_pydantic
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain import hub  # Access shared LangChain assets

#  Azure / OpenAI Integration
from langchain_openai.chat_models import AzureChatOpenAI

# Local functions
from AI_chunker_v2 import AgenticChunker, create_agentic_chunker

# AI_chunker_v2 was adapted from https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/a4570f3c4883eb9b835b0ee18990e62298f518ef/tutorials/LevelsOfTextSplitting/agentic_chunker.py

In [2]:
## Paths in the project

# Base Folder:
base_folder = os.path.dirname(os.getcwd())

# Answers path
answers_path = os.path.join(base_folder, 'Answers')

# Vector_DBs path
vector_dbs_path = os.path.join(base_folder, 'Vector_DBs')

# Code path
code_path = os.path.join(base_folder, 'Code')

# AI_Prepositions path
ai_prepositions_path = os.path.join(base_folder, 'AI_Prepositions')

# Corpus path
corpus_path = os.path.join(base_folder, 'Corpus')

# Ground_Truth path
ground_truth_path = os.path.join(base_folder, 'Ground_Truth')

# RAGAS_Results path
ragas_results_path = os.path.join(base_folder, 'RAGAS_Results')

### Setup Azure API

In [None]:
########## Setup the Azure API

os.environ["AZURE_OPENAI_API_KEY"] = "YOUR_KEY" # Your Key

azure_configs = {
    "base_url": "YOUR_URL", # Your endpoint
    "model_deployment": "YOUR_MODEL", # Your model_deployment (LLM)
    "model_name": "YOUR_MODEL_NAME" # Your model_name (LLM)
}

In [None]:
# Initialize the LLM (gpt-4o) for the chunk process
llm = AzureChatOpenAI(
    openai_api_version="YOUR API_version", # Your API_version
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["model_deployment"],
    model=azure_configs["model_name"],
    validate_base_url=False,
)

### Prepare prompt and .txt files

In [7]:
# Prompt to split texts
prompt_template = """
You are a proposition extractor. Your task is to break down the provided "Content" into a list of clear, standalone, and context-independent propositions. Each proposition must convey a single, interpretable idea that can be understood without referencing surrounding text.

Follow these detailed instructions:

1. Split complex or compound sentences into multiple simple propositions. Each proposition should express a single atomic fact or claim.
2. Preserve the original wording as much as possible. Avoid paraphrasing unless necessary to improve clarity.
3. Separate descriptive information about entities into distinct propositions. For example, if a person is identified with a profession or title, extract that information into its own proposition.
4. Resolve all pronouns and ambiguous references. Replace them with the full name or noun phrase they refer to.
5. Add minimal necessary context to decontextualize each proposition. Include names, dates, or locations when needed to make the statement understandable in isolation.
6. Output the propositions as a JSON array of strings. Each item in the array should be one proposition.

Content to analyze:
{content}

Please extract the propositions and return them as a JSON array:
"""

In [8]:
prompt = ChatPromptTemplate.from_template(prompt_template)

# Create the runnable chain
runnable = prompt | llm

In [None]:
# List to save the documents
documents = []

# Check all the .txt files in the folder
for filename in os.listdir(corpus_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(corpus_path, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            documents.append({
                "filename": filename,
                "content": content
            })

print(f"{len(documents)} documents have been uploaded.")

52 documents have been uploaded.


### Get propositions from each document + useful functions

In [59]:
## PROCESSING AND PROPOSITION EXTRACTION

def extract_propositions_from_text(content):
    """
    Extract propositions from text using the LLM
    """
    try:
        # Invoke the LLM with the content
        response = runnable.invoke({"content": content})
        
        # Get the response content
        response_content = response.content
        
        # Try to parse the JSON response
        try:
            # Look for JSON array in the response
            start_idx = response_content.find('[')
            end_idx = response_content.rfind(']') + 1
            
            if start_idx != -1 and end_idx != 0:
                json_str = response_content[start_idx:end_idx]
                propositions = json.loads(json_str)
                return propositions
            else:
                print("A valid JSON array was not found in the response")
                return []
                
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            print(f"LLM response: {response_content}")
            return []
            
    except Exception as e:
        print(f"Error in text processing: {e}")
        return []

# Process each document and extract propositions
processed_documents = []

for i, doc in enumerate(documents[2:52]):
    print(f"\n Processing document {i+1}/{len(documents)}: {doc['filename']}")
    
    # Extract propositions from the document content
    propositions = extract_propositions_from_text(doc['content'])
    
    # Store the processed document with propositions
    processed_doc = {
        "filename": doc['filename'],
        "original_content": doc['content'],
        "propositions": propositions,
        "num_propositions": len(propositions)
    }
    
    processed_documents.append(processed_doc)
    
    print(f" Retrieved  {len(propositions)} propositions from {doc['filename']}")


 Processing document 1/52: 03. NISS Registration.txt
 Retrieved  52 propositions from 03. NISS Registration.txt

 Processing document 2/52: 04. NOVA's master's programs.txt
 Retrieved  42 propositions from 04. NOVA's master's programs.txt

 Processing document 3/52: 05. NOVA's bachelor  programs.txt
 Retrieved  4 propositions from 05. NOVA's bachelor  programs.txt

 Processing document 4/52: 06. NOVA's FAQ.txt
 Retrieved  82 propositions from 06. NOVA's FAQ.txt

 Processing document 5/52: 07. AIMA Residence Permit Issued to Higher Education Students – Article 91.txt
 Retrieved  47 propositions from 07. AIMA Residence Permit Issued to Higher Education Students – Article 91.txt

 Processing document 6/52: 08. AIMA - Means of Subsistence.txt
 Retrieved  36 propositions from 08. AIMA - Means of Subsistence.txt

 Processing document 7/52: 09. VISA.txt
 Retrieved  41 propositions from 09. VISA.txt

 Processing document 8/52: Bachelor's_Degree_in_Data_Science.txt
 Retrieved  55 propositions 

In [60]:
## RESULTS SUMMARY

total_propositions = 0
for doc in processed_documents:
    print(f"\n📄 {doc['filename']}:")
    print(f"   - Retrieved propositions: {doc['num_propositions']}")
    total_propositions += doc['num_propositions']
    
    # Show first 3 propositions as example
    if doc['propositions']:
        print("   - First propositions:")
        for j, prop in enumerate(doc['propositions'][:3]):
            print(f"     {j+1}. {prop}")
        if len(doc['propositions']) > 3:
            print(f"     ... and {len(doc['propositions']) - 3} more")

print(f"\n Total number of proposals retrieved: {total_propositions}")


📄 03. NISS Registration.txt:
   - Retrieved propositions: 52
   - First propositions:
     1. The Social Security Identification Number (NISS) allows unique, precise, and accurate identification of citizens in the Social Security system at the national level.
     2. The Social Security Identification Number (NISS) is automatically allocated to national citizens when they apply for the Citizen Card.
     3. Foreign citizens and national citizens who are not required to have a Citizen Card must submit a request for a Social Security Identification Number (NISS).
     ... and 49 more

📄 04. NOVA's master's programs.txt:
   - Retrieved propositions: 42
   - First propositions:
     1. NOVA IMS offers 23 postgraduate programs.
     2. NOVA IMS offers 17 master's programs.
     3. NOVA IMS offers a Postgraduate Program in Artificial Intelligence for Business Transformation.
     ... and 39 more

📄 05. NOVA's bachelor  programs.txt:
   - Retrieved propositions: 4
   - First propositions:
  

In [None]:
## ACCESS INDIVIDUAL PROPOSITIONS
def get_propositions_by_filename(filename):
    """
    Gets the propositions of a specific file
    """
    for doc in processed_documents:
        if doc['filename'] == filename:
            return doc['propositions']
    return None

# I tested to create chunks with all the documents but, some chunks were so huge that the RAG system could not handle them.

In [65]:
# Get propositions from a specific file
if processed_documents:
    first_filename = processed_documents[0]['filename']
    props = get_propositions_by_filename(first_filename)
    print(f"\nProposals from the file'{first_filename}':")
    for i, prop in enumerate(props[:3] if props else []):
        print(f"  {i+1}. {prop}")



Proposals from the file'03. NISS Registration.txt':
  1. The Social Security Identification Number (NISS) allows unique, precise, and accurate identification of citizens in the Social Security system at the national level.
  2. The Social Security Identification Number (NISS) is automatically allocated to national citizens when they apply for the Citizen Card.
  3. Foreign citizens and national citizens who are not required to have a Citizen Card must submit a request for a Social Security Identification Number (NISS).


### Start chunking process

In [66]:
# Check processed_documents
print(f"Total documents: {len(processed_documents)}")

Total documents: 50


In [67]:
# Show some general document statistics (number of prepositions)
prop_counts = [doc['num_propositions'] for doc in processed_documents]
print(f"Statistics of proposals by document:")
print(f"   • Average: {sum(prop_counts)/len(prop_counts):.1f}")
print(f"   • Min: {min(prop_counts)}")
print(f"   • Max: {max(prop_counts)}")
print(f"   • Total: {sum(prop_counts)}")

Statistics of proposals by document:
   • Average: 59.7
   • Min: 4
   • Max: 108
   • Total: 2985


#### Setup Agentic chunker

In [68]:
ac = AgenticChunker()

In [69]:
# Start the process to iterate per each document

results_by_document = {}

for doc in processed_documents:
    # Create a chunker for the document
    doc_chunker = create_agentic_chunker()

    # Add prepositions from the current document
    doc_chunker.add_propositions(doc['propositions'])

    # Chunks after the process
    chunks_after = ac.get_chunks()

    # Save results
    results_by_document[doc['filename']] = {
        # 'document_info': doc,
        'chunks': doc_chunker.get_chunks(get_type='list_of_strings'),
        'chunks_for_embedding': doc_chunker.get_chunks_for_embedding(),
        'stats': doc_chunker.get_stats()
    }

    print(f"{doc['filename']} was succesfully processed")


Adding: 'The Social Security Identification Number (NISS) allows unique, precise, and accurate identification of citizens in the Social Security system at the national level.'
No chunks, creating a new one
Created new chunk (3c13b): Identification Systems

Adding: 'The Social Security Identification Number (NISS) is automatically allocated to national citizens when they apply for the Citizen Card.'
Chunk Found (3c13b), adding to: Identification Systems

Adding: 'Foreign citizens and national citizens who are not required to have a Citizen Card must submit a request for a Social Security Identification Number (NISS).'
Chunk Found (3c13b), adding to: Social Security Identification Systems

Adding: 'The request for a Social Security Identification Number (NISS) for these citizens is submitted via an online form called Pedido de NISS – Cidadão Estrangeiro ou Cidadão Nacional sem obrigatoriedade de ter cartão de cidadão.'
Chunk Found (3c13b), adding to: Social Security Identification & App

### Export and save results

In [None]:
def guardar_datos(datos: List[Dict[str, Any]], archivo: str = "datos.json"):
    """
    Saves the list of dictionaries in a JSON file
    
    Args:
        data: List of dictionaries with structure filename, original_content, propositions, num_propositions
        file: Name of the file to save (default: data.json)
    """
    try:
        with open(archivo, 'w', encoding='utf-8') as f:
            json.dump(datos, f, ensure_ascii=False, indent=2)
        print(f"Data successfully saved in {archivo}")
    except Exception as e:
        print(f"Error when saving: {e}")

In [None]:
# Let's store the "results_by_document" 
guardar_datos(results_by_document, os.path.join(ai_prepositions_path, "3-52_chunk_texts.json")) # change name if needed

Datos guardados exitosamente en 3-52_chunk_texts.json
