# JEPDocument Chatbot - ETL Pipeline Testing


This notebook provides a comprehensive testing environment for the ETL (Extract, Transform, Load) pipeline of the legal document chatbot system.

In [2]:
# Add src to Python path
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), 'src'))

# Import required modules
from src.etl.excel_reader import read_excel_file, validate_excel_format
from src.etl.pdf_downloader import download_pdfs
from src.etl.docling_processor import process_document
from src.etl.embedding_generator import generate_embeddings_for_chunks
from src.database.vector_store import initialize_database, VectorStore

## 1. Excel Reader Testing

In [None]:
# Test Excel reader functionality
# Replace with the path to your Excel file
excel_file_path = "data/excel/Reporte_providencias_ia_2024-10-09_06_49_57.xlsx"

try:
    # Validate Excel format
    is_valid = validate_excel_format(excel_file_path)
    print(f"Excel file format is valid: {is_valid}")

    # Read Excel file
    if is_valid:
        pdf_data = read_excel_file(excel_file_path)
        print(f"Found {len(pdf_data)} PDF links in the Excel file")
        
        # Display first few entries
        for i, pdf_info in enumerate(pdf_data[:3]):
            print(f"\nEntry {i+1}:")
            print(f"  URL: {pdf_info['url']}")
            print(f"  Filename: {pdf_info['filename']}")
            print(f"  Row Index: {pdf_info['row_index']}")
            print(f"  Metadata keys: {list(pdf_info['metadata'].keys())}")
    else:
        print("Excel file format is not valid. Please check the file.")
except FileNotFoundError:
    print(f"Excel file not found at {excel_file_path}. Please check the file path.")
except Exception as e:
    print(f"Error reading Excel file: {str(e)}")

2025-07-29 23:25:23,018 - src.etl.excel_reader - ERROR - Error validating Excel format: [Errno 2] No such file or directory: 'data/excel/document_links.xlsx'


Excel file format is valid: False
Excel file format is not valid. Please check the file.


## 2. PDF Downloader Testing

In [None]:
# Test PDF downloader functionality
if 'pdf_data' in locals():
    try:
        # Download first 3 PDFs for testing
        test_pdf_data = pdf_data[:3]
        
        download_result = download_pdfs(test_pdf_data, download_dir="data/raw")
        
        print(f"Total files processed: {download_result['total_files']}")
        print(f"Successful downloads: {len(download_result['successful_downloads'])}")
        print(f"Failed downloads: {len(download_result['failed_downloads'])}")
        
        # Display successful downloads
        for download in download_result['successful_downloads']:
            print(f"  - {download['filename']} downloaded to {download['filepath']}")
        
        # Display failed downloads
        for download in download_result['failed_downloads']:
            print(f"  - {download['filename']} failed: {download['error']}")
    except Exception as e:
        print(f"Error downloading PDFs: {str(e)}")
else:
    print("PDF data not available. Please run the Excel reader test first.")

## 3. Document Processing with Docling Testing

In [None]:
# Test Docling processor functionality
if 'download_result' in locals() and download_result['successful_downloads']:
    try:
        # Process the first successfully downloaded PDF
        first_pdf = download_result['successful_downloads'][0]
        pdf_file_path = first_pdf['filepath']
        
        print(f"Processing document: {pdf_file_path}")
        
        # Process the document
        processing_result = process_document(pdf_file_path)
        
        print(f"Processing status: {processing_result.status}")
        if processing_result.status == 'success':
            print(f"Generated {len(processing_result.chunks)} chunks")
            
            # Display first few chunks
            for i, chunk in enumerate(processing_result.chunks[:3]):
                print(f"\nChunk {i+1}:")
                print(f"  Content preview: {chunk.content[:200]}...")
                print(f"  Page numbers: {chunk.page_numbers}")
                print(f"  Section title: {chunk.section_title}")
                print(f"  Metadata: {chunk.chunk_metadata}")
        else:
            print(f"Processing failed: {processing_result.error_message}")
    except Exception as e:
        print(f"Error processing document: {str(e)}")
else:
    print("No successful downloads available. Please run the PDF downloader test first.")

## 4. Embedding Generation Testing

In [None]:
# Test embedding generation functionality
if 'processing_result' in locals() and processing_result.status == 'success':
    try:
        # Convert chunks to the format expected by embedding generator
        chunks_for_embedding = [
            {
                'content': chunk.content,
                'chunk_metadata': chunk.chunk_metadata
            }
            for chunk in processing_result.chunks
        ]
        
        print(f"Generating embeddings for {len(chunks_for_embedding)} chunks")
        
        # Generate embeddings
        chunks_with_embeddings = generate_embeddings_for_chunks(chunks_for_embedding)
        
        print(f"Generated embeddings for {len(chunks_with_embeddings)} chunks")
        
        # Display embedding information
        for i, chunk in enumerate(chunks_with_embeddings[:3]):
            print(f"\nChunk {i+1} embedding:")
            print(f"  Embedding dimension: {len(chunk['embedding'])}")
            print(f"  Embedding preview: {chunk['embedding'][:5]}...")
    except Exception as e:
        print(f"Error generating embeddings: {str(e)}")
else:
    print("No processed chunks available. Please run the document processing test first.")

## 5. Vector Store Testing

In [None]:
# Test vector store functionality
try:
    # Initialize database
    initialize_database()
    print("Database initialized successfully")
    
    # Create vector store instance
    vector_store = VectorStore()
    
    # Store document metadata
    if 'first_pdf' in locals():
        document_id = vector_store.store_document(
            filename=first_pdf['filename'],
            source_url=first_pdf['url'],
            metadata=first_pdf['metadata']
        )
        print(f"Stored document with ID: {document_id}")
        
        # Store chunks with embeddings
        if 'chunks_with_embeddings' in locals():
            vector_store.store_chunks(document_id, chunks_with_embeddings)
            print(f"Stored {len(chunks_with_embeddings)} chunks")
    
except Exception as e:
    print(f"Error testing vector store: {str(e)}")

## 6. Query Processing Testing

In [None]:
# Test query processing functionality
try:
    from src.rag.query_processor import process_query
    
    # Test query
    test_query = "What are the key points in the legal document?"
    
    print(f"Processing query: {test_query}")
    
    # Process query
    query_result = process_query(test_query)
    
    print(f"Query result:")
    print(f"  Found {query_result['chunk_count']} relevant chunks")
    print(f"  Response: {query_result['response']}")
    
    # Display chunk information
    for i, chunk in enumerate(query_result['chunks'][:3]):
        print(f"\nRelevant Chunk {i+1} (similarity: {chunk['similarity']:.3f}):")
        print(f"  Content preview: {chunk['content'][:200]}...")
        
except ImportError:
    print("Query processor module not found. Please check if it's implemented.")
except Exception as e:
    print(f"Error testing query processing: {str(e)}")

## 7. Complete ETL Pipeline Test

In [None]:
# Test the complete ETL pipeline
def run_complete_etl_pipeline(excel_file_path, max_documents=3):
    """Run the complete ETL pipeline for testing."""
    print("=== Running Complete ETL Pipeline ===")
    
    try:
        # Step 1: Read Excel file
        print("\n1. Reading Excel file...")
        pdf_data = read_excel_file(excel_file_path)
        print(f"   Found {len(pdf_data)} PDF links")
        
        # Limit to max_documents for testing
        pdf_data = pdf_data[:max_documents]
        
        # Step 2: Download PDFs
        print("\n2. Downloading PDFs...")
        download_result = download_pdfs(pdf_data, download_dir="data/raw")
        print(f"   Successfully downloaded {len(download_result['successful_downloads'])} PDFs")
        
        # Step 3: Process documents and generate embeddings
        print("\n3. Processing documents and generating embeddings...")
        vector_store = VectorStore()
        
        for i, download in enumerate(download_result['successful_downloads']):
            print(f"   Processing document {i+1}/{len(download_result['successful_downloads'])}: {download['filename']}")
            
            try:
                # Process document with Docling
                processing_result = process_document(download['filepath'])
                
                if processing_result.status == 'success':
                    print(f"     Generated {len(processing_result.chunks)} chunks")
                    
                    # Convert chunks for embedding generation
                    chunks_for_embedding = [
                        {
                            'content': chunk.content,
                            'chunk_metadata': chunk.chunk_metadata
                        }
                        for chunk in processing_result.chunks
                    ]
                    
                    # Generate embeddings
                    chunks_with_embeddings = generate_embeddings_for_chunks(chunks_for_embedding)
                    
                    # Store document metadata
                    document_id = vector_store.store_document(
                        filename=download['filename'],
                        source_url=download['url'],
                        metadata=download['metadata']
                    )
                    
                    # Store chunks with embeddings
                    vector_store.store_chunks(document_id, chunks_with_embeddings)
                    print(f"     Stored document and {len(chunks_with_embeddings)} chunks")
                else:
                    print(f"     Processing failed: {processing_result.error_message}")
            except Exception as e:
                print(f"     Error processing document: {str(e)}")
        
        print("\n=== ETL Pipeline Complete ===")
    except Exception as e:
        print(f"Error in ETL pipeline: {str(e)}")

# Run the complete ETL pipeline (uncomment to execute)
# run_complete_etl_pipeline("data/excel/document_links.xlsx", max_documents=2)