# üöÄ RAG Document Ingestion Pipeline
## From file_to_ingest Folder ‚Üí Storage ‚Üí PostgreSQL with pgvector

**Pipeline:**
1. Scan `file_to_ingest/` folder
2. Upload files to storage
3. Extract text from PDF (dengan page tracking)
4. Chunk text (dengan overlap)
5. Generate embeddings (OpenAI 1536-dim)
6. Insert chunks ke PostgreSQL pgvector
7. Test semantic search

## 0Ô∏è‚É£ Setup - Database Connection & Imports

In [1]:
import os
import sys
import subprocess
import json
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import pandas as pd
import uuid
import pdfplumber
import io
import re
import numpy as np
from openai import OpenAI
from dotenv import load_dotenv

# Load .env.local explicitly (for Jupyter/local development)
load_dotenv('.env.local')

# ============================================================================
# üîå DATABASE CONNECTION (via docker-compose exec)
# ============================================================================

class DatabaseConnection:
    """Docker PostgreSQL connection via docker-compose exec with stdin"""
    def __init__(self, working_dir: str = "."):
        self.working_dir = working_dir
        self.container_name = "system-llm-postgres-local"
        self.user = "llm_user"
        self.db = "system_llm"

    def execute_sql(self, query: str, fetch: bool = False) -> str:
        """Execute SQL via docker-compose exec with stdin (handles long queries)"""
        cmd = [
            "docker-compose", "-f", "docker-compose.local.yml",
            "exec", "-T", "postgres",
            "psql", "-U", self.user, "-d", self.db,
        ]
        
        if fetch:
            cmd.append("-t")

        try:
            result = subprocess.run(
                cmd,
                input=query,
                capture_output=True,
                text=True,
                encoding='utf-8',
                errors='replace',
                cwd=self.working_dir
            )

            if result.returncode != 0:
                error_msg = result.stderr.strip() if result.stderr else "Unknown error"
                raise Exception(f"SQL Error: {error_msg}")

            return result.stdout.strip() if fetch else ""

        except Exception as e:
            print(f"Database Error: {e}")
            raise

    def test_connection(self) -> bool:
        """Test database connection"""
        try:
            version = self.execute_sql("SELECT version();", fetch=True)
            if version:
                db_version = version.split(',')[0].strip()
                print(f"Connected to {self.container_name}")
                print(f"Version: {db_version}\n")
                return True
        except Exception as e:
            print(f"Connection failed: {e}\n")
            return False

# Initialize database
db = DatabaseConnection(working_dir=".")
print("=" * 80)
print("DATABASE CONNECTION TEST")
print("=" * 80)

if not db.test_connection():
    sys.exit(1)

print("All imports loaded successfully\n")

DATABASE CONNECTION TEST
Connected to system-llm-postgres-local
Version: PostgreSQL 15.15 on x86_64-pc-linux-musl

All imports loaded successfully



## 1Ô∏è‚É£ Storage Setup - Local File Storage

In [2]:
# ============================================================================
# üì¶ LOCAL FILE STORAGE
# ============================================================================

class LocalFileStorage:
    """Manage local file storage for uploaded PDFs"""
    def __init__(self, base_path="storage/uploads"):
        self.base_path = Path(base_path)
        self.base_path.mkdir(parents=True, exist_ok=True)
    
    def put(self, file_id: str, content: bytes) -> str:
        """Save file to storage"""
        path = self.base_path / f"{file_id}.pdf"
        path.write_bytes(content)
        return file_id
    
    def get(self, file_id: str) -> bytes:
        """Retrieve file from storage"""
        path = self.base_path / f"{file_id}.pdf"
        if not path.exists():
            raise FileNotFoundError(f"File not found: {file_id}")
        return path.read_bytes()

storage = LocalFileStorage()
print("‚úÖ Local storage initialized")
print(f"   Path: {storage.base_path.absolute()}\n")

‚úÖ Local storage initialized
   Path: c:\Users\pcgsa\Downloads\system-llm\system-llm-backend\storage\uploads



## 2Ô∏è‚É£ Discover Files - Scan file_to_ingest Folder

In [3]:
from pathlib import Path

# ============================================================================
# üìÅ CHECK & SCAN file_to_ingest FOLDER
# ============================================================================

ingest_folder = Path("file_to_ingest")

# Cek apakah folder sudah ada
if not ingest_folder.exists():
    print("üìÇ Folder file_to_ingest belum ada, membuat folder...")
    ingest_folder.mkdir(parents=True)
else:
    print("üìÇ Folder file_to_ingest sudah ada")

pdf_files = sorted(ingest_folder.glob("*.pdf"))

print("=" * 80)
print(f"üìÅ Scanning folder: {ingest_folder.absolute()}")
print("=" * 80 + "\n")

if pdf_files:
    print(f"Found {len(pdf_files)} PDF file(s):\n")
    for i, file_path in enumerate(pdf_files, 1):
        file_size = file_path.stat().st_size
        print(f"  [{i}] {file_path.name}")
        print(f"      Size: {file_size:,} bytes\n")
else:
    print("‚ö†Ô∏è  No PDF files found in file_to_ingest folder")
    print("\nüìù Please add PDF files to: file_to_ingest/")
    print("   Then run the next cells to process them.\n")


üìÇ Folder file_to_ingest sudah ada
üìÅ Scanning folder: c:\Users\pcgsa\Downloads\system-llm\system-llm-backend\file_to_ingest

Found 2 PDF file(s):

  [1] 4b tts-id v2.pdf
      Size: 1,870,838 bytes

  [2] 5a dialog-systems-en v2.pdf
      Size: 2,484,075 bytes



## 3Ô∏è‚É£ Select Files to Process

In [4]:
# ============================================================================
# üìã SELECT FILES TO PROCESS
# ============================================================================

# MODIFY THIS: Change which files to process
# Example: [1, 2] to process first and second file
# Example: list(range(1, len(pdf_files) + 1)) to process all
file_indices = list(range(1, len(pdf_files) + 1))  # Process ALL by default

selected_files = []

if pdf_files:
    print("üìã Selected files to ingest:\n")
    for idx in file_indices:
        if 1 <= idx <= len(pdf_files):
            file_path = pdf_files[idx - 1]
            selected_files.append(file_path)
            print(f"  ‚úÖ [{idx}] {file_path.name}")
    
    print(f"\n‚úÖ Total files selected: {len(selected_files)}\n")
else:
    print("‚ùå No PDF files available to select\n")

üìã Selected files to ingest:

  ‚úÖ [1] 4b tts-id v2.pdf
  ‚úÖ [2] 5a dialog-systems-en v2.pdf

‚úÖ Total files selected: 2



## 4Ô∏è‚É£ Upload Files & Create DB Records

In [None]:
# ============================================================================
# üóëÔ∏è OPTIONAL: CLEAR OLD DATA (Run this first if re-ingesting)
# ============================================================================

CLEAR_OLD_DATA = False  # WARNING!!! Only set to True if you want to clear all old chunks documents before ingesting

if CLEAR_OLD_DATA:
    print("=" * 80)
    print("Clearing old data...")
    print("=" * 80 + "\n")
    
    try:
        delete_chunks = "DELETE FROM document_chunk;"
        db.execute_sql(delete_chunks)
        print("‚úÖ Deleted all document_chunk records")
        
        delete_docs = "DELETE FROM document;"
        db.execute_sql(delete_docs)
        print("‚úÖ Deleted all document records\n")
    except Exception as e:
        print(f"‚ö†Ô∏è  Warning: {e}\n")
else:
    print("üí° To clear old data, set CLEAR_OLD_DATA = True at the top of this cell\n")

Clearing old data...

‚úÖ Deleted all document_chunk records
‚úÖ Deleted all document records



In [21]:
# ============================================================================
# üì§ UPLOAD FILES TO STORAGE & CREATE DB RECORDS
# ============================================================================

uploaded_documents = []

if selected_files:
    print("=" * 80)
    print(f"Uploading {len(selected_files)} file(s) to storage")
    print("=" * 80 + "\n")
    
    # Get a valid user_id from database (or use hardcoded default)
    # For now, use first user in database
    user_query = "SELECT id FROM \"user\" LIMIT 1;"
    user_result = db.execute_sql(user_query, fetch=True)
    
    if user_result and user_result.strip():
        user_id = user_result.strip()
    else:
        # Create default user if none exists
        default_user_id = str(uuid.uuid4())
        user_insert = f"""
        INSERT INTO \"user\" (id, email, password_hash, full_name, role)
        VALUES ('{default_user_id}', 'system@example.com', 'hash', 'System User', 'ADMIN');
        """
        try:
            db.execute_sql(user_insert)
            user_id = default_user_id
        except:
            user_id = "00000000-0000-0000-0000-000000000000"
    
    for file_idx, file_path in enumerate(selected_files, 1):
        try:
            # Read file
            file_content = file_path.read_bytes()
            file_size = len(file_content)
            
            # Generate storage filename
            storage_filename = str(uuid.uuid4())
            file_path_str = f"storage/uploads/{storage_filename}.pdf"
            
            # Upload to storage
            storage.put(storage_filename, file_content)
            
            # Create database record
            db_id = str(uuid.uuid4())
            
            insert_query = f"""
            INSERT INTO document (id, user_id, original_filename, filename, file_path, file_size, status, mime_type)
            VALUES ('{db_id}', '{user_id}', '{file_path.name}', '{storage_filename}.pdf', '{file_path_str}', {file_size}, 'UPLOADED', 'application/pdf');
            """
            
            db.execute_sql(insert_query)
            
            uploaded_documents.append({
                "index": file_idx,
                "db_id": db_id,
                "storage_filename": storage_filename,
                "original_filename": file_path.name,
                "file_size": file_size
            })
            
            print(f"  OK [{file_idx}] {file_path.name}")
            print(f"      Size: {file_size:,} bytes")
            print(f"      Storage ID: {storage_filename}")
            print(f"      DB ID: {db_id[:8]}...\n")
        
        except Exception as e:
            print(f"  ERROR [{file_idx}] {file_path.name}: {e}\n")
    
    if uploaded_documents:
        print("=" * 80)
        print(f"SUCCESS: {len(uploaded_documents)}/{len(selected_files)} file(s) uploaded")
        print("=" * 80 + "\n")
    else:
        print("FAILED: No files uploaded\n")
else:
    print("No files selected for upload\n")

Uploading 2 file(s) to storage



  OK [1] 4b tts-id v2.pdf
      Size: 1,870,838 bytes
      Storage ID: 0e3cbf88-14d1-4cd1-99d7-2cd85903ac8e
      DB ID: 228e7029...

  OK [2] 5a dialog-systems-en v2.pdf
      Size: 2,484,075 bytes
      Storage ID: eef27987-e3ce-4fe3-b209-afa2e02f4275
      DB ID: e5ecd1a2...

SUCCESS: 2/2 file(s) uploaded



## 5Ô∏è‚É£ Extract Text from PDF

In [22]:
# ============================================================================
# EXTRACT TEXT FROM PDF
# ============================================================================

def extract_text_from_pdf(pdf_bytes: bytes) -> dict:
    """Extract text from PDF with page tracking"""
    pages_text = {}
    try:
        with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
            for page_num, page in enumerate(pdf.pages, 1):
                extracted = page.extract_text()
                if extracted and extracted.strip():
                    pages_text[page_num] = extracted
    except Exception as e:
        print(f"Error: {e}")
        raise
    return pages_text

print("PDF extraction function loaded\n")

# Extract from uploaded documents
extracted_texts = {}

if uploaded_documents:
    print("=" * 80)
    print(f"Extracting text from {len(uploaded_documents)} document(s)")
    print("=" * 80 + "\n")
    
    for doc in uploaded_documents:
        db_id = doc["db_id"]
        storage_filename = doc["storage_filename"]
        filename = doc["original_filename"]
        
        try:
            pdf_bytes = storage.get(storage_filename)
            pages_text = extract_text_from_pdf(pdf_bytes)
            
            if pages_text:
                extracted_texts[db_id] = pages_text
                total_chars = sum(len(t) for t in pages_text.values())
                print(f"  OK {filename}")
                print(f"     Pages: {len(pages_text)}, Chars: {total_chars:,}\n")
            else:
                print(f"  SKIP {filename}: No text extracted\n")
        except Exception as e:
            print(f"  ERROR {filename}: {e}\n")
    
    print("=" * 80)
    print(f"SUCCESS: Extracted {len(extracted_texts)} document(s)")
    print("=" * 80 + "\n")
else:
    print("No documents to extract\n")

PDF extraction function loaded

Extracting text from 2 document(s)



Cannot set gray non-stroke color because /'Paint18' is an invalid float value


  OK 4b tts-id v2.pdf
     Pages: 61, Chars: 19,344



Cannot set gray non-stroke color because /'P56' is an invalid float value
Cannot set gray non-stroke color because /'P66' is an invalid float value
Cannot set gray non-stroke color because /'P133' is an invalid float value
Cannot set gray non-stroke color because /'P141' is an invalid float value


  OK 5a dialog-systems-en v2.pdf
     Pages: 87, Chars: 27,813

SUCCESS: Extracted 2 document(s)



## 6Ô∏è‚É£ Chunk Text

In [23]:
# ============================================================================
# CHUNK TEXT - FIXED VERSION (Cross-page chunking)
# ============================================================================

def chunk_text_with_pages(
    pages_text: Dict[int, str],
    chunk_size: int = 500,
    overlap: int = 50
) -> List[Tuple[str, int, int]]:
    """
    Chunk text globally (not per-page) while tracking page numbers.
    Returns: List of (chunk_content, start_page, end_page)

    Algorithm:
    1. Combine all pages into one text stream
    2. Split by sentences
    3. Build chunks from sentences (each chunk can span multiple pages)
    4. Track which pages each chunk touches

    Example:
    - If chunk spans pages 5-7, it means content from those pages
    - Overlap ensures context at chunk boundaries
    """
    chunks_with_pages = []

    # Convert to list of (sentence, page_number) tuples
    all_sentences = []
    for page_num in sorted(pages_text.keys()):
        page_content = pages_text[page_num]
        sentences = re.split(r'(?<=[.!?])\s+', page_content)
        for sentence in sentences:
            if sentence.strip():
                all_sentences.append((sentence, page_num))

    if not all_sentences:
        return chunks_with_pages

    # Build chunks globally
    current_chunk = []
    current_pages = set()
    current_size = 0

    for sentence, page_num in all_sentences:
        words = sentence.split()
        if not words:
            continue

        # If adding this sentence exceeds chunk_size AND we have content, save chunk
        if current_size + len(words) > chunk_size and current_chunk:
            chunk_content = ' '.join(current_chunk)
            start_page = min(current_pages)
            end_page = max(current_pages)
            chunks_with_pages.append((chunk_content, start_page, end_page))

            # OVERLAP: Keep last N words (not last N%)
            # This ensures next chunk starts with context from previous chunk
            overlap_words = current_chunk[-overlap:] if len(current_chunk) > overlap else current_chunk
            current_chunk = overlap_words
            current_size = len(' '.join(current_chunk).split())
            current_pages = {page_num}

        current_chunk.extend(words)
        current_pages.add(page_num)
        current_size += len(words)

    # Save remaining chunk
    if current_chunk:
        chunk_content = ' '.join(current_chunk)
        start_page = min(current_pages)
        end_page = max(current_pages)
        chunks_with_pages.append((chunk_content, start_page, end_page))

    return chunks_with_pages

In [24]:
print("Text chunking function loaded (GLOBAL CHUNKING - spans multiple pages)\n")

# Create chunks
chunks_by_document = {}

if extracted_texts:
    print("=" * 80)
    print(f"Creating chunks for {len(extracted_texts)} document(s)")
    print("=" * 80 + "\n")
    
    for doc_id, pages_text in extracted_texts.items():
        chunks_with_pages = chunk_text_with_pages(pages_text, chunk_size=500, overlap=50)
        chunks_by_document[doc_id] = chunks_with_pages
        
        doc = next((d for d in uploaded_documents if d["db_id"] == doc_id), None)
        if doc:
            print(f"  üìÑ {doc['original_filename']}")
            print(f"     Total Pages: {len(pages_text)}")
            print(f"     Total Chunks: {len(chunks_with_pages)}")
            
            # Analyze chunk distribution
            single_page = sum(1 for _, start, end in chunks_with_pages if start == end)
            multi_page = sum(1 for _, start, end in chunks_with_pages if start != end)
            
            print(f"     Single-page chunks: {single_page}")
            print(f"     Multi-page chunks: {multi_page}")
            
            if chunks_with_pages:
                avg_words = sum(len(c[0].split()) for c in chunks_with_pages) / len(chunks_with_pages)
                print(f"     Avg words/chunk: {avg_words:.0f}\n")
    
    total = sum(len(c) for c in chunks_by_document.values())
    print("=" * 80)
    print(f"SUCCESS: Created {total} chunks")
    print("=" * 80 + "\n")
else:
    print("No text to chunk\n")

Text chunking function loaded (GLOBAL CHUNKING - spans multiple pages)

Creating chunks for 2 document(s)

  üìÑ 4b tts-id v2.pdf
     Total Pages: 61
     Total Chunks: 7
     Single-page chunks: 0
     Multi-page chunks: 7
     Avg words/chunk: 455

  üìÑ 5a dialog-systems-en v2.pdf
     Total Pages: 87
     Total Chunks: 11
     Single-page chunks: 0
     Multi-page chunks: 11
     Avg words/chunk: 457

SUCCESS: Created 18 chunks



In [25]:
# ============================================================================
# GENERATE EMBEDDINGS (OpenAI)
# ============================================================================

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    print("ERROR: OPENAI_API_KEY not set in .env.local")
    print("Make sure .env.local has: OPENAI_API_KEY=sk-proj-...")
    sys.exit(1)

client = OpenAI(api_key=OPENAI_API_KEY)

def generate_embedding(text: str) -> List[float]:
    """Generate 1536-dimensional embedding"""
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

print("OpenAI embedding function loaded (1536 dimensions)")
print(f"Using API key: {OPENAI_API_KEY[:20]}...\n")

if chunks_by_document:
    print("Testing embedding generation...")
    
    first_doc_id = list(chunks_by_document.keys())[0]
    first_chunk = chunks_by_document[first_doc_id][0][0]
    
    try:
        embedding = generate_embedding(first_chunk[:1000])
        print(f"  OK: Generated {len(embedding)}-dimensional embedding")
        print(f"      First 5 values: {embedding[:5]}\n")
    except Exception as e:
        print(f"  ERROR: {e}\n")
        sys.exit(1)
else:
    print("No chunks for embedding test\n")

OpenAI embedding function loaded (1536 dimensions)
Using API key: sk-proj-pH9p6un0rQCs...

Testing embedding generation...
  OK: Generated 1536-dimensional embedding
      First 5 values: [-0.03277304023504257, 0.051569681614637375, 0.02663818560540676, -0.023709138855338097, -0.023732202127575874]



## 8Ô∏è‚É£ Insert Chunks to PostgreSQL

In [26]:
# ============================================================================
# INSERT CHUNKS TO POSTGRESQL
# ============================================================================

def insert_chunks_to_db(document_id: str, chunks_with_pages: List[Tuple[str, int, int]]):
    """
    Insert chunks with embeddings to PostgreSQL.
    Now supports chunks spanning multiple pages.
    
    For multi-page chunks:
    - page_number stores the START page
    - chunk_metadata stores {"start_page": X, "end_page": Y}
    """
    try:
        # Update status to PROCESSING
        status_query = f"UPDATE document SET status = 'PROCESSING' WHERE id = '{document_id}';"
        db.execute_sql(status_query)
        print(f"  Status: PROCESSING")
        
        # Insert chunks
        print(f"  Inserting {len(chunks_with_pages)} chunks...")
        
        for idx, (chunk_content, start_page, end_page) in enumerate(chunks_with_pages):
            # Generate embedding
            embedding = generate_embedding(chunk_content)
            embedding_json = json.dumps(embedding)
            
            # Escape quotes for SQL
            safe_content = chunk_content.replace("'", "''")
            safe_embedding = embedding_json.replace("'", "''")
            
            # Store page range in metadata
            metadata = json.dumps({"start_page": start_page, "end_page": end_page})
            safe_metadata = metadata.replace("'", "''")
            
            # Insert query
            chunk_id = str(uuid.uuid4())
            insert_query = f"""
            INSERT INTO document_chunk
            (id, document_id, chunk_index, content, page_number, embedding, chunk_metadata, created_at)
            VALUES
            ('{chunk_id}', '{document_id}', {idx}, '{safe_content}', {start_page}, '{safe_embedding}', '{safe_metadata}'::jsonb, now());
            """
            
            db.execute_sql(insert_query)
            
            if (idx + 1) % 10 == 0:
                print(f"     Progress: {idx + 1}/{len(chunks_with_pages)}")
        
        # Update status to PROCESSED
        processed_query = f"UPDATE document SET status = 'PROCESSED', processed_at = now() WHERE id = '{document_id}';"
        db.execute_sql(processed_query)
        print(f"  Status: PROCESSED")
        
    except Exception as e:
        print(f"  ERROR: {e}")
        raise

print("Insert function loaded (handles multi-page chunks)\n")

# Process all documents
if uploaded_documents and chunks_by_document:
    print("=" * 80)
    print(f"Starting ingestion for {len(uploaded_documents)} document(s)")
    print("=" * 80 + "\n")
    
    for doc_idx, doc in enumerate(uploaded_documents, 1):
        document_id = doc["db_id"]
        filename = doc["original_filename"]
        
        if document_id not in chunks_by_document:
            print(f"Skipping {filename} - no chunks\n")
            continue
        
        try:
            chunks_with_pages = chunks_by_document[document_id]
            print(f"[{doc_idx}/{len(uploaded_documents)}] {filename}")
            insert_chunks_to_db(document_id, chunks_with_pages)
            print()
        except Exception as e:
            print(f"FAILED: {e}\n")
    
    print("=" * 80)
    print("INGESTION COMPLETE!")
    print("=" * 80 + "\n")
else:
    print("Missing prerequisites\n")

Insert function loaded (handles multi-page chunks)

Starting ingestion for 2 document(s)

[1/2] 4b tts-id v2.pdf
  Status: PROCESSING
  Inserting 7 chunks...
  Status: PROCESSED

[2/2] 5a dialog-systems-en v2.pdf
  Status: PROCESSING
  Inserting 11 chunks...
     Progress: 10/11
  Status: PROCESSED

INGESTION COMPLETE!



## 9Ô∏è‚É£ Verification - Check Database

In [27]:
# ============================================================================
# VERIFICATION - Check Database
# ============================================================================

print("=" * 80)
print("VERIFICATION - Database Contents")
print("=" * 80 + "\n")

# Total chunks
count_result = db.execute_sql("SELECT COUNT(*) FROM document_chunk;", fetch=True).strip()
total_chunks = int(count_result) if count_result and count_result.isdigit() else 0
print(f"Total chunks in database: {total_chunks}\n")

# Document status
status_query = """
SELECT d.original_filename, d.status, COUNT(dc.id) as chunk_count
FROM document d LEFT JOIN document_chunk dc ON d.id = dc.document_id
GROUP BY d.id, d.original_filename, d.status
ORDER BY d.uploaded_at DESC;
"""
result = db.execute_sql(status_query, fetch=True)

print("Document Status:")
if result:
    for line in result.split('\n'):
        if line.strip() and '|' in line:
            parts = line.split('|')
            if len(parts) >= 3:
                filename = parts[0].strip()
                status = parts[1].strip()
                chunk_count = parts[2].strip()
                print(f"  {filename}")
                print(f"    Status: {status}, Chunks: {chunk_count}\n")
else:
    print("  No documents found\n")

print("=" * 80 + "\n")

VERIFICATION - Database Contents

Total chunks in database: 18

Document Status:
  No documents found




## üîü Semantic Search Test

In [28]:
# ============================================================================
# SEMANTIC SEARCH TEST
# ============================================================================

def semantic_search(query_text: str, top_k: int = 5) -> list:
    """Semantic search using cosine similarity"""
    try:
        # Generate query embedding
        query_embedding = np.array(generate_embedding(query_text))
        
        # Get chunks from database
        search_query = """
        SELECT dc.content, d.original_filename, dc.page_number, dc.embedding
        FROM document_chunk dc
        JOIN document d ON dc.document_id = d.id
        LIMIT 100;
        """
        result_text = db.execute_sql(search_query, fetch=True)
        
        similarities = []
        if result_text:
            for line in result_text.split('\n'):
                if '|' in line:
                    parts = line.split('|')
                    if len(parts) >= 4:
                        try:
                            content = parts[0].strip()
                            filename = parts[1].strip()
                            page_num = int(parts[2].strip()) if parts[2].strip().isdigit() else 0
                            embedding_json = parts[3].strip()
                            
                            chunk_embedding = np.array(json.loads(embedding_json))
                            similarity = np.dot(query_embedding, chunk_embedding) / (
                                np.linalg.norm(query_embedding) * np.linalg.norm(chunk_embedding) + 1e-10
                            )
                            similarities.append((content[:300], filename, page_num, float(similarity)))
                        except:
                            pass
        
        similarities.sort(key=lambda x: x[3], reverse=True)
        return similarities[:top_k]
    
    except Exception as e:
        print(f"Error: {e}")
        return []

# Test search
print("=" * 80)
print("SEMANTIC SEARCH TEST")
print("=" * 80 + "\n")

QUERY = "'Mengembangkan sistem text-to-speech berbasis ML Formalisasi Data evaluasi tugas Melatih Data pelatihan model Mengevalua Terapkan dan si model pantau'"

print(f"Query: '{QUERY}'\n")

results = semantic_search(QUERY, top_k=5)

if results:
    print(f"Found {len(results)} results:\n")
    for i, (content, filename, page, similarity) in enumerate(results, 1):
        print(f"  [{i}] {filename} (page {page})")
        print(f"      Similarity: {similarity:.4f}")
        print(f"      Content: {content[:150]}...\n")
else:
    print("No results found\n")

print("=" * 80)

SEMANTIC SEARCH TEST

Query: ''Mengembangkan sistem text-to-speech berbasis ML Formalisasi Data evaluasi tugas Melatih Data pelatihan model Mengevalua Terapkan dan si model pantau''

Found 5 results:

  [1] 4b tts-id v2.pdf (page 1)
      Similarity: 0.6858
      Content: Sintesis Bentuk Gelombang Pemrosesan Bahasa Lisan Fakultas Ilmu Komputer Universitas Indonesia Semester Gasal 2024/2025 Referensi ‚ñ™ TTS Waveform Synth...

  [2] 4b tts-id v2.pdf (page 11)
      Similarity: 0.5994
      Content: Ekspresi 1 2 3 4 5 o Seberapa baik intonasi sesuai dengan substansi ucapan? A/B testing ‚Ä¢ Menggunakan pilihan yang bersumber dari banyak orang untuk m...

  [3] 4b tts-id v2.pdf (page 57)
      Similarity: 0.5588
      Content: unit (tidak seperti sintesis diphone) tidak dapat mengubah penekanan. ‚óã Seleksi unit memberikan hasil yang bagus (tetapi mungkin tidak sepenuhnya bena...

  [4] 4b tts-id v2.pdf (page 27)
      Similarity: 0.5305
      Content: aksen Hasilkan: ‚óè Waveform 26 F0 Ge