In [1]:
"""
PROJECT: NeuralTranscript: Semantic Search & Q&A for YouTube Content
MODULE: 02_SEMANTIC_CHUNKING
-------------------------------------------------------------------------
DESCRIPTION:
This module transforms raw transcript text into semantically meaningful 
chunks. By adding source metadata to each chunk, we enable the RAG system 
to provide citations and structured context to the LLM (Gemini/Groq).

AUTHOR: Engr. Inam Ullah Khan
Master's Student in Data Science | Al-Farabi Kazakh National University
-------------------------------------------------------------------------
"""

import os
# NEW: Import from the dedicated text-splitters package
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# --- 1. CONFIGURATION ---
VIDEO_ID = "Gfr50f6ZBvo"
INPUT_PATH = f"data/transcripts/{VIDEO_ID}.txt"

# RAG Hyperparameters
CHUNK_SIZE = 1000   
CHUNK_OVERLAP = 200 

# --- 2. CORE PROCESSING FUNCTIONS ---

def load_processed_transcript(file_path: str) -> str:
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"‚ùå Transcript not found at {file_path}. Run Notebook 01 first.")
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def create_enriched_chunks(text: str, source_id: str) -> list[Document]:
    print(f"‚úÇÔ∏è Initializing Recursive Splitting (Size: {CHUNK_SIZE}, Overlap: {CHUNK_OVERLAP})...")
    
    # Updated Splitter
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separators=["\n\n", "\n", ".", " ", ""],
        add_start_index=True 
    )
    
    # Generate chunks as Document objects
    # Note: Using create_documents is cleaner in the new API
    enriched_docs = splitter.create_documents(
        [text], 
        metadatas=[{"source": source_id, "content_type": "video_transcript"}]
    )
    
    return enriched_docs

# --- 3. EXECUTION PIPELINE ---

if __name__ == "__main__":
    print(f"--- Starting NeuralTranscript Chunking Pipeline ---")
    
    full_text = load_processed_transcript(INPUT_PATH)
    chunked_docs = create_enriched_chunks(full_text, VIDEO_ID)
    
    print(f"‚úÖ Created {len(chunked_docs)} enriched chunks.")
    
    # Preview
    sample = chunked_docs[0]
    print(f"\n--- CHUNK VALIDATION ---\nMetadata: {sample.metadata}\nPreview: {sample.page_content[:150]}...")

  from .autonotebook import tqdm as notebook_tqdm


--- Starting NeuralTranscript Chunking Pipeline ---
‚úÇÔ∏è Initializing Recursive Splitting (Size: 1000, Overlap: 200)...
‚úÖ Created 169 enriched chunks.

--- CHUNK VALIDATION ---
Metadata: {'source': 'Gfr50f6ZBvo', 'content_type': 'video_transcript', 'start_index': 0}
Preview: the following is a conversation with demus hasabis ceo and co-founder of deepmind a company that has published and builds some of the most incredible ...


In [2]:
import pickle

# Save the chunks so Notebook 03 can use them
with open("data/chunked_docs.pkl", "wb") as f:
    pickle.dump(chunked_docs, f)

print("‚úÖ Chunks safely persisted to data/chunked_docs.pkl")

‚úÖ Chunks safely persisted to data/chunked_docs.pkl


# üèÅ Summary: Step 02 | Semantic Chunking & Metadata Enrichment
In this stage of the NeuralTranscript pipeline, we successfully transformed the raw, unstructured transcript into a structured dataset optimized for high-precision retrieval.

# üß† Key Achievements
- Recursive Splitting Logic: Instead of arbitrary cuts, we implemented a hierarchical approach using RecursiveCharacterTextSplitter. The system prioritizes splitting at paragraphs (\n\n), then sentences (.), ensuring that related ideas stay together within a single chunk.

- Context Preservation: By applying a 200-character overlap, we created a "sliding window" effect. This ensures that the transition between chunks remains semantically fluid, preventing the loss of information that occurs when a sentence is sliced at the boundary.

- Agentic Metadata Enrichment: Each text chunk was wrapped into a Document object and enriched with unique identifiers:

-  source: The original YouTube Video ID for traceability.

- chunk_id: Enables the future LLM to cite specific segments.

- start_index: Provides the exact character position from the original transcript.

# üìä Data Insights
Input Size: ~133,000 characters (Demis Hassabis Interview).

Output Yield: 169 semantically coherent chunks.

Efficiency: The average chunk length of 1,000 characters is the "sweet spot" for modern embedding models like all-MiniLM-L6-v2, balancing information density with retrieval speed.

## Summary

- Evaluated multiple chunking strategies for long-form transcript data
- Conducted both quantitative and qualitative analysis
- Selected an optimal chunk configuration for embedding and retrieval

**Next step:** Embedding generation and similarity-based retrieval  
(`03_embedding_retrieval.ipynb`)
