In [1]:
import re
import json

def convert_md_to_json(input_file_path, output_file_path):
    """
    Convert markdown file to JSON format for RAG processing.
    Splits content by ## headings and includes all subsections within each chunk.
    """
    
    # Read the markdown file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split content by ## headings (level 2 headings)
    # This regex finds lines that start with exactly two # followed by a space
    sections = re.split(r'\n(?=## )', content)
    
    # Initialize the result list
    json_chunks = []
    
    for section in sections:
        # Skip empty sections
        if not section.strip():
            continue
            
        # Find the first ## heading in this section
        lines = section.split('\n')
        heading_line = None
        
        for line in lines:
            if line.startswith('## '):
                heading_line = line
                break
        
        # Skip if no ## heading found (like the initial content before first ##)
        if not heading_line:
            continue
            
        # Extract the heading text (remove the ## and strip whitespace)
        chunk_heading = heading_line.replace('## ', '').strip()
        
        # The entire section content (including the heading)
        text_content = section.strip()
        
        # Create the JSON chunk
        chunk = {
            "chunk_link": "",  # Empty as requested
            "chunk_heading": chunk_heading,
            "text": text_content
        }
        
        json_chunks.append(chunk)
    
    # Write to JSON file
    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(json_chunks, file, indent=2, ensure_ascii=False)
    
    print(f"Conversion complete! Created {len(json_chunks)} chunks.")
    print(f"Output saved to: {output_file_path}")
    
    return json_chunks

def preview_chunks(chunks, num_chunks=3):
    """Preview the first few chunks to verify the conversion"""
    print(f"\nPreview of first {min(num_chunks, len(chunks))} chunks:")
    print("=" * 50)
    
    for i, chunk in enumerate(chunks[:num_chunks]):
        print(f"\nChunk {i+1}:")
        print(f"Heading: {chunk['chunk_heading']}")
        print(f"Text length: {len(chunk['text'])} characters")
        print(f"Text preview: {chunk['text'][:200]}...")
        print("-" * 30)

In [3]:
if __name__ == "__main__":
    # Example usage
    input_file = "Employee Handbook.md"  # Update this path
    output_file = "employee_handbook.json"  # Update this path
    
    try:
        chunks = convert_md_to_json(input_file, output_file)
        preview_chunks(chunks)
        
    except FileNotFoundError:
        print(f"Error: Could not find the input file '{input_file}'")
        print("Please update the input_file path in the script.")
    except Exception as e:
        print(f"Error during conversion: {str(e)}")

Conversion complete! Created 78 chunks.
Output saved to: employee_handbook.json

Preview of first 3 chunks:

Chunk 1:
Heading: For our team members working outside of New York State:
Text length: 3425 characters
Text preview: ## For our team members working outside of New York State:

This Handbook is intended for use by team members in all states where Uniswap Labs has team members, but it also provides certain informatio...
------------------------------

Chunk 2:
Heading: **Uniswap Principles | Uni-code**
Text length: 2595 characters
Text preview: ## **Uniswap Principles | Uni-code**

*Our Uniswap operating principles (Unicode) articulate who we are (our values) and how we work. They are our daily guideposts for how we interact with each other,...
------------------------------

Chunk 3:
Heading: Code of Ethics {#code-of-ethics}
Text length: 1944 characters
Text preview: ## Code of Ethics {#code-of-ethics}

Uniswap Labs’ Code of Ethics is one of the ways we put our UNIcode values in