# Text Processing - 2024 SEC Filings

**Purpose:** Process all 2024 10-K/10-Q filings for RAPTOR RAG prototype

**Key Changes from Multi-Year Version:**
- Source: `data/external/10-X_C_2024.zip` (26,018 filings)
- Strategy: More data within single timeframe (vs. samples across 31 years)
- Chunk size: **500 tokens only** (validated optimal from FinGPT research)
- Benefits: Temporal consistency, better clustering, statistically robust

**Data Scope:**
- Time period: Full year 2024 (Q1-Q4)
- Total filings: 26,018
- Compressed size: 1.6 GB
- Form types: 10-K, 10-Q (and variants)

**Output:** `output/processed_2024_500tok.json` for embedding generation

---

## 1. Setup & Dependencies

In [None]:
import sys
from pathlib import Path
import re
import json
import zipfile
from collections import defaultdict

# Add project root to path
project_root = Path.cwd().parent.parent
sys.path.insert(0, str(project_root))

# Data locations
DATA_ZIP = project_root / 'data' / 'external' / '10-X_C_2024.zip'
OUTPUT_DIR = project_root / 'notebooks' / 'prototyping' / 'output'
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"[INFO] Data source: {DATA_ZIP}")
print(f"[INFO] Output directory: {OUTPUT_DIR}")
print(f"[INFO] Zip file exists: {DATA_ZIP.exists()}")

if DATA_ZIP.exists():
    zip_size_mb = DATA_ZIP.stat().st_size / (1024*1024)
    print(f"[OK] Zip file size: {zip_size_mb:.2f} MB")

## 2. Inspect 2024 Data Structure

In [None]:
# Peek inside the zip to understand structure
with zipfile.ZipFile(DATA_ZIP, 'r') as z:
    file_list = z.namelist()
    
print(f"[OK] Total files in zip: {len(file_list):,}")
print(f"\n[INFO] First 10 files:")
for f in file_list[:10]:
    print(f"  {f}")

# Count by quarter
quarters = defaultdict(int)
for f in file_list:
    if 'QTR' in f:
        qtr = f.split('/')[1] if '/' in f else 'unknown'
        quarters[qtr] += 1

print(f"\n[INFO] Files by quarter:")
for qtr in sorted(quarters.keys()):
    print(f"  {qtr}: {quarters[qtr]:,} files")

# Filter to .txt files only (exclude directories)
txt_files = [f for f in file_list if f.endswith('.txt')]
print(f"\n[OK] Text files to process: {len(txt_files):,}")

## 3. Text Extraction Functions

**Reusing proven logic from multi-year prototype:**
- Extract metadata from SRAF-XML wrapper
- Clean text (remove HTML/XML tags)
- Handle format variations (1993-2024 formats)

In [None]:
def extract_sraf_metadata(content):
    """
    Extract metadata from SRAF header
    
    Returns:
        dict: Metadata fields (CIK, company name, form type, date, etc.)
    """
    metadata = {}
    
    # Extract SEC-Header section
    sec_header_match = re.search(r'<SEC-Header>(.*?)</SEC-Header>', content, re.DOTALL | re.IGNORECASE)
    if sec_header_match:
        sec_header = sec_header_match.group(1)
        
        # Field mappings: try multiple patterns for each field
        field_mappings = {
            'COMPANY_NAME': [
                r'COMPANY CONFORMED NAME:\s*(.+?)(?:\n|$)',
                r'CONFORMED-NAME:\s*(.+?)(?:\n|$)',
                r'CONFORMED NAME:\s*(.+?)(?:\n|$)'
            ],
            'CIK': [
                r'CENTRAL INDEX KEY:\s*(.+?)(?:\n|$)',
                r'CIK:\s*(.+?)(?:\n|$)'
            ],
            'FORM_TYPE': [
                r'FORM TYPE:\s*(.+?)(?:\n|$)',
                r'FORM-TYPE:\s*(.+?)(?:\n|$)',
                r'CONFORMED SUBMISSION TYPE:\s*(.+?)(?:\n|$)'
            ],
            'FILING_DATE': [
                r'FILED AS OF DATE:\s*(.+?)(?:\n|$)',
                r'FILED-AS-OF-DATE:\s*(.+?)(?:\n|$)',
                r'DATE AS OF CHANGE:\s*(.+?)(?:\n|$)'
            ],
            'ACCESSION_NUMBER': [
                r'ACCESSION NUMBER:\s*(.+?)(?:\n|$)',
                r'ACCESSION-NUMBER:\s*(.+?)(?:\n|$)'
            ],
            'PERIOD_OF_REPORT': [
                r'CONFORMED PERIOD OF REPORT:\s*(.+?)(?:\n|$)',
                r'CONFORMED-PERIOD-OF-REPORT:\s*(.+?)(?:\n|$)'
            ]
        }
        
        # Try each pattern until we find a match
        for field, patterns in field_mappings.items():
            for pattern in patterns:
                match = re.search(pattern, sec_header, re.IGNORECASE)
                if match:
                    metadata[field] = match.group(1).strip()
                    break
    
    return metadata


def extract_clean_text(content):
    """
    Extract clean text content from SRAF-XML-wrapper
    
    Returns:
        str: Clean text with HTML/XML tags removed
    """
    # Remove SRAF wrapper tags
    text = re.sub(r'<Header>.*?</Header>', '', content, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'<SEC-Header>.*?</SEC-Header>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # Remove XBRL tags
    text = re.sub(r'<[^>]*xbrl[^>]*>.*?</[^>]*xbrl[^>]*>', '', text, flags=re.DOTALL | re.IGNORECASE)
    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n\s*\n', '\n\n', text)
    
    return text.strip()


print("[OK] Text extraction functions loaded")

## 4. Chunking Functions

**Strategy:**
- **500 tokens per chunk** (validated optimal from FinGPT research)
- **50 token overlap** (10% of chunk size)
- **Contextual headers** on every chunk (company, form, date, CIK)

In [None]:
# Install tiktoken if needed
try:
    import tiktoken
except ImportError:
    print("[INFO] Installing tiktoken...")
    !pip install tiktoken
    import tiktoken

# Initialize tokenizer (cl100k_base used by GPT-3.5/4)
tokenizer = tiktoken.get_encoding("cl100k_base")

def count_tokens(text):
    """Count tokens in text using tiktoken"""
    return len(tokenizer.encode(text))


def chunk_by_tokens(text, chunk_size=500, overlap=50):
    """
    Chunk text by token count with overlap
    
    Args:
        text: Input text
        chunk_size: Target tokens per chunk (default: 500)
        overlap: Token overlap between chunks (default: 50)
    
    Returns:
        list: List of text chunks
    """
    tokens = tokenizer.encode(text)
    chunks = []
    
    start = 0
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append(chunk_text)
        
        # Move start forward by (chunk_size - overlap)
        start += (chunk_size - overlap)
    
    return chunks


def create_contextual_chunks(chunks, metadata):
    """
    Add document context to each chunk
    
    Args:
        chunks: List of text chunks
        metadata: Document metadata dict
    
    Returns:
        list: List of dicts with chunk text and metadata
    """
    contextual_chunks = []
    
    # Create context header
    company = metadata.get('COMPANY_NAME', 'Unknown Company')
    form_type = metadata.get('FORM_TYPE', 'Unknown Form')
    filing_date = metadata.get('FILING_DATE', 'Unknown Date')
    cik = metadata.get('CIK', 'Unknown CIK')
    
    context_header = f"Document: {company} ({form_type}) filed {filing_date} [CIK: {cik}]\n\n"
    
    for i, chunk in enumerate(chunks):
        contextual_chunks.append({
            'chunk_id': i,
            'text': context_header + chunk,
            'metadata': {
                'company': company,
                'form_type': form_type,
                'filing_date': filing_date,
                'cik': cik,
                'chunk_index': i,
                'total_chunks': len(chunks)
            }
        })
    
    return contextual_chunks


print("[OK] Chunking functions loaded")
print(f"[INFO] Chunk size: 500 tokens")
print(f"[INFO] Overlap: 50 tokens (10%)")

## 5. Test on Sample Filing

**Before processing all 26K files, validate on one sample**

In [None]:
# Load and test first .txt file from zip
with zipfile.ZipFile(DATA_ZIP, 'r') as z:
    # Get first .txt file
    txt_files = [f for f in z.namelist() if f.endswith('.txt')]
    sample_file = txt_files[0]
    
    print(f"[INFO] Testing with: {sample_file}")
    
    # Read file from zip
    with z.open(sample_file) as f:
        raw_content = f.read().decode('utf-8', errors='ignore')

print(f"[OK] Loaded {len(raw_content):,} characters")

# Extract metadata and clean text
metadata = extract_sraf_metadata(raw_content)
clean_text = extract_clean_text(raw_content)

print(f"\n[OK] Extracted metadata:")
for key, value in metadata.items():
    print(f"  {key}: {value}")

print(f"\n[OK] Clean text length: {len(clean_text):,} characters")

# Create chunks
token_count = count_tokens(clean_text)
chunks = chunk_by_tokens(clean_text, chunk_size=500, overlap=50)
contextual_chunks = create_contextual_chunks(chunks, metadata)

print(f"\n[OK] Chunking results:")
print(f"  Total tokens: {token_count:,}")
print(f"  Total chunks: {len(chunks)}")
print(f"  Avg tokens/chunk: {token_count // len(chunks) if chunks else 0}")

# Preview first chunk
print(f"\n[Preview] First contextual chunk:")
print(contextual_chunks[0]['text'][:600])
print(f"\n[Metadata]:")
print(json.dumps(contextual_chunks[0]['metadata'], indent=2))

## 6. Batch Process All 2024 Filings

**Processing strategy:**
- Read directly from zip (no need to extract to disk)
- Process in batches with progress updates
- Skip files that fail (log errors)
- Estimated time: 30-60 minutes for 26K filings

In [None]:
def process_filing(file_content, file_name, chunk_size=500, overlap=50):
    """
    Complete processing pipeline for a single filing
    
    Args:
        file_content: Raw file content (string)
        file_name: Name of the file
        chunk_size: Target tokens per chunk
        overlap: Token overlap
    
    Returns:
        dict: Processed filing with chunks and metadata
    """
    # Extract metadata and clean text
    metadata = extract_sraf_metadata(file_content)
    clean_text = extract_clean_text(file_content)
    
    # Create chunks
    chunks = chunk_by_tokens(clean_text, chunk_size, overlap)
    
    # Add context
    contextual_chunks = create_contextual_chunks(chunks, metadata)
    
    return {
        'file_name': file_name,
        'metadata': metadata,
        'total_tokens': count_tokens(clean_text),
        'chunk_size': chunk_size,
        'overlap': overlap,
        'num_chunks': len(chunks),
        'chunks': contextual_chunks
    }


print("[OK] Processing function ready")

In [None]:
# Process all files from zip
print(f"[INFO] Processing all 2024 filings...")
print(f"[INFO] Chunk size: 500 tokens with 50 token overlap\n")

results = []
errors = []

with zipfile.ZipFile(DATA_ZIP, 'r') as z:
    txt_files = [f for f in z.namelist() if f.endswith('.txt')]
    total_files = len(txt_files)
    
    print(f"[INFO] Total files to process: {total_files:,}\n")
    
    for i, file_path in enumerate(txt_files, 1):
        try:
            # Read file from zip
            with z.open(file_path) as f:
                content = f.read().decode('utf-8', errors='ignore')
            
            # Process filing
            result = process_filing(content, file_path, chunk_size=500, overlap=50)
            results.append(result)
            
            # Progress update every 1000 files
            if i % 1000 == 0:
                pct = (i / total_files) * 100
                print(f"[Progress] {i:,}/{total_files:,} ({pct:.1f}%) - Latest: {file_path.split('/')[-1]}")
        
        except Exception as e:
            errors.append({'file': file_path, 'error': str(e)})
            if len(errors) <= 10:  # Only print first 10 errors
                print(f"[FAIL] {file_path}: {str(e)}")

print(f"\n{'='*80}")
print(f"[COMPLETE] Processing finished!")
print(f"{'='*80}")
print(f"Successfully processed: {len(results):,} filings")
print(f"Errors encountered: {len(errors)}")
if errors:
    print(f"\nFirst few errors:")
    for err in errors[:5]:
        print(f"  {err['file']}: {err['error']}")

## 7. Summary Statistics

In [None]:
import pandas as pd
import numpy as np

# Calculate statistics
total_chunks = sum(f['num_chunks'] for f in results)
total_tokens = sum(f['total_tokens'] for f in results)
avg_chunks = total_chunks / len(results) if results else 0
avg_tokens = total_tokens / len(results) if results else 0

# Estimate storage (1536-dimensional embeddings at 4 bytes/float)
embedding_size_mb = (total_chunks * 1536 * 4) / (1024 * 1024)

# Distribution stats
chunks_per_filing = [f['num_chunks'] for f in results]
tokens_per_filing = [f['total_tokens'] for f in results]

print(f"{'='*80}")
print(f"2024 FILING PROCESSING SUMMARY")
print(f"{'='*80}")
print(f"\nDataset:")
print(f"  Total filings: {len(results):,}")
print(f"  Time period: 2024 (full year)")
print(f"  Form types: 10-K, 10-Q (and variants)")

print(f"\nChunking:")
print(f"  Chunk size: 500 tokens")
print(f"  Overlap: 50 tokens (10%)")
print(f"  Total chunks: {total_chunks:,}")
print(f"  Avg chunks/filing: {avg_chunks:.1f}")

print(f"\nToken Statistics:")
print(f"  Total tokens: {total_tokens:,}")
print(f"  Avg tokens/filing: {avg_tokens:,.0f}")
print(f"  Min tokens/filing: {min(tokens_per_filing):,}")
print(f"  Max tokens/filing: {max(tokens_per_filing):,}")
print(f"  Median tokens/filing: {np.median(tokens_per_filing):,.0f}")

print(f"\nStorage:")
print(f"  Estimated embedding size: {embedding_size_mb:,.2f} MB")
print(f"  Per-chunk embedding: 6.14 KB (1536 dims * 4 bytes)")

print(f"\nDistribution:")
print(f"  P25 chunks/filing: {np.percentile(chunks_per_filing, 25):.0f}")
print(f"  P50 chunks/filing: {np.percentile(chunks_per_filing, 50):.0f}")
print(f"  P75 chunks/filing: {np.percentile(chunks_per_filing, 75):.0f}")
print(f"  P95 chunks/filing: {np.percentile(chunks_per_filing, 95):.0f}")

print(f"\n{'='*80}")

## 8. Export Results

In [None]:
# Save processed chunks
output_file = OUTPUT_DIR / 'processed_2024_500tok.json'

print(f"[INFO] Saving to {output_file}...")

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2)

file_size_mb = output_file.stat().st_size / (1024*1024)

print(f"[OK] Saved: {output_file.name}")
print(f"[OK] File size: {file_size_mb:,.2f} MB")

# Save error log if any errors occurred
if errors:
    error_file = OUTPUT_DIR / 'processing_errors_2024.json'
    with open(error_file, 'w', encoding='utf-8') as f:
        json.dump(errors, f, indent=2)
    print(f"[INFO] Error log saved: {error_file.name}")

print(f"\n{'='*80}")
print(f"EXPORT COMPLETE")
print(f"{'='*80}")
print(f"\nOutput files:")
print(f"  - {output_file.name} ({file_size_mb:,.2f} MB)")
if errors:
    print(f"  - processing_errors_2024.json ({len(errors)} errors)")

print(f"\nNext steps:")
print(f"1. Generate embeddings (next notebook: 03_embedding_generation.ipynb)")
print(f"2. Test with both Ollama models: gpt-oss and llama3-sec")
print(f"3. Implement RAPTOR clustering (04_raptor_clustering.ipynb)")
print(f"4. Build RAG query interface (05_rag_query.ipynb)")