In [1]:
import re
import os
from collections import defaultdict

def normalize_character_name(name):
    """Merge character name variants (e.g., 'WAITER (CONTINUED)' -> 'WAITER')"""
    # Remove parentheticals and suffixes
    name = re.sub(r'\(.*?\)', '', name).strip()
    # Remove "CONT'D" or "CONTINUED" markers
    name = re.sub(r'\b(CONT\'D|CONTINUED)\b', '', name, flags=re.IGNORECASE).strip()
    # Remove voiceover markers (V.O.)
    name = re.sub(r'\bV\.O\.\b', '', name).strip()
    # Standardize to ALL CAPS (for consistency)
    return name.upper()

def clean_twin_peaks_script(raw_text):
    """Advanced cleaning with character normalization"""
    # Remove revision headers and page numbers
    text = re.sub(r'^\d+\.\n', '', raw_text, flags=re.MULTILINE)
    text = re.sub(r'\.{3}.*?YELLOW.*?\n', '', text)
    
    # Remove scene headings and technical directions
    text = re.sub(r'^(INT\.|EXT\.|FADE IN:|CUT TO:|DISSOLVE TO:|ESTABLISH\.).*$', '', 
                 text, flags=re.MULTILINE|re.IGNORECASE)
    
    # Remove parentheticals and actor directions
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]', '', text)
    
    # Remove ALL-CAPS action lines (but preserve character names)
    text = re.sub(r'^[A-Z][A-Z\s]+\n', '', text, flags=re.MULTILINE)
    
    # Process line by line to normalize characters
    cleaned_lines = []
    current_character = None
    
    for line in text.split('\n'):
        line = line.strip()
        if not line:
            continue
            
        # Character line detection (supports "CHARACTER:" and "CHARACTER (CONTINUED):")
        char_match = re.match(r'^([A-Z][A-Z\s]+?(?:\(.*?\))?):\s*(.*)$', line)
        if char_match:
            raw_name, dialogue = char_match.groups()
            normalized_name = normalize_character_name(raw_name)
            
            # Only update character if name isn't empty after normalization
            if normalized_name:
                current_character = normalized_name
                cleaned_lines.append(f"{current_character}: {dialogue}")
        elif current_character and line:
            # Continuation of previous character's dialogue
            cleaned_lines.append(f"{current_character}: {line}")
    
    return '\n'.join(cleaned_lines)

# Batch processing
input_folder = "episode-text"
output_folder = "cleaned-text"
os.makedirs(output_folder, exist_ok=True)

character_counts = defaultdict(int)

for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), 'r', encoding='utf-8', errors='replace') as f:
            raw_text = f.read()
        
        cleaned_text = clean_twin_peaks_script(raw_text)
        
        # Count characters for analysis
        for line in cleaned_text.split('\n'):
            if ':' in line:
                char = line.split(':')[0].strip()
                character_counts[char] += 1
        
        # Save cleaned file
        output_path = os.path.join(output_folder, f"norm_{filename}")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        
        print(f"Processed: {filename}")

# Print character frequency (optional)
print("\nCharacter frequencies:")
for char, count in sorted(character_counts.items(), key=lambda x: -x[1]):
    print(f"{char}: {count}")

Processed: tp0208.txt
Processed: tp0221.txt
Processed: tp0219.txt
Processed: tp0213.txt
Processed: tp0216.txt
Processed: tp0202.txt
Processed: tp0206.txt
Processed: tp0222.txt
Processed: tp0214.txt
Processed: tp0201.txt
Processed: tp0215.txt
Processed: tp0205.txt
Processed: tp0210.txt
Processed: tp0203.txt
Processed: tp0212.txt
Processed: tp0220.txt
Processed: tp0218.txt
Processed: tp0217.txt
Processed: tp0207.txt
Processed: tp0204.txt
Processed: tp0209.txt
Processed: tp0211.txt

Character frequencies:
FADE OUT: 7298
REVISED: 3624
FADE TO BLACK: 2020
SUDDENLY: 1186
INTERCUT: 1017
REVISIONS: 621
BOARD MEMBERS: 581
INTERCUT WITH: 577
POV: 486
ITEM: 366
SECOND DRAFT: 316
FIRST DRAFT: 249
MATCH WITH: 184
MATCH CUT TO: 183
SHOCK CUT TO: 169
QUICK DISSOLVE: 74
FADE TO WHITE: 36
FLASHBACK: 13
CUT TO BLACK: 10
ANGLE: 8
FIRST DRAFT DATE: 5
SMASH CUT TO: 3
GENERAL DISTRIBUTION: 1
DRAFT DATE: 1
REVISED DRAFT: 1
