In [3]:
import re
import os

def clean_script(raw_text):
    """
    Basic cleaning for stylometric analysis:
    - Removes scene headings, technical directions, and parentheticals
    - Preserves dialogue and action descriptions
    - Normalizes whitespace
    """
    # Remove scene headings (INT./EXT.) and technical directions
    text = re.sub(r'^(INT\.|EXT\.|ESTABLISH|CUT TO:|ANGLE:|FADE\s.*?):.*$', '', 
                 raw_text, flags=re.MULTILINE|re.IGNORECASE)
    
    # Remove revision markers and page numbers
    text = re.sub(r'^REVISIONS:.*$', '', text, flags=re.MULTILINE|re.IGNORECASE)
    text = re.sub(r'^\d+\.\s*', '', text, flags=re.MULTILINE)  # Scene numbers
    
    # Remove character names and parentheticals before dialogue
    text = re.sub(r'^[A-Z][A-Z\s]+\s*\(.*?\)\n', '', text, flags=re.MULTILINE)
    text = re.sub(r'^[A-Z][A-Z\s]+:\s*', '', text, flags=re.MULTILINE)  # Character names
    
    # Remove standalone parentheticals (e.g., (grins))
    text = re.sub(r'^\s*\(.*?\)\s*$', '', text, flags=re.MULTILINE)
    
    # Remove ALL-CAPS action lines (optional - comment out to keep)
    text = re.sub(r'^[A-Z][A-Z\s]+$', '', text, flags=re.MULTILINE)
    
    # Normalize whitespace and trim lines
    text = '\n'.join([line.strip() for line in text.split('\n') if line.strip()])
    
    # Collapse multiple newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text

# Batch Processing
input_folder = "episode-text"
output_folder = "cleaned-text"
os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), 'r', encoding='utf-8', errors='replace') as f:
            raw_text = f.read()
        
        cleaned_text = clean_script(raw_text)
        
        output_path = os.path.join(output_folder, f"cleaned_{filename}")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        
        print(f"Processed: {filename} → cleaned_{filename}")

Processed: tp0208.txt → cleaned_tp0208.txt
Processed: tp0221.txt → cleaned_tp0221.txt
Processed: tp0219.txt → cleaned_tp0219.txt
Processed: tp0213.txt → cleaned_tp0213.txt
Processed: tp0216.txt → cleaned_tp0216.txt
Processed: tp0202.txt → cleaned_tp0202.txt
Processed: tp0206.txt → cleaned_tp0206.txt
Processed: tp0222.txt → cleaned_tp0222.txt
Processed: tp0214.txt → cleaned_tp0214.txt
Processed: tp0201.txt → cleaned_tp0201.txt
Processed: tp0215.txt → cleaned_tp0215.txt
Processed: tp0205.txt → cleaned_tp0205.txt
Processed: tp0210.txt → cleaned_tp0210.txt
Processed: tp0203.txt → cleaned_tp0203.txt
Processed: tp0212.txt → cleaned_tp0212.txt
Processed: tp0220.txt → cleaned_tp0220.txt
Processed: tp0218.txt → cleaned_tp0218.txt
Processed: tp0217.txt → cleaned_tp0217.txt
Processed: tp0207.txt → cleaned_tp0207.txt
Processed: tp0204.txt → cleaned_tp0204.txt
Processed: tp0209.txt → cleaned_tp0209.txt
Processed: tp0211.txt → cleaned_tp0211.txt


In [None]:


def further_clean_script(text):

    # 1. Remove front matter (everything before first scene heading)
    front_matter_pattern = re.compile(
        r'^.*?TWIN PEAKS.*?\n'  # Title line
        r'(?:.*\n)*?'           # Variable number of lines
        r'Lynch/Frost Productions, Inc\.\n'
        r'.*?\n'                # Address lines
        r'\(\d{3}\) \d{3}-\d{4}\n'  # Phone number
        r'\n+',                 # Extra newlines
        flags=re.DOTALL|re.MULTILINE
    )
    text = front_matter_pattern.sub('', text)

    # 2. Remove all script formatting elements
    # Scene headings and technical directions
    text = re.sub(r'^(INT\.|EXT\.|ESTABLISH|CUT TO:|ANGLE:|FADE\s.*?):?.*$', 
                 '', text, flags=re.MULTILINE|re.IGNORECASE)
    
    # Comment lines and revision markers
    text = re.sub(r'^#.*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'^REVISIONS?:.*$', '', text, flags=re.MULTILINE|re.IGNORECASE)
    
    # Character names and parentheticals
    text = re.sub(r'^[A-Z][A-Z\s]+\s*\(.*?\)\n', '', text, flags=re.MULTILINE)
    text = re.sub(r'^[A-Z][A-Z\s]+:\s*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^\s*\(.*?\)\s*$', '', text, flags=re.MULTILINE)
    
    # Scene numbers and ALL-CAPS action lines
    text = re.sub(r'^\d+\.?\s*$', '', text, flags=re.MULTILINE)
    text = re.sub(r'^[A-Z][A-Z\s]+$', '', text, flags=re.MULTILINE)
    
    # 3. Final cleanup
    text = '\n'.join([line.strip() for line in text.split('\n') if line.strip()])
    text = re.sub(r'\n{3,}', '\n\n', text)  # Collapse multiple newlines
    
    return text

# Batch Processing for already-cleaned scripts
input_folder = "cleaned-text"
output_folder = "cleaned-text-final"
os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    if filename.startswith("cleaned_") and filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), 'r', encoding='utf-8', errors='replace') as f:
            precleaned_text = f.read()
        
        final_text = further_clean_script(precleaned_text)
        
        # Remove "cleaned_" prefix from output filename
        output_filename = filename.replace("cleaned_", "final_")
        output_path = os.path.join(output_folder, output_filename)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(final_text)
        
        print(f"Processed: {filename} → {output_filename}")

print("\nAll files processed successfully!")

KeyboardInterrupt: 