In [3]:
import re
import os

def clean_script(raw_text):
    """
    Basic cleaning for stylometric analysis:
    - Removes scene headings, technical directions, and parentheticals
    - Preserves dialogue and action descriptions
    - Normalizes whitespace
    """
    # Remove scene headings (INT./EXT.) and technical directions
    text = re.sub(r'^(INT\.|EXT\.|ESTABLISH|CUT TO:|ANGLE:|FADE\s.*?):.*$', '', 
                 raw_text, flags=re.MULTILINE|re.IGNORECASE)
    
    # Remove revision markers and page numbers
    text = re.sub(r'^REVISIONS:.*$', '', text, flags=re.MULTILINE|re.IGNORECASE)
    text = re.sub(r'^\d+\.\s*', '', text, flags=re.MULTILINE)  # Scene numbers
    
    # Remove character names and parentheticals before dialogue
    text = re.sub(r'^[A-Z][A-Z\s]+\s*\(.*?\)\n', '', text, flags=re.MULTILINE)
    text = re.sub(r'^[A-Z][A-Z\s]+:\s*', '', text, flags=re.MULTILINE)  # Character names
    
    # Remove standalone parentheticals (e.g., (grins))
    text = re.sub(r'^\s*\(.*?\)\s*$', '', text, flags=re.MULTILINE)
    
    # Remove ALL-CAPS action lines (optional - comment out to keep)
    text = re.sub(r'^[A-Z][A-Z\s]+$', '', text, flags=re.MULTILINE)
    
    # Normalize whitespace and trim lines
    text = '\n'.join([line.strip() for line in text.split('\n') if line.strip()])
    
    # Collapse multiple newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text

# Batch Processing
input_folder = "episode-text"
output_folder = "cleaned-text"
os.makedirs(output_folder, exist_ok=True)

for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), 'r', encoding='utf-8', errors='replace') as f:
            raw_text = f.read()
        
        cleaned_text = clean_script(raw_text)
        
        output_path = os.path.join(output_folder, f"cleaned_{filename}")
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        
        print(f"Processed: {filename} → cleaned_{filename}")

Processed: tp0208.txt → cleaned_tp0208.txt
Processed: tp0221.txt → cleaned_tp0221.txt
Processed: tp0219.txt → cleaned_tp0219.txt
Processed: tp0213.txt → cleaned_tp0213.txt
Processed: tp0216.txt → cleaned_tp0216.txt
Processed: tp0202.txt → cleaned_tp0202.txt
Processed: tp0206.txt → cleaned_tp0206.txt
Processed: tp0222.txt → cleaned_tp0222.txt
Processed: tp0214.txt → cleaned_tp0214.txt
Processed: tp0201.txt → cleaned_tp0201.txt
Processed: tp0215.txt → cleaned_tp0215.txt
Processed: tp0205.txt → cleaned_tp0205.txt
Processed: tp0210.txt → cleaned_tp0210.txt
Processed: tp0203.txt → cleaned_tp0203.txt
Processed: tp0212.txt → cleaned_tp0212.txt
Processed: tp0220.txt → cleaned_tp0220.txt
Processed: tp0218.txt → cleaned_tp0218.txt
Processed: tp0217.txt → cleaned_tp0217.txt
Processed: tp0207.txt → cleaned_tp0207.txt
Processed: tp0204.txt → cleaned_tp0204.txt
Processed: tp0209.txt → cleaned_tp0209.txt
Processed: tp0211.txt → cleaned_tp0211.txt


In [1]:
import os
import re

def clean_script(input_file, output_file):
    """Removes INT/EXT lines and normalizes case."""
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    cleaned_lines = []
    for line in lines:
        # Skip lines starting with EXT or INT (case-insensitive)
        if re.match(r'^(EXT|INT)\b', line, re.IGNORECASE):
            continue
        # Normalize to lowercase and add to cleaned lines
        cleaned_lines.append(line.lower())
    
    # Write cleaned text to output file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.writelines(cleaned_lines)

def batch_process(input_dir, output_dir):
    """Process all .txt files in input_dir and save to output_dir."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for filename in os.listdir(input_dir):
        if filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, filename)
            clean_script(input_path, output_path)
            print(f"Processed: {filename}")

# Example usage:
input_directory = "cleaned-text"  # Folder with original .txt files
output_directory = "cleaner-text"  # Folder for cleaned files

batch_process(input_directory, output_directory)
print("All files cleaned and saved!")

Processed: cleaned_tp0222.txt
Processed: cleaned_tp0217.txt
Processed: cleaned_tp0204.txt
Processed: cleaned_tp0208.txt
Processed: cleaned_tp0216.txt
Processed: cleaned_tp0221.txt
Processed: cleaned_tp0207.txt
Processed: cleaned_tp0215.txt
Processed: cleaned_tp0205.txt
Processed: cleaned_tp0206.txt
Processed: cleaned_tp0211.txt
Processed: cleaned_tp0201.txt
Processed: cleaned_tp0220.txt
Processed: cleaned_tp0218.txt
Processed: cleaned_tp0209.txt
Processed: cleaned_tp0212.txt
Processed: cleaned_tp0219.txt
Processed: cleaned_tp0202.txt
Processed: cleaned_tp0213.txt
Processed: cleaned_tp0210.txt
Processed: cleaned_tp0214.txt
Processed: cleaned_tp0203.txt
All files cleaned and saved!
