In [2]:
import os
import re

# ========================
# 1. SETUP PATHS
# ========================
input_folder = "episode-text"       # Folder with raw .txt files
output_folder = "cleaned-scripts"  # Output for cleaned scripts


# ========================
# 2. CLEANING FUNCTIONS
# ========================
def clean_script(text):
    """Remove all non-dialogue elements from screenplay text"""
    # Remove scene headings (e.g., "INT. GREAT NORTHERN - NIGHT")
    text = re.sub(r'^(INT\.|EXT\.|\.{3}).*$', '', text, flags=re.MULTILINE)
    # Remove camera/action directions (all caps lines)
    text = re.sub(r'^[A-Z][A-Z\s]+$', '', text, flags=re.MULTILINE)
    # Remove parentheticals (e.g., "(sighs)")
    text = re.sub(r'\(.*?\)', '', text)
    # Remove transitions (e.g., "CUT TO:", "FADE OUT.")
    text = re.sub(r'^(CUT TO|FADE|DISSOLVE).*$', '', text, flags=re.MULTILINE | re.IGNORECASE)
    # Remove page numbers (e.g., "2." at start of line)
    text = re.sub(r'^\d+\.?\s*$', '', text, flags=re.MULTILINE)
    # Collapse multiple newlines
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text.strip()

# ========================
# 3. PROCESS ALL FILES
# ========================
for filename in os.listdir(input_folder):
    if not filename.endswith(".txt"):
        continue

    # Read raw text
    with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as f:
        raw_text = f.read()

    # Clean text
    cleaned_text = clean_script(raw_text)

    # Save cleaned version
    output_path = os.path.join(output_folder, f"cleaned_{filename}")
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)

    print(f"Cleaned: {filename} → {output_path}")

print(f"\nAll cleaned scripts saved to: {output_folder}/")

Cleaned: tp0208.txt → cleaned-scripts/cleaned_tp0208.txt
Cleaned: tp0221.txt → cleaned-scripts/cleaned_tp0221.txt
Cleaned: tp0219.txt → cleaned-scripts/cleaned_tp0219.txt
Cleaned: tp0213.txt → cleaned-scripts/cleaned_tp0213.txt
Cleaned: tp0216.txt → cleaned-scripts/cleaned_tp0216.txt
Cleaned: tp0202.txt → cleaned-scripts/cleaned_tp0202.txt
Cleaned: tp0206.txt → cleaned-scripts/cleaned_tp0206.txt
Cleaned: tp0222.txt → cleaned-scripts/cleaned_tp0222.txt
Cleaned: tp0214.txt → cleaned-scripts/cleaned_tp0214.txt
Cleaned: tp0201.txt → cleaned-scripts/cleaned_tp0201.txt
Cleaned: tp0215.txt → cleaned-scripts/cleaned_tp0215.txt
Cleaned: tp0205.txt → cleaned-scripts/cleaned_tp0205.txt
Cleaned: tp0210.txt → cleaned-scripts/cleaned_tp0210.txt
Cleaned: tp0203.txt → cleaned-scripts/cleaned_tp0203.txt
Cleaned: tp0212.txt → cleaned-scripts/cleaned_tp0212.txt
Cleaned: tp0220.txt → cleaned-scripts/cleaned_tp0220.txt
Cleaned: tp0218.txt → cleaned-scripts/cleaned_tp0218.txt
Cleaned: tp0217.txt → cleaned-s