In [3]:
import re

# Load the Toy Story dataset
input_file = "/content/TOY STORY.txt"
output_file = "/content/TOY STORY-cleaned.txt"

# Define a function to clean the dataset
def clean_toy_story_dataset(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    cleaned_lines = []
    current_speaker = None

    for line in lines:
        line = line.strip()  # Remove leading and trailing whitespaces

        # Check if the line contains a speaker
        if re.match(r'^[A-Z][A-Z\s]*:$', line):  # Regex for identifying speaker lines
            current_speaker = line
        elif current_speaker:  # If there's a current speaker, format their dialogue
            cleaned_lines.append(f"{current_speaker} {line}")
            current_speaker = None
        else:
            # For lines without a speaker, treat them as part of the previous dialogue
            if cleaned_lines:
                cleaned_lines[-1] += f" {line}"

    # Write the cleaned dataset to a new file
    with open(output_path, 'w', encoding='utf-8') as f:
        for line in cleaned_lines:
            f.write(line + '\n')

    print(f"Cleaned dataset saved to {output_path}")

# Run the cleaning function
clean_toy_story_dataset(input_file, output_file)

Cleaned dataset saved to /content/TOY STORY-cleaned.txt


In [7]:
import re

# Comprehensive list of characters
MAIN_CHARACTERS = {
    'ANDY', 'WOODY', 'BUZZ LIGHTYEAR', 'BO PEEP', 'MR. POTATO HEAD',
    'REX', 'SLINKY', 'HAMM'
}

SIDE_CHARACTERS = {
    'SARGENT', 'LENNY', 'SCUD', 'SID', 'HANNAH', 'RC', 'SHARK',
    'BABYFACE', 'MIKE', 'ROBOT', 'ALIENS', 'THE GREEN ARMY MEN',
    'WOUNDED SOLDIER', 'MUTANT TOYS'
}

BACKGROUND_CHARACTERS = {
    'MRS. DAVIS', 'FRIENDS', 'PIZZA PLANET ATTENDANT',
    'PIZZA PLANET CUSTOMERS'
}

ALL_CHARACTERS = MAIN_CHARACTERS.union(SIDE_CHARACTERS).union(BACKGROUND_CHARACTERS)

def is_character_name(word):
    """
    Check if the word is a known character name.
    """
    # Remove any parenthetical notes
    clean_word = re.sub(r'\(.*?\)', '', word).strip()

    # Check if the cleaned word is in our character list
    return clean_word in ALL_CHARACTERS

def process_script(input_text):
    """
    Process the script to:
    1. Add colons after character names
    2. Add new lines after character dialogues
    """
    # Function to replace character names with Character:
    def replace_character_names(match):
        word = match.group(0)
        # Check if it's a character name
        if is_character_name(word):
            return word + ':'
        return word

    # First, add colons to character names
    modified_text = re.sub(r'\b[A-Z][A-Z\.\s]+\b', replace_character_names, input_text)

    # Add new lines after character dialogues
    # This regex looks for dialogue lines and adds a newline after them
    modified_text = re.sub(r'((?:[A-Z][A-Z\.\s]+:)(?:\s*\(.*?\))?.*?)((?=[A-Z][A-Z\.\s]+:)|$)', r'\1\n\n', modified_text)

    return modified_text

def process_file(input_filepath, output_filepath):
    """
    Process the entire input file and save the modified text.
    """
    try:
        # Read input file
        with open(input_filepath, 'r', encoding='utf-8') as file:
            input_text = file.read()

        # Process text
        modified_text = process_script(input_text)

        # Save modified text
        with open(output_filepath, 'w', encoding='utf-8') as file:
            file.write(modified_text)

        print(f"Successfully processed file. Output saved to {output_filepath}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Specify input and output file paths directly
input_filepath = '/content/TOY STORY-cleaned.txt'  # Default Colab input file path
output_filepath = '/content/TOY STORY-new.txt'  # Default Colab output file path

# Process the file
process_file(input_filepath, output_filepath)

Successfully processed file. Output saved to /content/TOY STORY-new.txt
