In [6]:
import pandas as pd
import unicodedata

def clean_text(text):
    if not isinstance(text, str):
        return text
    
    # Replace common problematic characters
    text = text.replace('"', '"')  # smart quote left
    text = text.replace('"', '"')  # smart quote right
    text = text.replace(''', "'")  # smart apostrophe left
    text = text.replace(''', "'")  # smart apostrophe right
    text = text.replace('–', '-')  # en dash
    text = text.replace('—', '-')  # em dash
    text = text.replace('…', '...')  # ellipsis
    text = text.replace('®', '(R)')  # registered trademark
    
    # Convert accented characters to ASCII
    normalized = unicodedata.normalize('NFD', text)
    ascii_text = normalized.encode('ascii', 'ignore').decode('ascii')
    
    return ascii_text

# Read the CSV file
print("Reading CSV file...")
df = pd.read_csv('disney_plus_titles.csv')

# Clean all text columns
print("Cleaning data...")
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].apply(clean_text)

# Fill missing values
df = df.fillna('')

# Save cleaned file
print("Saving cleaned file...")
df.to_csv('disney_plus_titles_cleaned.csv', index=False)

print("Done! Cleaned file saved as 'disney_plus_titles_cleaned.csv'")
print(f"Processed {len(df)} rows")

Reading CSV file...
Cleaning data...
Saving cleaned file...
Done! Cleaned file saved as 'disney_plus_titles_cleaned.csv'
Processed 1450 rows
