In [17]:
import pandas as pd
import re
import os

def clean_star_trek_script(csv_df):
    # Step 1: Remove rows where Dialogue is missing
    df_cleaned = csv_df.dropna(subset=['Dialogue']).copy()
    
    # Step 2: Fill missing Character names with 'Unknown'
    df_cleaned['Character'] = df_cleaned['Character'].fillna('Unknown')
    
    # Step 3: Remove rows that primarily contain metadata in the Dialogue field
    # Assuming metadata usually doesn't have speaking characters associated with it
    df_cleaned = df_cleaned[df_cleaned['Character'] != 'Unknown']
    
    # Step 4: Define a text-cleaning function
    def clean_text(text):
        text = re.sub(r'\[.*?\]', '', text)  # Remove text within brackets
        text = re.sub(r'[^a-zA-Z0-9\s.,!?\'\"]', '', text)  # Remove special characters except basic punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    
    # Apply the cleaning function to the Dialogue column
    df_cleaned['Dialogue'] = df_cleaned['Dialogue'].astype(str).apply(clean_text)
    
    # Optional: Save the cleaned data to a new CSV file
    df_cleaned.to_csv('Cleaned_Star_Trek_Script.csv', index=False)
    
    # Display the first few rows to verify the changes
    print(df_cleaned.head())
    return df_cleaned

# Define the folder path
folder_path = "/home/sagemaker-user/trekBERT/Movies.csv"
save_cleaned = True  # Ensure this variable is defined

# Process files
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)
        print(f'Processing file: {filename}')
        
        # Read the CSV file into a DataFrame
        csv_df = pd.read_csv(file_path)  # FIX: Load CSV into DataFrame before passing it
        
        # Clean the scripts
        cleaned_df = clean_star_trek_script(csv_df)
        
        # Save cleaned dfs
        if save_cleaned:
            cleaned_file_path = os.path.join(folder_path, f"cleaned_{filename}")
            cleaned_df.to_csv(cleaned_file_path, index=False)
            print(f"Saved cleaned file to: {cleaned_file_path}")

print("All files processed.")

Processing file: Star_Trek_Final_Frontier_Script.csv
    Scene_ID    Character                                           Dialogue  \
0        0.0  Star Trek V                                 The Final Frontier   
8        0.0        SYBOK  I thought weapons were forbidden on this plane...   
9        0.0        J'ONN                                   It's all I have.   
10       0.0        SYBOK                               Your pain runs deep.   
11       0.0        J'ONN                       What do you know of my pain?   

   Scene_Description                                           Metadata  
0                NaN                                                NaN  
8                NaN                     "The Planet of Galactic Peace"  
9                NaN  (out of a dust storm a horseman approaches an ...  
10               NaN                                                NaN  
11               NaN                                                NaN  
Saved cleaned file to:

In [10]:
!pwd

/home/sagemaker-user/trekBERT
