This notebook cleans the data further to use the bag of words features to use in our Naive Bayes Classification model.

In [None]:
import pandas as pd
import re
import argparse

movie_genres = {
    "Cloudy With A Chance Of Meatballs": "Animation,Comedy",
    "None": "Unknown",
    "Action Movie": "Action",
    "Mamma Mia": "Musical,Comedy",
    "Dragon": "Animation",
    "Rick And Morty": "Animation,Comedy",
    "Home Alone": "Comedy,Family",
    "The Big Lebowski": "Comedy,Crime",
    "Spider-Man": "Superhero,Action",
    "Goodfellas": "Crime,Drama",
    "Dead Silence": "Horror,Thriller",
    "La La Land": "Musical,Drama,Romance",
    "Harry Potter": "Fantasy,Adventure",
    "Transformer": "Action,Sci-Fi",
    "Teenage Mutant Ninja Turtles": "Animation,Action",
    "High School Musical": "Musical,Comedy,Family",
    "Despicable Me": "Animation,Comedy",
    "Toy Story": "Animation,Adventure",
    "The Godfather": "Crime,Drama",
    "Fast And Furious": "Action,Thriller",
    "The Garfield Movie": "Animation,Comedy",
    "Ratatouille": "Animation,Comedy",
    "Five Nights At Freddies": "Horror",
    "Back To The Future": "Sci-Fi,Adventure",
    "Mystic Pizza": "Comedy,Drama,Romance",
    "Iron Man": "Superhero,Action",
    "Life Of Pi": "Adventure,Drama",
    "Avengers": "Superhero,Action",
    "King Kong": "Action,Adventure,Drama",
    "Whiplash": "Drama,Music",
    "Inside Out": "Animation,Drama",
    "Superbad": "Comedy",
    "Stranger Things": "Sci-Fi,Horror,Drama",
    "Air Bud": "Family,Comedy",
    "Deadpool": "Action,Comedy",
    "A Quiet Place": "Horror,Thriller",
    "Finding Nemo": "Animation,Adventure",
    "My Cousin Vinny": "Comedy,Crime",
    "Eat Pray Love": "Drama,Romance",
    "Pulp Fiction": "Crime,Thriller",
    "Star Wars": "Sci-Fi,Adventure",
    "Batman": "Superhero,Action",
    "Shrek": "Animation,Comedy",
    "Scooby Doo": "Animation,Adventure",
    "Princess Diaries": "Comedy,Family,Romance",
    "Wall-E": "Animation,Sci-Fi",
    "The Hangover": "Comedy",
    "Breaking Bad": "Crime,Drama",
    "Interstellar": "Sci-Fi,Adventure,Drama",
    "Rush Hour": "Action,Comedy",
    "The Truman Show": "Drama,Sci-Fi",
    "Futurama": "Animation,Comedy,Sci-Fi",
    "Godfather": "Crime,Drama",
    "The Dictator": "Comedy,Political",
    "Borat": "Comedy,Political",
    "Mission: Impossible": "Action,Thriller",
    "Avengers: Endgame": "Superhero,Action",
    "Titanic": "Drama,Romance",
    "Dangal": "Drama,Sports",
    "Kung Fu Panda": "Animation,Action",
    "The Mummy": "Action,Adventure",
    "The Invisible Guest": "Thriller,Mystery",
    "Squid Game": "Drama,Thriller",
    "Parasite": "Thriller,Drama",
    "Blade Runner": "Sci-Fi,Thriller",
    "Spider-Man: Into The Spider-Verse": "Animation,Action",
    "Everything Everywhere All At Once": "Sci-Fi,Action,Adventure",
    "Barbie": "Comedy,Family",
    "Jurassic Park": "Adventure,Sci-Fi",
    "Ponyo": "Animation,Fantasy",
    "My Neighbor Totoro": "Animation,Family",
    "Kill Bill": "Action,Thriller",
    "Jiro Dreams Of Sushi": "Documentary",
    "Naruto": "Animation,Action",
    "Frozen": "Animation,Adventure,Family",
    "Shawshank Redemption": "Drama",
    "Mad Max": "Action,Sci-Fi",
    "The Lion King": "Animation,Adventure,Drama",
    "Your Name": "Animation,Romance,Fantasy",
    "Memoirs Of A Geisha": "Drama,Romance",
    "Godzilla": "Action,Sci-Fi",
    "Shazam": "Superhero,Action",
    "The Grinch": "Animation,Comedy,Family",
    "Zootopia": "Animation,Adventure,Comedy",
    "The Godfather Part II": "Crime,Drama",
    "The Social Network": "Drama",
    "The Big Sick": "Comedy,Drama,Romance",
    "Die Hard": "Action,Thriller",
    "Taxi Driver": "Crime,Drama,Thriller",
    "Fast & Furious": "Action,Thriller",
    "The Karate Kid": "Drama,Family",
    "John Wick": "Action,Thriller",
    "Bladerunner": "Sci-Fi,Thriller",
    "Parasite": "Thriller,Drama",
    "Gone Girl": "Thriller,Drama",
    "Inception": "Sci-Fi,Thriller",
    "The Breakfast Club": "Comedy,Drama",
    "The Lego Movie": "Animation,Adventure,Comedy",
    "Spider-Man: Far From Home": "Superhero,Action",
    "Space Jam": "Animation,Comedy",
    "Spongebob": "Animation,Comedy",
    "Toy Story 4": "Animation,Adventure",
    "Green Book": "Drama",
    "Madagascar": "Animation,Adventure,Comedy",
    "The Mario Movie": "Animation,Adventure",
    "The Big Lebowski": "Comedy,Crime",
    "The Road To Fallujah": "Documentary",
    "Kingdom Of Heaven": "Action,Adventure,Drama",
    "The Dictator": "Comedy,Political",
    "Rush Hour": "Action,Comedy",
    "Breaking Bad": "Crime,Drama",
    "The Boys": "Superhero,Action",
    "Wicked": "Musical,Fantasy",
    "Eternal Sunshine Of The Spotless Mind": "Drama,Romance,Sci-Fi",
    "The Grinch": "Animation,Comedy,Family",
    "Ratatouille": "Animation,Comedy",
    "Star Wars: The Last Jedi": "Sci-Fi,Action",
    "The Dictator": "Comedy",
    "The Lego Movie 2": "Animation,Comedy,Adventure",
    "Dune": "Sci-Fi,Adventure"
}


# Function to categorize complexity (Q1)
def categorize_complexity(value):
    try:
        value = int(value)
        if value == 1:
            return "very simple"
        elif value == 2:
            return "simple"
        elif value == 3:
            return "moderate"
        elif value == 4:
            return "complex"
        elif value == 5:
            return "very complex"
        else:
            return "unknown"
    except (ValueError, TypeError):
        return "unknown"

# Function to categorize number of ingredients (Q2)
def categorize_ingredients(value):
    try:
        # Handle different input formats
        value_str = str(value).lower()

        # Check if there's a range (e.g., "6-8")
        if "-" in value_str:
            parts = value_str.split("-")
            value = (int(parts[0].strip()) + int(parts[1].strip())) / 2
        else:
            # Extract numeric values
            match = re.search(r'(\d+)', value_str)
            if match:
                value = int(match.group(1))
            else:
                return "unknown"

        if value <= 3:
            return "very few"
        elif value <= 6:
            return "few"
        elif value <= 10:
            return "moderate"
        elif value <= 20:
            return "many"
        else:
            return "very many"
    except (ValueError, TypeError):
        return "unknown"

# Function to categorize price (Q4)
def categorize_price(value):
    try:
        value_str = str(value).lower()
        if value_str == 'nan' or value_str == 'none' or not value_str.strip():
            return "unknown"

        # use regex to extract number
        match = re.search(r'(\d+(?:\.\d+)?)', value_str)
        if match:
            value = float(match.group(1))
        else:
            return "unknown"

        if value <= 5:
            return "very cheap"
        elif value <= 10:
            return "cheap"
        elif value <= 15:
            return "moderate"
        elif value <= 25:
            return "expensive"
        else:
            return "very expensive"
    except (ValueError, TypeError):
        return "unknown"

# Function to categorize movies (Q5) into genres
def categorize_movie_genre(movie_title):
    # Dictionary mapping movie titles to genres

    if movie_title is None or str(movie_title).lower() in ['none', 'nan'] or not str(movie_title).strip():
        return "no_movie"

    # normalize the title
    normalized_title = str(movie_title)

    # Check for direct matches
    if normalized_title in movie_genres:
        return movie_genres[normalized_title]

    # Look for partial matching
    for title, genre in movie_genres.items():
        if title in normalized_title or normalized_title in title:
            return genre

    return "other"

# Main function to process the data
def clean_data(input_file, output_file, delimiter='\t'):
    try:
        # Read the input file
        print(f"Reading data from {input_file}...")
        df = pd.read_csv(input_file, delimiter=delimiter)

        # Get the column names for Q1, Q2, Q4, and Q5
        q1_col = next((col for col in df.columns if 'Q1' in col), None)
        q2_col = next((col for col in df.columns if 'Q2' in col), None)
        q4_col = next((col for col in df.columns if 'Q4' in col), None)
        q5_col = next((col for col in df.columns if 'Q5' in col), None)

        if not all([q1_col, q2_col, q4_col, q5_col]):
            print("Error: Could not find all required question columns (Q1, Q2, Q4, Q5)")
            return

        print("Transforming data...")
        # Apply transformations
        df[q1_col] = df[q1_col].apply(categorize_complexity)
        df[q2_col] = df[q2_col].apply(categorize_ingredients)
        df[q4_col] = df[q4_col].apply(categorize_price)
        df[q5_col] = df[q5_col].apply(categorize_movie_genre)

        # Save the processed data
        print(f"Saving cleaned data to {output_file}...")
        df.to_csv(output_file, index=False)
        print(f"Data cleaning completed successfully! File saved as {output_file}")

    except FileNotFoundError:
        print(f"Error: The file {input_file} was not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file {input_file} is empty.")
    except pd.errors.ParserError:
        print(f"Error: The file {input_file} could not be parsed. Check the delimiter.")
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")

from google.colab import drive
drive.mount('/content/drive')


clean_data("/content/drive/MyDrive/csc311/combined_data.csv", "/content/drive/MyDrive/csc311/worded_data.csv", ",")

Mounted at /content/drive
Reading data from /content/drive/MyDrive/csc311/combined_data.csv...
Transforming data...
Saving cleaned data to /content/drive/MyDrive/csc311/worded_data.csv...
Data cleaning completed successfully! File saved as /content/drive/MyDrive/csc311/worded_data.csv
