<a href="https://colab.research.google.com/github/jshanmu1/INST326-Group-Project/blob/main/Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#clean reviews
def clean_review(review):
    """ cleans a list of movie reviews by removing missing data and duplicates, and reviews that are not specific

This function iterates through the provided list of reviews and performs
    two main cleaning operations:
    1. Removes any review entries that are considered incomplete (e.g., None, "", 0, or empty lists/dicts).
    2. Removes duplicate review strings, keeping the first occurrence.

Args:
reviews_list(list): List of raw movie reviews / potential mixed data types

Returns:
list: A cleaned list of movie reviews.

Raises:
   TypeError: If the input 'reviews' is not a list.

Example:
movie_reviews = ["Great movie!", None, "So-so.", "Great movie!", ""]
clean_review(movie_reviews)
# Output: ['Great movie!', 'So-so.']
"""
    if not isinstance(review, list):
        raise TypeError("Input must be a list.")
    
    cleaned_reviews = []
    seen_reviews = set()
    
    for rev in review:
        
        if rev in (None, "", 0, [], {}):
            continue
        
     
        if isinstance(rev, str) and rev not in seen_reviews:
            cleaned_reviews.append(rev)
            seen_reviews.add(rev)
    
    return cleaned_reviews

In [None]:
#summarize_plot
def summarize_plot(plot, max_length=100):
    """Summarizes a movie plot to a specified maximum length.

    This function takes a movie plot as input and truncates it to the specified
    maximum length, appending an ellipsis ("...") if the plot exceeds that length.

    Args:
        plot (str): The original movie plot.
        max_length (int): The maximum length of the summarized plot. Default is 100 characters.

    Returns:
        str: The summarized movie plot.

    Raises:
        TypeError: If 'plot' is not a string or 'max_length' is not an integer.
        ValueError: If 'max_length' is less than or equal to 0.

    Example:
        original_plot = "In a world where technology has advanced beyond imagination, a young hero rises to challenge the status quo and bring balance to society."
        summarized_plot = summarize_plot(original_plot, max_length=50)
        print(summarized_plot)
        # Output: "In a world where technology has advanced beyo..."
    """
    if not isinstance(plot, str):
        raise TypeError("Plot must be a string.")
    if not isinstance(max_length, int):
        raise TypeError("Max length must be an integer.")
    if max_length <= 0:
        raise ValueError("Max length must be greater than 0.")
    
    if len(plot) > max_length:
        return plot[:max_length - 3] + "..."
    return plot

In [None]:
#average_rating
def average_rating(ratings):
    """Calculates the average rating from a list of ratings.

    This function takes a list of numerical ratings and computes the average.
    It ignores any non-numeric values in the list.

    Args:
        ratings (list): A list of numerical ratings (int or float).

    Returns:
        float: The average rating, or 0 if there are no valid ratings.

    Raises:
        TypeError: If 'ratings' is not a list.

    Example:
        movie_ratings = [4.5, 3.0, 5.0, None, "bad", 4.0]
        avg_rating = average_rating(movie_ratings)
        print(avg_rating)
        # Output: 4.125
    """
    if not isinstance(ratings, list):
        raise TypeError("Input must be a list.")
    
    total = 0
    count = 0
    
    for rating in ratings:
        if isinstance(rating, (int, float)):
            total += rating
            count += 1
    
    return total / count if count > 0 else 0

In [None]:
#is_positive
def is_positive(review):
    """Determines if a movie review is positive based on the presence of positive keywords.

    This function checks if the review contains any of the predefined positive keywords.
    If any positive keyword is found, the review is considered positive.

    Args:
        review (str): The movie review text.
        """
    positive_keywords = {"good", "great", "excellent", "amazing", "fantastic", "love", "wonderful", "best", "awesome", "positive"}
    
    if not isinstance(review, str):
        raise TypeError("Review must be a string.")
    
    review_lower = review.lower()
    
    for keyword in positive_keywords:
        if keyword in review_lower:
            return True
    return False

In [None]:
class DataClean:
    """Represents a dataset cleaning tool for movie reviews and ratings.
    
    This class provides methods to clean review data, calculate average ratings,
    and generate summaries. It integrates multiple data-cleaning operations into
    one cohesive object.

    Example:
        >>> data = {
        ...     "reviews": ["Great movie!", None, "So-so.", "Great movie!", ""],
        ...     "ratings": [4.5, 3.0, 5.0, None, "bad", 4.0]
        ... }
        >>> cleaner = DataClean(data)
        >>> cleaner.clean_reviews()
        ['Great movie!', 'So-so.']
        >>> cleaner.average_rating()
        4.125
        >>> print(cleaner)
        Dataset with 2 cleaned reviews and average rating 4.12
    """

    def __init__(self, data: dict):
        """Initialize the DataClean instance.
        
        Args:
            data (dict): Dictionary containing at least 'reviews' and/or 'ratings' keys.
        
        Raises:
            TypeError: If data is not a dictionary.
            ValueError: If required keys are missing.
        """
        if not isinstance(data, dict):
            raise TypeError("Data must be provided as a dictionary.")
        if "reviews" not in data and "ratings" not in data:
            raise ValueError("Data must contain at least 'reviews' or 'ratings' keys.")
        
        self._data = data
        self._cleaned_reviews = None
        self._average_rating = None

    # ----- Properties -----
    @property
    def data(self):
        """dict: Get a copy of the dataset."""
        return self._data.copy()
    
    @property
    def cleaned_reviews(self):
        """list | None: Returns the most recently cleaned reviews (if available)."""
        return self._cleaned_reviews

    @property
    def avg_rating(self):
        """float | None: Returns the last computed average rating."""
        return self._average_rating

    # ----- Methods -----
    def clean_reviews(self):
        """Cleans the review data by removing empty, duplicate, or invalid entries.
        
        Returns:
            list: A cleaned list of reviews.
        
        Raises:
            TypeError: If 'reviews' in data is not a list.
        """
        reviews = self._data.get("reviews", [])
        if not isinstance(reviews, list):
            raise TypeError("Expected 'reviews' to be a list.")
        
        cleaned = []
        seen = set()
        for rev in reviews:
            if rev in (None, "", 0, [], {}):
                continue
            if isinstance(rev, str) and rev not in seen:
                cleaned.append(rev)
                seen.add(rev)
        
        self._cleaned_reviews = cleaned
        return cleaned

    def average_rating(self):
        """Calculates the average rating from numeric values in the dataset.
        
        Returns:
            float: The average rating, or 0 if no valid ratings exist.
        
        Raises:
            TypeError: If 'ratings' in data is not a list.
        """
        ratings = self._data.get("ratings", [])
        if not isinstance(ratings, list):
            raise TypeError("Expected 'ratings' to be a list.")
        
        total = count = 0
        for rating in ratings:
            if isinstance(rating, (int, float)):
                total += rating
                count += 1
        
        self._average_rating = total / count if count > 0 else 0
        return self._average_rating

    def summary(self):
        """Generates a formatted summary of the cleaned data.
        
        Returns:
            str: Summary report including number of reviews and average rating.
        """
        reviews = self._cleaned_reviews or []
        avg = self._average_rating if self._average_rating is not None else "N/A"
        return f"Cleaned {len(reviews)} reviews. Average rating: {avg}"

    # ----- String Representations -----
    def __str__(self):
        reviews_count = len(self._cleaned_reviews) if self._cleaned_reviews else 0
        avg = f"{self._average_rating:.2f}" if self._average_rating is not None else "N/A"
        return f"Dataset with {reviews_count} cleaned reviews and average rating {avg}"

    def __repr__(self):
        return f"DataClean(data={self._data!r})"
