<a href="https://colab.research.google.com/github/jshanmu1/INST326-Group-Project/blob/main/Project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#clean reviews
def clean_review(review):
    """ cleans a list of movie reviews by removing missing data and duplicates, and reviews that are not specific

This function iterates through the provided list of reviews and performs
    two main cleaning operations:
    1. Removes any review entries that are considered incomplete (e.g., None, "", 0, or empty lists/dicts).
    2. Removes duplicate review strings, keeping the first occurrence.

Args:
reviews_list(list): List of raw movie reviews / potential mixed data types

Returns:
list: A cleaned list of movie reviews.

Raises:
   TypeError: If the input 'reviews' is not a list.

Example:
movie_reviews = ["Great movie!", None, "So-so.", "Great movie!", ""]
clean_review(movie_reviews)
# Output: ['Great movie!', 'So-so.']
"""
    if not isinstance(review, list):
        raise TypeError("Input must be a list.")
    
    cleaned_reviews = []
    seen_reviews = set()
    
    for rev in review:
        
        if rev in (None, "", 0, [], {}):
            continue
        
     
        if isinstance(rev, str) and rev not in seen_reviews:
            cleaned_reviews.append(rev)
            seen_reviews.add(rev)
    
    return cleaned_reviews

In [None]:
#summarize_plot
def summarize_plot(plot, max_length=100):
    """Summarizes a movie plot to a specified maximum length.

    This function takes a movie plot as input and truncates it to the specified
    maximum length, appending an ellipsis ("...") if the plot exceeds that length.

    Args:
        plot (str): The original movie plot.
        max_length (int): The maximum length of the summarized plot. Default is 100 characters.

    Returns:
        str: The summarized movie plot.

    Raises:
        TypeError: If 'plot' is not a string or 'max_length' is not an integer.
        ValueError: If 'max_length' is less than or equal to 0.

    Example:
        original_plot = "In a world where technology has advanced beyond imagination, a young hero rises to challenge the status quo and bring balance to society."
        summarized_plot = summarize_plot(original_plot, max_length=50)
        print(summarized_plot)
        # Output: "In a world where technology has advanced beyo..."
    """
    if not isinstance(plot, str):
        raise TypeError("Plot must be a string.")
    if not isinstance(max_length, int):
        raise TypeError("Max length must be an integer.")
    if max_length <= 0:
        raise ValueError("Max length must be greater than 0.")
    
    if len(plot) > max_length:
        return plot[:max_length - 3] + "..."
    return plot

In [None]:
#average_rating
def average_rating(ratings):
    """Calculates the average rating from a list of ratings.

    This function takes a list of numerical ratings and computes the average.
    It ignores any non-numeric values in the list.

    Args:
        ratings (list): A list of numerical ratings (int or float).

    Returns:
        float: The average rating, or 0 if there are no valid ratings.

    Raises:
        TypeError: If 'ratings' is not a list.

    Example:
        movie_ratings = [4.5, 3.0, 5.0, None, "bad", 4.0]
        avg_rating = average_rating(movie_ratings)
        print(avg_rating)
        # Output: 4.125
    """
    if not isinstance(ratings, list):
        raise TypeError("Input must be a list.")
    
    total = 0
    count = 0
    
    for rating in ratings:
        if isinstance(rating, (int, float)):
            total += rating
            count += 1
    
    return total / count if count > 0 else 0

In [None]:
#is_positive
def is_positive(review):
    """Determines if a movie review is positive based on the presence of positive keywords.

    This function checks if the review contains any of the predefined positive keywords.
    If any positive keyword is found, the review is considered positive.

    Args:
        review (str): The movie review text.
        """
    positive_keywords = {"good", "great", "excellent", "amazing", "fantastic", "love", "wonderful", "best", "awesome", "positive"}
    
    if not isinstance(review, str):
        raise TypeError("Review must be a string.")
    
    review_lower = review.lower()
    
    for keyword in positive_keywords:
        if keyword in review_lower:
            return True
    return False

In [None]:
import csv 
def load_db(path):
    """
    Load and filter TMDB movies from a CSV file (Started with api keys in mind but we're filtering csv files to be more specific).

    Keeps only rows from years 2010 to 2025 with vote counter being >= 1,

    Args: Path to the CSV file.

    Returns: A list of filtered rows as dictionaries.

    Raises:
        TypeError: If path is not a string.
        FileNotFoundError: If the file does not exist.
    """
    if not isinstance(path, str):
        raise TypeError("path is a string")

    rows = []
    with open(path, "r", encoding="utf-8-sig") as f:
        reader = csv.DictReader(f)
        for row in reader:
            year = None
            if "release_year" in row and row["release_year"]:
                try:
                    year = int(row["release_year"])
                except ValueError:
                    year = None
            if year is None and "release_date" in row and row["release_date"]:
                d = row["release_date"]
                if len(d) >= 4 and d[:4].isdigit():
                    year = int(d[:4])

            try:
                votes = int(row.get("vote_count", "0"))   #Vount Count Getter portion
            except ValueError:
                votes = 0

            if year is not None and 2010 <= year <= 2025 and votes >= 1:
                rows.append(row)

    return rows

In [None]:

#Second function
def fetch_tmdb_movie_reviews(title, movie_rows):
    """
    Create a reviews list from the CSV rows per the movie title.

    Uses the movie overview as the review text and vote_average as the rating.

    Args:
        title string: Movie title to match. 
        movie_rows list: Rows from load_db().

    Returns:
        List of dicts: {"author": "TMDB users", "content": <overview>, "author_details": {"rating": <float or None>}}

    Raises:
        TypeError: If the inputs are invalid types.
    """
    if not isinstance(title, str):
        raise TypeError("title must be a string")
    if not isinstance(movie_rows, list):
        raise TypeError("movie_rows must be a list")

    found = []
    q = title.strip().lower()

    for row in movie_rows:
        row_title = (row.get("title") or row.get("original_title") or "").strip()
        if not row_title:
            continue
        name = row_title.lower()

        if name == q or q in name:
            overview = (row.get("overview") or "").strip()
            rating = None
            va = row.get("vote_average")
            if va not in (None, ""):
                try:
                    rating = float(va)
                except ValueError:
                    rating = None
            found.append({
                "author": "TMDB users",
                "content": overview,
                "author_details": {"rating": rating}
            })

    return found


In [None]:
#Third function 
def normalize_tmdb_reviews(reviews):
    """
    Convert review dicts into list form

    Input:  [{"author": , "content": , "author_details": {"rating": X}}]
    Output: [["TMDB users", "some text", X],]

    Args:
        reviews: list of review dicts.

    Returns:
        List of [author, content, rating]

    Raises:
        TypeError: If reviews is not a list.
    """
    if not isinstance(reviews, list):
        raise TypeError("reviews must be a type of list")

    clean = []
    for item in reviews:
        if not isinstance(item, dict):
            continue
        author = item.get("author", "TMDB users")
        content = (item.get("content", "")).strip()
        details = item.get("author_details", {})
        rating = None
        if isinstance(details, dict):
            rating = details.get("rating", None)
        clean.append([author, content, rating])

    return clean



In [None]:
#Fourth function 
def export_reviews_to_csv(reviews, filename):
    """
    Save reviews to a CSV file with columns.

    Args:
        reviews (list): [author, content, rating].
        filename (str): Output CSV path.
        
    Returns:
        None
        
    Raises:
        TypeError: If inputs are wrong types.
        ValueError: If reviews is empty.
    """
    if not isinstance(reviews, list):
        raise TypeError("reviews must be in a list")
    if not isinstance(filename, str):
        raise TypeError("filename must be a string")
    if len(reviews) == 0:
        raise ValueError("no reviews to export")

    with open(filename, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Author", "Content", "Rating"])
        for row in reviews:
            if isinstance(row, (list, tuple)) and len(row) >= 3:
                writer.writerow([row[0], row[1], row[2]])