<h1><b>Data Extraction and Data Transformation</b> </h1>

In [None]:
# Use 'pip' to install the 'beautifulsoup4' package
# This package includes the 'bs4' library for web scraping with BeautifulSoup
!pip install bs4

In [None]:
# Use 'pip' to install the 'pandas' library
# Pandas is a powerful data manipulation and analysis library in Python
!pip install pandas

In [None]:
# Use 'pip' to install the 'tabulate' library
# Tabulate is used for formatting and displaying tabular data
!pip install tabulate

In [4]:
# Import urllib.request for compatibility
import urllib.request as urllib2

# Import BeautifulSoup for parsing HTML content
from bs4 import BeautifulSoup

# Import the Tabulate function from the 'tabulate' library
from tabulate import tabulate

# Import the Pandas library for data manipulation and analysis
import pandas as pd

In [6]:
# Send an HTTP GET request to the URL
# The 'response' object contains the web page's content that can be further processed
response = urllib2.urlopen('https://www.rottentomatoes.com/browse/movies_at_home/')

# Read the HTML content of the response
html_doc = response.read()

In [7]:
# Create a BeautifulSoup object to parse the HTML content
soup = BeautifulSoup(html_doc, 'html.parser')

In [None]:
# Format the parsed html file
soup.prettify()

In [10]:
# Initialize an empty list 'movie_data' to store information about movies
movie_data = []

# Loop through elements found in the HTML document with specific attributes:
# The loop combines 'div' and 'a' elements with attributes "data-track" set to "scores" and "data-qa" set to "discovery-media-list-item-caption"
# These attributes are used to identify elements containing movie-related information in the HTML document
for movie in soup.find_all('div', attrs={"data-track": "scores", "data-qa": "discovery-media-list-item-caption"}) + soup.find_all('a', attrs={"data-track": "scores", "data-qa": "discovery-media-list-item-caption"}):

    # For each movie:

    # Extract the movie's name from the current 'movie' element in the HTML:
    # - Locate a 'span' element within the 'movie' element with the attribute "data-qa" set to "discovery-media-list-item-title"
    # - Retrieve the text content of the 'span' element using 'text'
    # - Remove any leading or trailing whitespace using 'strip'
    # - Assign the resulting movie name to the variable 'movie_name'
    movie_name = movie.find('span', attrs={"data-qa": "discovery-media-list-item-title"}).text.strip()

    # Extract the 'Audience Score' from the current 'movie' element in the HTML:
    # - Search for an element named 'score-pairs' within the 'movie' element
    # - Retrieve the value of the 'audiencescore' attribute from the 'score-pairs' element
    # - Assign the extracted 'Audience Score' to the variable 'audience_score'
    audience_score = movie.find('score-pairs').get('audiencescore')

    # Extract the 'Tomatometer Rating' from the current 'movie' element in the HTML:
    # - Search for an element named 'score-pairs' within the 'movie' element
    # - Retrieve the value of the 'criticsscore' attribute from the 'score-pairs' element
    # - Assign the extracted 'Tomatometer Rating' to the variable 'tomatometer'
    tomatometer = movie.find('score-pairs').get('criticsscore')

    # Append a dictionary representing movie information to the 'movie_data' list:
    # - 'Movie Name': Store the extracted movie name obtained earlier
    # - 'Audience Score': Use 'audience_score' if it's available; otherwise, use a hyphen ('-') as a placeholder
    # - 'Tomatometer Rating': Use 'tomatometer' if it's available; otherwise, use a hyphen ('-') as a placeholder
    movie_data.append({
        'Movie Name': movie_name,
        'Audience Score': audience_score or '-',
        'Tomatometer Rating': tomatometer or '-'
    })

In [None]:
movie_data

<h1><b>Data Presentation</b> </h1>

In [14]:
# Create a DataFrame 'movie_df' from the 'movie_data' list:
# 'pd.DataFrame()' is a function from the 'pandas' library used to create a tabular data structure
movie_df = pd.DataFrame(movie_data)

In [17]:
# Create a formatted table 'movie_data_table' from the DataFrame 'movie_df':
# - 'tabulate' is a function from the 'tabulate' library used to format and display tabular data
# - 'movie_df' is the DataFrame containing movie information
# - 'headers='keys'' specifies that the column headers should be based on the DataFrame keys (column names)
# - 'tablefmt='fancy_grid'' selects a formatting style for the table, in this case, a fancy grid style
# - 'showindex=False' ensures that the DataFrame's index is not displayed as a separate column in the table
# - 'colalign=("left", "right", "right")' specifies the column alignment, with the first column left-aligned
#   and the remaining two columns right-aligned
movie_data_table = tabulate(movie_df, headers='keys', tablefmt='fancy_grid', showindex=False, colalign=("left","right", "right"))

In [None]:
# Print the 'movie_data_table':
# This step is the final part of the code's logic, allowing the user to see the organized movie data
print(movie_data_table)