In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
# TMdb movie URL 
tmdb_movies_url = 'https://www.themoviedb.org/movie'

In [6]:
# The movie page is downloaded using 'requests`
response = requests.get(tmdb_movies_url)
# Check if the request was successful 
response.status_code=200

In [7]:
page_contents = response.text
page_contents[:500]

'<!DOCTYPE html>\n<html lang="en" class="no-js">\n  <head>\n    <title>Request Error (403) - The Movie Database (TMDb)</title>\n    <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n    <meta http-equiv="cleartype" content="on">\n    <meta charset="utf-8">\n    <meta name="robots" content="noindex">\n    <meta name="mobile-web-app-capable" content="yes">\n    <meta name="apple-mobile-web-app-capable" content="yes">\n    <meta name="HandheldFriendly" content="True">\n    <meta name="MobileOptimized" c'

In [8]:
with open ('tmdb_movie.html', 'w') as f:
    f.write(page_contents)

In [9]:
doc = BeautifulSoup(page_contents, 'html.parser')

In [10]:
def get_movies_page():
    """
    Function to download a web page using `requests` and check the status code to validate
    if the call was successful. 
    """
    movies_url = 'https://www.themoviedb.org/movie'
    # Access the webpage using `requests`
    response = requests.get(movies_url)
    # Check if the request was successful
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(movies_url))
    # Parse the `response' text using BeautifulSoup
    movies_doc = BeautifulSoup(response.text, 'html.parser')
    return movies_doc

In [21]:
def get_movies_info(doc):
    """
    Function to get the movie informations - 
    release date, genre, runtime and director.
    """
    div1_tags = doc.find('div', class_ = 'facts')
    release_date = div1_tags.text.split()[1]
    genre = div1_tags.text.split()[3:-2]
    runtime = div1_tags.text.split()[-2:]
    
    div2_tags = doc.find_all('div', {'class':'scroller_wrap should_fade is_fading'})
    director = div2_tags[0].text.strip().partition("\n")[0]
    
    return release_date, genre, runtime, director

In [11]:
movies_names_tags = doc.find_all('h2')[4:]  #Exclude the first 4 lines
names = []
for h2 in movies_names_tags:
    names.append(h2.a.text.strip())
print(names)

[]


In [12]:
links = []
for h2 in movies_names_tags:
    links.append(h2.a['href'])
print(links)

[]


In [13]:
def get_movies_names(doc):
    """
    Function to extract the movie names from HTML source code using BeautifulSoup.
    """
    movies_names_tags = doc.find_all('h2')[4:]  #Exclude the first 4 lines
    movies_names = []
    # Loop through the page get all the movie names from the page
    for h2 in movies_names_tags:
        movies_names.append(h2.a.text.strip())
    return movies_names

In [14]:
# Get the popular movie list from the webpage using the BeautifulSoup object `doc`. 
get_movies_names(doc)

[]

In [15]:
def get_movies_rating(doc):
    """
    Function to extract the movie user rating from HTML source code using the BeautifulSoup. 
    """
    desc_selector = 'user_score_chart'
    movies_rating_tags = doc.find_all('div', {'class': desc_selector})
    movies_rating = []
    # Loop through the webpage to get the ratings of all the movies in the page
    for tag in movies_rating_tags:
        movies_rating.append(tag.attrs['data-percent'])
    return movies_rating

In [16]:
# Get the ratings of each movies in the webpage using the BeautifulSoup object `doc`. 
get_movies_rating(doc)

[]

In [17]:
def get_movies_urls(doc):
    """
    Function to extract the movie links from HTML source code using BeautifulSoup. 
    """
    movies_urls = []
    base_url = 'https://www.themoviedb.org'
    movies_names_tags = doc.find_all('h2')[4:]  #Exclude the first 4 lines
    # Loop through the webpage to get the URL of each movie
    for tag in movies_names_tags:
        movies_urls.append(base_url + tag.a['href'])
    return movies_urls

In [18]:
# Get the URLS of each movies in the webpage using the BeautifulSoup object `doc`. 
get_movies_urls(doc)

[]

In [19]:
# Let's read a movie page
def get_detailed_movie_page(movies_url):
    """
    Function to read the HTML source code using BeautifulSoup.
    """
    # Download the page
    response = requests.get(movies_url)
    # Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(movies_url))
    # Parse using Beautiful soup
    movies_doc = BeautifulSoup(response.text, 'html.parser')
    return movies_doc

In [20]:
# Call the `get_movies_info` for movie `Below Zero`.
get_movies_info(doc2)

NameError: name 'get_movies_info' is not defined

In [22]:
# Call the `get_movies_info` for movie `Godzilla vs. Kong`.
get_movies_info(doc1) 

NameError: name 'doc1' is not defined

In [23]:
# Find the `div` tag under `facts` class to get the release date, genre and runtime 
div_tags = doc2.find('div', class_ = 'facts')

release_date = div_tags.text.split()[1]
genre = div_tags.text.split()[3:-2]
runtime = div_tags.text.split()[-2:]

# Print and validate the result is correct
print(release_date, genre, runtime)

NameError: name 'doc2' is not defined

In [24]:
def get_all_movies_details(urls):
    """
    Function to get lists of movie information as lists from all the pages. 
    """
    genres = []
    release_dates = []
    runtimes = []
    directors = []
    
    # Loop through all the urls of the the movies 
    for url in urls:
        movie_doc = get_movies_page(url)
        # get_movies_info returns release_date, genre, runtime, director.
        release_date, genre, runtime, director = get_movies_info(movie_doc)
        # Convert the genre list to string on ` `. 
        genres.append(" ".join(genre))
        release_dates.append(release_date)
        runtimes.append(" ".join(runtime))
        directors.append(director)
        
    return genres, release_dates, runtimes, directors

In [27]:
def scrape_movies():
    """
    Function to download web page using `requests` and
    to extract the HTML source code using BeautifulSoup.
    """
    # Let's get the popular movies listing from the TMdb website
    page_count = 1 # Initializing the movie page count to 1
    # Define lists for all the movie attributes
    all_names = []
    all_ratings = []
    all_genres = []
    all_release_dates = []
    all_runtimes = []
    all_directors = []
    all_urls = []
    
    while page_count < 8: # Looping for 8 pages of the TMdb web page
        movies_url = "https://www.themoviedb.org/movie?page=%d" %(page_count)
        # Access the webpage using `requests`
        response = requests.get(movies_url)
        response.status_code=200
        # Check if the request was successful
        if response.status_code != 200:
            raise Exception('Failed to load page {}'.format(movies_url))
        # Parse the `response' text using BeautifulSoup
        doc = BeautifulSoup(response.text, 'html.parser')
        
        urls = get_movies_urls(doc)
        genres, release_dates, runtimes, directors = get_all_movies_details(urls)
        
        # Append each movie attribute to respective lists
        all_names += get_movies_names(doc)
        all_ratings += get_movies_rating(doc)
        all_genres += genres
        all_release_dates += release_dates
        all_runtimes += runtimes
        all_directors += directors
        all_urls += urls 
        page_count += 1

        # Defining a dictionary to store the movie informations
    movies_dict = {
        'name': all_names,
        'rating': all_ratings,
        'genre': all_genres,
        'release_date': all_release_dates,
        'runtime': all_runtimes,
        'director': all_directors,
        'url': all_urls
    }
    return pd.DataFrame(movies_dict)

In [28]:
# Invoke the scrape_movies functionality 
movies_df = scrape_movies()
movies_df.head() # View the first few rows of the output

Unnamed: 0,name,rating,genre,release_date,runtime,director,url
