In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# Function to read HTML from a URL or string
def read_html(source):
    if source.startswith('http'):
        r = requests.get(source)
        return BeautifulSoup(r.content, 'html.parser')
    else:
        return BeautifulSoup(source, 'html.parser')

# Read HTML from a URL
html = read_html("http://rvest.tidyverse.org/")
print(html)

# Read HTML from a string
html = read_html("""
  <p>This is a paragraph</p>
  <ul>
    <li>This is a bulleted list</li>
  </ul>
""")
print(html)

html = read_html("""
  <h1>This is a heading</h1>
  <p id='first'>This is a paragraph</p>
  <p class='important'>This is an important paragraph</p>
""")

# Select elements by tag, class, and id
p_elements = html.select("p")
important_elements = html.select(".important")
first_element = html.select("#first")

# Get the first p element
first_p_element = html.select_one("p")

# Parsing a more complex HTML
html = read_html("""
  <ul>
    <li><b>C-3PO</b> is a <i>droid</i> that weighs <span class='weight'>167 kg</span></li>
    <li><b>R4-P17</b> is a <i>droid</i></li>
    <li><b>R2-D2</b> is a <i>droid</i> that weighs <span class='weight'>96 kg</span></li>
    <li><b>Yoda</b> weighs <span class='weight'>66 kg</span></li>
  </ul>
""")

# Extract character names and weights
characters = html.select("li")
for character in characters:
    name = character.find("b").get_text() if character.find("b") else ''
    weight = character.find(class_='weight').get_text() if character.find(class_='weight') else ''
    print(f"{name}: {weight}")

# Working with hyperlinks
html = read_html("""
  <p><a href='https://en.wikipedia.org/wiki/Cat'>cats</a></p>
  <p><a href='https://en.wikipedia.org/wiki/Dog'>dogs</a></p>
""")

# Extracting the href attribute of the first anchor element within p elements
hrefs = [p.select_one('a')['href'] for p in html.select('p')]
print(hrefs)

# Parsing a table from HTML and converting to DataFrame
html = read_html("""
  <table class='mytable'>
    <tr><th>x</th>   <th>y</th></tr>
    <tr><td>1.5</td> <td>2.7</td></tr>
    <tr><td>4.9</td> <td>1.3</td></tr>
    <tr><td>7.2</td> <td>8.1</td></tr>
  </table>
""")

# Convert the table to DataFrame
table = pd.read_html(str(html.select_one('.mytable')))[0]
print(table)

# Fetching and parsing a specific section from a web page
url = "https://rvest.tidyverse.org/articles/starwars.html"
html = read_html(url)

section = html.select("section")
for sec in section:
    h2_text = sec.find('h2').get_text() if sec.find('h2') else ''
    director = sec.find(class_='director').get_text() if sec.find(class_='director') else ''
    print(f"Section: {h2_text}, Director: {director}")

# Parsing IMDb Top Rated Movies
url = "https://web.archive.org/web/20220201012049/https://www.imdb.com/chart/top/"
html = read_html(url)

# Extract the table
table = pd.read_html(str(html.select_one('table')))[0]

# Processing the table to extract rank, title, and year
# This part will need custom processing depending on the table structure
# Here's a simplified version
table['rank_title_year'] = table['Rank & Title'].str.extract(r'(\d+\.\s+.*\s\(\d+\))')[0]
table[['Rank', 'Title', 'Year']] = table['rank_title_year'].str.extract(r'(\d+)\.\s+(.*)\s\((\d+)\)')

# Drop the temporary 'rank_title_year' column
table = table.drop(columns=['rank_title_year'])

# Show the processed table
print(table)


<!DOCTYPE html>

<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<meta content="Wrappers around the xml2 and httr packages to make it
    easy to download, then manipulate, HTML and XML." name="description"/>
<title>Easily Harvest (Scrape) Web Pages • rvest</title>
<!-- favicons --><link href="favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180" type="image/png"/>
<link href="apple-touch-icon-120x120.png" rel="apple-touch-icon" sizes="120x120" type="image/png"/>
<link href="apple-touch-icon-76x76.png" rel="apple-touch-icon" sizes="76x76" type="image/png"/>
<link href="apple

In [6]:
# Select specific columns and rename them
ratings = table[['Rank & Title', 'IMDb Rating']].rename(
    columns={'Rank & Title': 'rank_title_year', 'IMDb Rating': 'rating'}
)

# Clean up the 'rank_title_year' column
ratings['rank_title_year'] = ratings['rank_title_year'].str.replace("\n +", " ", regex=True)

# Separate 'rank_title_year' into 'rank', 'title', and 'year' columns
ratings[['Rank', 'Title', 'Year']] = ratings['rank_title_year'].str.extract(r'(\d+)\.\s+(.*)\s\((\d+)\)')

# Fetch HTML content for extracting title ratings using BeautifulSoup
url = "https://web.archive.org/web/20220201012049/https://www.imdb.com/chart/top/"
response = requests.get(url)
html = BeautifulSoup(response.content, 'html.parser')

# Extract the title ratings
title_ratings = [tag['title'] for tag in html.select("td strong")[:5]]  # Adjust the index as needed

# Add a new column for the extracted ratings
ratings['title_ratings'] = pd.Series(title_ratings)

# Separate 'title_ratings' into 'rating' and 'number_of_ratings'
ratings[['Rating', 'Number']] = ratings['title_ratings'].str.extract(r'([0-9.]+) based on ([0-9,]+) user ratings')

# Parse 'Number' into a numeric type, removing commas
# ratings['Number'] = ratings['Number'].str.replace(',', '').astype(int)
# Convert 'Number' to numeric, coercing errors to NaN, then fill NaNs with 0
ratings['Number'] = pd.to_numeric(ratings['Number'].str.replace(',', ''), errors='coerce').fillna(0).astype(int)

# Display the final DataFrame
print(ratings)

                         rank_title_year  rating Rank  \
0    1.  The Shawshank Redemption (1994)     9.2    1   
1               2.  The Godfather (1972)     9.1    2   
2      3.  The Godfather: Part II (1974)     9.0    3   
3             4.  The Dark Knight (2008)     9.0    4   
4                5.  12 Angry Men (1957)     8.9    5   
..                                   ...     ...  ...   
245   246.  The Battle of Algiers (1966)     8.0  246   
246       247.  Nights of Cabiria (1957)     8.0  247   
247   248.  Miracle in Cell No. 7 (2019)     8.0  248   
248           249.  Andrei Rublev (1966)     8.0  249   
249      250.  The Princess Bride (1987)     8.0  250   

                        Title  Year                        title_ratings  \
0    The Shawshank Redemption  1994  9.2 based on 2,536,415 user ratings   
1               The Godfather  1972  9.1 based on 1,745,675 user ratings   
2      The Godfather: Part II  1974  9.0 based on 1,211,032 user ratings   
3          