In [1]:
!pip install bs4 requests

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25l[?25hdone
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1270 sha256=f8bfb76b6b395f666fad57d983816f040d3e5878157de8ef2f936401a0546369
  Stored in directory: /root/.cache/pip/wheels/73/2b/cb/099980278a0c9a3e57ff1a89875ec07bfa0b6fcbebb9a8cad3
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [2]:
# Import the necessary libraries
from bs4 import BeautifulSoup

# Define the HTML string
html = """<html><head><title>My Simple HTML Page</title></head><body><div class="my-class"><span>This is some text in a span tag</span><p>This is some text in a paragraph tag</p><p>And yet again another paragraph</p></div></body></html>"""

In [3]:
# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(html, 'html.parser')

In [4]:
print(soup.prettify())

<html>
 <head>
  <title>
   My Simple HTML Page
  </title>
 </head>
 <body>
  <div class="my-class">
   <span>
    This is some text in a span tag
   </span>
   <p>
    This is some text in a paragraph tag
   </p>
   <p>
    And yet again another paragraph
   </p>
  </div>
 </body>
</html>


In [5]:
# Example 1: Find the first occurrence of a tag
first_span = soup.find('span')
print(first_span)

<span>This is some text in a span tag</span>


In [9]:
# Example 2: Find all occurrences of a tag
all_p = soup.find_all('p')
print(all_p)

[<p>This is some text in a paragraph tag</p>, <p>And yet again another paragraph</p>]


In [12]:
len(all_p)

2

In [None]:
# Example 3: Navigate to the parent tag
parent_div = first_span.parent
print(parent_div)



<div class="my-class"><span>This is some text in a span tag</span><p>This is some text in a paragraph tag</p><p>And yet again another paragraph</p></div>


In [None]:
# Example 4: Navigate to the next sibling tag
next_sibling = first_span.next_sibling
print(next_sibling)

<p>This is some text in a paragraph tag</p>


In [None]:
# Example 5: Navigate to the previous sibling tag
previous_sibling = next_sibling.previous_sibling
print(previous_sibling)



<span>This is some text in a span tag</span>


In [None]:
# Example 6: Get the tag name
tag_name = first_span.name
print(tag_name)



span


In [None]:
# Example 7: Get the tag attributes
class_name = parent_div['class']
print(class_name)



['my-class']


In [None]:
# Example 8: Get the text content of a tag
span_text = first_span.text
print(span_text)



This is some text in a span tag


In [None]:
# Example 9: Get the text content of multiple tags
div_text = parent_div.get_text()
print(div_text)



This is some text in a span tagThis is some text in a paragraph tagAnd yet again another paragraph


In [None]:
# Example 10: Modify the HTML
parent_div['class'] = 'new-class'
print(parent_div)

<div class="new-class"><span>This is some text in a span tag</span><p>This is some text in a paragraph tag</p><p>And yet again another paragraph</p></div>


In [None]:
# Example 11: Iterate over all <p> elements in <body>
for i, p in enumerate(soup.body.find_all('p')):
  print(f"{i+1}. tag: {p.name}, text:", p.get_text())

1. tag: p, text: This is some text in a paragraph tag
2. tag: p, text: And yet again another paragraph


# Wikipedia

Real-world example!

We will scrape the main page of wikipedia. In doing this, we will:

1. Get the featured article content
2. Get all the `did you know` content along with some additional data
3. Do the same as 2. but for the `in the news` section
4. Repeat this process for the `on this day` section.

In [13]:
import requests as re

In [14]:
# Get the main page content
req = re.get("https://en.wikipedia.org/wiki/Main_Page")
soup = BeautifulSoup(req.content, "html.parser")

In [16]:
# Get the featured article content
featured_article_text = soup.find(class_="MainPageBG mp-box").p.get_text()


In [20]:
# Get all "did you know" facts
did_you_know = soup.find(id="mp-dyk").find_all("li")

# For each, make a tuple, that stores the text and the first link
did_you_know_list = []
for li in did_you_know:
  text = li.get_text()
  link = li.find("a")["href"]
  did_you_know_list.append({"text": text, "link": link})

# for each link, get the first paragraph text
for dyk in did_you_know_list:
  # URL suffix: '/wiki/foo_bar'
  link = dyk['link']
  url = f"https://en.wikipedia.org/{link}"
  intro_text_soup = BeautifulSoup(re.get(url).content)
  intro = intro_text_soup.find_all("p")[1].get_text()
  dyk['intro'] = intro

In [23]:
did_you_know_list

[{'text': '... that precursors to the killer toy include ventriloquist dummies such as Otto (pictured) in the 1929 film The Great Gabbo?',
  'link': '/wiki/Killer_toy',
  'intro': 'A killer toy is a stock character in horror fiction. They include toys, such as dolls and ventriloquist dummies, that come to life and seek to kill or otherwise carry out violence. The killer toy subverts the associations of childhood with innocence and lack of agency while invoking the uncanny nature of a lifelike toy. Killer toy fiction often invokes ideas of companionship and the corruption of children, sometimes taking place in dysfunctional or single parent homes. They have historically been associated with occultism and spirit possession, though artificial intelligence became more common in later works.\n'},
 {'text': '... that Armenian-Turkish soprano Sibil Pektorosoğlu released her first album after singing in a church choir for almost twenty years?',
  'link': '/wiki/Armenians_in_Turkey',
  'intro':

In [24]:
# let's do it again for the in the news section
in_the_news = soup.find(id="mp-itn").find_all("li")

in_the_news_list = []
for itn in in_the_news:
  text = itn.get_text()
  link = itn.find("a")["href"]

  # notice that this part is different from above
  # which do you think makes the most sense performance-wise? 
  url = f"https://en.wikipedia.org/{link}"
  intro_text_soup = BeautifulSoup(re.get(url).content)
  intro = intro_text_soup.find_all("p")[1].get_text()
  itn['intro'] = intro
  in_the_news_list.append({
      "text": text,
      "link": link,
      "intro": intro
  })


In [25]:
in_the_news_list[0]

{'text': 'The European Space Agency launches the Jupiter Icy Moons Explorer (JUICE) to study Ganymede, Europa and  Callisto (trajectory pictured).',
 'link': '/wiki/European_Space_Agency',
 'intro': 'The European Space Agency[a] is an intergovernmental organisation of 22 member states[7] dedicated to the exploration of space. Established in 1975 and headquartered in Paris, ESA has a worldwide staff of about 2,200 in 2018[8] and an annual budget of about €4.9\xa0billion in 2023.[4]\n'}

In [None]:
# Your turn! Do it for the "On this day" section!
# Take into account that the first item might not be in a <li> tag


## NY Times

Exercise:

1. Get all article titles from the main page `https://www.nytimes.com/international/`
2. For each, get:
   - title
   - summary of the article
   - reading time
   - link to it

What would be the follow up to this?


In [26]:
req = re.get("https://www.nytimes.com/international/")
soup = BeautifulSoup(req.content, "html.parser")


In [27]:
story_wrappers = soup.find_all("section", class_="story-wrapper")
stories = []
for story in story_wrappers:
  try:
    title = story.find(class_="indicate-hover").get_text()
  except:
    title = ""
  try:
    summary = story.find("p", class_="summary-class").get_text()
  except:
    summary = ""
  try:
    reading_time = story.find("p", class_="css-1esztn").get_text()
  except:
    reading_time = ""
  try:
    link = story.find("a")["href"]
  except:
    link = ""
  stories.append((title, summary, reading_time, link))


In [35]:
parsed_sections = []
for section in soup.find_all("section", class_="story-wrapper"):
  try:
    # get the url of the article
    link = section.find("a")["href"]
  except:
    link = ""
  try:
    # get the title of the article
    title = section.find("h3").get_text()
  except:
    try:
      title = section.find("h4").get_text()
    except:
      title = ""
  try:
    # get the description
    description = section.find("p").get_text()
  except:
    description = ""
  # Append to the list my parsed section
  parsed_sections.append((link, title, description))

In [43]:
urls = []
a_tags = soup.find_all("a")
for a_tag in a_tags:
  link = a_tag["href"]
  text = a_tag.get_text()
  urls.append((link, text))