In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
response = requests.get("https://www.bbc.com/")
doc = BeautifulSoup(response.text, 'html.parser')

In [3]:
# Get all h3 tags with the class .media__title
doc.find_all('h3', class_='media__title')

[<h3 class="media__title">
 <a class="media__link" href="/sport/football/51198762" rev="hero1|headline">
                                                                     England lose shootout in Euro 2020 final                                                            </a>
 </h3>,
 <h3 class="media__title">
 <a class="media__link" href="/news/world-europe-57800151" rev="hero2|headline">
                                                                     Italy fans ecstatic after Euro 2020 triumph                                                            </a>
 </h3>,
 <h3 class="media__title">
 <a class="media__link" href="/sport/football/57800221" rev="hero3|headline">
                                                                     Final loss incredibly painful - Southgate                                                            </a>
 </h3>,
 <h3 class="media__title">
 <a class="media__link" href="/news/science-environment-57797297" rev="hero4|headline">
                 

In [4]:
# You need to understand CSS selectors
# .media__title means "something with the class of media__title"
# h3 means "something with the tag name of h3"
# h3.media__title means "something with the tag name of h3 AND the class of media__title"
# which means we can do crazy things like:
# h3.media__title a
# means "a link inside of (an h3 tag with the class of media__title)"
doc.select('.media__title')

[<h3 class="media__title">
 <a class="media__link" href="/sport/football/51198762" rev="hero1|headline">
                                                                     England lose shootout in Euro 2020 final                                                            </a>
 </h3>,
 <h3 class="media__title">
 <a class="media__link" href="/news/world-europe-57800151" rev="hero2|headline">
                                                                     Italy fans ecstatic after Euro 2020 triumph                                                            </a>
 </h3>,
 <h3 class="media__title">
 <a class="media__link" href="/sport/football/57800221" rev="hero3|headline">
                                                                     Final loss incredibly painful - Southgate                                                            </a>
 </h3>,
 <h3 class="media__title">
 <a class="media__link" href="/news/science-environment-57797297" rev="hero4|headline">
                 

In [5]:
# Get everything with the class of media__title
# and then loop through them each and print out the text
titles = doc.select('.media__title')
for title in titles:
    print(title.text.strip())

England lose shootout in Euro 2020 final
Italy fans ecstatic after Euro 2020 triumph
Final loss incredibly painful - Southgate
Billionaire Branson rockets to the edge of space
Inside the Miami building collapse recovery operation
‘They shot me, but they didn’t know I’m unbreakable’
US team to visit Haiti after president's killing
Wildfires rage as heatwave sweeps western US
Djokovic should win 25 Slams - McEnroe
FA condemns racist abuse of England players
Small number of fans broke into Wembley - police
What were Neanderthal children like?
The workers who can't discuss pay
The UK town that inspired the Olympics
Five stars for Verhoeven's Benedetta
Fruity dessert recipes for summer
News quiz: What was a policeman doing with Taylor Swift?
Why getting things wrong is good for science
Tired of working from home? Put the office on wheels
The European nation keeping us young
Branson's journey to the edge of space (and back)
Branson's journey to the edge of space (and...
The sacred forest for

In [6]:
# Get everything with the class of media__title
# and then loop through them each and print out the text
tags = doc.select('.media__tag')
for tag in tags:
    print(tag.text.strip())

Football
Europe
Football
Science & Environment
US & Canada
Asia
Latin America & Caribbean
US & Canada
Tennis
Football
UK
Future
Worklife
Travel
Culture
FOOD
World
Ideas
Business
Travel
Science & Environment
Science & Environment
Asia
Derby
UK
Technology
US & Canada
Middle East
Middle East
England
US & Canada
UK
Video
Europe
England
Entertainment & Arts
Technology
Science & Environment
Asia
Science & Environment
Business
Entertainment & Arts
Now Showing
Business
In Pictures
In Pictures
Africa
In Pictures
In Pictures


In [7]:
len(titles)

49

In [8]:
len(tags)

49

In [9]:
import pandas as pd

pd.DataFrame({
    'title': titles,
    'tag': tags
})

Unnamed: 0,title,tag
0,"[\n, [\n ...",[Football]
1,"[\n, [\n ...",[Europe]
2,"[\n, [\n ...",[Football]
3,"[\n, [\n ...",[Science & Environment]
4,"[\n, [\n ...",[US & Canada]
5,"[\n, [\n ...",[Asia]
6,"[\n, [\n ...",[Latin America & Caribbean]
7,"[\n, [\n ...",[US & Canada]
8,"[\n, [\n ...",[Tennis]
9,"[\n, [\n ...",[Football]


In [10]:
summaries = doc.select('.media__summary')
len(summaries)

16

In [11]:
# titles = doc.select('.media__title')

# find everything with the class of media-list__item
# each one of these is going to be a row
stories = doc.select('.media-list__item')

# Starting off without ANY rows
rows = []

for story in stories:
    print("----")
    # Starting off knowing NONE of the columns of data for this datapoint?
    row = {}

    # print(story)
    # We want the one title INSIDE OF THIS STORY
    # story.find('h3', class_='media-title)
    # Let's update our dictionary's 'title' with the title
    row['title'] = story.select_one('h3').text.strip()
    # story.select_one('.media__link').get('href')
#     try:
#         print(story.select_one('.media__link')['href'])
#     except:
#         try:
#             print(story.select_one('.reel__link')['href'])
#         except:
#             print("Couldn't find a link")

    try:
        # Find me a media__link OR a reel_link
        row['href'] = story.select_one('.media__link, .reel__link')['href']
    except:
        print("Couldn't find a link")

    try:
        row['tag'] = story.select_one('.media__tag').text.strip()
    except:
        print("Couldn't find a tag!")

    try:
        row['summary'] = story.select_one('.media__summary').text.strip()
    except:
        print("Couldn't find a summary")

    print(row)
    # When we're done adding info to our row, we're going to add it into our list
    # of rows
    rows.append(row)

----
{'title': 'England lose shootout in Euro 2020 final', 'href': '/sport/football/51198762', 'tag': 'Football', 'summary': "England's hopes of ending a 55-year wait for a major trophy are crushed in heartbreaking fashion as they lose on penalties to Italy in the final of Euro 2020 at Wembley."}
----
Couldn't find a summary
{'title': 'Italy fans ecstatic after Euro 2020 triumph', 'href': '/news/world-europe-57800151', 'tag': 'Europe'}
----
Couldn't find a summary
{'title': 'Final loss incredibly painful - Southgate', 'href': '/sport/football/57800221', 'tag': 'Football'}
----
Couldn't find a summary
{'title': 'Billionaire Branson rockets to the edge of space', 'href': '/news/science-environment-57797297', 'tag': 'Science & Environment'}
----
Couldn't find a summary
{'title': 'Inside the Miami building collapse recovery operation', 'href': '/news/world-us-canada-57795441', 'tag': 'US & Canada'}
----
{'title': '‘They shot me, but they didn’t know I’m unbreakable’', 'href': '/news/world-

In [12]:
rows

[{'title': 'England lose shootout in Euro 2020 final',
  'href': '/sport/football/51198762',
  'tag': 'Football',
  'summary': "England's hopes of ending a 55-year wait for a major trophy are crushed in heartbreaking fashion as they lose on penalties to Italy in the final of Euro 2020 at Wembley."},
 {'title': 'Italy fans ecstatic after Euro 2020 triumph',
  'href': '/news/world-europe-57800151',
  'tag': 'Europe'},
 {'title': 'Final loss incredibly painful - Southgate',
  'href': '/sport/football/57800221',
  'tag': 'Football'},
 {'title': 'Billionaire Branson rockets to the edge of space',
  'href': '/news/science-environment-57797297',
  'tag': 'Science & Environment'},
 {'title': 'Inside the Miami building collapse recovery operation',
  'href': '/news/world-us-canada-57795441',
  'tag': 'US & Canada'},
 {'title': '‘They shot me, but they didn’t know I’m unbreakable’',
  'href': '/news/world-asia-57779841',
  'tag': 'Asia',
  'summary': 'Afghan woman Shakila Zareen had to have 22 o

In [13]:
import pandas as pd

df = pd.DataFrame(rows)
df

Unnamed: 0,title,href,tag,summary
0,England lose shootout in Euro 2020 final,/sport/football/51198762,Football,England's hopes of ending a 55-year wait for a...
1,Italy fans ecstatic after Euro 2020 triumph,/news/world-europe-57800151,Europe,
2,Final loss incredibly painful - Southgate,/sport/football/57800221,Football,
3,Billionaire Branson rockets to the edge of space,/news/science-environment-57797297,Science & Environment,
4,Inside the Miami building collapse recovery op...,/news/world-us-canada-57795441,US & Canada,
5,"‘They shot me, but they didn’t know I’m unbrea...",/news/world-asia-57779841,Asia,Afghan woman Shakila Zareen had to have 22 ope...
6,US team to visit Haiti after president's killing,/news/world-latin-america-57800152,Latin America & Caribbean,It will assess the security situation after la...
7,Wildfires rage as heatwave sweeps western US,/news/world-us-canada-57794263,US & Canada,Tributes are paid to two firefighters who died...
8,Djokovic should win 25 Slams - McEnroe,/sport/tennis/57768307,Tennis,Novak Djokovic not going on to win 25 Grand Sl...
9,FA condemns racist abuse of England players,/sport/football/57800431,Football,The FA condemns racist abuse aimed at England ...


In [14]:
df.to_csv("bbc-headlines.csv", index=False)