In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

In [2]:
# What we are trying to do here, is see if we can scrape a news article
# If we can, then we will try to scrape the website and do this for a few news articles
url = 'https://www.bbc.com/news/world-60555472'


In [3]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en-GB">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title data-rh="true">
   How many Ukrainian refugees are there and where have they gone? - BBC News
  </title>
  <meta content="The UN says more than 13 million people have fled their homes since the Russian invasion." data-rh="true" name="description"/>
  <meta content="#FFFFFF" data-rh="true" name="theme-color"/>
  <meta content="https://www.facebook.com/bbcnews" data-rh="true" property="article:author"/>
  <meta content="100004154058350" data-rh="true" property="fb:admins"/>
  <meta content="1609039196070050" data-rh="true" property="fb:app_id"/>
  <meta content="1143803202301544,317278538359186,1392506827668140,742734325867560,185246968166196,156060587793370,137920769558355,193435954068976,21263239760,156400551056385,929399697073756,154344434967,228735667216,80758950658,260212261199,294662213128,1086451581439054,2833481216820

In [5]:
soup.title.string

'How many Ukrainian refugees are there and where have they gone? - BBC News'

In [6]:
# Text of  article
ps = soup.find_all('p')
pattern = r'>[\w,\'\s.!?()]+<'
regex = re.compile(pattern)
texts = regex.findall(str(ps))

In [7]:
texts

[">More than 13 million people have fled their homes since Russia's invasion of Ukraine, the United Nations (UN) says. <",
 '>, <',
 '>More than five million have left for neighbouring countries, while eight million people are thought to be displaced inside Ukraine itself. <',
 '>, <',
 '>, <',
 '>In updated figures, the UN says that, <',
 '>as of 16 June<',
 '>, <',
 '>Others have moved on to other destinations, especially those who crossed into Poland, Hungary and Slovakia. These nations <',
 '>have open borders with other EU countries<',
 '>. <',
 '>, <',
 '>The UN says there are more than 780,000 Ukrainians in Germany, 373,965 in the Czech Republic and 129,623 in Italy. <',
 '>, <',
 '>, <',
 '>President Vladimir Putin says his forces <',
 '>helped evacuate 140,000 civilians from Mariupol<',
 '> and insists no one was forced to go to Russia. However, <',
 '>volunteer groups<',
 '> say they have helped hundreds of Ukrainians leave Russia.<',
 '>, <',
 '>The EU has granted Ukrainians

In [8]:
# Cleaning the ends of each string
clean_texts = []
for text in texts:
    cleaned = re.sub('[><]+', '', text)
    if len(cleaned) > 4:
        # There were some of the texts that looked like this '> ,<'
        clean_texts.append(cleaned.strip())

In [9]:
# There are some references to other articles at the bottom
# but if they are the same amount for every article, 
# we can just cut the last 12 strings off of the list.
clean_texts

["More than 13 million people have fled their homes since Russia's invasion of Ukraine, the United Nations (UN) says.",
 'More than five million have left for neighbouring countries, while eight million people are thought to be displaced inside Ukraine itself.',
 'In updated figures, the UN says that,',
 'as of 16 June',
 'Others have moved on to other destinations, especially those who crossed into Poland, Hungary and Slovakia. These nations',
 'have open borders with other EU countries',
 'The UN says there are more than 780,000 Ukrainians in Germany, 373,965 in the Czech Republic and 129,623 in Italy.',
 'President Vladimir Putin says his forces',
 'helped evacuate 140,000 civilians from Mariupol',
 'and insists no one was forced to go to Russia. However,',
 'volunteer groups',
 'say they have helped hundreds of Ukrainians leave Russia.',
 'The EU has granted Ukrainians the right to stay and work throughout its 27 member nations for up to three years.',
 "Refugees are housed in rece

In [10]:
t = soup.find_all('time')
pattern = r'datetime=\"[0-9\-]+'
regex = re.compile(pattern)
publish_date = regex.findall(str(t))

In [11]:
publish_date[0].replace('datetime="', '')

'2022-06-21'

In [12]:
# These are the headers for the article
h = soup.find_all('h2')

In [13]:
pattern = r'>[\w\s]+\?{1}<'
regex = re.compile(pattern)
headers = regex.findall(str(h))

In [14]:
clean_headers = []
for header in headers:
    header = header.strip("<").strip(">")
    clean_headers.append(header)

In [15]:
clean_headers

['Where are refugees going?',
 'What help are countries offering refugees?',
 'How many people are returning to Ukraine?',
 'Where are people fleeing inside Ukraine?',
 'What is the UK doing to help Ukrainian refugees?']

In [16]:
# Now lets try to scrape multiple articles and get their specific urls 

url = 'https://www.bbc.com/news'

In [17]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

In [18]:
front_page = soup.find_all('a')

In [19]:
pattern = r'href=\"[\w\-\/]+[\-\/]+[0-9]+\">'
regex = re.compile(pattern)
links = regex.findall(str(front_page))

In [20]:
pattern = r'[\w\'\/\-\,\s]+</h3'
regex = re.compile(pattern)
titles = regex.findall(str(front_page))

In [21]:
# Since these don't match up, we'll just get all of the titles from the links that we scrape
# Some of these aren't articles, but we will deal with this later
print(len(titles))
print(len(links))

47
79


In [22]:
# Since there are 77 links, lets make a function to scrape these

def article_scraper(url):

    # Retrieving Url
    r = requests.get((url))
    soup = BeautifulSoup(r.text, 'html.parser')

    # Title
    title = soup.title.string

    # The p's have all of the text of the article
    p = soup.find_all('p')
    pattern = r'>[\w,\'\s.!?()]+<'
    regex = re.compile(pattern)
    texts = regex.findall(str(p))

    # Cleaning Text
    clean_texts = ''
    for text in texts:
        cleaned = re.sub('[><]+', '', text)
        if len(cleaned) > 4:
            clean_texts+= cleaned.strip() + ' '
    
    # These are the headers for the article
    h = soup.find_all('h2')
    pattern = r'>[\w\s]+\?{1}<'
    regex = re.compile(pattern)
    headers = regex.findall(str(h))

    # Cleaning Headers
    clean_headers = []
    for header in headers:
        header = header.strip("<").strip(">")
        clean_headers.append(header)
    
    # Date Published
    t = soup.find_all('time')
    pattern = r'datetime=\"[0-9\-]+'
    regex = re.compile(pattern)
    publish_date = regex.findall(str(t))

    # Clean Date
    date = publish_date[0].replace('datetime="', '')
    if len(date) < 3:
        date = date + 'hours ago'
    
    return({'title':title, 'text':clean_texts.strip(), 'headers':clean_headers, 'url':url, 'publish_date': date})

In [23]:
# Cleaning links to process through the function

urls = []
for link in links:
    cleaned = link.replace('href="', '').replace('">', '')
    urls.append('https://www.bbc.com' + cleaned.strip())

In [24]:
# Build an empty dataframe

df = pd.DataFrame(columns=['title', 'text', 'headers', 'url', 'publish_date'])

In [25]:
for url in urls:
    contents = article_scraper(url)
    df.loc[len(df)] = contents
    # Don't want to get my IP address banned
    time.sleep(1)

In [26]:
df = df.drop_duplicates(subset=['url'])

In [27]:
# This dataframe may change day to day, since these are all the front page articles
df

Unnamed: 0,title,text,headers,url,publish_date
0,War in Ukraine - BBC News,The International Energy Agency warns that Rus...,[],https://www.bbc.com/news/world-60525350,2
1,Climate - BBC News,Newsbeat has been finding out about the indivi...,[],https://www.bbc.com/news/science-environment-5...,17
2,One-minute World News - BBC News,This video can not be played Watch the latest ...,[],https://www.bbc.com/news/av/10462520,2022-06-22
6,New Economy - BBC News,The UK economy cannot afford to have more of i...,[],https://www.bbc.com/news/business-45489065,8
7,New Tech Economy - BBC News,As a Google engineer says his firm's top chatb...,[],https://www.bbc.com/news/business-15521824,6
8,Entrepreneurship - BBC News,Meet the female entrepreneurs aged over 50 who...,[],https://www.bbc.com/news/business-22434141,13
9,Technology of Business - BBC News,"Engineers are developing mobile, floating nucl...",[],https://www.bbc.com/news/business-11428889,1
10,CEO Secrets - BBC News,Liz Jackson MBE started her career as an entre...,[],https://www.bbc.com/news/business-33712313,17
11,"Afghan earthquake: At least 1,000 people kille...",BBC News This video can not be played A powerf...,[],https://www.bbc.com/news/world-asia-61890804,2022-06-22
13,Afghanistan earthquake: 'Every street you go y...,By Thom Poole BBC News This is how one Afghan ...,[],https://www.bbc.com/news/world-asia-61900222,2022-06-22


In [28]:
videos = []
for i, row in df.iterrows():
    if row.text.startswith('This video can not be played'):
        videos.append(i)

In [29]:
# Since video articles are a bit different, I wanted to offer this in a different data frame
video_articles_df = pd.DataFrame(columns=['title', 'text', 'headers', 'url', 'publish_date'])
video_links = []
for number in videos:
    row = df.iloc[number]
    video_links.append(row.url)
    video_articles_df.loc[len(video_articles_df)] = row
video_links

['https://www.bbc.com/news/av/10462520',
 'https://www.bbc.com/news/world-61898437',
 'https://www.bbc.com/news/world-us-canada-61889593',
 'https://www.bbc.com/news/world-africa-61802498',
 'https://www.bbc.com/news/world-asia-61890802',
 'https://www.bbc.com/news/entertainment-arts-61894118',
 'https://www.bbc.com/news/world-us-canada-61838411',
 'https://www.bbc.com/news/world-europe-61863172',
 'https://www.bbc.com/news/61836019',
 'https://www.bbc.com/news/world-asia-india-61877422',
 'https://www.bbc.com/news/world-us-canada-61890557']

In [30]:
video_articles_df

Unnamed: 0,title,text,headers,url,publish_date
0,One-minute World News - BBC News,This video can not be played Watch the latest ...,[],https://www.bbc.com/news/av/10462520,2022-06-22
1,Fears for US woman's life as abortion denied i...,"By Sara Monetta BBC News For the past week, th...",[],https://www.bbc.com/news/world-61898437,2022-06-22
2,Capitol riot hearing: Vote workers detail deat...,This video can not be played Election official...,[],https://www.bbc.com/news/world-us-canada-61889593,2022-06-22
3,How Russia has outflanked Ukraine in Africa - ...,"By Paul Melly Africa Programme, Chatham House,...",[],https://www.bbc.com/news/world-africa-61802498,2022-06-21
4,Monkeypox: First Singapore case recorded in Br...,By Frances Mao BBC News Singapore has recorded...,[],https://www.bbc.com/news/world-asia-61890802,2022-06-22
5,Kate Bush on finding new fame with Stranger Th...,This video can not be played Kate Bush on find...,[],https://www.bbc.com/news/entertainment-arts-61...,2022-06-22
6,Dog turns on stove and starts house fire - BBC...,This video can not be played A fire that cause...,[],https://www.bbc.com/news/world-us-canada-61838411,2022-06-17
7,French elections: What now for opposition left...,By Paul Kirby In Paris They were the big winne...,[],https://www.bbc.com/news/world-europe-61863172,2022-06-20
8,Thailand cannabis: From a war on drugs to weed...,Other countries in the region have followed th...,[],https://www.bbc.com/news/61836019,2022-06-20
9,"Ankur Warikoo, Rachana Ranade: The YouTubers I...",By Nikhil Inamdar and Ayushi Shah BBC News He...,[],https://www.bbc.com/news/world-asia-india-6187...,2022-06-21


In [31]:
home_pages = []
for i, row in df.iterrows():
    if row.text.endswith('Email us at Send an SMS or MMS to Follow Have Your Say on Twitter Why you can trust BBC News') == True:
        home_pages.append(i)

In [32]:
# We will also be creating another dataframe for homepages, 
# in case we would want to scrape again on another type of field 
# like science, the world, technology
home_page_df = pd.DataFrame(columns=['title', 'text', 'headers', 'url', 'publish_date'])
home_links = []
for number in home_pages:
    row = df.iloc[number]
    home_links.append(row.url)
    home_page_df.loc[len(home_page_df)] = row
home_links

['https://www.bbc.com/news/world-60525350',
 'https://www.bbc.com/news/science-environment-56837908',
 'https://www.bbc.com/news/business-11428889',
 'https://www.bbc.com/news/business-33712313',
 'https://www.bbc.com/news/world-asia-61890804',
 'https://www.bbc.com/news/world-asia-61900222',
 'https://www.bbc.com/news/world-middle-east-61893747']

In [33]:
home_page_df

Unnamed: 0,title,text,headers,url,publish_date
0,War in Ukraine - BBC News,The International Energy Agency warns that Rus...,[],https://www.bbc.com/news/world-60525350,2
1,Climate - BBC News,Newsbeat has been finding out about the indivi...,[],https://www.bbc.com/news/science-environment-5...,17
2,Technology of Business - BBC News,"Engineers are developing mobile, floating nucl...",[],https://www.bbc.com/news/business-11428889,1
3,CEO Secrets - BBC News,Liz Jackson MBE started her career as an entre...,[],https://www.bbc.com/news/business-33712313,17
4,"Afghan earthquake: At least 1,000 people kille...",BBC News This video can not be played A powerf...,[],https://www.bbc.com/news/world-asia-61890804,2022-06-22
5,Afghanistan earthquake: 'Every street you go y...,By Thom Poole BBC News This is how one Afghan ...,[],https://www.bbc.com/news/world-asia-61900222,2022-06-22
6,Afghanistan earthquake: Injured carried to res...,This video can not be played A government resc...,[],https://www.bbc.com/news/world-middle-east-618...,2022-06-22


In [34]:
print(home_pages)
print(videos)

[0, 1, 6, 7, 8, 9, 10]
[2, 14, 20, 25, 34, 36, 38, 42, 44, 45, 46]


In [35]:
df = df.drop(home_pages)
df = df.drop(videos)

In [36]:
articles_df = df.reset_index()

In [37]:
articles_df = articles_df.drop(columns=['index'])

In [38]:
articles_df

Unnamed: 0,title,text,headers,url,publish_date
0,"Afghan earthquake: At least 1,000 people kille...",BBC News This video can not be played A powerf...,[],https://www.bbc.com/news/world-asia-61890804,2022-06-22
1,Afghanistan earthquake: 'Every street you go y...,By Thom Poole BBC News This is how one Afghan ...,[],https://www.bbc.com/news/world-asia-61900222,2022-06-22
2,History of deadly earthquakes - BBC News,Earthquakes have claimed millions of lives in ...,[],https://www.bbc.com/news/world-12717980,2022-06-22
3,President Biden urges petrol tax 'holiday' as ...,With national elections for Congress coming in...,[What is the US gas tax?],https://www.bbc.com/news/business-61899467,2022-06-22
4,Shaye Moss: 'I don't want anyone knowing my na...,A former US election worker who was singled ou...,[],https://www.bbc.com/news/world-us-canada-61889951,2022-06-21
5,Fears for US woman's life as abortion denied i...,"By Sara Monetta BBC News For the past week, th...",[],https://www.bbc.com/news/world-61898437,2022-06-22
6,Half Russian separatist force dead or wounded ...,By Paul Kirby BBC News Russian and Russian pro...,[],https://www.bbc.com/news/world-europe-61891462,2022-06-22
7,Ukraine war: No easy exit from besieged Lysych...,"By Orla Guerin BBC News, Lysychansk If you had...",[],https://www.bbc.com/news/world-europe-61895580,2022-06-22
8,White House unveils plans to reduce nicotine i...,But the measure is likely to face opposition b...,[],https://www.bbc.com/news/world-us-canada-61887753,2022-06-22
9,Uvalde shooting: Robb Elementary School to be ...,Don McLaughlin made the claim on Tuesday at a ...,[],https://www.bbc.com/news/world-us-canada-61890558,2022-06-22
