# Simple web scraping example (www.pbs.org/newshour)

In [None]:
#!pip install BeautifulSoup4

### Import the libraries

In [None]:
# to get the URL
import requests

# to parse the HTMLDOM
from bs4 import BeautifulSoup

# patience is virtue
import time

### Get the first page

In [None]:
# the url we want to scrape
url = 'https://www.pbs.org/newshour/search-results?q=%22artificial%20intelligence%22'

# get data from the url
res = requests.get(url)


# parse to bs4
soup = BeautifulSoup(res.content, 'html.parser')


# let's get something as simple as a title
# use .get_text() to extract the text from the element(s)
title = soup.find('title').get_text()

# output the title
title

### Get list of articles

In [None]:
# .search-result
items = soup.find_all(class_='search-result')

# iterate through the items
for item in items:
  title = item.find(class_='search-result__title').get_text()
  
  # the a element consistitutes a link find the href attribute which is the actual link
  url = item.find('a')['href']
  
  description = item.find('p').get_text().strip()
  date = item.find(class_='search-result__date').get_text()

  print(url)

### Pagination

In [None]:
# check the number of pages
total_pages = int(soup.find_all(class_='pagination__number')[-1].get_text())

# create an empty list
url_list = []

for page in range(1,total_pages+1):
  # wait a little bit
  time.sleep(2)
  
  # get the url, this could be a bit better besides being hardcord
  url = 'https://www.pbs.org/newshour/search-results?q=%22artificial+intelligence%22&pnb='+str(page)

  # get data from the url
  res = requests.get(url)

  # parse to bs4
  soup = BeautifulSoup(res.content, 'html.parser')

  # .search-result
  items = soup.find_all(class_='search-result')

  # get the urls
  for item in items:
    url = item.find('a')['href']
    url_list.append(url)


In [None]:
url_list

### And now you can download every page individually!

In [None]:
for url in url_list:
  # wait
  print('Retrieving', url)
  time.sleep(2)

  # get data from the url
  res = requests.get(url)

  # parse to bs4
  soup = BeautifulSoup(res.content, 'html.parser')
  file = url.replace('https://www.pbs.org/newshour/', '').replace('/', '-') + '.html'
  
  # save to disk
  with open('data/PBS/'+file, "w") as f:
    f.write(str(soup))