In [115]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from collections import defaultdict

In [114]:
# Retrieve page from Amsterdam
URL = 'https://www.amsterdam.nl/nieuws/coronavirus/nieuwsoverzicht-coronavirus/'
page = requests.get(URL)
text = page.text

**Beautifulsoup Documentation**

https://beautiful-soup-4.readthedocs.io/en/latest/

In [106]:
# Let's make some nice soup, a Caldo de Peix for example. 
soup = BeautifulSoup(text, 'html.parser')
ams_news = soup.find(id='PagCls_15511847').find(class_='tekst')

# Remove and unwrap container elements. 
for el in ams_news.find_all('div'):
    el.unwrap()
for el in ams_news.find_all(class_='visuallyhidden'):
    el.decompose()
for el in ams_news.find_all(class_='datetime'):
    el.unwrap()
for el in ams_news.find_all(class_='siteLink'):
    if el.parent.name == 'p':
        el.parent.unwrap()

# # Print example of cleaned HTML code.
# print(ams_news.prettify())

In [113]:
# Loop over all time elements in the upwrapped soup.
collection = []
for timestamp in ams_news.find_all('time'):
    
    coldict = defaultdict(str)
    # Parse timestamp given
    coldict['timestamp'] = datetime.strptime(timestamp.attrs['datetime'], 
                                             '%Y-%m-%dT%H:%M:%S.0000000')

    # Find next elements until next publications
    for elem in timestamp.find_next_siblings():
        
        # Title
        if elem.name == 'h3':
            coldict['title'] = elem.get_text()
            continue
            
        # First P (paragraph) element encountered as main text. 
        if elem.name == 'p' and 'descr' not in coldict:
            coldict['descr'] = elem.get_text()
            continue
            
        # Other elements added to the secondary description.
        if elem.name == 'p' and 'descr' in coldict:
            coldict['descr2'] += elem.get_text()
            continue
            
        # If a link is attached, decompose and add to dict. 
        if elem.name == 'a':
            coldict['link_href'] = elem.attrs['href']
            coldict['link_desc'] = elem.get_text()
            continue
        
        # If next time-object is found, break loop for next iter.
        if elem.name == 'time':
            break
            
    # Add final dict to collection list. 
    collection.append(coldict)

# Create nice pandas, set index and sort. 
output_df = pd.DataFrame.from_dict(collection)
output_df = output_df.set_index('timestamp')
output_df = output_df.sort_index()

output_df.head()

Unnamed: 0_level_0,title,descr,link_href,link_desc,descr2
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-02-28 19:22:00,Persconferentie,Naar aanleiding van de besmetting van een Amst...,,,De GGD Amsterdam is op basis van de landelijke...
2020-02-29 13:08:00,2 contacten testten positief voor COVID-19,Vannacht testten directe contacten van de pati...,,,De GGD Amsterdam benaderde gisteren contacten ...
2020-02-29 16:42:00,Partner en jongste kind van de besmette vrouw ...,Vandaag is bekend geworden dat de partner en h...,,,De GGD Amsterdam zet daarom het contactonderzo...
2020-03-04 08:36:00,Jongste kind patiënt Diemen heeft het coronavi...,Het jongste kind van de patient in Diemen is t...,,,
2020-03-04 15:04:00,Nieuw bevestigd geval van het coronavirus in A...,Er is een nieuwe besmetting in Amsterdam. Het ...,,,


In [116]:
# Output to csv-file. 
output_df.to_csv('./data-amsterdam-covid-nieuws.csv')