In [91]:
from bs4 import BeautifulSoup
from selenium import webdriver
from chromedriver_py import binary_path
import csv, random
from time import sleep

berlin_url = 'https://postal-codes.cybo.com/germany/berlin/#listcodes'
amsterdam_url = 'https://postal-codes.cybo.com/netherlands/amsterdam/#listcodes'

I use selenium as the page have protection from basic webscraping 

In [3]:
driver = webdriver.Chrome(executable_path=binary_path)

In [5]:
driver.get(berlin_url)
berlin_page_html = driver.execute_script("return document.body.innerHTML;")
driver.get(amsterdam_url)
amsterdam_page_html = driver.execute_script("return document.body.innerHTML;")

In [7]:
berlin_soup = BeautifulSoup(berlin_page_html, 'html.parser')
amsterdam_soup = BeautifulSoup(amsterdam_page_html, 'html.parser')

First I scrape a tables with all the postal codes of the cities from the berlin_url and amsterdam_url. Then I save it to the disk to avoid scraping the pages again.

In [64]:
#save to csv file
with open('berlin_codes.csv', mode='w', newline='', encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=';')

    berlin_html_table = berlin_soup.find_all('table')[2]
    for row in berlin_html_table.find_all('tr'):
        row = [str(column.text).replace(' km²', '').replace('Area', 'Area km²').strip() for column in row.find_all(['td', 'th'])]
        csv_writer.writerow(row)

In [65]:
#save to csv file
with open('amsterdam_codes.csv', mode='w', newline='', encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=';')

    amsterdam_html_table = amsterdam_soup.find_all('table')[1]
    for row in amsterdam_html_table.find_all('tr'):
        row = [str(column.text).replace(' km²', '').replace('Area', 'Area km²').strip() for column in row.find_all(['td', 'th'])]
        csv_writer.writerow(row)

Then, I scrape the details URL link and I enter each link and I scape the details, saving them on the disk to avoid scraping again. This is necessary because this web traffic is very unnatural and the website protection could trigger. 

In [117]:
url_protocol = 'https:'
berlin_html_table = berlin_soup.find_all('table')[2]
with open('berlin_details.csv', mode='w', newline='', encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=';')
    csv_writer.writerow(['Postal Code', 'Median Age', 'Neighborhoods', 'Latitude', 'Longitude']) # Header

    for row in berlin_html_table.find_all('tr'):
        if row.find('th'):
            continue
        postal_code = (row.find('td').text)
        details_url = url_protocol + row.find('td').find('a')['href']
        print(postal_code, details_url)

        #we need to scrape details page for each code to get necessary information
        driver.get(details_url) 
        details_soup = BeautifulSoup(driver.execute_script("return document.body.innerHTML;"), 'html.parser')
        details_html_table = details_soup.find('table')
        for row in details_html_table.find_all('tr'):
            row = [str(column.text).replace(' km²', '').replace('Area', 'Area km²').strip() for column in row.find_all(['td', 'th'])]
            if row[0] in ['Neighborhoods']:
                neighborhoods = row[1]
            elif row[0] in ['Coordinates']:
                coords = str(row[1]).replace('°', '').split(u'\xa0/\xa0')
                latitude = coords[0]
                longitude = coords[1]
            elif row[0] in ['Median Age']:
                median_age = str(row[1]).replace(' years', '')
        csv_writer.writerow([postal_code, median_age, neighborhoods, latitude, longitude])
        csv_file.flush()
        # sleep to avoid being blocked by website protection
        sleep(random.randint(5, 9))

10115 https://postal-codes.cybo.com/germany/10115_berlin-berlin/
10117 https://postal-codes.cybo.com/germany/10117_berlin-berlin/
10119 https://postal-codes.cybo.com/germany/10119_berlin-berlin/
10178 https://postal-codes.cybo.com/germany/10178_berlin-berlin/
10179 https://postal-codes.cybo.com/germany/10179_berlin-berlin/
10243 https://postal-codes.cybo.com/germany/10243_berlin-berlin/
10245 https://postal-codes.cybo.com/germany/10245_berlin-berlin/
10247 https://postal-codes.cybo.com/germany/10247_berlin-berlin/
10249 https://postal-codes.cybo.com/germany/10249_berlin-berlin/
10315 https://postal-codes.cybo.com/germany/10315_berlin-berlin/
10317 https://postal-codes.cybo.com/germany/10317_berlin-berlin/
10318 https://postal-codes.cybo.com/germany/10318_berlin-berlin/
10319 https://postal-codes.cybo.com/germany/10319_berlin-berlin/
10365 https://postal-codes.cybo.com/germany/10365_berlin-berlin/
10367 https://postal-codes.cybo.com/germany/10367_berlin-berlin/
10369 https://postal-code

In [120]:
url_protocol = 'https:'
amsterdam_html_table = amsterdam_soup.find_all('table')[1]
with open('amsterdam_details.csv', mode='w', newline='', encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file, delimiter=';')
    csv_writer.writerow(['Postal Code', 'Median Age', 'Neighborhoods', 'Latitude', 'Longitude']) # Header

    for row in amsterdam_html_table.find_all('tr'):
        if row.find('th'):
            continue
        postal_code = (row.find('td').text)
        details_url = url_protocol + row.find('td').find('a')['href']

        #we need to scrape details page for each code to get necessary information
        driver.get(details_url) 
        details_soup = BeautifulSoup(driver.execute_script("return document.body.innerHTML;"), 'html.parser')
        details_html_table = details_soup.find('table')
        for row in details_html_table.find_all('tr'):
            row = [str(column.text).replace(' km²', '').replace('Area', 'Area km²').strip() for column in row.find_all(['td', 'th'])]
            if row[0] in ['Neighborhoods']:
                neighborhoods = row[1]
            elif row[0] in ['Coordinates']:
                coords = str(row[1]).replace('°', '').split(u'\xa0/\xa0')
                latitude = coords[0]
                longitude = coords[1]
            elif row[0] in ['Median Age']:
                median_age = str(row[1]).replace(' years', '')
        csv_writer.writerow([postal_code, median_age, neighborhoods, latitude, longitude])
        csv_file.flush()
        # sleep to avoid being blocked by website protection
        sleep(random.randint(9, 13))

In [121]:
driver.close()