In [2]:
import requests, bs4
import time
import pandas as pd

DELAY_BETWEEN_PAGES = 2
BASE_URL = 'https://www.bildung.berlin.de/Schulverzeichnis/SchulListe.aspx'
BASE_PORTRAIT = 'https://www.bildung.berlin.de/Schulverzeichnis/Schulportrait.aspx'

# 2022 https://www.bildung.berlin.de/Schulverzeichnis/SchulListe.aspx?BezNr=06
# 2022 https://www.bildung.berlin.de/Schulverzeichnis/Schulportrait.aspx?IDSchulzweig=%2023251

bezirk_dict = {'01': 'Mitte',
                '02': 'Friedrichshain-Kreuzberg',
                '03': 'Pankow',
                '04': 'Charlottenburg-Wilmersdorf',
                '05': 'Spandau',
                '06': 'Steglitz-Zehlendorf',
                '07': 'Tempelhof-Schöneberg',
                '08': 'Neukölln',
                '09': 'Treptow-Köpenick',
                '10': 'Marzahn-Hellersdorf',
                '11': 'Lichtenberg',
                '12': 'Reinickendorf'}

In [15]:
def get_bezirk_page(bezirk_nummer):
    bezirk_url = BASE_URL + '?BezNr=' + bezirk_nummer
    res = requests.get(bezirk_url)
    res.raise_for_status()
    return bs4.BeautifulSoup(res.text, 'html.parser')

def get_school_ids(soup_area):
    school_links = soup_area.select('a')
    return [str(x).split('=')[2][1:6] for x in school_links if 'IDSchulzweig' in str(x)]

def get_school_info(school_id):
        result = dict()
        school_url = BASE_PORTRAIT + '?IDSchulzweig=%20' + school_id
        res_school = requests.get(school_url)
        res_school.raise_for_status()
        soup_school = bs4.BeautifulSoup(res_school.text, 'html.parser')
        temp = soup_school.find('div', {'id': 'divAllgemein'})
        result['schulname'] = temp.find('span', {'id':'ContentPlaceHolderMenuListe_lblSchulname'}).getText().strip()
        result['schulart'] = temp.find('span', {'id':'ContentPlaceHolderMenuListe_lblSchulart'}).getText().strip()
        result['strasse'] = temp.find('span', {'id':'ContentPlaceHolderMenuListe_lblStrasse'}).getText().strip()
        result['ort'] = temp.find('span', {'id':'ContentPlaceHolderMenuListe_lblOrt'}).getText().strip()
        result['tel'] = temp.find('span', {'id':'ContentPlaceHolderMenuListe_lblTelefon'}).getText().strip()
        result['email'] = temp.find('a', {'id':'ContentPlaceHolderMenuListe_HLinkEMail'}).getText().strip()
        result['leitung'] = temp.find('span', {'id':'ContentPlaceHolderMenuListe_lblLeitung'}).getText().strip()
        return result

def clean_up_data(school_info):
    result = pd.DataFrame(school_info)
    result.email = result.email.str.replace('%09', '')
    result.email = result.email.str.replace(' ', '')
    result.email = result.email.str.replace('%20', '')
    try:
        result[['email1', 'email2']] = result.email.str.split(';', expand=True)
    except ValueError:
        result['email1'] = result['email']
        result['email2'] = None
    result = result[result.email.str.contains('@')]
    result = result[~result.email1.duplicated()]
    COLS = ['schulart','schulname','strasse','ort', 'tel','email1', 'leitung']

    return result[COLS]


In [24]:
for bezirk_nummer, bezirk_name in bezirk_dict.items():
    # if bezirk_nummer != '01': continue #uncomment if only one Bezirk is required
    print('Scraping :', bezirk_name)
    soup_area = get_bezirk_page(bezirk_nummer)
    school_ids = get_school_ids(soup_area)
    school_info = list()
    n = 1
    for school_id in school_ids:
        school_info.append(get_school_info(school_id))
        time.sleep(DELAY_BETWEEN_PAGES)
        if n%25==0: print(n)
        n+=1
    result = clean_up_data(school_info)
    result.to_csv(bezirk_nummer + '_' + bezirk_name + '.csv')



Scraping : Reinickendorf
25
50
75
100
