# Scraping the official Bundestag website

In [1]:
# import libraries
import urllib.request
import requests
import re
import pickle
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
import time
import random
import pandas as pd
import numpy as np
import os

# see https://stackoverflow.com/questions/2134706/hitting-maximum-recursion-depth-using-pickle-cpickle/2135176#2135176
import sys
sys.setrecursionlimit(100000)

In [2]:
# set up working directory
os.path.abspath(os.getcwd()) # initial working directory (should be equal to source file directory if using Jupyter Notebook)
os.chdir('../../data/web_scraping') # change to directory where all data files are stored
# check working directory
os.path.abspath(os.getcwd())

'C:\\Users\\Simon\\OneDrive\\Uni\\LMU\\SS 2020\\Statistisches Consulting\\Bundestag-MP-Analyse\\data\\web_scraping'

### Setup

In [5]:
# set up selenium web driver
chdriver_path = r'C:\Users\Simon\chromedriver.exe' # download chromedriver, save locally
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(chdriver_path)

In [6]:
# initialize selenium web driver
website = "https://www.bundestag.de/abgeordnete"
driver.get(website)

# switching to list view
element = driver.find_element_by_class_name('icon-list-bullet') # find list button
webdriver.ActionChains(driver).move_to_element(element).click(element).perform() # click
time.sleep(random.randint(5,10)) # wait for list view to load

In [8]:
# count how many MPs are listed on the website (includes dropouts and successors)
len(driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li'))

### Scraping

In [9]:
# setting up a dictionary with MP name and party only (from list)

abg_dict = {}
abg_count = 0
for link in driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li'):
    name = link.find_element_by_class_name('bt-teaser-person-text').find_element_by_tag_name('h3').text
    partei = link.find_element_by_class_name('bt-teaser-person-text').find_element_by_tag_name('p').text
    value = [name, partei] # value: name & partei
    abg_dict[abg_count] = value
    abg_count += 1

In [13]:
# for each MP, retrieve all information (except name and party) from individual page
# note that timeout errors might occur when waiting for a site to load
# in that case, rerun the loop below for the missing parts, then append to output already generated
for abg in abg_dict:
    
    # (re-)load list view (for all iterations)
    driver.get(website)
    element = driver.find_element_by_class_name('icon-list-bullet')
    webdriver.ActionChains(driver).move_to_element(element).click(element).perform() # click to change to list view
    time.sleep(random.randint(8,10)) # wait for list view to load
    
    driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li')[abg].click() # click to open individual page
    time.sleep(random.randint(8,10)) # wait for page to load

    soup = BeautifulSoup(driver.page_source, 'lxml') # convert page to soup
    
    # extract election mode (wahlart), state (bundesland), and electoral district (wahlkreis)
    wahlart = soup.find_all('h4')[4].text
    bundesland = soup.find('div', attrs = {'class': 'col-xs-12 col-sm-6 bt-standard-content'}).h5.text
    wahlkreis = soup.find('div', attrs = {'class': 'col-xs-12 col-sm-6 bt-standard-content'}).a.text \
        if soup.find('div', attrs = {'class': 'col-xs-12 col-sm-6 bt-standard-content'}).a is not None else "n.a."   

    # extract committee memberships (ausschuesse)
    ausschuesse = {}

    ## keys: position within committee (e.g., ordentliches mitglied, stellvertretendes mitglied)
    keys = []
    for heading in soup.find_all(class_ = 'bt-collapse-padding-bottom')[4].find_all('h5'):
        key = heading.text.strip('\n').strip() 
        keys.append(key)

    ## values: the actual committees
    values = []
    for ul in soup.find_all(class_ = 'bt-collapse-padding-bottom')[4].find_all('ul'):
        entries = []
        for x in ul.find_all('a'):
            entries.append(x['title'])
        values.append(entries)

    ## in case of memberships in other (non-Bundestag) bodies
    if len(soup.find_all('h4', string = 'Mitgliedschaften in sonstigen Gremien')) == 1:
        for heading in soup.find_all(class_ = 'bt-collapse-padding-bottom')[5].find_all('h5'):
            interim_key = heading.text.strip('\n').strip()
            key = f'{interim_key} (sonstige Gremien)'
            keys.append(key)

        for ul in soup.find_all(class_ = 'bt-collapse-padding-bottom')[5].find_all('ul'):
            entries = []
            for x in ul.find_all('a'):
                entries.append(x['title'])
            values.append(entries)
    
    if len(soup.find_all('h4', string = 'Funktion')) == 0:
        for i in range(len(keys)):
            ausschuesse[keys[i]] = values[i]
    else:
        for i in range(len(keys)):
            ausschuesse[keys[i]] = []  

    # exctract social media accounts(soziale medien)
    social_media = {}

    if len(soup.find_all('h5', string = 'Profile im Internet')) == 1:
        for link in soup.find_all(class_ = 'bt-linkliste')[0].find_all('a'):
            social_media[link['title']] = link.get('href')

    # extract biography (briografie)   
    biografie = str()
    for p in soup.find(class_ = 'bt-collapse-padding-bottom').find_all('p'):
        biografie += p.text.strip('\n').replace(u'\xa0', u' ')

    abg_dict[abg].append(wahlart)
    abg_dict[abg].append(bundesland)
    abg_dict[abg].append(wahlkreis)
    abg_dict[abg].append(ausschuesse)
    abg_dict[abg].append(social_media)
    abg_dict[abg].append(biografie)
    

### Saving output

In [81]:
# save raw MP dictionary
with open('abg_dict.pickle', 'wb') as handle:
    pickle.dump(abg_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [6]:
# convert into dataframe, add headers
df = pd.DataFrame(abg_dict).transpose()
df.columns = ['Name', 'Partei', 'Wahlart', 'Bundesland', 'Wahlkreis', 'Ausschuesse', 'Soziale Medien', 'Biografie']

In [7]:
# split Wahlkreis into number and name
df['Wahlkreis-Nr.'] = df['Wahlkreis'].apply(lambda x: int(x.split(':')[0].strip('Wahlkreis').strip('')) if x != "n.a." else "")
df['Wahlkreis'] = df['Wahlkreis'].apply(lambda x: x.split(':')[1].strip(' ') if x != "n.a." else "")

In [8]:
# reorder columns
columns_titles = ['Name', 'Partei', 'Wahlart', 'Bundesland', 'Wahlkreis', 'Wahlkreis-Nr.', 'Ausschuesse', 'Soziale Medien', 'Biografie']
df=df.reindex(columns=columns_titles)

In [6]:
# saving final dataframe
with open('abg_df.pickle', 'wb') as handle:
    pickle.dump(abg_df, handle, protocol=pickle.HIGHEST_PROTOCOL)