# Scraping the official Bundestag website

### Prerequisites

In [1]:
# Libraries

import pandas as pd # data wrangling
import numpy as np # math operations
import math # math operations
import os # directories
import time # system time
import random # random number generation
import pickle # data compression
import re # regular expressions
import unidecode # regular expressions

import urllib.request # scraping
import requests # scraping
from bs4 import BeautifulSoup # scraping
import ctypes # interface to C
import tweepy # twitter 

import sys # system limit (preventing infinite running)
sys.setrecursionlimit(100000)

import selenium # chrome driver
from selenium import webdriver # chrome driver
import selenium.common.exceptions as selexcept # exception handling

### Setup

Web driver _chromedriver_ must be downloaded and saved locally prior to execution of the following code (https://chromedriver.chromium.org/downloads).
Important: make sure to get the appropriate version for the OS in use, and that versions of chromedriver and Chrome browser match (an error will be raised if this is not the case).

In [4]:
# Set up selenium web driver

chdriver_path = r'../1_input/chromedriver.exe'
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(chdriver_path)

In [5]:
# Specify url and navigate to website

website = "https://www.bundestag.de/abgeordnete"
driver.get(website)

# Switch to list view - first, find "List" button

element = driver.find_element_by_class_name('icon-list-bullet') 

# Click button

webdriver.ActionChains(driver).move_to_element(element).click(element).perform()

# Wait for list view to load

time.sleep(random.randint(15, 20))

In [4]:
# Count how many MPs are listed on the website (includes dropouts and successors)

len(driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li'))

745

### Scraping

In [5]:
# Set up empty list

abg_df = []

# Find names and party and append to list

for link in driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li'):
    abg_df.append({
        'name': link.find_element_by_class_name('bt-teaser-person-text').\
        find_element_by_tag_name('h3').text,
        'party': link.find_element_by_class_name('bt-teaser-person-text').\
        find_element_by_tag_name('p').text,
    })
    
# Convert to pandas dataframe    

abg_df = pd.DataFrame(abg_df)

In [6]:
# Separate names in first and last names

name_concat = abg_df['name'].str.split(", ", n=1, expand=True) 
abg_df['last_name'] = name_concat[0] 
abg_df['first_name'] = name_concat[1] 
abg_df.drop(columns = ['name'], inplace=True) 
abg_df = abg_df.reindex(
    columns = ['last_name', 'first_name'] + list(abg_df.columns[:-2]))

In [7]:
# Add columns for infors that will be scraped shortly

abg_df = abg_df.reindex(
    columns = abg_df.columns.tolist() + 
    ['bundesland', 'wahlkreis_name', 'wahlkreis_nr', 'wahlkreis', 'twitter'])

# Inspect

abg_df.head()

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis_name,wahlkreis_nr,wahlkreis,twitter
0,Abercron,Dr. Michael von,CDU/CSU,,,,,
1,Achelwilm,Doris,Die Linke,,,,,
2,Aggelidis,Grigorios,FDP,,,,,
3,Akbulut,Gökay,Die Linke,,,,,
4,Albani,Stephan,CDU/CSU,,,,,


In [8]:
# Create range to loop over

abg_range = abg_df.index[abg_df['bundesland'].isnull()]
abg_range

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            735, 736, 737, 738, 739, 740, 741, 742, 743, 744],
           dtype='int64', length=745)

In [9]:
for abg in abg_range:
    
    try:
        
        # (Re-)load list view (for all iterations)
        
        driver.get(website)
        element = driver.find_element_by_class_name('icon-list-bullet')
        
        # Click to change to list view
        
        webdriver.ActionChains(driver).move_to_element(element).click(element).perform() 
        
        # Wait for list view to load
        
        time.sleep(random.randint(3, 5)) 
        
        # Click to open individual page
        
        driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li')\
        [abg].click()
        
        # Wait for page to load
        
        time.sleep(random.randint(3, 5)) 
        
        # Convert page to soup
        
        soup = BeautifulSoup(driver.page_source, 'lxml') 

        # Extract state (bundesland) and electoral district (wahlkreis)
        
        bundesland = soup.find(
            'div', attrs={'class': 'col-xs-12 col-sm-6 bt-standard-content'}).h5.text
        wahlkreis = soup.find(
            'div', attrs={'class': 'col-xs-12 col-sm-6 bt-standard-content'}).a.text \
            if soup.find(
            'div', attrs={'class': 'col-xs-12 col-sm-6 bt-standard-content'}).a is not None \
            else "n.a."

        # Split wahlkreis in name and ID
        
        wahlkreis_name = wahlkreis.split(':')[1].strip(' ') if wahlkreis \
        not in ["n.a.", None] else ""
        wahlkreis_nr = int(
            wahlkreis.split(':')[0].strip('Wahlkreis').strip('')) if wahlkreis \
            not in ["n.a.", None] else ""
        
        # Extract social media account
        
        social_media = {}
        
        if len(soup.find_all('h5', string='Profile im Internet')) == 1:
            for link in soup.find_all(class_='bt-linkliste')[0].find_all('a'):
                social_media[link['title']]=link.get('href')
                
        abg_df.loc[abg, 'bundesland'] = bundesland
        abg_df.loc[abg, 'wahlkreis'] = wahlkreis
        abg_df.loc[abg, 'wahlkreis_name'] = wahlkreis_name
        abg_df.loc[abg, 'wahlkreis_nr'] = wahlkreis_nr
        abg_df.loc[abg, 'twitter'] = social_media['Twitter'] if 'Twitter' in social_media else ""
        
        if abg%20 == 0:
            print('Data for MP %s successfully retrieved' %abg)
        
    # In case of IndexError or AttributeError, which occurs if page fails to load, try again
    
    except (IndexError, AttributeError, selexcept.NoSuchElementException):
        abg_range = abg_df.index[abg_df['bundesland'].isnull()]
        
ctypes.windll.user32.MessageBoxW(0, "MP data successfully scraped", "Progress Report")  

Data for MP 0 successfully retrieved


KeyboardInterrupt: 

In [10]:
abg_df.head(15)

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis_name,wahlkreis_nr,wahlkreis,twitter,username
0,Abercron,Dr. Michael von,CDU/CSU,Schleswig-Holstein,Pinneberg,7.0,Wahlkreis 007: Pinneberg,,https://twitter.com/mvabercron/
1,Achelwilm,Doris,Die Linke,,,,,,
2,Aggelidis,Grigorios,FDP,Niedersachsen,Hannover-Land I,43.0,Wahlkreis 043: Hannover-Land I,,https://twitter.com/aggelidis_fdp?lang=de
3,Akbulut,Gökay,Die Linke,Baden-Württemberg,Mannheim,275.0,Wahlkreis 275: Mannheim,,https://twitter.com/akbulutgokay?lang=de
4,Albani,Stephan,CDU/CSU,,,,,,
5,Alt,Renata,FDP,,,,,,
6,Altenkamp,Norbert,CDU/CSU,Hessen,Main-Taunus,181.0,Wahlkreis 181: Main-Taunus,,
7,Altmaier,Peter,CDU/CSU,,,,,,
8,Amthor,Philipp,CDU/CSU,Mecklenburg-Vorpommern,Mecklenburgische Seenplatte I – Vorpommern-Gre...,16.0,Wahlkreis 016: Mecklenburgische Seenplatte I –...,,
9,Amtsberg,Luise,Bündnis 90/Die Grünen,,,,,,


### Saving output

In [23]:
# Save output

with open('../3_output/abg_df.pickle', 'wb') as handle:
    pickle.dump(abg_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
abg_df.to_csv('../3_output/abg_df.csv', index=False, encoding='utf-8-sig')