# Scraping the official Bundestag website

### Prerequisites

In [1]:
# Libraries

import urllib.request
import requests
import re
import pickle
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
import time
import random
import pandas as pd
import numpy as np
import os
import ctypes

# see https://stackoverflow.com/questions/2134706/hitting-maximum-recursion-depth-using-pickle-cpickle/2135176#2135176
import sys
sys.setrecursionlimit(100000)

In [2]:
# Define user path (comment out the irrelevant one) and set working directory

my_path = 'C:\\Users\\wimme\\Documents\\1_uni\\1_master\\consulting\\projects\\consulting\\1_scraping'
# my_path = 'Asmiks path'
os.chdir(my_path)
os.path.abspath(os.getcwd())

'C:\\Users\\wimme\\Documents\\1_uni\\1_master\\consulting\\projects\\consulting\\1_scraping'

### Setup

Web driver _chromedriver_ must be downloaded and saved locally prior to execution of the following coede.

In [3]:
# Set up selenium web driver

chdriver_path = r'C:\\Users\\wimme\\Documents\\1_uni\\1_master\\consulting\\projects\\consulting\\1_scraping\\input\\chromedriver.exe' 
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(chdriver_path)

In [4]:
# Initialize selenium web driver

website = "https://www.bundestag.de/abgeordnete"
driver.get(website)
# Switch to list view - first, find "List" button
element = driver.find_element_by_class_name('icon-list-bullet') 
# Click button
webdriver.ActionChains(driver).move_to_element(element).click(element).perform()
# Wait for list view to load
time.sleep(random.randint(15, 20))

In [5]:
# Count how many MPs are listed on the website (includes dropouts and successors)

len(driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li'))

738

### Scraping

In [16]:
# Set up a dataframe with MP name and party only (from list)

abg_df = []

for link in driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li'):
    
    abg_df.append(
        {'name': link.find_element_by_class_name('bt-teaser-person-text').find_element_by_tag_name('h3').text,
         'party': link.find_element_by_class_name('bt-teaser-person-text').find_element_by_tag_name('p').text,
        }
    )
    
abg_df = pd.DataFrame(abg_df)

In [17]:
# Separate names in first and last names

name_concat = abg_df['name'].str.split(", ", n = 1, expand = True) 
abg_df['last_name'] = name_concat[0] 
abg_df['first_name'] = name_concat[1] 
abg_df.drop(columns = ['name'], inplace = True) 
abg_df = abg_df.reindex(
    columns = ['last_name', 'first_name'] + list(abg_df.columns[:-2]))

In [18]:
abg_df = abg_df.reindex(columns = abg_df.columns.tolist() + 
                        ['bundesland', 'wahlkreis_name', 'wahlkreis_nr', 'wahlkreis', 'twitter'])

In [19]:
abg_df.head()

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis_name,wahlkreis_nr,wahlkreis,twitter
0,Abercron,Dr. Michael von,CDU/CSU,,,,,
1,Achelwilm,Doris,Die Linke,,,,,
2,Aggelidis,Grigorios,FDP,,,,,
3,Akbulut,Gökay,Die Linke,,,,,
4,Albani,Stephan,CDU/CSU,,,,,


In [10]:
# Create range to loop over

abg_range = abg_df.index[abg_df['bundesland'].isnull()]
abg_range

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            728, 729, 730, 731, 732, 733, 734, 735, 736, 737],
           dtype='int64', length=738)

In [20]:
for abg in abg_range:
    
    try:
        
        # (Re-)load list view (for all iterations)
        driver.get(website)
        element = driver.find_element_by_class_name('icon-list-bullet')
        # Click to change to list view
        webdriver.ActionChains(driver).move_to_element(element).click(element).perform() 
        # Wait for list view to load
        time.sleep(random.randint(3, 5)) 
        # Click to open individual page
        driver.find_element_by_class_name('bt-list-holder').find_elements_by_tag_name('li')[abg].click() 
        # Wait for page to load
        time.sleep(random.randint(3, 5)) 
        
        # Convert page to soup
        soup = BeautifulSoup(driver.page_source, 'lxml') 

        # Extract state (bundesland) and electoral district (wahlkreis)
        
        bundesland = soup.find(
            'div', attrs = {'class': 'col-xs-12 col-sm-6 bt-standard-content'}).h5.text
        wahlkreis = soup.find(
            'div', attrs = {'class': 'col-xs-12 col-sm-6 bt-standard-content'}).a.text \
            if soup.find(
            'div', attrs = {'class': 'col-xs-12 col-sm-6 bt-standard-content'}).a is not None else "n.a."

        # Split wahlkreis in name and ID
        
        wahlkreis_name = wahlkreis.split(':')[1].strip(' ') if wahlkreis not in ["n.a.", None] else ""
        wahlkreis_nr = int(
            wahlkreis.split(':')[0].strip('Wahlkreis').strip('')) if wahlkreis not in ["n.a.", None] else ""
        
        # Extract social media account
        
        social_media = {}

        if len(soup.find_all('h5', string = 'Profile im Internet')) == 1:
            for link in soup.find_all(class_ = 'bt-linkliste')[0].find_all('a'):
                social_media[link['title']] = link.get('href')

        abg_df.loc[abg, 'bundesland'] = bundesland
        abg_df.loc[abg, 'wahlkreis'] = wahlkreis
        abg_df.loc[abg, 'wahlkreis_name'] = wahlkreis_name
        abg_df.loc[abg, 'wahlkreis_nr'] = wahlkreis_nr
        abg_df.loc[abg, 'twitter'] = social_media['Twitter'] if 'Twitter' in social_media else ""
        
        if abg%20 == 0:
            print('Data for MP %s successfully retrieved' %abg)
        
    # In case of IndexError or AttributeError, which occurs if page fails to load, try again
    
    except (IndexError, AttributeError):
        abg_range = abg_df.index[abg_df['bundesland'].isnull()]
        
ctypes.windll.user32.MessageBoxW(0, "MP data successfully scraped", "Progress Report")   

Data for MP 0 successfully retrieved
Data for MP 20 successfully retrieved
Data for MP 40 successfully retrieved
Data for MP 60 successfully retrieved
Data for MP 80 successfully retrieved
Data for MP 100 successfully retrieved
Data for MP 120 successfully retrieved
Data for MP 140 successfully retrieved
Data for MP 160 successfully retrieved
Data for MP 180 successfully retrieved
Data for MP 200 successfully retrieved
Data for MP 220 successfully retrieved
Data for MP 240 successfully retrieved
Data for MP 260 successfully retrieved
Data for MP 280 successfully retrieved
Data for MP 300 successfully retrieved
Data for MP 320 successfully retrieved
Data for MP 340 successfully retrieved
Data for MP 360 successfully retrieved
Data for MP 380 successfully retrieved
Data for MP 400 successfully retrieved
Data for MP 420 successfully retrieved
Data for MP 440 successfully retrieved
Data for MP 460 successfully retrieved
Data for MP 480 successfully retrieved
Data for MP 500 successfully re

0

In [22]:
abg_df.head(15)

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis_name,wahlkreis_nr,wahlkreis,twitter
0,Abercron,Dr. Michael von,CDU/CSU,\n\n\n\n\n\n\n Schleswig-Holstein\n\n,Pinneberg,7.0,Wahlkreis 007: Pinneberg,https://twitter.com/mvabercron/
1,Achelwilm,Doris,Die Linke,Bremen,,,n.a.,https://twitter.com/DorisAchelwilm
2,Aggelidis,Grigorios,FDP,\n\n\n\n\n\n\n Niedersachsen\n\n,Hannover-Land I,43.0,Wahlkreis 043: Hannover-Land I,https://twitter.com/aggelidis_fdp?lang=de
3,Akbulut,Gökay,Die Linke,\n\n\n\n\n\n\n Baden-Württemberg\n\n,Mannheim,275.0,Wahlkreis 275: Mannheim,https://twitter.com/akbulutgokay?lang=de
4,Albani,Stephan,CDU/CSU,\n\n\n\n\n\n\n Niedersachsen\n\n,Oldenburg – Ammerland,27.0,Wahlkreis 027: Oldenburg – Ammerland,
5,Alt,Renata,FDP,\n\n\n\n\n\n\n Baden-Württemberg\n\n,Nürtingen,262.0,Wahlkreis 262: Nürtingen,
6,Altenkamp,Norbert,CDU/CSU,\n\n\n\n\n\n\n Hessen\n\n,Main-Taunus,181.0,Wahlkreis 181: Main-Taunus,
7,Altmaier,Peter,CDU/CSU,\n\n\n\n\n\n\n Saarland\n\n,Saarlouis,297.0,Wahlkreis 297: Saarlouis,https://twitter.com/peteraltmaier
8,Amthor,Philipp,CDU/CSU,\n\n\n\n\n\n\n Mecklenburg-Vorpommern\n\n,Mecklenburgische Seenplatte I – Vorpommern-Gre...,16.0,Wahlkreis 016: Mecklenburgische Seenplatte I –...,
9,Amtsberg,Luise,Bündnis 90/Die Grünen,\n\n\n\n\n\n\n Schleswig-Holstein\n\n,Kiel,5.0,Wahlkreis 005: Kiel,


### Saving output

In [23]:
# Save final dataframe

with open('output/abg_df.pickle', 'wb') as handle:
    pickle.dump(abg_df, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
abg_df.to_csv('output/abg_df.csv', index = False, encoding = 'utf-8-sig')