# Scraping Twitter accounts from party websites

Note: In case of the AfD party, Twitter accounts are not available on the official party website and must thus be gathered manually.

In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import pickle
import numpy as np
import unidecode
import os

In [2]:
# set up working directory
os.path.abspath(os.getcwd()) # initial working directory (should be equal to source file directory if using Jupyter Notebook)
os.chdir('../../data/web_scraping') # change to directory where all data files are stored
# check working directory
os.path.abspath(os.getcwd())

'C:\\Users\\Simon\\OneDrive\\Uni\\LMU\\SS 2020\\Statistisches Consulting\\Bundestag-MP-Analyse\\data\\web_scraping'

### Setup

In [3]:
# access party websites, convert into soups 
# fdp = "https://www.fdpbt.de/fraktion/abgeordnete"
# source_fdp = requests.get(fdp).text
# soup_fdp = BeautifulSoup(source_fdp, 'html.parser')

cdu = "https://www.cducsu.de/hier-stellt-die-cducsu-bundestagsfraktion-ihre-abgeordneten-vor"
source_cdu = requests.get(cdu).text
soup_cdu = BeautifulSoup(source_cdu, 'html.parser')

spd = "https://www.spdfraktion.de/abgeordnete/alle?wp=19&view=list&old=19"
source_spd = requests.get(spd).text
soup_spd = BeautifulSoup(source_spd, 'html.parser')

gruene = "https://www.gruene-bundestag.de/abgeordnete"
source_gruene = requests.get(gruene).text
soup_gruene = BeautifulSoup(source_gruene, 'html.parser')

# for Die Linke, one needs to extract Twitter accounts from each individual MP website
linke_base = "https://www.linksfraktion.de/fraktion/abgeordnete/"
letters = [['a', 'e'], ['f', 'j'], ['k', 'o'], ['p', 't'], ['u', 'z']] # website contains bins of MPs, according to last name
linke_name_bins = []

for letter in letters:
    extension = f'{letter[0]}-bis-{letter[1]}/' 
    linke_name_bins.append(linke_base + extension)

In [9]:
# for each party, find appropriate parent node in soup

# all_abg_fdp = soup_fdp.find(class_ = 'person-list').find_all(class_ = 'person-item-wrapper')
all_abg_cdu = soup_cdu.find_all(class_ = 'teaser delegates')
all_abg_spd = soup_spd.find_all(class_ = 'views-row')

extensions_gruene = soup_gruene.find_all('a', class_="abgeordneteTeaser__wrapper")
urlbase_gruene = 'https://www.gruene-bundestag.de'
all_abg_gruene = []
for a in extensions_gruene:
    extension = a['href']
    link = urlbase_gruene + str(extension)
    all_abg_gruene.append(link)
    
all_abg_linke = []
for name_bin in linke_name_bins:
    source = requests.get(name_bin).text
    soup = BeautifulSoup(source, 'html.parser')
    for abg in soup.find_all('div', attrs = {'class': 'col-xs-12 col-sm-12 col-md-6 col-lg-6'}):
        extension = abg.find('h2').find('a')['href'].lstrip('/fraktion/abgeordnete/')
        all_abg_linke.append(linke_base + extension)

### Scraping

In [10]:
# set up empty list
twitter_list = []

# # FDP
# for abg in all_abg_fdp:
#     name_field = abg.find(class_ = 'person-name')
#     funktion = name_field.find('span').text.strip()
#     name = name_field.text.strip('\n').strip().rstrip(funktion).strip('\n').strip()
#     twitter = abg.find('a', attrs = {'class': 'tw'}, href = True)
#     twitter_list.append(
#         {
#         'Partei': "FDP",
#         'Name': name,
#         'Twitter': twitter['href'] if twitter is not None else ""
#             }
#         )

# CDU/CSU
for abg in all_abg_cdu:
    twitter = abg.find(class_ = 'twitter')
    twitter_list.append(
        {
        'Partei': "CDU/CSU",
        'Name': abg.find('h2').find('span').text.strip(' '),
        'Twitter': twitter.find('a', href = True)['href'] if twitter is not None else ""
            }
        )

# SPD
for abg in all_abg_spd:
    twitter = abg.find(class_ = 'ico_twitter')
    twitter_list.append(
        {
        'Partei': "SPD",
        'Name': abg.find('h3').find('a').get_text().strip(' '),
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )

# Bündnis 90/Die Grünen
for abg in all_abg_gruene:
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    hrefss = []
    twitter = ""
    for x in abg_soup.find_all(class_="weitereInfoTeaser"):
        for y in x.find_all('a', href = True):
            z = y['href']
            hrefss.append(z)
            for i in hrefss:
                if "twitter" not in i:
                    continue 
                else:
                    twitter = i
                    
    twitter_list.append(           
        {
        'Partei': "Bündnis 90/Die Grünen",
        'Name': abg_soup.find('h1').text,
        'Twitter': twitter
            }
        )

# Die Linke
for abg in all_abg_linke:
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    twitter = abg_soup.find('a', text = re.compile('Twitter-Profil'))
    twitter_list.append(
        {
        'Partei': "Die Linke",
        'Name': abg_soup.find('h1').text.strip(' '),
        'Twitter': twitter['href'] if twitter is not None else ""
            }
        )

### Adding AfD, saving output

In [10]:
# adding manually scraped Twitter accounts from AfD party members
afd_df = pd.read_csv('AFD.csv', encoding = "ISO-8859-1", delimiter = ';')
afd_df.columns = ['Name', 'Partei', 'Twitter']
columns_titles = ['Partei', 'Name', 'Twitter']
afd_df=afd_df.reindex(columns=columns_titles)

In [880]:
# convert into dataframe
twitter_df = pd.DataFrame(twitter_list)
# append AfD Twitter accounts
twitter_df = twitter_df.append(afd_df)

In [882]:
# # saving twitter_df
# with open('twitter_df.pickle', 'wb') as handle:
#     pickle.dump(twitter_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
# # load back in
# with open('twitter_df.pickle', 'rb') as handle:
#     twitter_df = pickle.load(handle)

### Merging Twitter accounts df into the principal MP df 

In [13]:
# load in data
with open('abg_df.pickle', 'rb') as handle:
    df = pickle.load(handle)

In [4]:
# prepare MP names from Twitter df for name-based matching (helper function)
def name_prep_twitter(name):
    interim = re.sub("[\(\[].*?[\)\]]", "", name).strip(' ')
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    interim = re.sub(' +', ' ', interim)
    if len(interim.split()) > 2:
        if interim.split()[0].endswith(('.', 'med', 'forest')):
            first_name = interim.split()[1]
        else:
            first_name = interim.split()[0]   
        last_name = interim.split()[-1]
        return (first_name + ' ' + last_name)
    else:
        return interim

In [7]:
# test
name_prep_twitter('Prof. Dr. med. Axel Gehrke')

'Axel Gehrke'

In [1095]:
# prepare MP names from Twitter df for name-based matching
twitter_df['Name_matching'] = twitter_df['Name'].apply(name_prep_twitter)

In [1096]:
# prepare MP names from MP df for name-based matching (helper function)
def name_prep(name):
    interim = re.sub("[\(\[].*?[\)\]]", "", ' '.join(name.split(',')[::-1])).strip(' ') # placing first name before last name
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim) # stripping titles
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    interim = re.sub(' +', ' ', interim)
    if len(interim.split()) > 2:
        if interim.split()[0].endswith(('.', 'med', 'forest')):
            first_name = interim.split()[1]
        else:
            first_name = interim.split()[0]   
        last_name = interim.split()[-1]
        return (first_name + ' ' + last_name)
    else:
        return interim

In [1097]:
# prepare MP names from MP df for name-based matching
df['Name_matching'] = df['Name'].apply(name_prep)

In [1098]:
# merging Twitter df and MP df
df = pd.merge(df, twitter_df, how = 'left', left_on = 'Name_matching', right_on = 'Name_matching', suffixes = ('', '_right'))

In [1076]:
# in case non-matches occur: identify them
# non_matches = []
# for i in range(len(df)):
#     if (df['Name_matching'][i] != df['Name_matching'][i]):
#         non_matches.append(i)

# print(non_matches)

In [1099]:
# filling up Twitter-column with values from Soziale Medien-dictionary (helper function)
def get_twitter_from_dict(x):
    if 'Twitter' in x:
        return x['Twitter']

In [1100]:
# filling up Twitter-column with values from Soziale Medien-dictionary
df['Twitter_dict'] = df['Soziale Medien'].apply(get_twitter_from_dict)
df['Twitter'] = np.where(df['Twitter_dict'].notnull(), df['Twitter_dict'], df['Twitter'])

# dropping columns used for merging only
columns_to_keep = ['Name', 'Partei', 'Wahlart', 'Bundesland', 'Wahlkreis', 'Wahlkreis-Nr.', 'Ausschuesse', 'Soziale Medien', 'Biografie', 'Twitter']
df.drop(df.columns.difference(columns_to_keep), 1, inplace=True)  

### Converting Twitter URL to Twitter username

In [1101]:
# URL to username (helper function)
def get_username(url):
    if url.startswith('http'):
        return(url.split('/')[3].split('?')[0])
    else:
        return(url.split('?')[0])

In [1102]:
mask1 = df['Twitter'].notnull()

In [1103]:
# URL to username
df['Twitter'] = df['Twitter'][mask1].apply(get_username)

In [1106]:
df.head(50)

Unnamed: 0,Name,Partei,Wahlart,Bundesland,Wahlkreis,Ausschuesse,Soziale Medien,Biografie,Twitter
0,"Abercron, Dr. Michael von",CDU/CSU,Direkt gewählt,Schleswig-Holstein,Wahlkreis 007: Pinneberg,{'Ordentliches Mitglied': ['Ausschuss für Ernä...,{'von-abercron.de/': 'http://www.von-abercron....,Geboren am 17. November 1952 in Ehlers...,mvabercron
1,"Achelwilm, Doris",Die Linke,Gewählt über Landesliste,Bremen,n.a.,{'Ordentliches Mitglied': ['Ausschuss für Fami...,{'doris-achelwilm.de': 'http://www.doris-achel...,Geboren am 30. November 1976 in Thuine...,DorisAchelwilm
2,"Aggelidis, Grigorios",FDP,Gewählt über Landesliste,Niedersachsen,Wahlkreis 043: Hannover-Land I,{'Ordentliches Mitglied': ['Kuratorium der Bun...,{'grigorios-aggelidis.de': 'http://www.grigori...,Geboren am 19. August 1965 in Hannover...,aggelidis_fdp
3,"Akbulut, Gökay",Die Linke,Gewählt über Landesliste,Baden-Württemberg,Wahlkreis 275: Mannheim,"{'Ordentliches Mitglied': ['Schriftführer/in',...",{'goekay-akbulut.de': 'https://goekay-akbulut....,Geboren 1982 in Pinarbasi/ Türkei; ledig.Juni ...,akbulutgokay
4,"Albani, Stephan",CDU/CSU,Gewählt über Landesliste,Niedersachsen,Wahlkreis 027: Oldenburg – Ammerland,{'Ordentliches Mitglied': ['Ausschuss für Bild...,{'stephan-albani.de': 'http://www.stephan-alba...,Geboren am 3. Juni 1968 in Göttingen; verheira...,
5,"Alt, Renata",FDP,Gewählt über Landesliste,Baden-Württemberg,Wahlkreis 262: Nürtingen,{'Obfrau': ['Unterausschuss für Zivile Krisenp...,"{'renata-alt.de': 'http://www.renata-alt.de', ...",Geboren am 27. August 1965 in Skalica;...,RenataAlt_MdB
6,"Altenkamp, Norbert",CDU/CSU,Direkt gewählt,Hessen,Wahlkreis 181: Main-Taunus,"{'Ordentliches Mitglied': ['Schriftführer/in',...",{'norbert-altenkamp.de': 'http://www.norbert-a...,Geboren am 27. Juli 1972 in Marl/Westf...,
7,"Altmaier, Peter",CDU/CSU,Direkt gewählt,Saarland,Wahlkreis 297: Saarlouis,{'Bundesminister für Wirtschaft und Energie': []},{'peteraltmaier.de': 'http://www.peteraltmaier...,"Geboren am 18. Juni 1958 in Ensdorf, Saar; kat...",peteraltmaier
8,"Amthor, Philipp",CDU/CSU,Direkt gewählt,Mecklenburg-Vorpommern,Wahlkreis 016: Mecklenburgische Seenplatte I –...,{'Ordentliches Mitglied': ['Ausschuss für Inne...,{'philipp-amthor.de': 'http://www.philipp-amth...,Geboren am 10. November 1992 in Uecker...,
9,"Amtsberg, Luise",Bündnis 90/Die Grünen,Gewählt über Landesliste,Schleswig-Holstein,Wahlkreis 005: Kiel,{'Ordentliches Mitglied': ['Ausschuss für Inne...,{'luise-amtsberg.de': 'http://www.luise-amtsbe...,Geboren am 17. Oktober 1984 in Greifswald.2004...,


In [1092]:
# save combined df
with open('abg_df.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)