# Scraping Twitter accounts from party websites

Note: In case of the AfD party, Twitter accounts are not available on the official party website and must thus be gathered manually.

### Prerequisites

In [1]:
# Libraries

from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import pickle
import numpy as np
import unidecode
import os
import ctypes

In [2]:
# Define user path (comment out the irrelevant one) and set working directory

my_path = 'C:\\Users\\wimme\\Documents\\1_uni\\1_master\\consulting\\projects\\consulting\\1_scraping'
# my_path = 'Asmiks path'
os.chdir(my_path)
os.path.abspath(os.getcwd())

'C:\\Users\\wimme\\Documents\\1_uni\\1_master\\consulting\\projects\\consulting\\1_scraping'

### Setup

In [3]:
# Access party websites and convert into soups 

fdp = "https://www.fdpbt.de/fraktion/abgeordnete"
source_fdp = requests.get(fdp).text
soup_fdp = BeautifulSoup(source_fdp, 'html.parser')

cdu = "https://www.cducsu.de/hier-stellt-die-cducsu-bundestagsfraktion-ihre-abgeordneten-vor"
source_cdu = requests.get(cdu).text
soup_cdu = BeautifulSoup(source_cdu, 'html.parser')

spd = "https://www.spdfraktion.de/abgeordnete/alle?wp=19&view=list&old=19"
source_spd = requests.get(spd).text
soup_spd = BeautifulSoup(source_spd, 'html.parser')

gruene = "https://www.gruene-bundestag.de/abgeordnete"
source_gruene = requests.get(gruene).text
soup_gruene = BeautifulSoup(source_gruene, 'html.parser')

# For Die Linke, one needs to extract Twitter accounts from each individual MP website
linke_base = "https://www.linksfraktion.de/fraktion/abgeordnete/"
# Website contains bins of MPs, according to last name
letters = [['a', 'e'], ['f', 'j'], ['k', 'o'], ['p', 't'], ['u', 'z']] 
linke_name_bins = []

for letter in letters:
    extension = f'{letter[0]}-bis-{letter[1]}/' 
    linke_name_bins.append(linke_base + extension)

In [7]:
# For each party, find appropriate parent node in soup

# all_abg_fdp = soup_fdp.find(class_ = 'person-list').find_all(class_ = 'person-item-wrapper')

all_abg_cdu = soup_cdu.find_all(class_ = 'teaser delegates')

all_abg_spd = soup_spd.find_all(class_ = 'views-row')

extensions_gruene = soup_gruene.find_all('a', class_ = "abgeordneteTeaser__wrapper")
urlbase_gruene = 'https://www.gruene-bundestag.de'
all_abg_gruene = []

for a in extensions_gruene:
    
    extension = a['href']
    link = urlbase_gruene + str(extension)
    all_abg_gruene.append(link)
    
all_abg_linke = []

for name_bin in linke_name_bins:
    
    source = requests.get(name_bin).text
    soup = BeautifulSoup(source, 'html.parser')
    
    for abg in soup.find_all('div', attrs = {'class': 'col-xs-12 col-sm-12 col-md-6 col-lg-6'}):
        extension = abg.find('h2').find('a')['href'].lstrip('/fraktion/abgeordnete/')
        all_abg_linke.append(linke_base + extension)

### Scraping

In [8]:
# Scrape accounts from soups 

twitter_list = []

# CDU/CSU

for abg in all_abg_cdu:
    
    twitter = abg.find(class_ = 'twitter')
    
    twitter_list.append(
        {
            'party': "CDU/CSU",
            'name': abg.find('h2').find('span').text.strip(' '),
            'twitter_ext': twitter.find('a', href = True)['href'] if twitter is not None else ""
        }
    )

# FDP

# for abg in all_abg_fdp:
    
#     name_field = abg.find(class_ = 'person-name')
#     funktion = name_field.find('span').text.strip()
#     name = name_field.text.strip('\n').strip().rstrip(funktion).strip('\n').strip()
#     twitter = abg.find('a', attrs = {'class': 'tw'}, href = True)
    
#     twitter_list.append(
#         {
#             'party': "FDP",
#             'name': name,
#             'twitter_ext': twitter['href'] if twitter is not None else ""
#         }
#     )

# Gruene

for abg in all_abg_gruene:
    
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    hrefss = []
    twitter = ""
    
    for x in abg_soup.find_all(class_ = "weitereInfoTeaser"):
        
        for y in x.find_all('a', href = True):
            
            z = y['href']
            hrefss.append(z)
            
            for i in hrefss:
                
                if "twitter" not in i:
                    continue 
                else:
                    twitter = i
                    
    twitter_list.append(           
        {
            'party': "Bündnis 90/Die Grünen",
            'name': abg_soup.find('h1').text,
            'twitter_ext': twitter
        }
    )
    
# Linke

for abg in all_abg_linke:
    
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    twitter = abg_soup.find('a', text = re.compile('Twitter-Profil'))
    
    twitter_list.append(
        {
            'party': "Die Linke",
            'name': abg_soup.find('h1').text.strip(' '),
            'twitter_ext': twitter['href'] if twitter is not None else ""
        }
    )

# SPD

for abg in all_abg_spd:
    
    twitter = abg.find(class_ = 'ico_twitter')
    
    twitter_list.append(
        {
            'party': "SPD",
            'name': abg.find('h3').find('a').get_text().strip(' '),
            'twitter_ext': twitter['href'] if twitter is not None else ""
        }
    )
    
# Convert to data frame    
    
twitter_df = pd.DataFrame(twitter_list)    

ctypes.windll.user32.MessageBoxW(0, "Twitter accounts successfully scraped", "Progress Report")

1

In [9]:
twitter_df.head()

Unnamed: 0,party,name,twitter_ext
0,CDU/CSU,Dr. Michael von Abercron,https://twitter.com/mvabercron
1,CDU/CSU,Stephan Albani,
2,CDU/CSU,Norbert Altenkamp,
3,CDU/CSU,Peter Altmaier,http://twitter.com/peteraltmaier
4,CDU/CSU,Philipp Amthor,


In [10]:
# Add list of manually collected account names for AfD

afd_df = pd.read_csv('input/afd_twitter_accounts.csv', encoding = "ISO-8859-1", delimiter = ';')
afd_df.columns = ['name', 'party', 'twitter_ext']
columns_titles = ['party', 'name', 'twitter_ext']
afd_df = afd_df.reindex(columns = columns_titles)

twitter_df = twitter_df.append(afd_df)

In [11]:
twitter_df

Unnamed: 0,party,name,twitter_ext
0,CDU/CSU,Dr. Michael von Abercron,https://twitter.com/mvabercron
1,CDU/CSU,Stephan Albani,
2,CDU/CSU,Norbert Altenkamp,
3,CDU/CSU,Peter Altmaier,http://twitter.com/peteraltmaier
4,CDU/CSU,Philipp Amthor,
...,...,...,...
84,AfD,Prof. Dr. Harald Weyel,h_weyel
85,AfD,Wolfgang Wiehle,WolfgangWiehle
86,AfD,Dr. Heiko Wildberg,Heiko_Wildberg
87,AfD,Dr. Christian Wirth,ChrWirthMdB


In [12]:
twitter_df[twitter_df['party'] == 'CDU/CSU'].head(50)

Unnamed: 0,party,name,twitter_ext
0,CDU/CSU,Dr. Michael von Abercron,https://twitter.com/mvabercron
1,CDU/CSU,Stephan Albani,
2,CDU/CSU,Norbert Altenkamp,
3,CDU/CSU,Peter Altmaier,http://twitter.com/peteraltmaier
4,CDU/CSU,Philipp Amthor,
5,CDU/CSU,Artur Auernhammer,
6,CDU/CSU,Peter Aumer,https://twitter.com/PeterAumer
7,CDU/CSU,Dorothee Bär,http://twitter.com/DoroBaer
8,CDU/CSU,Thomas Bareiß,https://twitter.com/Thomas_Bareiss
9,CDU/CSU,Norbert Barthle,


### Save output

In [13]:
# Save twitter_df

with open('output/twitter_df.pickle', 'wb') as handle:
    pickle.dump(twitter_df, handle, protocol = pickle.HIGHEST_PROTOCOL)
    
# load back in

with open('output/twitter_df.pickle', 'rb') as handle:
    twitter_df = pickle.load(handle)

### Merging accounts data into principal MP data 

In [14]:
# Load

with open('output/abg_df.pickle', 'rb') as handle:
    abg_df = pickle.load(handle)

In [15]:
# Prepare MP names from Twitter df for name-based matching

def name_prep_twitter(name):
    
    interim = re.sub("[\(\[].*?[\)\]]", "", name).strip(' ')
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    interim = re.sub(' +', ' ', interim)
    
    if len(interim.split()) > 2:
        
        if interim.split()[0].endswith(('.', 'med', 'forest')):
            first_name = interim.split()[1]
            
        else:
            first_name = interim.split()[0]   
            
        last_name = interim.split()[-1]
        return (first_name + ' ' + last_name)

        if interim.split()[-1] == 'von':
            first_name = interim.split()[0:-1]     
    
    else:
        return interim

In [16]:
# Prepare MP names from Twitter df for name-based matching

twitter_df['name_matching'] = twitter_df['name'].apply(name_prep_twitter)

In [17]:
# Prepare MP names from MP df for name-based matching (helper function)

def name_prep(name):
    
    # Strip titles
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', name) 
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    interim = re.sub(' +', ' ', interim)
    
    if len(interim.split()) > 1:     
        return(interim.split()[0])
    
    else:
        return interim

In [18]:
# Prepare MP names from MP df for name-based matching

abg_df['name_matching'] = abg_df['first_name'].apply(name_prep) + ' ' + abg_df['last_name'].apply(name_prep)

In [20]:
# Merge Twitter df and MP df

abg_twitter_df = pd.merge(
    abg_df, 
    twitter_df[['name_matching', 'twitter_ext']], 
    how = 'left', 
    left_on = 'name_matching', 
    right_on = 'name_matching'
)

abg_twitter_df['twitter'] = np.where(
    abg_twitter_df['twitter'] != '', 
    abg_twitter_df['twitter'], 
    np.nan
)

abg_twitter_df['twitter_ext'] = np.where(
    abg_twitter_df['twitter_ext'] != '', 
    abg_twitter_df['twitter_ext'], 
    np.nan
)

In [21]:
abg_twitter_df[abg_twitter_df['party'] == 'CDU/CSU'].head(30)

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis_name,wahlkreis_nr,wahlkreis,twitter,name_matching,twitter_ext
0,Abercron,Dr. Michael von,CDU/CSU,\n\n\n\n\n\n\n Schleswig-Holstein\n\n,Pinneberg,7,Wahlkreis 007: Pinneberg,https://twitter.com/mvabercron/,Michael Abercron,https://twitter.com/mvabercron
4,Albani,Stephan,CDU/CSU,\n\n\n\n\n\n\n Niedersachsen\n\n,Oldenburg – Ammerland,27,Wahlkreis 027: Oldenburg – Ammerland,,Stephan Albani,
6,Altenkamp,Norbert,CDU/CSU,\n\n\n\n\n\n\n Hessen\n\n,Main-Taunus,181,Wahlkreis 181: Main-Taunus,,Norbert Altenkamp,
7,Altmaier,Peter,CDU/CSU,\n\n\n\n\n\n\n Saarland\n\n,Saarlouis,297,Wahlkreis 297: Saarlouis,https://twitter.com/peteraltmaier,Peter Altmaier,http://twitter.com/peteraltmaier
8,Amthor,Philipp,CDU/CSU,\n\n\n\n\n\n\n Mecklenburg-Vorpommern\n\n,Mecklenburgische Seenplatte I – Vorpommern-Gre...,16,Wahlkreis 016: Mecklenburgische Seenplatte I –...,,Philipp Amthor,
14,Auernhammer,Artur,CDU/CSU,\n\n\n\n\n\n\n Bayern\n\n,Ansbach,241,Wahlkreis 241: Ansbach,,Artur Auernhammer,
15,Aumer,Peter,CDU/CSU,\n\n\n\n\n\n\n Bayern\n\n,Regensburg,233,Wahlkreis 233: Regensburg,,Peter Aumer,https://twitter.com/PeterAumer
19,Bär,Dorothee,CDU/CSU,\n\n\n\n\n\n\n Bayern\n\n,Bad Kissingen,248,Wahlkreis 248: Bad Kissingen,,Dorothee Bar,http://twitter.com/DoroBaer
23,Bareiß,Thomas,CDU/CSU,\n\n\n\n\n\n\n Baden-Württemberg\n\n,Zollernalb – Sigmaringen,295,Wahlkreis 295: Zollernalb – Sigmaringen,,Thomas Bareiss,https://twitter.com/Thomas_Bareiss
27,Barthle,Norbert,CDU/CSU,\n\n\n\n\n\n\n Baden-Württemberg\n\n,Backnang – Schwäbisch Gmünd,269,Wahlkreis 269: Backnang – Schwäbisch Gmünd,,Norbert Barthle,


In [22]:
# Impute account name from Bundestag website where necessary and available

abg_twitter_df['twitter'] = np.where(
    abg_twitter_df['twitter_ext'].notnull(), abg_twitter_df['twitter_ext'], abg_twitter_df['twitter'])

abg_twitter_df = abg_twitter_df.drop('twitter_ext', axis = 1)

In [23]:
abg_twitter_df.head(30)

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis_name,wahlkreis_nr,wahlkreis,twitter,name_matching
0,Abercron,Dr. Michael von,CDU/CSU,\n\n\n\n\n\n\n Schleswig-Holstein\n\n,Pinneberg,7.0,Wahlkreis 007: Pinneberg,https://twitter.com/mvabercron,Michael Abercron
1,Achelwilm,Doris,Die Linke,Bremen,,,n.a.,https://twitter.com/doris_achelwilm,Doris Achelwilm
2,Aggelidis,Grigorios,FDP,\n\n\n\n\n\n\n Niedersachsen\n\n,Hannover-Land I,43.0,Wahlkreis 043: Hannover-Land I,https://twitter.com/aggelidis_fdp?lang=de,Grigorios Aggelidis
3,Akbulut,Gökay,Die Linke,\n\n\n\n\n\n\n Baden-Württemberg\n\n,Mannheim,275.0,Wahlkreis 275: Mannheim,https://twitter.com/akbulutgokay,Gokay Akbulut
4,Albani,Stephan,CDU/CSU,\n\n\n\n\n\n\n Niedersachsen\n\n,Oldenburg – Ammerland,27.0,Wahlkreis 027: Oldenburg – Ammerland,,Stephan Albani
5,Alt,Renata,FDP,\n\n\n\n\n\n\n Baden-Württemberg\n\n,Nürtingen,262.0,Wahlkreis 262: Nürtingen,,Renata Alt
6,Altenkamp,Norbert,CDU/CSU,\n\n\n\n\n\n\n Hessen\n\n,Main-Taunus,181.0,Wahlkreis 181: Main-Taunus,,Norbert Altenkamp
7,Altmaier,Peter,CDU/CSU,\n\n\n\n\n\n\n Saarland\n\n,Saarlouis,297.0,Wahlkreis 297: Saarlouis,http://twitter.com/peteraltmaier,Peter Altmaier
8,Amthor,Philipp,CDU/CSU,\n\n\n\n\n\n\n Mecklenburg-Vorpommern\n\n,Mecklenburgische Seenplatte I – Vorpommern-Gre...,16.0,Wahlkreis 016: Mecklenburgische Seenplatte I –...,,Philipp Amthor
9,Amtsberg,Luise,Bündnis 90/Die Grünen,\n\n\n\n\n\n\n Schleswig-Holstein\n\n,Kiel,5.0,Wahlkreis 005: Kiel,,Luise Amtsberg


### Converting Twitter URL to Twitter username

In [24]:
# URL to username (helper function)

def get_username(url):
    
    if url.startswith('http'):
        return(url.split('/')[3].split('?')[0])
    
    else:
        return(url.split('?')[0])

In [25]:
# URL to username

mask1 = abg_twitter_df['twitter'].notnull()
abg_twitter_df['twitter'] = abg_twitter_df['twitter'][mask1].apply(get_username)

In [23]:
abg_twitter_df.head(50)

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis,wahlkreis_nr,twitter,name_matching
0,Abercron,Dr. Michael von,CDU/CSU,Schleswig-Holstein,Pinneberg,7.0,mvabercron,Michael Abercron
1,Achelwilm,Doris,Die Linke,,,,doris_achelwilm,Doris Achelwilm
2,Aggelidis,Grigorios,FDP,Niedersachsen,Hannover-Land I,43.0,aggelidis_fdp,Grigorios Aggelidis
3,Akbulut,Gökay,Die Linke,Baden-Württemberg,Mannheim,275.0,akbulutgokay,Gokay Akbulut
4,Albani,Stephan,CDU/CSU,Niedersachsen,Oldenburg – Ammerland,27.0,,Stephan Albani
5,Alt,Renata,FDP,,,,,Renata Alt
6,Altenkamp,Norbert,CDU/CSU,Hessen,Main-Taunus,181.0,,Norbert Altenkamp
7,Altmaier,Peter,CDU/CSU,Saarland,Saarlouis,297.0,peteraltmaier,Peter Altmaier
8,Amthor,Philipp,CDU/CSU,Mecklenburg-Vorpommern,Mecklenburgische Seenplatte I – Vorpommern-Gre...,16.0,,Philipp Amthor
9,Amtsberg,Luise,Bündnis 90/Die Grünen,,,,,Luise Amtsberg


In [26]:
len(abg_twitter_df) - abg_twitter_df['twitter'].isnull().sum()

472

### Saving

In [27]:
# Save

with open('output/abg_twitter_df.pickle', 'wb') as handle:
    pickle.dump(abg_twitter_df, handle, protocol = pickle.HIGHEST_PROTOCOL)
abg_twitter_df.to_csv('output/abg_twitter_df.csv', index = False, encoding = 'utf-8-sig')