# Scraping Twitter accounts from party websites

Note: In case of the AfD party, Twitter accounts are not available on the official party website and must thus be gathered manually.

### Prerequisites

In [6]:
# Libraries

import pandas as pd # data wrangling
import numpy as np # math operations
import math # math operations
import os # directories
import time # system time
import random # random number generation
import pickle # data compression
import re # regular expressions
import unidecode # regular expressions

import urllib.request # scraping
import requests # scraping
from bs4 import BeautifulSoup # scraping
import ctypes # interface to C
import tweepy # twitter 

import sys # system limit (preventing infinite running)
sys.setrecursionlimit(100000)

import selenium # chrome driver
from selenium import webdriver # chrome driver
import selenium.common.exceptions as selexcept # exception handling

### Setup

In [2]:
# Access party websites and convert into soups 

fdp = "https://www.fdpbt.de/fraktion/abgeordnete"
source_fdp = requests.get(fdp).text
soup_fdp = BeautifulSoup(source_fdp, 'html.parser')

cdu = "https://www.cducsu.de/hier-stellt-die-cducsu-bundestagsfraktion-ihre-abgeordneten-vor"
source_cdu = requests.get(cdu).text
soup_cdu = BeautifulSoup(source_cdu, 'html.parser')

spd = "https://www.spdfraktion.de/abgeordnete/alle?wp=19&view=list&old=19"
source_spd = requests.get(spd).text
soup_spd = BeautifulSoup(source_spd, 'html.parser')

gruene = "https://www.gruene-bundestag.de/abgeordnete"
source_gruene = requests.get(gruene).text
soup_gruene = BeautifulSoup(source_gruene, 'html.parser')

# For Die Linke, one needs to extract Twitter accounts from each individual MP website

linke_base = "https://www.linksfraktion.de/fraktion/abgeordnete/"

# Website contains bins of MPs, according to last name

letters = [['a', 'e'], ['f', 'j'], ['k', 'o'], ['p', 't'], ['u', 'z']] 
linke_name_bins = []

for letter in letters:
    extension = f'{letter[0]}-bis-{letter[1]}/' 
    linke_name_bins.append(linke_base + extension)

In [7]:
# For each party, find appropriate parent node in soup

all_abg_cdu = soup_cdu.find_all(class_='teaser delegates')

all_abg_spd = soup_spd.find_all(class_='views-row')

extensions_gruene = soup_gruene.find_all('a', class_="abgeordneteTeaser__wrapper")
urlbase_gruene = 'https://www.gruene-bundestag.de'
all_abg_gruene = []

for a in extensions_gruene:
    
    extension = a['href']
    link = urlbase_gruene + str(extension)
    all_abg_gruene.append(link)
    
all_abg_linke = []

for name_bin in linke_name_bins:
    
    source = requests.get(name_bin).text
    soup = BeautifulSoup(source, 'html.parser')
    
    for abg in soup.find_all('div', attrs={'class': 'col-xs-12 col-sm-12 col-md-6 col-lg-6'}):
        extension = abg.find('h2').find('a')['href'].lstrip('/fraktion/abgeordnete/')
        all_abg_linke.append(linke_base + extension)

### Scraping

In [8]:
# Scrape accounts from soups 

twitter_list = []

# CDU/CSU

for abg in all_abg_cdu:
    
    twitter = abg.find(class_ = 'twitter')
    
    twitter_list.append({
        'party': "CDU/CSU",
        'name': abg.find('h2').find('span').text.strip(' '),
        'twitter_ext': twitter.find('a', href=True)['href'] if twitter is not None else ""
    })

# Gruene

for abg in all_abg_gruene:
    
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    hrefss = []
    twitter = ""
    
    for x in abg_soup.find_all(class_="weitereInfoTeaser"):
        
        for y in x.find_all('a', href=True):
            
            z = y['href']
            hrefss.append(z)
            
            for i in hrefss:
                
                if "twitter" not in i:
                    continue 
                else:
                    twitter = i
                    
    twitter_list.append({
        'party': "Bündnis 90/Die Grünen",
        'name': abg_soup.find('h1').text,
        'twitter_ext': twitter
    })
    
# Linke

for abg in all_abg_linke:
    
    abg_source = requests.get(abg).text
    abg_soup = BeautifulSoup(abg_source, 'html.parser')
    twitter = abg_soup.find('a', text=re.compile('Twitter-Profil'))
    
    twitter_list.append({
        'party': "Die Linke",
        'name': abg_soup.find('h1').text.strip(' '),
        'twitter_ext': twitter['href'] if twitter is not None else ""
    })

# SPD

for abg in all_abg_spd:
    
    twitter = abg.find(class_='ico_twitter')
    
    twitter_list.append({
        'party': "SPD",
        'name': abg.find('h3').find('a').get_text().strip(' '),
        'twitter_ext': twitter['href'] if twitter is not None else ""
    })
    
# Convert to data frame    
    
twitter_df = pd.DataFrame(twitter_list)    

ctypes.windll.user32.MessageBoxW(0, "Twitter accounts successfully scraped", "Progress Report")

0

In [9]:
twitter_df.head()

Unnamed: 0,party,name,twitter_ext
0,Bündnis 90/Die Grünen,Luise Amtsberg,
1,Bündnis 90/Die Grünen,Lisa Badum,https://twitter.com/badulrichmartha
2,Bündnis 90/Die Grünen,Annalena Baerbock,https://twitter.com/ABaerbock
3,Bündnis 90/Die Grünen,Margarete Bause,https://twitter.com/MargareteBause
4,Bündnis 90/Die Grünen,Dr. Danyal Bayaz,https://twitter.com/derdanyal


In [10]:
# Add list of manually collected account names for AfD

afd_df = pd.read_csv('../1_input/afd_twitter_accounts.csv', encoding="ISO-8859-1", delimiter=';')
afd_df.columns = ['name', 'party', 'twitter_ext']
columns_titles = ['party', 'name', 'twitter_ext']
afd_df = afd_df.reindex(columns=columns_titles)

twitter_df = twitter_df.append(afd_df)

In [11]:
twitter_df

Unnamed: 0,party,name,twitter_ext
0,Bündnis 90/Die Grünen,Luise Amtsberg,
1,Bündnis 90/Die Grünen,Lisa Badum,https://twitter.com/badulrichmartha
2,Bündnis 90/Die Grünen,Annalena Baerbock,https://twitter.com/ABaerbock
3,Bündnis 90/Die Grünen,Margarete Bause,https://twitter.com/MargareteBause
4,Bündnis 90/Die Grünen,Dr. Danyal Bayaz,https://twitter.com/derdanyal
...,...,...,...
84,AfD,Prof. Dr. Harald Weyel,h_weyel
85,AfD,Wolfgang Wiehle,WolfgangWiehle
86,AfD,Dr. Heiko Wildberg,Heiko_Wildberg
87,AfD,Dr. Christian Wirth,ChrWirthMdB


### Save output

In [13]:
# Save twitter_df

with open('../3_output/twitter_df.pickle', 'wb') as handle:
    pickle.dump(twitter_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# load back in

with open('../3_output/twitter_df.pickle', 'rb') as handle:
    twitter_df=pickle.load(handle)

### Merging accounts data into principal MP data 

In [12]:
# Load

with open('../3_output/abg_df.pickle', 'rb') as handle:
    abg_df=pickle.load(handle)

In [13]:
# Prepare MP names from Twitter df for name-based matching

def name_prep(name, twitter=True):
    
    interim = re.sub("[\(\[].*?[\)\]]", "", name).strip(' ')
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = re.sub(r'(^\w{1,6}\. ?)', r'', interim)
    interim = unidecode.unidecode(interim).strip(' ')
    interim = re.sub(' +', ' ', interim)
    
    if twitter:
    
        if len(interim.split()) > 2:

            if interim.split()[0].endswith(('.', 'med', 'forest')):
                first_name = interim.split()[1]
            else:
                first_name = interim.split()[0]   

            last_name = interim.split()[-1]
            return (first_name + ' ' + last_name)

            if interim.split()[-1] == 'von':
                first_name = interim.split()[0:-1]     

        else:
            return interim
        
    else:
        
        if len(interim.split()) > 1:     
            return(interim.split()[0])
        else:
            return interim

In [14]:
# Prepare MP names from Twitter df for name-based matching

twitter_df['name_matching'] = twitter_df['name'].apply(name_prep)

In [15]:
# Prepare MP names from MP df for name-based matching

abg_df['name_matching'] = abg_df['first_name'].apply(name_prep, twitter = False) + ' ' + \
abg_df['last_name'].apply(name_prep, twitter = False)

In [21]:
# Merge Twitter df and MP df

abg_twitter_df = pd.merge(
    abg_df, 
    twitter_df[['name_matching', 'twitter_ext']], 
    how='left', 
    left_on='name_matching', 
    right_on='name_matching'
)

abg_twitter_df['twitter'] = np.where(
    abg_twitter_df['twitter'] != '', 
    abg_twitter_df['twitter'], 
    np.nan
)

abg_twitter_df['twitter_ext'] = np.where(
    abg_twitter_df['twitter_ext'] != '', 
    abg_twitter_df['twitter_ext'], 
    np.nan
)

In [22]:
# Impute account name from Bundestag website where necessary and available

abg_twitter_df['twitter'] = np.where(
    abg_twitter_df['twitter_ext'].notnull(), 
    abg_twitter_df['twitter_ext'], 
    abg_twitter_df['twitter'])

abg_twitter_df = abg_twitter_df.drop('twitter_ext', axis=1)

### Converting Twitter URL to Twitter username

In [23]:
# URL to username (helper function)

def get_username(url):
    
    if url.startswith('http'):
        return(url.split('/')[3].split('?')[0])
    
    else:
        return(url.split('?')[0])

In [24]:
# URL to username

mask = abg_twitter_df['twitter'].notnull()
abg_twitter_df['twitter'] = abg_twitter_df['twitter'][mask].apply(get_username)

In [26]:
abg_twitter_df.head()

Unnamed: 0,last_name,first_name,party,bundesland,wahlkreis_name,wahlkreis_nr,wahlkreis,twitter,name_matching,twitter_ext
0,Abercron,Dr. Michael von,CDU/CSU,\n\n\n\n\n\n\n Schleswig-Holstein\n\n,Pinneberg,7.0,Wahlkreis 007: Pinneberg,mvabercron,Michael Abercron,
1,Achelwilm,Doris,Die Linke,Bremen,,,n.a.,DorisAchelwilm,Doris Achelwilm,https://twitter.com/doris_achelwilm
2,Aggelidis,Grigorios,FDP,\n\n\n\n\n\n\n Niedersachsen\n\n,Hannover-Land I,43.0,Wahlkreis 043: Hannover-Land I,aggelidis_fdp,Grigorios Aggelidis,
3,Akbulut,Gökay,Die Linke,\n\n\n\n\n\n\n Baden-Württemberg\n\n,Mannheim,275.0,Wahlkreis 275: Mannheim,akbulutgokay,Gokay Akbulut,https://twitter.com/akbulutgokay
4,Albani,Stephan,CDU/CSU,\n\n\n\n\n\n\n Niedersachsen\n\n,Oldenburg – Ammerland,27.0,Wahlkreis 027: Oldenburg – Ammerland,,Stephan Albani,


### Saving

In [27]:
# Save

with open('../3_output/abg_twitter_df.pickle', 'wb') as handle:
    pickle.dump(abg_twitter_df, handle, protocol=pickle.HIGHEST_PROTOCOL)
abg_twitter_df.to_csv('../3_output/abg_twitter_df.csv', index=False, encoding='utf-8-sig')