# Scraping MPs' tweets

### Prerequisites

In [2]:
# Libraries

import pandas as pd # data wrangling
import numpy as np # math operations
import math # math operations
import os # directories
import time # system time
import random # random number generation
import pickle # data compression
import re # regular expressions
import unidecode # regular expressions

import urllib.request # scraping
import requests # scraping
from bs4 import BeautifulSoup # scraping
import ctypes # interface to C
import tweepy # twitter 

import sys # system limit (preventing infinite running)
sys.setrecursionlimit(100000)

import selenium # chrome driver
from selenium import webdriver # chrome driver
import selenium.common.exceptions as selexcept # exception handling

### Preparation

In [3]:
# Import Bundestag data

with open('../3_output/abg_twitter_df.pickle', 'rb') as handle:
    abg_twitter_df = pickle.load(handle)
    
# Select name, party and username for each member and store in table called twitter_account

names = abg_twitter_df['name_matching']
twitter_usernames = abg_twitter_df['twitter']
twitter_usernames.rename('username', inplace=True)
twitter_account = pd.concat([names, twitter_usernames], axis=1)

# Drop usernames that are nan or empty (i.e. parliamentarians with no account)

mask = twitter_account.username.notnull()
twitter_account = twitter_account[mask]
twitter_account.reset_index(drop=True, inplace=True)

### Scraping

In [10]:
# Function to download tweets for a specific user with Tweepy

def download_tweets_tweepy_mod(username):
    
    # Helper function to check whether tweet is retweet
    
    def is_retweet(x):
        try:
            res = not(math.isnan(x))
        except:
            res = True
        return(res)

    # Helper function to retrieve hashtags
    
    def get_hashtags(x):
        hashtags_dict = x['hashtags']
        hashtags_text = [x['text'] for x in hashtags_dict]
        return(hashtags_text)

    # Helper function to retrieve user mentions
    
    def get_mentions(x):
        mentions_dict = x['user_mentions']
        mentions_text = [x['screen_name'] for x in mentions_dict]
        return(mentions_text)
    
    # Initialize a list to hold all the tweepy Tweets
    
    alltweets = []
    
    # Specify relevant columns
    
    colnames = [
        'created_at', 
        'full_text', 
        'retweet_count', 
        'favorite_count', 
        'followers_count', 
        'location'
    ]
    
    try:
        
        # Make initial request for most recent tweets (200 is the maximum allowed count)
        
        new_tweets = api.user_timeline(
            screen_name=username, 
            count=200,
            tweet_mode="extended")	
        
        # Save most recent tweets
        
        alltweets.extend(new_tweets)
        
        # Save the id of the oldest tweet less one
        
        oldest = alltweets[-1].id - 1
        
        # Keep grabbing tweets until there are no tweets left to grab
        
        while len(new_tweets) > 0:
            
            # All subsequent requests use the max_id param to prevent duplicates
            
            new_tweets = api.user_timeline(
                screen_name=username,
                count=200,
                max_id=oldest,
                tweet_mode='extended')
            
            # Save most recent tweets
            
            alltweets.extend(new_tweets)
            oldest = alltweets[-1].id - 1
            
        # Convert output to pandas DataFrame
        
        outtweets = pd.DataFrame([tweet.__dict__ for tweet in alltweets])
        
        # Check whether tweet is retweet
        
        outtweets['is_retweet'] = outtweets['retweeted_status'].apply(is_retweet)
                
        # Retrieve other metrics
        
        outtweets['followers_count'] = [x.followers_count for x in outtweets['author']]
        outtweets['location'] = [x.location for x in outtweets['author']]
        outtweets = outtweets[~ outtweets['is_retweet']]
        outtweets = outtweets[colnames]
        
        # Add boolean column for availability
        
        outtweets.insert(0, 'available', True)
        
    except:

        print('Data for user %s cannot be downloaded' %username)
        outtweets = pd.DataFrame(np.nan, index=[0], columns=colnames)
        outtweets.insert(0, 'available', False)
        
    # Add column with username
    
    outtweets.insert(0, 'username', username)
    return(outtweets)


In [11]:
tweepy_df = pd.DataFrame()

In [13]:
my_keys = {
    'consumer_key': 'o0g3JVWSKzRYv9dQp2SEPdjXp',
    'consumer_secret': 'AyvUIFzB82w3ZetyTXf1PbHiSxK7CgdcJo0D5jfKAoFlUuP0iH',
    'access_token_key': '1302924762914660354-7ydX1jUVSnscL60hhl83biPGNVeQoH',
    'access_token_secret': '9NqtnWj2q8uLuQkLMWdamJyIEb56hlGJOVgrydzoakorT'}

# Set up access to API

auth = tweepy.OAuthHandler(my_keys['consumer_key'], my_keys['consumer_secret'])
auth.set_access_token(my_keys['access_token_key'], my_keys['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
# Download most recent tweets using tweepy (at most 3200 tweets per user)

for username in twitter_account['username']:
    tweepy_df=pd.concat([tweepy_df, download_tweets_tweepy_mod(username)])
    
tweepy_df=twitter_account.merge(tweepy_df, on='username')

ctypes.windll.user32.MessageBoxW(0, "Twitter data successfully scraped", "Progress Report")

Data for user doris_achelwilm cannot be downloaded
Data for user akbulutgokay cannot be downloaded
Data for user Manfredbehrens cannot be downloaded
Data for user MWBirkwald cannot be downloaded
Data for user BystronAfD cannot be downloaded


In [1]:
tweepy_df.head(50)

NameError: name 'tweepy_df' is not defined

### Saving

In [11]:
# Create random subset of tweepy_df to have smaller data to try methods on

tweepy_df_subset = tweepy_df.sample(frac=0.1, replace=False, random_state=1)

In [8]:
# Save output

with open('../3_output/tweepy_df.pickle', 'wb') as handle:
    pickle.dump(tweepy_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open('../3_output/tweepy_df_subset_no_retweets.pickle', 'wb') as handle:
    pickle.dump(tweepy_df_subset, handle, protocol=pickle.HIGHEST_PROTOCOL)