# Vox Populi

We'll start out with creating a json-document with information about MP's on twitter retreived from https://www.mpsontwitter.co.uk/list. There are some errors within the list, and these errors are fixed using the user_id_correction-function loading in corrections from a file.

In [4]:
import requests
import json

from bs4 import BeautifulSoup

def create_user_list(filename='mps.json', errata='user-errata.txt'):
    """
    Retrieves list of MP's on twitter, and store their username,
    political party, and twitter profile url in a json file,
    their full names as keywords.
    """

    url = "https://www.mpsontwitter.co.uk/list"
    data = requests.get(url)

    html = BeautifulSoup(data.text, 'html.parser')
    table = html.select("tbody", id='mp_wrapper')[1]

    mp_dict = {}
    for line in table.select('tr'):
        name = line.select('td')[2].get_text().strip()
        party = ' '.join(line.td['class'])
        twitter_id = line.a.get_text()[1:]
        url = line.a['href']
            
        mp_dict[name] = {
            "party": party,
            "screen_name": twitter_id,
            "url": "https://twitter.com/" + twitter_id
        }

    with open(filename, 'w') as f:
        json.dump(mp_dict, f)
        
    if errata is not None:
        user_id_correction(erratafile=errata, user_file=filename)
        
    print(f'MPs on Twitter stored in {filename}')
    
    
def user_id_correction(erratafile='user-errata.txt',
                       user_file='mps.json'):
    """
    Fixes errors in user_file, using information stored in erratafile
    """
    # Create dictionary with errata[wrong-id] = errata[righ-id]
    with open(erratafile, 'r') as f:
        errata = {}

        for line in f:
            line = line.rstrip('\n')
            errata[line.split('=')[0]] = line.split('=')[1]

    # Load dictironary to be corrected
    with open(user_file, 'r') as f:
        user_dict = json.load(f)
    
    
    # Correct the entries in the user-file
    for wrong_id, right_id in errata.items():
        for name, info in user_dict.items():
            if info["screen_name"] == wrong_id:
                print()
                print('Correction')
                print('Old: ', user_dict[name])
                user_dict[name]["screen_name"] = right_id
                user_dict[name]["url"] = info["url"][:-len(wrong_id)] + right_id
                print('New: ', user_dict[name])
                print()

    # Save corrected dictionary to json.
    with open(user_file, 'w') as f:
        json.dump(user_dict, f)
    
create_user_list()


Correction
Old:  {'party': 'Conservative', 'screen_name': 'daviddaguidmp', 'url': 'https://twitter.com/daviddaguidmp'}
New:  {'party': 'Conservative', 'screen_name': 'davidduguidmp', 'url': 'https://twitter.com/davidduguidmp'}


Correction
Old:  {'party': 'Conservative', 'screen_name': 'eorgeFreemanMP', 'url': 'https://twitter.com/eorgeFreemanMP'}
New:  {'party': 'Conservative', 'screen_name': 'GeorgeFreemanMP', 'url': 'https://twitter.com/GeorgeFreemanMP'}


Correction
Old:  {'party': 'Conservative', 'screen_name': 'SirRogerGaleMP', 'url': 'https://twitter.com/SirRogerGaleMP'}
New:  {'party': 'Conservative', 'screen_name': 'SirRogerGale', 'url': 'https://twitter.com/SirRogerGale'}

MPs on Twitter stored in mps.json


Based on this list of MP's we will load their tweets into a corpus using the tweepy-module. It will need to use the API-key that you should have stoed in the credentials.txt-file.

In [53]:
import os
import sys
import tweepy

class CorpusCreator:
    """
    Class for creating corpus structure and loading tweets
    using tweepy.
    
    Structure:
    -> corpus
    
    ---> party1
    -----> user11
    -----> user12
    -----> user13
    
    ---> party2
    -----> user21
    -----> user22
    .
    .
    .
    """
    
    def __init__(self, user_dict=None, rel_path='./',
                rate_limit_wait=True):
        
        # Get an API-object authorized from 'credentials.txt'
        auth = get_tweet_auth()
        self.api = tweepy.API(auth,
                              wait_on_rate_limit=rate_limit_wait,
                              wait_on_rate_limit_notify=rate_limit_wait)
        
        self.root = rel_path + 'corpus/'
        
        # Load mp-file from directory
        assert type(user_dict) in (str, dict) or user_dict is None,"User_dict wrong format"
        if user_dict is None:
            user_dict = 'mps.json'
            
        if type(user_dict) is str:
            with open(user_dict) as f:
                self.users = json.load(f)
                
        elif type(user_dict) is dict:
            self.users = user_dict
        
        # Create root filesystem
        try:
            os.mkdir(self.root)
            print('Directory "corpus created.')
            print()
        except:
            print('Directory "corpus" already exists.')
            print()
            
        
            
    def load_tweets(self, max_items=10000, user=None):
        """
        For all users in self.users, get [max_items] tweets and
        save each to separate files. 
        """
        for name, info in self.users.items():
            try:
                os.mkdir(self.root + info['party'].lower().replace(' ', '_'))
            except FileExistsError:
                pass
            
            filepath = self.root + info['party'].lower().replace(' ', '_')
            filepath = filepath + '/' + name.lower().replace(' ', '')
            try:
                print(f'Reading tweets from {name}')
                user = info['screen_name']
                curs = tweepy.Cursor(self.api.user_timeline,
                                     screen_name=user,
                                     count=200,
                                     tweet_mode="extended"
                                     ).items(max_items)

                with open(filepath + '.jsonl', 'w') as f:
                    for status in curs:
                        tweet = status._json
                        json_dump_line(tweet, f)
                        
            except tweepy.TweepError as exc:
                print(exc)
                os.remove(filepath + '.jsonl')

                
def get_tweet_auth(auth_file='credentials.txt'):
    """
    Get tweepy oauth object given a credentials-file, formatted
    as a nltk.twitter-creds file.
    """
    keys = []
    
    # Open credentials-file
    with open(auth_file, 'r') as f:
        for line in f:
            # Read only key/token
            token = line.split('=')[-1].rstrip('\n')
            
            # Add token to keys-list
            if token is not '':
                keys.append(line.split('=')[-1].rstrip('\n'))
                
    auth = tweepy.OAuthHandler(*keys[:2])
    auth.set_access_token(*keys[2:])
    return auth


def json_dump_line(json_object, file_object):
    """
    Dumps a dictionay json_object to file_object, adding 
    a trailing newline, hence creating a json line format.
    """
    json.dump(json_object, file_object)
    file_object.write('\n')


def rm_empty_json_in_path(path):
    """
    Browses through corpus-files and removes any user.json-files which are 
    empty for various reasons.
    """
    assert os.path.isdir(path), "[path] not a valid directory"
    
    # Ensure directories are given with ending '/' for recursion
    if path[-1] != '/':
        path += '/'
    
    print('Browsing "' + path + '"')
        
    for f in os.listdir(path):
        filepath = path + f
        if os.path.isfile(filepath) and '.jsonl' in filepath:
            try:
                if os.path.getsize(filepath) == 0:
                    print('Removing ' + filepath)
                    os.remove(filepath)
            
            # Shouldn't happen, but just to make sure.
            except OSError as e:
                print(e)
                pass
        
        elif os.path.isdir(filepath):
            # Browse one dir deeper
            rm_empty_json_in_path(path + f + '/')    
    
    
    
    
number_of_tweets = int(input("Number of tweets per user:"))

# Ensure that the argument is a positive integer.
assert number_of_tweets > 0, "Number of tweets must be a positive integer"

# Create MP-list and run corpuscreator
if not os.path.isfile('mps.json'):
    create_user_list()

corpus_creator = CorpusCreator(user_dict='mps.json')
corpus_creator.load_tweets(max_items=number_of_tweets)
rm_empty_json_in_path('corpus/')

There are also some files generated that are empty for various reasons, e.g. mps that have created a twitter account, but have yet to post anything. These should be cleaned out to prevent errors when loading corpus.

Now that the tweets are finished downloading, we will need to create a way to read them. We will build a class inheriting from the nltk TwitterCorpusReader. However, due to twitter extending maximum tweet length, we will have to make our own "strings()"-function ensuring that it reads the full tweets, instead of ending tweets with a '...' if they go beyond the canonical character-bound. Furthermore we will write the tweets to different dataframes, for easier use later.

In [54]:
import json
import pandas as pd

from nltk.tokenize import TweetTokenizer
from nltk.corpus.reader import TwitterCorpusReader



class MPTweetCorpusReader(TwitterCorpusReader):
    """
    Class cerate specifically for ease of use in text clustering of the 
    British Member of Parliament tweets.
    """
    
    def __init__(self, root, fileids=None, word_tokenizer=TweetTokenizer(),
                 encoding='utf-8', create_df=False):
        TwitterCorpusReader.__init__(self, root, fileids, word_tokenizer,
                                     encoding)

        self.parties = list(set([fileid.split('/')[0] for fileid in self.fileids()]))
        self.users = [fileid.split('/')[1].split('.')[0] for fileid in self.fileids()]
        
        self.num_tweets = len(self.strings())
        self.num_parties = len(self.parties)
        self.num_users = len(self.users)
        
        
        with open(self.root + '../mps.json') as f:
            self._mp_dict = json.load(f)
        
        
        self.df_savepath = self.root + 'tweet_df.pkl'
        
        if create_df:
            print('Building tweet dataframe.')
            self._build_dataframe()
            print('Tweet dataframe built.')
            print()
        
        else:
            try:
                print("Loading tweet dataframe.")
                self.df = pd.read_pickle(self.df_savepath)
                print("Tweet dataframe loaded.")
                print()
            
            except OSError as exc:
                self.df = None
                
                print('OSError: ' + exc)
                print("No dataframe created/loaded.")
                print()
    
    def strings(self, fileids=None):
        """
        Returns only the text content of Tweets in the file(s)
        :return: the given file(s) as a list of Tweets.
        :rtype: list(str)
        """
        fulltweets = self.docs(fileids)
        tweets = []
        for jsono in fulltweets:
            try:
                text = jsono["full_text"]
                if isinstance(text, bytes):
                    text = text.decode(self.encoding)
                tweets.append(text)
            except KeyError:
                pass
        return tweets
                
    def _build_dataframe(self):
        self.df = pd.DataFrame(columns=['user', 'party', 'userid', 'text'])
        i = -1    
        for user, info in self._mp_dict.items():
            # Get filepath for users tweet
            user_fileids = (info['party'].lower().replace(' ', '_') + '/' 
                            + user.lower().replace(' ', '') + '.jsonl')

            try:
                user_tweets = self.strings(user_fileids)

                for string in user_tweets:
                    i += 1
                    self.df.loc[i] = [user, info['party'], info['screen_name'], string]

            except OSError:
                # File doesn't exists, probably due to locked twitter profile.
                pass

            
    def to_dataframe(self, samples='tweet', savename=None):
        """
        samples = string: {'tweet', 'user', 'party'}
        
        Create a dataframe where each row reperesnts one tweet, and stores 
        as a member variable. If samples is 'user' or 'party', it will return 
        a dataframe for which all tweets belonging to a single user/party is 
        concatenated into one. 
        """
        assert samples in ("tweet", "user", "party"), "Invalid argument [samples]:" + str(samples)
        
        # Create base dataframe.
        if samples == "tweet":
            return self.df
            
        # Create "lower resolution" dataframes if necessary
        if samples in ("user", "party"):
            
            df_by_user = pd.DataFrame(columns=['user', 'party', 'text'])
            
            i = -1
            for user, info in self._mp_dict.items():
                # Concatenate all tweets from user into one string.
                tweets = ' '.join(list(self.df.loc[self.df['user'] == user]['text']))
                
                if tweets != '':
                    i += 1
                    df_by_user.loc[i] = [user, info['party'], tweets]
                    
            # Let name of mp/user be index of dataframe
            df_by_user.set_index('user', inplace=True)
            
            if samples == "party":
                df_by_party = pd.DataFrame(columns=['party', 'text'])
                
                i = -1
                for party in df_by_user['party'].unique():
                    # Concatenate tweets from all users in party into single string
                    tweets = ' '.join(list(df_by_user.loc[df_by_user['party'] == party]['text']))
                    
                    if tweets != '':
                        i += 1
                        df_by_party.loc[i] = [party, tweets]
                        
                df_by_party.set_index('party', inplace=True)
                        
                if savename is not None:
                    df_by_party.to_pickle(self.root + savename)
                return df_by_party
            
            if savename is not None:
                df_by_user.to_pickle(self.root + savename)
            return df_by_user


In [49]:
corpus = MPTweetCorpusReader(root='./corpus/', fileids='.*.jsonl', create_df=True)
corpus.df.to_pickle('corpus/tweet_df.pkl')
df_user = corpus.to_dataframe('user', 'user_df.pkl')
df_party = corpus.to_dataframe('party', 'party_df.pkl')

Building tweet dataframe.
Tweet dataframe built.

