In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import pandas as pd

from nltk.tokenize import TweetTokenizer
from nltk.corpus.reader import TwitterCorpusReader



class MPTweetCorpusReader(TwitterCorpusReader):
    """
    Class cerate specifically for ease of use in text clustering of the 
    British Member of Parliament tweets.
    """
    
    def __init__(self, root, fileids=None, word_tokenizer=TweetTokenizer(),
                 encoding='utf-8', create_df=False):
        TwitterCorpusReader.__init__(self, root, fileids, word_tokenizer,
                                     encoding)

        self.parties = list(set([fileid.split('/')[0] for fileid in self.fileids()]))
        self.users = [fileid.split('/')[1].split('.')[0] for fileid in self.fileids()]
        
        self.num_tweets = len(self.strings())
        self.num_parties = len(self.parties)
        self.num_users = len(self.users)
        
        
        with open(self.root + 'mps.json') as f:
            self._mp_dict = json.load(f)
        
        
        self.df_savepath = self.root + 'tweet_df.pkl'
        
        if create_df:
            print('Building tweet dataframe.')
            self._build_dataframe()
            print('Tweet dataframe built.')
            print()
        
        else:
            try:
                print("Loading tweet dataframe.")
                self.df = pd.read_pickle(self.df_savepath)
                print("Tweet dataframe loaded.")
                print()
            
            except OSError as exc:
                self.df = None
                
                print('OSError: ' + exc)
                print("No dataframe created/loaded.")
                print()
                
                
    def _build_dataframe(self):
        self.df = pd.DataFrame(columns=['user', 'party', 'userid', 'text'])
        i = -1    
        for user, info in self._mp_dict.items():
            # Get filepath for users tweet
            user_fileids = (info['party'].lower().replace(' ', '_') + '/' 
                            + user.lower().replace(' ', '') + '.jsonl')

            try:
                user_tweets = corpus.strings(user_fileids)

                for string in user_tweets:
                    i += 1
                    self.df.loc[i] = [user, info['party'], info['screen_name'], string]

            except OSError:
                # File doesn't exists, probably due to locked twitter profile.
                pass

            
    def to_dataframe(self, samples='tweet', savename=None):
        """
        samples = string: {'tweet', 'user', 'party'}
        
        Create a dataframe where each row reperesnts one tweet, and stores 
        as a member variable. If samples is 'user' or 'party', it will return 
        a dataframe for which all tweets belonging to a single user/party is 
        concatenated into one. 
        """
        assert samples in ("tweet", "user", "party"), "Invalid argument [samples]:" + str(samples)
        
        # Create base dataframe.
        if samples == "tweet":
            return self.df
            
        # Create "lower resolution" dataframes if necessary
        if samples in ("user", "party"):
            
            df_by_user = pd.DataFrame(columns=['user', 'party', 'text'])
            
            i = -1
            for user, info in self._mp_dict.items():
                # Concatenate all tweets from user into one string.
                tweets = ' '.join(list(self.df.loc[self.df['user'] == user]['text']))
                
                if tweets != '':
                    i += 1
                    df_by_user.loc[i] = [user, info['party'], tweets]
                    
            # Let name of mp/user be index of dataframe
            df_by_user.set_index('user', inplace=True)
            
            if samples == "party":
                df_by_party = pd.DataFrame(columns=['party', 'text'])
                
                i = -1
                for party in df_by_user['party'].unique():
                    # Concatenate tweets from all users in party into single string
                    tweets = ' '.join(list(df_by_user.loc[df_by_user['party'] == party]['text']))
                    
                    if tweets != '':
                        i += 1
                        df_by_party.loc[i] = [party, tweets]
                        
                df_by_party.set_index('party', inplace=True)
                        
                if savename is not None:
                    df_by_party.to_pickle(self.root + savename)
                return df_by_party
            
            if savename is not None:
                df_by_user.to_pickle(self.root + savename)
            return df_by_user


In [3]:
corpus = MPTweetCorpusReader(root='./corpus/', fileids='.*.jsonl', create_df=False)
df_user = corpus.to_dataframe('user', 'user_df.pkl')
df_party = corpus.to_dataframe('party', 'party_df.pkl')

Loading tweet dataframe.
Tweet dataframe loaded.



In [4]:
corpus.df

Unnamed: 0,user,party,userid,text
0,Adam Afriyie,Conservative,AdamAfriyie,I welcome this great news for our military per...
1,Adam Afriyie,Conservative,AdamAfriyie,Delighted our Prime Minister has secured a new...
2,Adam Afriyie,Conservative,AdamAfriyie,It was privilege to attend &amp; celebrate the...
3,Adam Afriyie,Conservative,AdamAfriyie,"With HS2 under review, it is now time to revie..."
4,Adam Afriyie,Conservative,AdamAfriyie,Loving all the #Farm24 support today. It’s tha...
...,...,...,...,...
2893,Mike Gapes,The Independent Group,MikeGapes,A lesson for Johnson? https://t.co/BArhF1HUZr
2894,Mike Gapes,The Independent Group,MikeGapes,"RT @BBCNewsnight: ""He has got great qualities...."
2895,Mike Gapes,The Independent Group,MikeGapes,You should have gone to specsavers https://t.c...
2896,Mike Gapes,The Independent Group,MikeGapes,The Independent Group for Change is a party re...


In [5]:
df_user

Unnamed: 0_level_0,party,text
user,Unnamed: 1_level_1,Unnamed: 2_level_1
Adam Afriyie,Conservative,I welcome this great news for our military per...
Alan Duncan,Conservative,Strategists be careful. If the Government thin...
Alan Mak,Conservative,RT @MagicCarmel: Thank you so much - great job...
Alberto Costa,Conservative,RT @BlabyDC: URGENT DOG APPEAL: We urgently ne...
Alec Shelbrooke,Conservative,RT @elashton: At least it's nice and quiet tod...
...,...,...
Ann Coffey,The Independent Group,RT @mrchrisjohn: Now I see what @Jacob_Rees_Mo...
Anna Soubry,The Independent Group,What a disgraceful #nasty @Conservatives party...
Chris Leslie,The Independent Group,Beginning to look like all these @sajidjavid s...
Joan Ryan,The Independent Group,RT @Siobhain_MP: The former Prime Minister’s d...


In [6]:
df_party

Unnamed: 0_level_0,text
party,Unnamed: 1_level_1
Conservative,I welcome this great news for our military per...
Democratic Unionist Party,At @CommonsEFRA obtaining evidence on labour c...
Green Party,Next hurdle cleared! Second reading passed 3...
Independent,RT @rjd_crowther: There are not 3 million EU c...
Labour,"At #PMQs today, I asked Boris Johnson how much..."
Liberal Democrat,A great maiden speech by @DoddsJane in today's...
Plaid Cymru,RT @JonathanPlaid: Exactly ⬇️⬇️⬇️ https://t.co...
Scottish National Party,Another quarter billion pounds casually thrown...
Sinn Fein,Saddened to see the Jimmy Gralton monument has...
The Independent Group,RT @mrchrisjohn: Now I see what @Jacob_Rees_Mo...
