In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import os
import tweepy
import json

from utils import json_dump_line, get_tweet_auth

class CorpusCreator:
    """
    Class for creating corpus structure and loading tweets
    using tweepy.
    
    Structure:
    -> corpus
    
    ----> screen_name1
    ------> tweet_id1.txt
    ------> tweet_id2.txt
    
    ----> screen_name2
    ------> tweet_id1.txt
    ------> tweet_id2.txt
    .
    .
    
    """
    
    def __init__(self, user_dict=None, rel_path='./',
                rate_limit_wait=True):
        
        # Get an API-object authorized from 'credentials.txt'
        auth = get_tweet_auth()
        self.api = tweepy.API(auth,
                              wait_on_rate_limit=rate_limit_wait,
                              wait_on_rate_limit_notify=rate_limit_wait)
        
        self.root = rel_path + 'corpus/'
        
        # Load mp-file from directory
        if user_dict is None:
            with open('mps.json') as f:
                self.users = json.load(f)
        else:
            self.users = user_dict
        
        # Create root filesystem
        try:
            os.mkdir(self.root)
            print('Directory "corpus created.')
            print()
        except:
            print('Directory "corpus" already exists.')
            print()
            
        
            
    def load_tweets(self, max_items=10000, user=None):
        """
        For all users in self.users, get [max_items] tweets and
        save each to separate files. 
        """
        for name, info in self.users.items():
            try:
                print(f'Reading tweets from {name}')
                user = info['screen_name']
                curs = tweepy.Cursor(self.api.user_timeline,
                                     screen_name=user).items(max_items)

                filename = name.lower().replace(' ', '')
                with open(self.root + filename + '.jsonl', 'w') as f:
                    for status in curs:
                        tweet = status._json
                        json_dump_line(tweet, f)
            except tweepy.TweepError as exc:
                print(exc)

In [11]:
from utils import create_user_list

create_user_list()

corpus_creator = CorpusCreator()
corpus_creator.load_tweets(max_items=100)


Correction
Old:  {'party': 'Conservative', 'screen_name': 'daviddaguidmp', 'url': 'https://twitter.com/daviddaguidmp'}
New:  {'party': 'Conservative', 'screen_name': 'davidduguidmp', 'url': 'https://twitter.com/davidduguidmp'}


Correction
Old:  {'party': 'Conservative', 'screen_name': 'eorgeFreemanMP', 'url': 'https://twitter.com/eorgeFreemanMP'}
New:  {'party': 'Conservative', 'screen_name': 'GeorgeFreemanMP', 'url': 'https://twitter.com/GeorgeFreemanMP'}


Correction
Old:  {'party': 'Conservative', 'screen_name': 'SirRogerGaleMP', 'url': 'https://twitter.com/SirRogerGaleMP'}
New:  {'party': 'Conservative', 'screen_name': 'SirRogerGale', 'url': 'https://twitter.com/SirRogerGale'}

MPs on Twitter stored in mps.json
Directory "corpus" already exists.
Reading tweets from Adam Afriyie
Reading tweets from Alan Duncan
Reading tweets from Alan Mak
Reading tweets from Alberto Costa


Rate limit reached. Sleeping for: 282


KeyboardInterrupt: 