In [None]:
%load_ext autoreload
%autoreload 2

In [23]:
import os
import tweepy
import json

from utils import json_dump_line, get_tweet_auth

class CorpusCreator:
    """
    Class for creating corpus structure and loading tweets
    using tweepy.
    
    Structure:
    -> corpus
    
    ----> screen_name1
    ------> tweet_id1.txt
    ------> tweet_id2.txt
    
    ----> screen_name2
    ------> tweet_id1.txt
    ------> tweet_id2.txt
    .
    .
    
    """
    
    def __init__(self, user_dict=None, rel_path='./',
                rate_limit_wait=True):
        
        # Get an API-object authorized from 'credentials.txt'
        auth = get_tweet_auth()
        self.api = tweepy.API(auth,
                              wait_on_rate_limit=rate_limit_wait,
                              wait_on_rate_limit_notify=rate_limit_wait)
        
        self.root = rel_path + 'corpus/'
        
        # Load mp-file from directory
        if user_dict is None:
            with open('mps.json') as f:
                self.users = json.load(f)
        else:
            self.users = user_dict
        
        # Create root filesystem
        try:
            os.mkdir(self.root)
            print('Directory "corpus created.')
            print()
        except:
            print('Directory "corpus" already exists.')
            print()
            
        
            
    def load_tweets(self, max_items=10000, user=None):
        """
        For all users in self.users, get [max_items] tweets and
        save each to separate files. 
        """
        for name, info in self.users.items():
            try:
                os.mkdir(self.root + info['party'].lower().replace(' ', '_'))
            except FileExistsError:
                pass
            
            filepath = self.root + info['party'].lower().replace(' ', '_')
            filepath = filepath + '/' + name.lower().replace(' ', '')
            try:
                print(f'Reading tweets from {name}')
                user = info['screen_name']
                curs = tweepy.Cursor(self.api.user_timeline,
                                     screen_name=user,
                                     count=200).items(max_items)

                with open(filepath + '.jsonl', 'w') as f:
                    for status in curs:
                        tweet = status._json
                        json_dump_line(tweet, f)
                        
            except tweepy.TweepError as exc:
                print(exc)
                os.remove(filepath + '.jsonl')

In [24]:
# corpus_creator = CorpusCreator()
# corpus_creator.load_tweets(max_items=5)

Directory "corpus" already exists.

Reading tweets from Adam Afriyie
Reading tweets from Alan Duncan
Reading tweets from Alan Mak
Reading tweets from Alberto Costa
Reading tweets from Alec Shelbrooke
Reading tweets from Alex Burghart
Reading tweets from Alex Chalk
Reading tweets from Alistair Burt
Reading tweets from Alok Sharma
Reading tweets from Alun Cairns
Reading tweets from Amanda Milling
Reading tweets from Amber Rudd
Reading tweets from Andrea Jenkyns
Reading tweets from Andrea Leadsom
Reading tweets from Andrew Bowie
Reading tweets from Andrew Bridgen
Reading tweets from Andrew Jones
Reading tweets from Andrew Lewer
Reading tweets from Andrew Rosindell
Reading tweets from Andrew Selous
Reading tweets from Andrew Stephenson
Reading tweets from Anne Marie Morris
Reading tweets from Anne Milton
Reading tweets from Anne-Marie Trevelyan
Reading tweets from Antoinette Sandbach
Reading tweets from Ben Bradley
Reading tweets from Ben Wallace
Reading tweets from Bernard Jenkin
Reading 

Reading tweets from Stephen Barclay
Reading tweets from Stephen Crabb
Reading tweets from Stephen Hammond
Reading tweets from Stephen Kerr
Reading tweets from Stephen McPartland
Reading tweets from Stephen Metcalfe
Reading tweets from Steve Brine
Reading tweets from Steve Double
Reading tweets from Steven Baker
Reading tweets from Stuart Andrew
Reading tweets from Suella Fernandes
Reading tweets from Theresa May
Reading tweets from Tim Loughton
Reading tweets from Tobias Ellwood
Reading tweets from Tom Pursglove
Reading tweets from Tom Tugendhat
Reading tweets from Tracey Crouch
Reading tweets from Vicky Ford
Reading tweets from Victoria Prentis
Reading tweets from Wendy Morton
Reading tweets from Will Quince
Reading tweets from William Cash
Reading tweets from William Wragg
Reading tweets from Zac Goldsmith
Reading tweets from David Simpson
Reading tweets from Emma Little-Pengelly
Reading tweets from Gavin Robinson
Reading tweets from Jeffrey M. Donaldson
Reading tweets from Jim Shann

Reading tweets from Ruth George
Reading tweets from Ruth Jones
Reading tweets from Ruth Smeeth
Reading tweets from Sandy Martin
Reading tweets from Sarah Champion
Reading tweets from Sarah Jones
Reading tweets from Seema Malhotra
Reading tweets from Shabana Mahmood
Reading tweets from Sharon Hodgson
Reading tweets from Siobhain McDonagh
Reading tweets from Stella Creasy
Reading tweets from Stephanie Peacock
Reading tweets from Stephen Doughty
Reading tweets from Stephen Hepburn
Reading tweets from Stephen Kinnock
Reading tweets from Stephen Morgan
Reading tweets from Stephen Timms
Reading tweets from Stephen Twigg
Reading tweets from Steve McCabe
Reading tweets from Steve Reed
Reading tweets from Sue Hayman
Reading tweets from Susan Elan Jones
Reading tweets from Tanmanjit Singh Dhesi
Reading tweets from Teresa Pearce
Reading tweets from Thangam Debbonaire
Reading tweets from Thelma Walker
Reading tweets from Toby Perkins
Reading tweets from Tom Watson
Reading tweets from Tonia Antonia