In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
auth = get_tweet_auth()
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
user = 'AdamAfriyie'

In [19]:
curs = tweepy.Cursor(api.user_timeline, screen_name=user, count=200,
                     tweet_mode='extended').items(5)



with open('test.jsonl', 'w') as f:
    for status in curs:
        tweet = status._json
        json_dump_line(tweet, f)

In [29]:
# Natural Language Toolkit: Twitter Corpus Reader
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
"""

import json
import os

from six import string_types

from nltk.tokenize import TweetTokenizer

from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer
from nltk.corpus.reader.api import CorpusReader


class TwitterCorpusReader(CorpusReader):
    """
    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.
    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.
    Construct a new Tweet corpus reader for a set of documents
    located at the given root directory.
    If you made your own tweet collection in a directory called
    `twitter-files`, then you can initialise the reader as::
        from nltk.corpus import TwitterCorpusReader
        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')
    However, the recommended approach is to set the relevant directory as the
    value of the environmental variable `TWITTER`, and then invoke the reader
    as follows::
       root = os.environ['TWITTER']
       reader = TwitterCorpusReader(root, '.*\.json')
    If you want to work directly with the raw Tweets, the `json` library can
    be used::
       import json
       for tweet in reader.docs():
           print(json.dumps(tweet, indent=1, sort_keys=True))
    """

    CorpusView = StreamBackedCorpusView
    """
    The corpus view class used by this reader.
    """

    def __init__(
        self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding="utf8"
    ):
        """
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.
        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer

    def docs(self, fileids=None):
        """
        Returns the full Tweet objects, as specified by `Twitter
        documentation on Tweets
        <https://dev.twitter.com/docs/platform-objects/tweets>`_
        :return: the given file(s) as a list of dictionaries deserialised
        from JSON.
        :rtype: list(dict)
        """
        return concat(
            [
                self.CorpusView(path, self._read_tweets, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def strings(self, fileids=None):
        """
        Returns only the text content of Tweets in the file(s)
        :return: the given file(s) as a list of Tweets.
        :rtype: list(str)
        """
        fulltweets = self.docs(fileids)
        tweets = []
        for jsono in fulltweets:
            try:
                text = jsono["full_text"]
                if isinstance(text, bytes):
                    text = text.decode(self.encoding)
                tweets.append(text)
            except KeyError:
                pass
        return tweets

    def tokenized(self, fileids=None):
        """
        :return: the given file(s) as a list of the text content of Tweets as
        as a list of words, screenanames, hashtags, URLs and punctuation symbols.
        :rtype: list(list(str))
        """
        tweets = self.strings(fileids)
        tokenizer = self._word_tokenizer
        return [tokenizer.tokenize(t) for t in tweets]

    def raw(self, fileids=None):
        """
        Return the corpora in their raw form.
        """
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, string_types):
            fileids = [fileids]
        return concat([self.open(f).read() for f in fileids])

    def _read_tweets(self, stream):
        """
        Assumes that each line in ``stream`` is a JSON-serialised object.
        """
        tweets = []
        for i in range(10):
            line = stream.readline()
            if not line:
                return tweets
            tweet = json.loads(line)
            tweets.append(tweet)
        return tweets

In [30]:
# from nltk.corpus.reader import TwitterCorpusReader
corpus = TwitterCorpusReader(root='./testcorpus/', fileids='.*.jsonl')

In [31]:
corpus.fileids()

['test.jsonl']

In [33]:
corpus.tokenized()

[['After',
  'a',
  'quiet',
  'evening',
  'at',
  'The',
  'Barley',
  'Mow',
  'at',
  'Englefield',
  'Green',
  ',',
  'the',
  'weather',
  '’',
  's',
  'looking',
  'good',
  'today',
  'and',
  'I',
  '’',
  'm',
  'doing',
  'some',
  'casework',
  'with',
  'an',
  'active',
  'day',
  'across',
  'the',
  'constituency',
  '.',
  '@bmenglefield',
  '@WindsorTories'],
 ['Good',
  'news',
  'for',
  'the',
  'UK',
  '.',
  'As',
  'the',
  'PM',
  '’',
  's',
  'Trade',
  'Envoy',
  'to',
  'Ghana',
  '&',
  'Guinea',
  ',',
  'I',
  '’',
  'm',
  'very',
  'much',
  'aware',
  'that',
  'ship',
  'building',
  'can',
  'be',
  'a',
  'great',
  'export',
  'too',
  '!',
  'https://t.co/FJQZ0VYlmC'],
 ['The',
  'green',
  'belt',
  'helps',
  'make',
  'our',
  'constituency',
  'a',
  'beautiful',
  'place',
  'to',
  'live',
  '.',
  'So',
  'I',
  '’',
  'm',
  'delighted',
  'to',
  'say',
  'that',
  'today',
  'we',
  '’',
  've',
  'secured',
  '£',
  '50k',
  'from',


In [2]:
import os
import tweepy
import json

from utils import json_dump_line, get_tweet_auth

class CorpusCreator:
    """
    Class for creating corpus structure and loading tweets
    using tweepy.
    
    Structure:
    -> corpus
    
    ----> screen_name1
    ------> tweet_id1.txt
    ------> tweet_id2.txt
    
    ----> screen_name2
    ------> tweet_id1.txt
    ------> tweet_id2.txt
    .
    .
    
    """
    
    def __init__(self, user_dict=None, rel_path='./',
                rate_limit_wait=True):
        
        # Get an API-object authorized from 'credentials.txt'
        auth = get_tweet_auth()
        self.api = tweepy.API(auth,
                              wait_on_rate_limit=rate_limit_wait,
                              wait_on_rate_limit_notify=rate_limit_wait)
        
        self.root = rel_path + 'corpus/'
        
        # Load mp-file from directory
        if user_dict is None:
            with open('mps.json') as f:
                self.users = json.load(f)
        else:
            self.users = user_dict
        
        # Create root filesystem
        try:
            os.mkdir(self.root)
            print('Directory "corpus created.')
            print()
        except:
            print('Directory "corpus" already exists.')
            print()
            
        
            
    def load_tweets(self, max_items=10000, user=None):
        """
        For all users in self.users, get [max_items] tweets and
        save each to separate files. 
        """
        for name, info in self.users.items():
            try:
                os.mkdir(self.root + info['party'].lower().replace(' ', '_'))
            except FileExistsError:
                pass
            
            filepath = self.root + info['party'].lower().replace(' ', '_')
            filepath = filepath + '/' + name.lower().replace(' ', '')
            try:
                print(f'Reading tweets from {name}')
                user = info['screen_name']
                curs = tweepy.Cursor(self.api.user_timeline,
                                     screen_name=user,
                                     count=200,
                                     ).items(max_items)

                with open(filepath + '.jsonl', 'w') as f:
                    for status in curs:
                        tweet = status._json
                        json_dump_line(tweet, f)
                        
            except tweepy.TweepError as exc:
                print(exc)
                os.remove(filepath + '.jsonl')

In [24]:
# corpus_creator = CorpusCreator()
# corpus_creator.load_tweets(max_items=5)

Directory "corpus" already exists.

Reading tweets from Adam Afriyie
Reading tweets from Alan Duncan
Reading tweets from Alan Mak
Reading tweets from Alberto Costa
Reading tweets from Alec Shelbrooke
Reading tweets from Alex Burghart
Reading tweets from Alex Chalk
Reading tweets from Alistair Burt
Reading tweets from Alok Sharma
Reading tweets from Alun Cairns
Reading tweets from Amanda Milling
Reading tweets from Amber Rudd
Reading tweets from Andrea Jenkyns
Reading tweets from Andrea Leadsom
Reading tweets from Andrew Bowie
Reading tweets from Andrew Bridgen
Reading tweets from Andrew Jones
Reading tweets from Andrew Lewer
Reading tweets from Andrew Rosindell
Reading tweets from Andrew Selous
Reading tweets from Andrew Stephenson
Reading tweets from Anne Marie Morris
Reading tweets from Anne Milton
Reading tweets from Anne-Marie Trevelyan
Reading tweets from Antoinette Sandbach
Reading tweets from Ben Bradley
Reading tweets from Ben Wallace
Reading tweets from Bernard Jenkin
Reading 

Reading tweets from Stephen Barclay
Reading tweets from Stephen Crabb
Reading tweets from Stephen Hammond
Reading tweets from Stephen Kerr
Reading tweets from Stephen McPartland
Reading tweets from Stephen Metcalfe
Reading tweets from Steve Brine
Reading tweets from Steve Double
Reading tweets from Steven Baker
Reading tweets from Stuart Andrew
Reading tweets from Suella Fernandes
Reading tweets from Theresa May
Reading tweets from Tim Loughton
Reading tweets from Tobias Ellwood
Reading tweets from Tom Pursglove
Reading tweets from Tom Tugendhat
Reading tweets from Tracey Crouch
Reading tweets from Vicky Ford
Reading tweets from Victoria Prentis
Reading tweets from Wendy Morton
Reading tweets from Will Quince
Reading tweets from William Cash
Reading tweets from William Wragg
Reading tweets from Zac Goldsmith
Reading tweets from David Simpson
Reading tweets from Emma Little-Pengelly
Reading tweets from Gavin Robinson
Reading tweets from Jeffrey M. Donaldson
Reading tweets from Jim Shann

Reading tweets from Ruth George
Reading tweets from Ruth Jones
Reading tweets from Ruth Smeeth
Reading tweets from Sandy Martin
Reading tweets from Sarah Champion
Reading tweets from Sarah Jones
Reading tweets from Seema Malhotra
Reading tweets from Shabana Mahmood
Reading tweets from Sharon Hodgson
Reading tweets from Siobhain McDonagh
Reading tweets from Stella Creasy
Reading tweets from Stephanie Peacock
Reading tweets from Stephen Doughty
Reading tweets from Stephen Hepburn
Reading tweets from Stephen Kinnock
Reading tweets from Stephen Morgan
Reading tweets from Stephen Timms
Reading tweets from Stephen Twigg
Reading tweets from Steve McCabe
Reading tweets from Steve Reed
Reading tweets from Sue Hayman
Reading tweets from Susan Elan Jones
Reading tweets from Tanmanjit Singh Dhesi
Reading tweets from Teresa Pearce
Reading tweets from Thangam Debbonaire
Reading tweets from Thelma Walker
Reading tweets from Toby Perkins
Reading tweets from Tom Watson
Reading tweets from Tonia Antonia