In [1]:
import datetime
import json
import os
import time

from tqdm import tqdm
from tweepy import TweepyException

from data_utils import DATA_PATH
from twitter_utils import get_client

In [None]:
ids = set()
year = 2016
with open(os.path.join('data', f'follower_lists_{year}.jsonl')) as f:
    for l in f:
        for _, ids_ in json.loads(l).items():
            ids.update(ids_)

accounts = list() #id, username, tweet_count
with open('data/followers_rehydrated.jsonl') as f:
    for l in f:
        account = json.loads(l)
        if int(account['id']) in ids:
            accounts.append((account['id'],account['username'],account["public_metrics"]["tweet_count"]))

In [None]:
accounts_100plus = set(filter(lambda x:x[2]>100,accounts))

In [None]:
queries = list()

while(len(accounts_100plus)):
    query = '-is:retweet '
    while len(query)<=1024:
        if not len(accounts_100plus):
            break
        next_user= accounts_100plus.pop()
        next_username=f"from:{next_user[1]} OR "
        if (len(query)+len(next_username)>1024):
            query = query[:-len(" OR ")]
            accounts_100plus.add(next_user)
            break
        else:
            query+=next_username
    queries.append(query)

In [None]:
window = 5
start_date = datetime.datetime(2016, 11, 8-window, 0, 0, 0)
end_date =   datetime.datetime(2016, 11, 8+window, 0, 0, 0)


In [None]:
client = get_client()


In [None]:
out_path = os.path.join(DATA_PATH, f'election_tweets_{year}.jsonl')
errors = list()

In [None]:
with open(out_path, 'a+') as f:
    for query in tqdm(queries, desc='processing queries'):
        done = False
        next_token = None
        while not done:
            try:
                res = client.search_all_tweets(query=query, end_time=end_date, next_token=next_token, start_time=start_date, max_results=500, \
                                               place_fields = ['id','full_name', 'country', 'geo', 'name', 'place_type'],
                                             tweet_fields = ['id', 'created_at', 'geo', 'public_metrics', 'text', 'edit_history_tweet_ids', 'attachments', 'author_id',  'conversation_id', 'entities', 'in_reply_to_user_id', 'lang', 'possibly_sensitive', 'referenced_tweets', 'reply_settings', 'source', 'withheld'], # missing edit controls and  other fields that require user auth, 'context_annotations' requires <100 results
                            media_fields=None,  \
                            poll_fields=None, since_id=None, sort_order=None, \
                            until_id=None, \
                            user_fields=None)
                for tweet in res.data:
                    f.write(json.dumps(tweet.data, sort_keys=True)+'\n')
                next_token = res.meta.get('next_token', None)
                if not next_token:
                    done=True

            except TweepyException as e:
                print(e)
                errors.extend(query)
                time.sleep(60)
                done=True