In [1]:
import datetime
import json
import os
import time

from tqdm import tqdm
from tweepy import TweepyException

from data_utils import DATA_PATH
from twitter_utils import get_client

In [2]:
ids = set()
year = 2016
with open(os.path.join('data', f'follower_lists_{year}.jsonl')) as f:
    for l in f:
        for _, ids_ in json.loads(l).items():
            ids.update(ids_)

accounts = list() #id, username, tweet_count
with open('data/followers_rehydrated.jsonl') as f:
    for l in f:
        account = json.loads(l)
        if int(account['id']) in ids:
            accounts.append((account['id'],account['username'],account["public_metrics"]["tweet_count"]))

In [3]:
accounts_100plus = set(filter(lambda x:x[2]>100,accounts))

In [4]:
queries = list()

while(len(accounts_100plus)):
    query = '-is:retweet '
    while len(query)<=1024:
        if not len(accounts_100plus):
            break
        next_user= accounts_100plus.pop()
        next_username=f"from:{next_user[1]} OR "
        if (len(query)+len(next_username)>1024):
            query = query[:-len(" OR ")]
            accounts_100plus.add(next_user)
            break
        else:
            query+=next_username
    queries.append(query)

In [5]:
window = 5
start_date = datetime.datetime(2016, 11, 8-window, 0, 0, 0)
end_date =   datetime.datetime(2016, 11, 8+window, 0, 0, 0)


In [6]:
client = get_client()


In [7]:
out_path = os.path.join(DATA_PATH, f'election_tweets_{year}.jsonl')
errors = list()

In [8]:
with open(out_path, 'a+') as f:
    for query in tqdm(queries, desc='processing queries'):
        done = False
        next_token = None
        while not done:
            try:
                res = client.search_all_tweets(query=query, end_time=end_date, next_token=next_token, start_time=start_date, max_results=500, \
                                               place_fields = ['id','full_name', 'country', 'geo', 'name', 'place_type'],
                                             tweet_fields = ['id', 'created_at', 'geo', 'public_metrics', 'text', 'edit_history_tweet_ids', 'attachments', 'author_id',  'conversation_id', 'entities', 'in_reply_to_user_id', 'lang', 'possibly_sensitive', 'referenced_tweets', 'reply_settings', 'source', 'withheld'], # missing edit controls and  other fields that require user auth, 'context_annotations' requires <100 results
                            media_fields=None,  \
                            poll_fields=None, since_id=None, sort_order=None, \
                            until_id=None, \
                            user_fields=None)
                for tweet in res.data:
                    f.write(json.dumps(tweet.data, sort_keys=True)+'\n')
                next_token = res.meta.get('next_token', None)
                if not next_token:
                    done=True

            except TweepyException as e:
                print(e)
                errors.extend(query)
                time.sleep(60)
                done=True

processing queries:   0%|          | 14/47119 [01:48<119:18:16,  9.12s/it]Rate limit exceeded. Sleeping for 488 seconds.
processing queries:   0%|          | 20/47119 [10:38<396:00:00, 30.27s/it]  Rate limit exceeded. Sleeping for 859 seconds.
processing queries:   0%|          | 33/47119 [27:55<253:58:15, 19.42s/it]  Rate limit exceeded. Sleeping for 724 seconds.
processing queries:   0%|          | 45/47119 [42:06<146:39:44, 11.22s/it]  Rate limit exceeded. Sleeping for 774 seconds.
processing queries:   0%|          | 54/47119 [56:06<266:15:55, 20.37s/it]  Rate limit exceeded. Sleeping for 835 seconds.
processing queries:   0%|          | 99/47119 [1:16:05<104:57:58,  8.04s/it]  Rate limit exceeded. Sleeping for 538 seconds.
processing queries:   0%|          | 180/47119 [1:35:07<80:30:44,  6.17s/it]   Rate limit exceeded. Sleeping for 297 seconds.
processing queries:   0%|          | 192/47119 [1:41:41<138:24:27, 10.62s/it] 

503 Service Unavailable
Service Unavailable


processing queries:   0%|          | 200/47119 [1:43:46<110:47:15,  8.50s/it]Rate limit exceeded. Sleeping for 680 seconds.
processing queries:   0%|          | 214/47119 [1:56:39<101:08:30,  7.76s/it]  Rate limit exceeded. Sleeping for 810 seconds.
processing queries:   0%|          | 224/47119 [2:11:29<253:49:21, 19.49s/it]  Rate limit exceeded. Sleeping for 822 seconds.
processing queries:   1%|          | 237/47119 [2:26:34<167:31:24, 12.86s/it]  Rate limit exceeded. Sleeping for 819 seconds.
processing queries:   1%|          | 241/47119 [2:40:32<1188:58:46, 91.31s/it] Rate limit exceeded. Sleeping for 882 seconds.
processing queries:   1%|          | 244/47119 [2:55:37<2152:53:33, 165.34s/it]Rate limit exceeded. Sleeping for 880 seconds.
processing queries:   1%|          | 271/47119 [3:13:18<114:13:25,  8.78s/it]  Rate limit exceeded. Sleeping for 720 seconds.
processing queries:   1%|          | 279/47119 [3:25:49<284:35:24, 21.87s/it]  Rate limit exceeded. Sleeping for 870 sec

503 Service Unavailable
Service Unavailable


processing queries:   1%|          | 327/47119 [4:14:12<157:13:49, 12.10s/it]  

503 Service Unavailable
Service Unavailable


processing queries:   1%|          | 335/47119 [4:16:40<107:29:48,  8.27s/it]Rate limit exceeded. Sleeping for 524 seconds.
processing queries:   1%|          | 344/47119 [4:26:26<214:01:10, 16.47s/it]  Rate limit exceeded. Sleeping for 840 seconds.
processing queries:   1%|          | 365/47119 [4:42:53<86:20:14,  6.65s/it]   Rate limit exceeded. Sleeping for 754 seconds.
processing queries:   1%|          | 377/47119 [4:56:45<131:13:04, 10.11s/it]  Rate limit exceeded. Sleeping for 824 seconds.
processing queries:   1%|          | 384/47119 [5:11:16<450:10:33, 34.68s/it]  

503 Service Unavailable
Service Unavailable


processing queries:   1%|          | 398/47119 [5:13:45<71:15:55,  5.49s/it] Rate limit exceeded. Sleeping for 706 seconds.
processing queries:   1%|          | 399/47119 [5:25:38<2828:50:01, 217.98s/it]

503 Service Unavailable
Service Unavailable


processing queries:   1%|          | 402/47119 [5:26:48<1112:50:06, 85.75s/it] Rate limit exceeded. Sleeping for 824 seconds.
processing queries:   1%|          | 406/47119 [5:41:04<1461:49:03, 112.66s/it]Rate limit exceeded. Sleeping for 870 seconds.
processing queries:   1%|          | 432/47119 [5:58:19<70:53:26,  5.47s/it]   Rate limit exceeded. Sleeping for 737 seconds.
processing queries:   1%|          | 444/47119 [6:12:20<158:18:07, 12.21s/it]  Rate limit exceeded. Sleeping for 798 seconds.
processing queries:   1%|          | 459/47119 [6:28:01<176:15:50, 13.60s/it]  

503 Service Unavailable
Service Unavailable


processing queries:   1%|          | 481/47119 [6:32:08<50:01:47,  3.86s/it] Rate limit exceeded. Sleeping for 511 seconds.
processing queries:   1%|          | 486/47119 [6:41:28<578:26:15, 44.65s/it]  Rate limit exceeded. Sleeping for 853 seconds.
processing queries:   1%|          | 502/47119 [6:58:52<224:01:37, 17.30s/it]  Rate limit exceeded. Sleeping for 712 seconds.
processing queries:   1%|          | 511/47119 [7:11:47<276:08:53, 21.33s/it]  Rate limit exceeded. Sleeping for 838 seconds.
processing queries:   1%|          | 544/47119 [7:29:50<153:56:38, 11.90s/it]  Rate limit exceeded. Sleeping for 656 seconds.
processing queries:   1%|          | 586/47119 [7:45:20<103:35:38,  8.01s/it]  Rate limit exceeded. Sleeping for 627 seconds.
processing queries:   1%|▏         | 597/47119 [7:57:01<147:00:26, 11.38s/it]  Rate limit exceeded. Sleeping for 827 seconds.
processing queries:   1%|▏         | 610/47119 [8:12:28<164:00:12, 12.69s/it]  Rate limit exceeded. Sleeping for 802 sec

KeyboardInterrupt: 