In [1]:
import json

from tqdm import tqdm
import tweepy
from dotenv import load_dotenv
import os

from tweepy import TweepyException

load_dotenv()  # take environment variables from .env.

True

In [2]:
DATA_PATH = 'data'
YEARS = [2016, 2020]

In [3]:
for year in YEARS:
    with open(os.path.join(DATA_PATH, f'polls_{year}_labeled.json')) as f:
        polls = json.load(f)
    author_ids = list(map(lambda x: x['meta']['author_id'], polls))
    with open(os.path.join(DATA_PATH, f'polls_{year}_author_ids.json'), 'w+') as f:
        json.dump(author_ids, f)

In [4]:
auth = tweepy.OAuth2BearerHandler(
    bearer_token= os.environ['BEARER'])
api = tweepy.API(auth, wait_on_rate_limit=True)

In [7]:
exception_users = dict()

In [8]:

for year in YEARS:
    out_path = os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl')
    existing_users = set()
    if os.path.exists(out_path):
        with open(out_path) as f:
            existing_users.update(k for l in f for k in json.loads(l))
            print(f'found {len(existing_users)} scraped users')

    with open(os.path.join(DATA_PATH, f'polls_{year}_author_ids.json')) as f:
        author_ids = [id for id in json.load(f) if id not in existing_users]

    with open(os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl'), 'a+', encoding='utf8') as f:
        for author_id in tqdm(author_ids, desc='checking followers of poll authors'):
            follower_ids = []
            try:
                for page in tweepy.Cursor(api.get_follower_ids, user_id = author_id, count = 5000).pages():
                    follower_ids.extend(page)
                f.write(json.dumps({author_id:follower_ids}) + '\n')
            except TweepyException as e:
                print(e)
                print(author_id)
                exception_users[id]=e


found 59 scraped users


checking followers of poll authors:   0%|          | 0/50 [00:00<?, ?it/s]Rate limit reached. Sleeping for: 702
checking followers of poll authors:   2%|▏         | 1/50 [11:43<9:34:29, 703.46s/it]

401 Unauthorized
Not authorized.
349240397


checking followers of poll authors:   4%|▍         | 2/50 [11:43<3:51:51, 289.81s/it]

401 Unauthorized
Not authorized.
155939890


Rate limit reached. Sleeping for: 894
checking followers of poll authors:  24%|██▍       | 12/50 [26:48<09:56, 15.68s/it]  Rate limit reached. Sleeping for: 895
checking followers of poll authors:  36%|███▌      | 18/50 [41:47<25:15, 47.36s/it]   Rate limit reached. Sleeping for: 894
checking followers of poll authors:  40%|████      | 20/50 [56:50<1:46:29, 212.98s/it]Rate limit reached. Sleeping for: 895
checking followers of poll authors:  46%|████▌     | 23/50 [1:11:48<1:32:19, 205.15s/it]Rate limit reached. Sleeping for: 895
checking followers of poll authors:  54%|█████▍    | 27/50 [1:26:52<54:39, 142.59s/it]  Rate limit reached. Sleeping for: 895
checking followers of poll authors:  72%|███████▏  | 36/50 [1:41:55<05:04, 21.73s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  84%|████████▍ | 42/50 [1:56:55<06:26, 48.27s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  90%|█████████ | 45/50 [2:11:59<12:29, 149.83s/it]R

404 Not Found
34 - Sorry, that page does not exist.
1273811445269106688


checking followers of poll authors:  27%|██▋       | 159/580 [2:30:15<14:25,  2.06s/it]  Rate limit reached. Sleeping for: 897
checking followers of poll authors:  30%|███       | 174/580 [2:45:16<14:04,  2.08s/it]    Rate limit reached. Sleeping for: 897
checking followers of poll authors:  33%|███▎      | 189/580 [3:00:18<13:29,  2.07s/it]    Rate limit reached. Sleeping for: 896
checking followers of poll authors:  35%|███▌      | 204/580 [3:15:19<12:59,  2.07s/it]    Rate limit reached. Sleeping for: 896
checking followers of poll authors:  36%|███▌      | 209/580 [3:30:17<6:42:58, 65.17s/it]  Rate limit reached. Sleeping for: 895
checking followers of poll authors:  37%|███▋      | 217/580 [3:45:22<2:38:58, 26.28s/it]  Rate limit reached. Sleeping for: 895
checking followers of poll authors:  40%|████      | 232/580 [4:00:21<12:35,  2.17s/it]    Rate limit reached. Sleeping for: 897
checking followers of poll authors:  43%|████▎     | 247/580 [4:15:23<11:24,  2.06s/it]    Rate lim

404 Not Found
34 - Sorry, that page does not exist.
1273811445269106688


checking followers of poll authors:  73%|███████▎  | 424/580 [21:16:41<05:29,  2.11s/it]Rate limit reached. Sleeping for: 896
checking followers of poll authors:  76%|███████▌  | 439/580 [21:31:42<04:53,  2.08s/it]    Rate limit reached. Sleeping for: 896
checking followers of poll authors:  78%|███████▊  | 451/580 [21:46:43<12:12,  5.68s/it]    Rate limit reached. Sleeping for: 896
checking followers of poll authors:  80%|███████▉  | 462/580 [22:01:45<16:07,  8.20s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  82%|████████▏ | 475/580 [22:16:45<07:04,  4.04s/it]   Rate limit reached. Sleeping for: 896
checking followers of poll authors:  83%|████████▎ | 484/580 [22:31:45<25:30, 15.94s/it]   

404 Not Found
34 - Sorry, that page does not exist.
1216703337317457921


checking followers of poll authors:  84%|████████▍ | 489/580 [22:31:46<04:20,  2.86s/it]Rate limit reached. Sleeping for: 897
checking followers of poll authors:  87%|████████▋ | 502/580 [22:46:48<05:23,  4.15s/it]   Rate limit reached. Sleeping for: 896
checking followers of poll authors:  89%|████████▉ | 517/580 [23:01:50<02:12,  2.10s/it]   Rate limit reached. Sleeping for: 896
checking followers of poll authors:  91%|█████████ | 529/580 [23:16:52<05:00,  5.90s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  93%|█████████▎| 539/580 [23:31:53<07:44, 11.34s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  95%|█████████▌| 553/580 [23:46:52<01:19,  2.94s/it]   Rate limit reached. Sleeping for: 897
checking followers of poll authors:  98%|█████████▊| 566/580 [24:01:55<00:55,  4.00s/it]   Rate limit reached. Sleeping for: 896
checking followers of poll authors: 100%|██████████| 580/580 [24:16:55<00:00, 150.72s/it]


In [None]:
# client = tweepy.Client(bearer_token=os.environ['BEARER'],
#                        wait_on_rate_limit=True)
# author_id = author_ids[0]
# user_fields = ['id']
# response = client.get_users_followers(id=author_id, user_fields=user_fields, max_results=1000)
# one_user = response.data[0]
# one_user.data
# next_token = response.meta['next_token']