In [1]:
import json

from tqdm import tqdm
import tweepy
from dotenv import load_dotenv
import os

from tweepy import TweepyException

load_dotenv()  # load environment variables from .env

True

In [2]:
DATA_PATH = 'data'
YEARS = [2016, 2020]

In [16]:
for year in YEARS:
    with open(os.path.join(DATA_PATH, f'polls-{year}.json')) as f:
        polls = json.load(f)
    author_ids = [poll['meta']['author_id'] for poll in filter(lambda x: x['type']=='twitter', polls)]
    with open(os.path.join(DATA_PATH, f'polls_{year}_author_ids.json'), 'w+') as f:
        json.dump(author_ids, f)

In [17]:
auth = tweepy.OAuth2BearerHandler(
    bearer_token= os.environ['BEARER'])
api = tweepy.API(auth, wait_on_rate_limit=True)

In [18]:
exception_users = dict()

In [None]:

for year in YEARS:
    out_path = os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl')
    existing_users = set()
    if os.path.exists(out_path):
        with open(out_path) as f:
            existing_users.update(k for l in f for k in json.loads(l))
            print(f'found {len(existing_users)} scraped users')

    with open(os.path.join(DATA_PATH, f'polls_{year}_author_ids.json')) as f:
        author_ids = [id for id in json.load(f) if id not in existing_users]

    with open(os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl'), 'a+', encoding='utf8') as f:
        for author_id in tqdm(author_ids, desc='checking followers of poll authors'):
            follower_ids = []
            try:
                for page in tweepy.Cursor(api.get_follower_ids, user_id = author_id, count = 5000).pages():
                    follower_ids.extend(page)
                f.write(json.dumps({author_id:follower_ids}) + '\n')
            except TweepyException as e:
                print(e)
                print(author_id)
                exception_users[id]=e


found 95 scraped users


checking followers of poll authors:   0%|          | 1/292 [00:00<01:04,  4.50it/s]

404 Not Found
34 - Sorry, that page does not exist.
2515437133


checking followers of poll authors:   2%|▏         | 5/292 [00:02<02:33,  1.87it/s]Rate limit reached. Sleeping for: 895
checking followers of poll authors:   3%|▎         | 10/292 [15:03<5:15:00, 67.02s/it] 

401 Unauthorized
Not authorized.
349240397


checking followers of poll authors:   4%|▍         | 11/292 [15:03<3:38:07, 46.58s/it]

401 Unauthorized
Not authorized.
155939890


checking followers of poll authors:   5%|▌         | 15/292 [15:04<51:38, 11.19s/it]  

401 Unauthorized
Not authorized.
871921008


checking followers of poll authors:   6%|▌         | 18/292 [15:05<18:09,  3.98s/it]Rate limit reached. Sleeping for: 896
checking followers of poll authors:  10%|█         | 30/292 [30:06<24:41,  5.65s/it]    Rate limit reached. Sleeping for: 895
checking followers of poll authors:  13%|█▎        | 38/292 [45:08<1:37:42, 23.08s/it]  Rate limit reached. Sleeping for: 895
checking followers of poll authors:  14%|█▍        | 42/292 [1:00:05<6:48:02, 97.93s/it]  Rate limit reached. Sleeping for: 894
checking followers of poll authors:  15%|█▌        | 44/292 [1:15:10<16:24:35, 238.21s/it]Rate limit reached. Sleeping for: 894
checking followers of poll authors:  18%|█▊        | 53/292 [1:30:10<1:42:01, 25.61s/it]  Rate limit reached. Sleeping for: 896
checking followers of poll authors:  19%|█▉        | 56/292 [1:45:09<9:14:22, 140.94s/it] 

401 Unauthorized
Not authorized.
215886325


checking followers of poll authors:  20%|█▉        | 58/292 [1:45:09<4:29:48, 69.18s/it] Rate limit reached. Sleeping for: 895
checking followers of poll authors:  22%|██▏       | 64/292 [2:00:12<3:24:35, 53.84s/it]  Rate limit reached. Sleeping for: 894
checking followers of poll authors:  23%|██▎       | 68/292 [2:15:16<6:35:51, 106.03s/it] Rate limit reached. Sleeping for: 894
checking followers of poll authors:  25%|██▌       | 74/292 [2:30:16<3:32:37, 58.52s/it]  Rate limit reached. Sleeping for: 895
checking followers of poll authors:  29%|██▉       | 84/292 [2:45:18<44:20, 12.79s/it]    Rate limit reached. Sleeping for: 895
checking followers of poll authors:  30%|██▉       | 87/292 [3:00:17<7:47:29, 136.83s/it] Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 894
check

404 Not Found
34 - Sorry, that page does not exist.
742938016495378433


Rate limit reached. Sleeping for: 895
checking followers of poll authors:  36%|███▌      | 105/292 [5:45:30<9:58:44, 192.11s/it] Rate limit reached. Sleeping for: 894
checking followers of poll authors:  37%|███▋      | 107/292 [6:00:30<14:33:41, 283.36s/it]Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 894
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 892
Rate limit reached. Sleeping for: 894
checking followers of poll authors:  39%|███▊      | 113/292 [7:30:42<15:15:24, 306.84s/it] Rate limit reached. Sleeping for: 895
checking followers of poll authors:  42%|████▏     | 122/292 [7:45:43<1:21:31, 28.77s/it]  Rate limit reached. Sleeping for: 896
checking followers of poll authors:  44%|████▍     | 129/292 [8:00:42<1:33:09, 34.29s/it]  Rate limit reached. Sleeping for: 895
Rate limit reached. Sleeping for: 894
checking followers of poll authors:  47%|████▋     | 136/292 [8:30:47<2:53:25, 66.70s/it

404 Not Found
34 - Sorry, that page does not exist.
3224102715


checking followers of poll authors:  49%|████▊     | 142/292 [8:45:49<2:13:36, 53.45s/it]Rate limit reached. Sleeping for: 896
checking followers of poll authors:  50%|████▉     | 145/292 [9:00:47<6:08:25, 150.37s/it] 

401 Unauthorized
Not authorized.
206061601


checking followers of poll authors:  51%|█████     | 148/292 [9:00:48<2:04:08, 51.72s/it] 

401 Unauthorized
Not authorized.
390502765


checking followers of poll authors:  52%|█████▏    | 151/292 [9:00:48<42:06, 17.92s/it]  

401 Unauthorized
Not authorized.
414845225


checking followers of poll authors:  54%|█████▍    | 157/292 [9:00:50<05:14,  2.33s/it]Rate limit reached. Sleeping for: 897
Rate limit reached. Sleeping for: 895
checking followers of poll authors:  55%|█████▌    | 161/292 [9:30:55<6:47:04, 186.44s/it] Rate limit reached. Sleeping for: 895
checking followers of poll authors:  58%|█████▊    | 169/292 [9:45:54<1:08:02, 33.19s/it]  Rate limit reached. Sleeping for: 895
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 894
checking followers of poll authors:  62%|██████▏   | 181/292 [11:16:04<1:00:47, 32.86s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  63%|██████▎   | 183/292 [11:31:02<6:12:14, 204.91s/it]Rate limit reached. Sleeping for: 894
checking followers of poll authors:  65%|██████▍   | 189/292 [11:46:07<2:00:00, 69.91s/it]  Rate limit reached. Sleeping for: 894

404 Not Found
34 - Sorry, that page does not exist.
261306457


checking followers of poll authors:  70%|███████   | 205/292 [12:16:07<25:13, 17.39s/it]   Rate limit reached. Sleeping for: 895
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 893
Rate limit reached. Sleeping for: 893
checking followers of poll authors:  71%|███████   | 208/292 [13:16:14<12:30:42, 536.23s/it] 

404 Not Found
34 - Sorry, that page does not exist.
2669804190


Rate limit reached. Sleeping for: 894
checking followers of poll authors:  73%|███████▎  | 214/292 [13:31:14<2:22:17, 109.45s/it] Rate limit reached. Sleeping for: 895
checking followers of poll authors:  75%|███████▌  | 219/292 [13:46:12<1:41:14, 83.21s/it] 

401 Unauthorized
Not authorized.
728059399


checking followers of poll authors:  78%|███████▊  | 227/292 [13:46:14<05:30,  5.09s/it]  Rate limit reached. Sleeping for: 896
checking followers of poll authors:  80%|████████  | 234/292 [14:01:16<31:26, 32.53s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  83%|████████▎ | 243/292 [14:16:15<13:57, 17.10s/it]   

401 Unauthorized
Not authorized.
206061601


checking followers of poll authors:  84%|████████▎ | 244/292 [14:16:16<09:37, 12.04s/it]

401 Unauthorized
Not authorized.
728059399


checking followers of poll authors:  85%|████████▍ | 248/292 [14:16:17<02:16,  3.10s/it]Rate limit reached. Sleeping for: 896
checking followers of poll authors:  86%|████████▌ | 251/292 [14:31:16<1:31:06, 133.33s/it]

404 Not Found
34 - Sorry, that page does not exist.
752989213851877377


checking followers of poll authors:  88%|████████▊ | 256/292 [14:31:18<13:36, 22.67s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  89%|████████▉ | 260/292 [14:46:21<52:38, 98.71s/it]   Rate limit reached. Sleeping for: 895
checking followers of poll authors:  90%|████████▉ | 262/292 [15:01:18<1:58:20, 236.69s/it]Rate limit reached. Sleeping for: 895


In [None]:
# client = tweepy.Client(bearer_token=os.environ['BEARER'],
#                        wait_on_rate_limit=True)
# author_id = author_ids[0]
# user_fields = ['id']
# response = client.get_users_followers(id=author_id, user_fields=user_fields, max_results=1000)
# one_user = response.data[0]
# one_user.data
# next_token = response.meta['next_token']