In [1]:
import json

import pandas as pd
from tqdm import tqdm
import tweepy
from dotenv import load_dotenv
import os

from tweepy import TweepyException

load_dotenv()  # load environment variables from .env

True

In [2]:
DATA_PATH = 'data'
YEARS = [2016, 2020]

In [16]:
for year in YEARS:
    with open(os.path.join(DATA_PATH, f'polls-{year}.json')) as f:
        polls = json.load(f)
    author_ids = [poll['meta']['author_id'] for poll in filter(lambda x: x['type']=='twitter', polls)]
    with open(os.path.join(DATA_PATH, f'polls_{year}_author_ids.json'), 'w+') as f:
        json.dump(author_ids, f)

In [17]:
auth = tweepy.OAuth2BearerHandler(
    bearer_token= os.environ['BEARER'])
api = tweepy.API(auth, wait_on_rate_limit=True)

In [29]:
exception_users = dict()

In [30]:

for year in YEARS:
    out_path = os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl')
    existing_users = set()
    if os.path.exists(out_path):
        with open(out_path) as f:
            existing_users.update(k for l in f for k in json.loads(l))
            print(f'found {len(existing_users)} scraped users')

    with open(os.path.join(DATA_PATH, f'polls_{year}_author_ids.json')) as f:
        author_ids = [id for id in json.load(f) if id not in existing_users]

    with open(os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl'), 'a+', encoding='utf8') as f:
        for author_id in tqdm(author_ids, desc='checking followers of poll authors'):
            follower_ids = []
            try:
                for page in tweepy.Cursor(api.get_follower_ids, user_id = author_id, count = 5000).pages():
                    follower_ids.extend(page)
                f.write(json.dumps({author_id:follower_ids}) + '\n')
            except TweepyException as e:
                print(e)
                print(author_id)
                exception_users[author_id]=e


found 333 scraped users


checking followers of poll authors:   6%|▌         | 1/17 [00:00<00:03,  4.91it/s]

404 Not Found
34 - Sorry, that page does not exist.
2515437133


checking followers of poll authors:  18%|█▊        | 3/17 [00:00<00:02,  4.86it/s]

401 Unauthorized
Not authorized.
155939890


checking followers of poll authors:  24%|██▎       | 4/17 [00:00<00:02,  4.61it/s]

401 Unauthorized
Not authorized.
871921008


checking followers of poll authors:  29%|██▉       | 5/17 [00:01<00:02,  4.48it/s]

401 Unauthorized
Not authorized.
215886325


checking followers of poll authors:  41%|████      | 7/17 [00:01<00:02,  4.65it/s]

404 Not Found
34 - Sorry, that page does not exist.
742938016495378433
404 Not Found
34 - Sorry, that page does not exist.
3224102715


checking followers of poll authors:  47%|████▋     | 8/17 [00:01<00:01,  4.74it/s]

401 Unauthorized
Not authorized.
206061601


checking followers of poll authors:  53%|█████▎    | 9/17 [00:01<00:01,  4.62it/s]

401 Unauthorized
Not authorized.
390502765


checking followers of poll authors:  59%|█████▉    | 10/17 [00:02<00:01,  4.70it/s]

401 Unauthorized
Not authorized.
414845225


checking followers of poll authors:  71%|███████   | 12/17 [00:02<00:01,  4.68it/s]

404 Not Found
34 - Sorry, that page does not exist.
261306457
404 Not Found
34 - Sorry, that page does not exist.
2669804190


checking followers of poll authors:  76%|███████▋  | 13/17 [00:02<00:00,  4.67it/s]

401 Unauthorized
Not authorized.
728059399


checking followers of poll authors:  82%|████████▏ | 14/17 [00:03<00:00,  4.63it/s]

401 Unauthorized
Not authorized.
206061601


checking followers of poll authors:  88%|████████▊ | 15/17 [00:03<00:00,  4.64it/s]

401 Unauthorized
Not authorized.
728059399


Rate limit reached. Sleeping for: 896
checking followers of poll authors:  94%|█████████▍| 16/17 [15:00<04:30, 270.29s/it]

404 Not Found
34 - Sorry, that page does not exist.
752989213851877377


checking followers of poll authors: 100%|██████████| 17/17 [15:00<00:00, 52.99s/it] 

401 Unauthorized
Not authorized.
2381862440





found 636 scraped users


checking followers of poll authors:   2%|▏         | 1/55 [00:00<00:12,  4.21it/s]

401 Unauthorized
Not authorized.
86815264


checking followers of poll authors:   4%|▎         | 2/55 [00:00<00:12,  4.35it/s]

401 Unauthorized
Not authorized.
1594409988


checking followers of poll authors:   5%|▌         | 3/55 [00:00<00:11,  4.52it/s]

401 Unauthorized
Not authorized.
1594409988


checking followers of poll authors:   7%|▋         | 4/55 [00:00<00:11,  4.37it/s]

401 Unauthorized
Not authorized.
1594409988


checking followers of poll authors:   9%|▉         | 5/55 [00:01<00:11,  4.44it/s]

401 Unauthorized
Not authorized.
1194495690191360004


checking followers of poll authors:  11%|█         | 6/55 [00:01<00:10,  4.53it/s]

401 Unauthorized
Not authorized.
1594409988


checking followers of poll authors:  13%|█▎        | 7/55 [00:01<00:10,  4.58it/s]

404 Not Found
34 - Sorry, that page does not exist.
42841187
401 Unauthorized
Not authorized.
86815264


checking followers of poll authors:  16%|█▋        | 9/55 [00:01<00:10,  4.59it/s]

404 Not Found
34 - Sorry, that page does not exist.
209684798


checking followers of poll authors:  18%|█▊        | 10/55 [00:02<00:09,  4.53it/s]

401 Unauthorized
Not authorized.
4875048329


checking followers of poll authors:  20%|██        | 11/55 [00:02<00:09,  4.55it/s]

404 Not Found
34 - Sorry, that page does not exist.
553581707
401 Unauthorized
Not authorized.
86815264


checking followers of poll authors:  24%|██▎       | 13/55 [00:02<00:09,  4.57it/s]

404 Not Found
34 - Sorry, that page does not exist.
3140006645


Rate limit reached. Sleeping for: 895
checking followers of poll authors:  25%|██▌       | 14/55 [14:59<3:05:07, 270.92s/it]

401 Unauthorized
Not authorized.
86815264


checking followers of poll authors:  27%|██▋       | 15/55 [14:59<2:06:12, 189.32s/it]

404 Not Found
34 - Sorry, that page does not exist.
401757176


checking followers of poll authors:  31%|███       | 17/55 [14:59<58:40, 92.65s/it]   

401 Unauthorized
Not authorized.
24096463
401 Unauthorized
Not authorized.
3389073989


checking followers of poll authors:  33%|███▎      | 18/55 [15:00<40:00, 64.88s/it]

401 Unauthorized
Not authorized.
786209354


checking followers of poll authors:  35%|███▍      | 19/55 [15:00<27:16, 45.46s/it]

401 Unauthorized
Not authorized.
1089300681872175104
401 Unauthorized
Not authorized.
1037663718686031879


checking followers of poll authors:  38%|███▊      | 21/55 [15:00<12:40, 22.37s/it]

404 Not Found
34 - Sorry, that page does not exist.
4030465994


checking followers of poll authors:  40%|████      | 22/55 [15:01<08:38, 15.72s/it]

404 Not Found
34 - Sorry, that page does not exist.
1257444286934781952


checking followers of poll authors:  42%|████▏     | 23/55 [15:01<05:54, 11.08s/it]

404 Not Found
34 - Sorry, that page does not exist.
1201768582402711552


checking followers of poll authors:  44%|████▎     | 24/55 [15:01<04:02,  7.82s/it]

401 Unauthorized
Not authorized.
1077552804930224128


checking followers of poll authors:  45%|████▌     | 25/55 [15:01<02:46,  5.54s/it]

404 Not Found
34 - Sorry, that page does not exist.
1273811445269106688
404 Not Found
34 - Sorry, that page does not exist.
1109108413966049280


checking followers of poll authors:  51%|█████     | 28/55 [15:02<00:55,  2.04s/it]

401 Unauthorized
Not authorized.
2930966494
401 Unauthorized
Not authorized.
1277427101633613824


Rate limit reached. Sleeping for: 897
checking followers of poll authors:  53%|█████▎    | 29/55 [30:00<1:57:25, 270.97s/it]

401 Unauthorized
Not authorized.
30109729
404 Not Found
34 - Sorry, that page does not exist.
1302297223632482305


checking followers of poll authors:  56%|█████▋    | 31/55 [30:01<53:09, 132.88s/it]  

404 Not Found
34 - Sorry, that page does not exist.
2275031188


checking followers of poll authors:  58%|█████▊    | 32/55 [30:01<35:40, 93.09s/it] 

404 Not Found
34 - Sorry, that page does not exist.
946409530853855232


checking followers of poll authors:  60%|██████    | 33/55 [30:01<23:54, 65.23s/it]

401 Unauthorized
Not authorized.
1077552804930224128


checking followers of poll authors:  62%|██████▏   | 34/55 [30:01<16:00, 45.72s/it]

404 Not Found
34 - Sorry, that page does not exist.
1066864102406946816


checking followers of poll authors:  64%|██████▎   | 35/55 [30:02<10:41, 32.08s/it]

404 Not Found
34 - Sorry, that page does not exist.
743520822610911233


checking followers of poll authors:  65%|██████▌   | 36/55 [30:02<07:07, 22.52s/it]

401 Unauthorized
Not authorized.
32372752
404 Not Found
34 - Sorry, that page does not exist.
2268746017


checking followers of poll authors:  69%|██████▉   | 38/55 [30:02<03:09, 11.17s/it]

404 Not Found
34 - Sorry, that page does not exist.
1187154684408188928


checking followers of poll authors:  71%|███████   | 39/55 [30:03<02:06,  7.89s/it]

401 Unauthorized
Not authorized.
147733355


checking followers of poll authors:  73%|███████▎  | 40/55 [30:03<01:23,  5.59s/it]

401 Unauthorized
Not authorized.
2271320364


checking followers of poll authors:  76%|███████▋  | 42/55 [30:03<00:37,  2.87s/it]

404 Not Found
34 - Sorry, that page does not exist.
1135633178440425472
401 Unauthorized
Not authorized.
714976928323805185


checking followers of poll authors:  78%|███████▊  | 43/55 [30:04<00:24,  2.07s/it]Rate limit reached. Sleeping for: 896


401 Unauthorized
Not authorized.
1258037784474013696


checking followers of poll authors:  80%|████████  | 44/55 [45:01<49:37, 270.69s/it]

401 Unauthorized
Not authorized.
101044787
404 Not Found
34 - Sorry, that page does not exist.
4591703001


checking followers of poll authors:  84%|████████▎ | 46/55 [45:01<19:54, 132.75s/it]

401 Unauthorized
Not authorized.
1281175699580055552


checking followers of poll authors:  85%|████████▌ | 47/55 [45:02<12:23, 92.99s/it] 

404 Not Found
34 - Sorry, that page does not exist.
1273811445269106688


checking followers of poll authors:  87%|████████▋ | 48/55 [45:02<07:36, 65.15s/it]

401 Unauthorized
Not authorized.
1258037784474013696


checking followers of poll authors:  89%|████████▉ | 49/55 [45:02<04:34, 45.67s/it]

401 Unauthorized
Not authorized.
970653049596203008
404 Not Found
34 - Sorry, that page does not exist.
1005708111741628419


checking followers of poll authors:  93%|█████████▎| 51/55 [45:03<01:29, 22.49s/it]

404 Not Found
34 - Sorry, that page does not exist.
1268830393597407238


checking followers of poll authors:  95%|█████████▍| 52/55 [45:03<00:47, 15.81s/it]

404 Not Found
34 - Sorry, that page does not exist.
1216703337317457921
401 Unauthorized
Not authorized.
63998604


checking followers of poll authors:  98%|█████████▊| 54/55 [45:03<00:07,  7.85s/it]

401 Unauthorized
Not authorized.
1110075862328717312


checking followers of poll authors: 100%|██████████| 55/55 [45:03<00:00, 49.16s/it]

404 Not Found
34 - Sorry, that page does not exist.
1292939157375987712





In [31]:
len(exception_users)

60

In [41]:
with open(os.path.join(DATA_PATH, 'authors_missing.jsonl'), 'w+') as f:
    for k, v in exception_users.items():
        f.write(json.dumps({k:v.api_errors[0]})+'\n')

In [42]:
for year in YEARS:
    out_path = os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl')
    existing_users = set()
    if os.path.exists(out_path):
        with open(out_path) as f:
            existing_users.update(k for l in f for k in json.loads(l))
            print(f'found {len(existing_users)} scraped users')

    with open(os.path.join(DATA_PATH, f'polls_{year}_author_ids.json')) as f:
        author_ids = [id for id in json.load(f) if id not in existing_users]
        print(len(author_ids))
        print(len([i for i in author_ids if i not in exception_users]))

found 334 scraped users
16
0
found 636 scraped users
55
0


In [54]:
import pandas as pd

fnames = ['senators-accounts-1.csv',
'representatives-accounts-1.csv',
'congress116-house-accounts.csv',
'congress116-senate-accounts.csv',]
congress_ids = set(pd.concat([pd.read_csv(os.path.join(DATA_PATH, fname)) for fname in fnames]).Uid.unique())
len(congress_ids)

1050

In [55]:
congress_ids

{803694179079458816,
 816683274076614656,
 818554054309715969,
 817076257770835968,
 811313565760163844,
 811986281177772032,
 816157667882373120,
 2974648323,
 806583915012046854,
 818472418620608512,
 816298918468259841,
 815966620300480514,
 816652616625168388,
 798973032362606600,
 161411080,
 442824717,
 21157904,
 855240223432769538,
 829061809135030272,
 304138251,
 193441812,
 4228409359,
 1080574793630527505,
 158890005,
 1058717720,
 1129029661,
 242376736,
 42481696,
 550131748,
 385429543,
 5558312,
 2976606250,
 117501995,
 43986986,
 2933760046,
 303861808,
 404355121,
 4827594804,
 188403766,
 22044727,
 237862972,
 231510077,
 1058807868,
 218292287,
 153507902,
 158470209,
 970207298,
 3018670151,
 816303263586914304,
 721127537674489856,
 233949261,
 241207373,
 3026622545,
 1058345042,
 22812754,
 27775884,
 239548513,
 1060487274,
 233693291,
 377534571,
 435331179,
 291756142,
 3170031728,
 2973870195,
 223166587,
 231108733,
 1080307235350241280,
 8160304247785431