In [1]:
import json
import time

import pandas as pd
from tqdm import tqdm
import tweepy
from dotenv import load_dotenv
import os

from tweepy import TweepyException

from utils import divide_chunks

load_dotenv()  # take environment variables from .env.

True

In [2]:
USER_FIELDS = ['created_at','description','entities','id','location','name','pinned_tweet_id','profile_image_url','protected','public_metrics','url','username','verified','withheld']
BATCH_SIZE=100
def get_profiles(ids, client, out_path, error_path, user_fields=USER_FIELDS, batch_size=BATCH_SIZE):
    errors = list()

    with open(out_path, 'a+', encoding='utf8') as f, open(error_path, 'a+', encoding='utf8') as err:

        for chunk in tqdm(divide_chunks(ids, batch_size), desc='batch', total = len(ids)//batch_size+bool(len(ids)%batch_size)):
            try:
                returned = client.get_users(ids=chunk, user_fields=user_fields)
                for user in returned.data:
                    f.write(json.dumps(user.data)+'\n')
                for error in returned.errors:
                    err.write(json.dumps(error)+'\n')
            except TweepyException as e:
                print(e)
                errors.extend(chunk)
                time.sleep(60)
    return errors

In [3]:
client = tweepy.Client(bearer_token=os.environ['BEARER'],
                       wait_on_rate_limit=True,)
# with API rather than client, could add                        retry_count=10, retry_delay=60, retry_errors=set([503])

In [4]:
DATA_PATH = 'data'
YEARS = [2016, 2020]
ids = set()
for year in YEARS:
    with open(os.path.join(DATA_PATH, f'follower_lists_{year}.jsonl')) as f:
        for l in f:
            for _, ids_ in json.loads(l).items():
                ids.update(ids_)
ids = list(ids)

In [5]:
len(ids)

12439182

In [6]:
out_path = os.path.join(DATA_PATH, 'followers_rehydrated.jsonl')
err_path = os.path.join(DATA_PATH, 'followers_missing.jsonl')
already_scraped = set()
if os.path.exists(out_path):
    with open(out_path) as f:
        already_scraped = {json.loads(l)['id'] for l in f}
if os.path.exists(err_path):
    with open(err_path) as f:
        already_scraped.update({json.loads(l)['resource_id'] for l in f})
already_scraped = set(map(int, already_scraped))
print(f"{len(already_scraped)} users were already scraped")

ids_ = [id_ for id_ in ids if id_ not in already_scraped]
print(f"{len(ids_)} users left to scrape")


12439182 users were already scraped
0 users left to scrape


In [45]:
errors = get_profiles(ids=ids_, client=client, out_path=out_path, error_path=err_path)

batch: 100%|██████████| 6/6 [00:02<00:00,  2.58it/s]


In [46]:
print(len(errors))

0


In [33]:
other_errors = get_profiles(ids=errors, client=client, out_path=out_path, error_path=err_path)

batch: 100%|██████████| 2/2 [00:00<00:00,  3.16it/s]


In [10]:
replies = dict()
for fname in ['reply_2016.json', 'reply_2020.json', 'retweet_2016.json', 'retweet_2020.json']:
    with open(os.path.join(DATA_PATH, fname)) as f:
        replies.update(json.load(f))
repliers = list(set(filter(lambda x: x is not None, [vv.get('id', None) for k, v in replies.items() for vv in v])))
len(repliers)

8157

In [11]:
out_path = os.path.join(DATA_PATH, 'repliers_rehydrated.jsonl')
err_path = os.path.join(DATA_PATH, 'repliers_missing.jsonl')
follower_errors = get_profiles(ids=repliers, client=client, out_path=out_path, error_path=err_path)

batch: 100%|██████████| 82/82 [00:30<00:00,  2.65it/s]


In [13]:
len(follower_errors)

0

In [15]:
author_ids = set()
for year in YEARS:
    with open(os.path.join(DATA_PATH, f'polls-{year}.json')) as f:
        polls = json.load(f)
    author_ids.update([poll['meta']['author_id'] for poll in filter(lambda x: x['type']=='twitter', polls)])
author_ids = list(author_ids)
print(len(author_ids))

1024


In [16]:
out_path = os.path.join(DATA_PATH, 'authors_rehydrated.jsonl')
err_path = os.path.join(DATA_PATH, 'author_profiles_missing.jsonl')
author_errors = get_profiles(ids=author_ids, client=client, out_path=out_path, error_path=err_path)

batch: 100%|██████████| 11/11 [00:04<00:00,  2.51it/s]


In [17]:
len(author_errors)

0