In [19]:
import pandas as pd
import os
import json
from collections import defaultdict


from data_utils import DATA_PATH, YEARS

In [2]:
polls= dict()
for year in YEARS:
    poll_fpath = os.path.join(DATA_PATH, 'paper_data','polls', f'polls-{year}-query1-2options-clean.json')
    with open(poll_fpath) as f:
        polls[year] = json.load(f)

In [3]:
twitter_polls = {year:list(filter(lambda poll:poll['type']=='twitter', polls_)) for year, polls_ in polls.items()}

In [4]:
len(twitter_polls[2016]), len(twitter_polls[2020])

(401, 993)

In [5]:
twitter_author_ids = {year:list(poll['meta']['author_id'] for poll in polls_) for year, polls_ in twitter_polls.items()}

In [6]:
twitter_follower_lists = dict()

fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('follower_list' in fname) and fname.endswith('jsonl')]
_follower_lists = dict()
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for l in f:
            _follower_lists.update(json.loads(l))
_twitter_missing_follower_lists=dict()
for year, authors in twitter_author_ids.items():
    twitter_follower_lists[year] = dict()
    _twitter_missing_follower_lists[year] = set()
    for author in authors:
        if author not in _follower_lists:
            _twitter_missing_follower_lists[year].add(author)
        else:
            twitter_follower_lists[year][author] = [str(id_) for id_ in _follower_lists[author]]

data\follower_lists_2016.jsonl
data\follower_lists_2020.jsonl
data\follower_lists_new.jsonl
data\follower_lists_newer.jsonl


In [7]:
len(twitter_follower_lists[2016]), len(twitter_follower_lists[2020]), len(_twitter_missing_follower_lists[2016]), len(_twitter_missing_follower_lists[2020])

(334, 636, 14, 46)

In [8]:
twitter_follower_ids = {year:{follower for follower in followers} for year, follow_lists in twitter_follower_lists.items() for followers in follow_lists.values()}

In [9]:
len(twitter_follower_ids[2016]), len(twitter_follower_ids[2020])

(1400, 11283)

In [10]:
fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('missing' in fname) and fname.endswith('jsonl')]
_missing = set()
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for message in  map(json.loads, f):
            _missing.add(message['value'] if 'value' in message else list(message.keys())[0])

data\authors_missing.jsonl
data\author_profiles_missing.jsonl
data\complotto_missing.jsonl
data\followers_missing.jsonl
data\newer_followees_missing.jsonl
data\newer_followers_missing.jsonl
data\new_authors_missing.jsonl
data\new_followees_missing.jsonl
data\new_followees_of_authors_missing.jsonl
data\new_followers_missing.jsonl
data\repliers_missing.jsonl


In [11]:
len(_missing)

17188

In [12]:
len([i for i in _twitter_missing_follower_lists[2016] if i not in _missing]),len([i for i in _twitter_missing_follower_lists[2020] if i not in _missing])

(0, 0)

In [13]:

twitter_follower_profiles = defaultdict(dict)
_twitter_follower_ids_to_years = defaultdict(list)
for year, followers in twitter_follower_ids.items():
    for follower in followers:
        _twitter_follower_ids_to_years[follower].append(year)

fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('follower' in fname) and ('rehydrated' in fname) and fname.endswith('jsonl')]
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for profile in map(json.loads, f):
            id_ = profile['id']
            for year in _twitter_follower_ids_to_years[id_]:
                twitter_follower_profiles[year][id_] = profile

data\followers_rehydrated.jsonl
data\newer_followers_rehydrated.jsonl
data\new_followers_rehydrated.jsonl


In [14]:
len([i for i in twitter_follower_profiles[2016] if i in _missing]),len([i for i in twitter_follower_profiles[2020] if i in _missing])

(0, 11)

In [15]:
len(twitter_follower_profiles[2016]),len(twitter_follower_profiles[2020])

(1400, 11283)

In [16]:

twitter_author_profiles = defaultdict(dict)
_twitter_author_ids_to_years = defaultdict(list)
for year, authors in twitter_author_ids.items():
    for author in authors:
        _twitter_author_ids_to_years[author].append(year)

fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('author' in fname) and ('rehydrated' in fname) and fname.endswith('jsonl')]
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for profile in map(json.loads, f):
            id_ = profile['id']
            for year in _twitter_author_ids_to_years[id_]:
                twitter_author_profiles[year][id_] = profile

data\authors_rehydrated.jsonl
data\new_followees_of_authors_rehydrated.jsonl


In [17]:
len([i for i in twitter_author_profiles[2016] if i in _missing]),len([i for i in twitter_author_profiles[2020] if i in _missing])

(5, 14)

In [18]:
len(twitter_author_profiles[2016]),len(twitter_author_profiles[2020])

(339, 649)

In [32]:
locations = pd.read_csv(os.path.join(DATA_PATH, 'locations_resolved.csv'), index_col=0)
locations = {i['location_str']:i for i in locations.to_dict(orient='records')}

In [34]:
twitter_follower_locations = {year:
    {user_id: locations[user_profile['location']] for user_id, user_profile in profiles.items() if
     ('location' in user_profile) and (user_profile['location'] in locations)} for year, profiles in
    twitter_follower_profiles.items()}


In [35]:
len(twitter_follower_locations[2016]),len(twitter_follower_locations[2020])

(788, 8046)

In [39]:
twitter_follower_ids = {year:list(ids) for year, ids in twitter_follower_ids.items()}
twitter_author_ids = {year:list(ids) for year, ids in twitter_author_ids.items()}
out_dir = os.path.join(DATA_PATH, 'paper_data', 'meta')
os.makedirs(out_dir, exist_ok=True)
for obj, fname in [(twitter_follower_ids,'twitter_follower_ids.json'),
                   (twitter_follower_lists,'twitter_follower_lists.json'),
                   (twitter_follower_locations,'twitter_follower_locations.json'),
                   (twitter_follower_profiles,'twitter_follower_profiles.json'),
                   (twitter_author_ids,'twitter_author_ids.json'),
                   (twitter_author_profiles,'twitter_author_profiles.json'),
                   ]:
    print(fname)
    with open(os.path.join(out_dir, fname), 'w+', encoding='utf8') as f:
        json.dump(obj, f)

twitter_follower_ids.json
twitter_follower_lists.json
twitter_follower_locations.json
twitter_follower_profiles.json
twitter_author_ids.json
twitter_author_profiles.json


In [38]:
twitter_follower_ids

{2016: {'813812141941805058',
  '2770973696',
  '2881019310',
  '483874106',
  '554701847',
  '2977619130',
  '159783252',
  '593111152',
  '2777102713',
  '2873823296',
  '747915047515525120',
  '379505580',
  '2705317117',
  '900458163761270784',
  '2906964189',
  '630299871',
  '754270263139766273',
  '4612639648',
  '48097862',
  '3186511175',
  '2820222901',
  '2286066776',
  '580857186',
  '150506383',
  '373847263',
  '3190770703',
  '2233237405',
  '1043366888',
  '2854738910',
  '299618258',
  '4237863073',
  '394031563',
  '301054699',
  '1654041709',
  '284035707',
  '2763853854',
  '2926851153',
  '30322836',
  '239003356',
  '2794895476',
  '770303861634596864',
  '2926948528',
  '2755974957',
  '3613887076',
  '2758567689',
  '2945833063',
  '3396035427',
  '79477626',
  '3590992574',
  '609122928',
  '2795692924',
  '627688434',
  '37914958',
  '2332138694',
  '401824910',
  '2758358589',
  '2793276436',
  '834910133478096896',
  '2729647754',
  '3082556687',
  '23598421