In [1]:
import pandas as pd
import os
import json
from collections import defaultdict


from data_utils import DATA_PATH, YEARS

In [2]:
polls= dict()
for year in YEARS:
    poll_fpath = os.path.join(DATA_PATH, 'paper_data','polls', f'polls-{year}-query1-2options-clean.json')
    with open(poll_fpath) as f:
        polls[year] = json.load(f)

In [3]:
twitter_polls = {year:list(filter(lambda poll:poll['type']=='twitter', polls_)) for year, polls_ in polls.items()}

In [4]:
len(twitter_polls[2016]), len(twitter_polls[2020])

(401, 993)

In [5]:
twitter_author_ids_and_trump_percent = {year:defaultdict(list) for year in YEARS}
for year, polls_ in twitter_polls.items():
    for poll in polls_:
        twitter_author_ids_and_trump_percent[year][poll['meta']['author_id']].append(poll['trump_perc'])


In [6]:
twitter_author_ids = {year:set(poll['meta']['author_id'] for poll in polls_) for year, polls_ in twitter_polls.items()}

In [7]:
twitter_follower_lists = dict()

fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('follower_list' in fname) and fname.endswith('jsonl')]
_follower_lists = dict()
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for l in f:
            _follower_lists.update(json.loads(l))
_twitter_missing_follower_lists=dict()
for year, authors in twitter_author_ids.items():
    twitter_follower_lists[year] = dict()
    _twitter_missing_follower_lists[year] = set()
    for author in authors:
        if author not in _follower_lists:
            _twitter_missing_follower_lists[year].add(author)
        else:
            twitter_follower_lists[year][author] = [str(id_) for id_ in _follower_lists[author]]

data\follower_lists_2016.jsonl
data\follower_lists_2020.jsonl
data\follower_lists_new.jsonl
data\follower_lists_newer.jsonl


In [8]:
len(twitter_follower_lists[2016]), len(twitter_follower_lists[2020]), len(_twitter_missing_follower_lists[2016]), len(_twitter_missing_follower_lists[2020])

(334, 636, 14, 46)

In [9]:
twitter_follower_ids = {year:{follower for followers in follow_lists.values() for follower in followers} for year, follow_lists in twitter_follower_lists.items()}

In [10]:
len(twitter_follower_ids[2016]), len(twitter_follower_ids[2020])

(4508491, 8353619)

In [11]:
fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('missing' in fname) and fname.endswith('jsonl')]
_missing = set()
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for message in  map(json.loads, f):
            _missing.add(message['value'] if 'value' in message else list(message.keys())[0])

data\authors_missing.jsonl
data\author_profiles_missing.jsonl
data\complotto_missing.jsonl
data\followers_missing.jsonl
data\newer_followees_missing.jsonl
data\newer_followers_missing.jsonl
data\new_authors_missing.jsonl
data\new_followees_missing.jsonl
data\new_followees_of_authors_missing.jsonl
data\new_followers_missing.jsonl
data\repliers_missing.jsonl


In [12]:
len(_missing)

17188

In [13]:
len([i for i in _twitter_missing_follower_lists[2016] if i not in _missing]),len([i for i in _twitter_missing_follower_lists[2020] if i not in _missing])

(0, 0)

In [14]:

twitter_follower_profiles = defaultdict(dict)
_twitter_follower_ids_to_years = defaultdict(list)
for year, followers in twitter_follower_ids.items():
    for follower in followers:
        _twitter_follower_ids_to_years[follower].append(year)

fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('follower' in fname) and ('rehydrated' in fname) and fname.endswith('jsonl')]
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for profile in map(json.loads, f):
            id_ = profile['id']
            for year in _twitter_follower_ids_to_years[id_]:
                twitter_follower_profiles[year][id_] = profile

data\followers_rehydrated.jsonl
data\newer_followers_rehydrated.jsonl
data\new_followers_rehydrated.jsonl


In [15]:
len([i for i in twitter_follower_profiles[2016] if i in _missing]),len([i for i in twitter_follower_profiles[2020] if i in _missing])

(496, 1215)

In [16]:
len(twitter_follower_profiles[2016]),len(twitter_follower_profiles[2020])

(4507571, 8351916)

In [17]:

twitter_author_profiles = defaultdict(dict)
_twitter_author_ids_to_years = defaultdict(list)
for year, authors in twitter_author_ids.items():
    for author in authors:
        _twitter_author_ids_to_years[author].append(year)

fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('author' in fname) and ('rehydrated' in fname) and fname.endswith('jsonl')]
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f:
        for profile in map(json.loads, f):
            id_ = profile['id']
            for year in _twitter_author_ids_to_years[id_]:
                twitter_author_profiles[year][id_] = profile

data\authors_rehydrated.jsonl
data\new_followees_of_authors_rehydrated.jsonl


In [18]:
len([i for i in twitter_author_profiles[2016] if i in _missing]),len([i for i in twitter_author_profiles[2020] if i in _missing])

(5, 14)

In [19]:
len(twitter_author_profiles[2016]),len(twitter_author_profiles[2020])

(339, 649)

In [10]:
locations = pd.read_csv(os.path.join(DATA_PATH, 'locations_resolved.csv'), index_col=0)
locations = {i['location_str']:i for i in locations.to_dict(orient='records')}

In [21]:
twitter_follower_locations = {year:
    {user_id: locations[user_profile['location']] for user_id, user_profile in profiles.items() if
     ('location' in user_profile) and (user_profile['location'] in locations)} for year, profiles in
    twitter_follower_profiles.items()}


In [22]:
len(twitter_follower_locations[2016]),len(twitter_follower_locations[2020])

(2152390, 3520349)

In [23]:
twitter_author_locations = {year:
    {user_id: locations[user_profile['location']] for user_id, user_profile in profiles.items() if
     ('location' in user_profile) and (user_profile['location'] in locations)} for year, profiles in
    twitter_author_profiles.items()}
len(twitter_author_locations[2016]),len(twitter_author_locations[2020])

(259, 437)

In [24]:
twitter_follower_ids = {year:list(ids) for year, ids in twitter_follower_ids.items()}
twitter_author_ids = {year:list(ids) for year, ids in twitter_author_ids.items()}
out_dir = os.path.join(DATA_PATH, 'paper_data', 'meta')
os.makedirs(out_dir, exist_ok=True)
for obj, fname in [(twitter_follower_ids,'twitter_follower_ids.json'),
                   (twitter_follower_lists,'twitter_follower_lists.json'),
                   (twitter_follower_locations,'twitter_follower_locations.json'),
                   (twitter_follower_profiles,'twitter_follower_profiles.json'),
                   (twitter_author_ids,'twitter_author_ids.json'),
                   (twitter_author_profiles,'twitter_author_profiles.json'),
                   (twitter_author_locations,'twitter_author_locations.json'),
                   (twitter_author_ids_and_trump_percent,'twitter_author_ids_and_trump_percent.json')
                   ]:
    print(fname)
    with open(os.path.join(out_dir, fname), 'w+', encoding='utf8') as f:
        json.dump(obj, f)

twitter_follower_ids.json
twitter_follower_lists.json
twitter_follower_locations.json
twitter_follower_profiles.json
twitter_author_ids.json
twitter_author_profiles.json
twitter_author_locations.json
twitter_author_ids_and_trump_percent.json


In [5]:
fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if('rehydrated' in fname) and fname.endswith('jsonl') and ('complotto' not in fname)]
__ids = set()
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f, open(os.path.join(DATA_PATH, 'profiles.jsonl'), 'a+', encoding='utf8') as outf:
        for profile in map(json.loads, f):
            id_ = profile['id']
            if id_ not in __ids:
                outf.write(json.dumps(profile, sort_keys=True)+'\n')
                __ids.add(id_)

data\authors_rehydrated.jsonl
data\followers_rehydrated.jsonl
data\newer_followees_rehydrated.jsonl
data\newer_followers_rehydrated.jsonl
data\new_followees_of_authors_rehydrated.jsonl
data\new_followees_rehydrated.jsonl
data\new_followers_rehydrated.jsonl
data\repliers_rehydrated.jsonl


In [7]:
fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('follower_list' in fname) and fname.endswith('jsonl') and ('complotto' not in fname)]
__ids = set()
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f, open(os.path.join(DATA_PATH, 'follower_lists.jsonl'), 'a+', encoding='utf8') as outf:
        for l in f:
            id_ = list(json.loads(l).keys())[0]
            if id_ not in __ids:
                outf.write(l)
                __ids.add(id_)



data\follower_lists.jsonl
data\follower_lists_2016.jsonl
data\follower_lists_2020.jsonl
data\follower_lists_new.jsonl
data\follower_lists_newer.jsonl


In [8]:
fpaths = [os.path.join(DATA_PATH, fname) for fname in os.listdir(DATA_PATH) if ('followee_list' in fname) and fname.endswith('jsonl') and ('complotto' not in fname)]
__ids = set()
for fpath in fpaths:
    print(fpath)
    with open(fpath, encoding='utf8') as f, open(os.path.join(DATA_PATH, 'followee_lists.jsonl'), 'a+', encoding='utf8') as outf:
        for l in f:
            id_ = list(json.loads(l).keys())[0]
            if id_ not in __ids:
                outf.write(l)
                __ids.add(id_)



data\followee_lists_2016.jsonl
data\followee_lists_2020.jsonl
data\followee_lists_new.jsonl
data\followee_lists_newer.jsonl
data\followee_lists_of_new_retweeters_repliers.jsonl
data\followee_lists_of_repliers.jsonl
data\followee_lists_of_repliers_2.jsonl
data\followee_lists_of_repliers_stephen.jsonl
data\followee_lists_of_retweeters_repliers.jsonl
data\followee_lists_overlapping.jsonl


In [15]:
with open(os.path.join(DATA_PATH, 'locations.json'), 'w+') as f:
    json.dump(locations, f)

In [13]:
len(locations)

1800321