In [4]:
import os
import glob
import time
import math
import json
import csv
import sys
import re
import pickle
import random
import pandas as pd
import numpy as np
import requests
import unidecode
import reverse_geocode
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import defaultdict
from collections import Counter
from datetime import datetime
from scipy import stats
from patsy import dmatrices
# from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf
from ethnicolr import pred_census_ln, pred_wiki_name
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [5]:
pd.set_option('mode.chained_assignment', None)

In [14]:
cols_keep = ['Mention Date', 'Mention Title', 'Outlet or Author', 'Mention URL', 'DOI', 'Research Output Title', 'Journal/Collection Title', 'Subjects (FoR)', 'Affiliations (GRID)', 'Publication Date', 'Altmetric Attention Score']
data_root = '/shared/0/projects/news-quotes/'

In [7]:
def get_path(filename):
    '''return absolute file path'''
    return os.path.join(data_root, filename)

def yield_one_line(filename, delimiter = '\t', quote = csv.QUOTE_NONE):
    '''a generator which produce one line of a given file'''
    filepath = get_path(filename)
    with open(filepath, 'r') as file:
        print('processing %s...' %(filepath))
        reader = csv.reader(file, delimiter=delimiter, quoting=quote)
        for row in reader:
            yield row

In [8]:
# clean journal title, affiliation title
table = str.maketrans(",'&:-", "     ")
prefix = ['Professor', 'Prof.', 'Doctor', 'Dr.', 'Mr.', 'Miss', 'Ms.', 'Mrs.']
name_filter = set(['consortium', 'collaboration', 'collaboration*', 'editor', 'bank'])
last_name_fix = {
    'join(': 'Berle',
    '葉)': 'Ip',
    'A.)': 'Boyle',
    'M.)': 'Lenton',
    'Tian(': 'Tian',
    ')Guangping': 'Guangping'}

In [9]:
def get_author_name(name):
    words = name.split()
    # must has first and last name to query the API
    if len(words) <= 1:
        payload = 'invalid'
    else:
        payload = {'Fname': words[0], 'Lname': words[-1]}
        # C. Kirabo Jackson
        if len(words) >= 3:
            # C or C.
            if len(words[0]) == 1 or (len(words[0]) == 2 and words[0][1] == '.'):
                # not C. J. Jackson; but C. Del Jackson
                if len(words[1]) > 2 or (len(words[1]) == 2 and words[1][1] != '.'):
                    given = words[1]
                    family = words[-1]
                    payload = {'Fname': given, 'Lname': family}
    return payload
        
def get_year_gap(row):
    url, doi = row['url'], row['doi']
    try:
        date1 = datetime.strptime(urls_date[url][:10], '%Y-%m-%d')
        date2 = datetime.strptime(dois_date[doi][:10], '%Y-%m-%d')
        num_year = (date1-date2).days//365
        return num_year
    except:
        return np.NaN
    
def get_journal_rank(journal):
    if journal == 'unknown':
        return np.NaN
    else:
        nname = norm_string(journal)
        if nname in journal_impact:
            return journal_impact[nname]
        else:
            return np.NaN
        
def get_journal_cate(journal):
    if journal in top_journals or journal == 'unknown':
        return journal
    else:
        return 'lose journal'

def get_affi_name(affis):
    affi_li = affis.split('|')
    univs = []
    for affi_id in affi_li:
        if affi_id in affi_name:
            univs.append(affi_name[affi_id])
    if len(univs) == 0:
        return 'unknown'
    else:
        return '|'.join(univs)
    
def get_affi_cate(affis):
    affi_li = affis.split('|')
    cate_li = set()
    for affi_id in affi_li:
        if affi_id in affi_country:
            country = affi_country[affi_id]
            if country == 'United States':
                cate_li.add('domestic')
            else:
                cate_li.add('international')
        else:
            cate_li.add('unknown')
    if 'domestic' in cate_li:
        return 'domestic'
    elif 'international' in cate_li:
        return 'international'
    else:
        return 'unknown'
        
def get_affi_rank(affis):
    affi_li = affis.split('|')
    rank_li = []
    for affi_id in affi_li:
        if affi_id in affi_rank:
            rank_li.append(affi_rank[affi_id])
    if len(rank_li) == 0:
        return np.NaN
    else:
        return min(rank_li)

def get_author_rank(aid):
    if aid in aid_metric:
        return aid_metric[aid][0]
    else:
        return np.NaN

def norm_string(aff_name):
    aff_name = aff_name.lower().translate(table)
    aff_name = ' '.join(aff_name.split())
    return aff_name
    
def get_news_length(url):
    return len(urls_text[url].split(" "))

def get_last_name_feats(name):
    # family name length and prob.
    feats = [0, 0]
    name_tupe = get_author_name(name)
    if type(name_tupe) == dict:
        family = name_tupe['Lname']
        if family.lower() in name_filter:
            # org
            feats[0] = len(name)
        else:
            # length
            feats[0] = len(family)
            # census prob
            family_u = family.upper()
            if family_u in family_freq:
                feats[1] = family_freq[family_u]
    # invalid: single-word-name
    else:
        feats[0] = len(name)
    return feats

def get_author_eth_gen(name):
    # invalid: single-word-name
    feats = ['unknown', 'unknown']
    name_tupe = get_author_name(name)
    if type(name_tupe) == dict:
        family = name_tupe['Lname']
        # The XXX Collaboration would go to API as well since the Lname is Collaboration.
        if family.lower() in name_filter:
            feats = ['org', 'org']
        else:
            if name in nname_eth_gen:
                # ethnicity
                major = nname_eth_gen[name]['Ethnea'].split('-')[0]
                if major != "ERROR" and major != 'UNKNOWN' and major != 'TOOSHORT':
                    feats[0] = major
                # gender
                gender = nname_eth_gen[name]['Genni']
                if gender != '-':
                    feats[1] = gender
    return feats

def get_reporter_eth_gen(name):
    feats = ['unknown', 'unknown']
    if name != 'unknown':
        if name in reporter_eth_gen:
            # ethnicity
            major = reporter_eth_gen[name]['Ethnea'].split('-')[0]
            if major != "ERROR" and major != 'UNKNOWN' and major != 'TOOSHORT':
                feats[0] = major
            # gender
            gender = reporter_eth_gen[name]['Genni']
            if gender != '-':
                feats[1] = gender
    return feats

def is_correspond_author(row):
    doi, seq_num = row['doi'], row['author_seq_num']
    if doi in doi_wos_authors:
        ca_list = doi_wos_authors[doi]['ca_pos_list']
        if seq_num in ca_list:
            return 'yes'
        else:
            return 'no'
    else:
        return 'unknown'

    
# note: this func treats single-word name differently (e.g., "He" vs. "Hao He")
def check_aut_mentioned(row):
    url, name = row['url'], row['author_name']
    text = urls_text[url]
    text = text[:int(0.9*len(text))]
    payload = get_author_name(name)
    # single-word name
    if payload == 'invalid':
        # ignore single-word name such as "L", "Do".
        if len(name) >= 3:
            if name in text:
                return 'yes'
            else:
                return 'no'
        else:
            return 'drop'
    else:
        given, family = payload['Fname'], payload['Lname']
        if len(family) == 1:
            return 'drop'
        elif family == 'He' or family == 'She':
            for pre in prefix:
                pattern = r"\b%s %s\b"%(pre, family)
                if re.search(pattern, text):
                    return 'yes'
            else:
                return 'drop'
        else:
            if family in last_name_fix:
                family = last_name_fix[family]
            pattern = r"\b%s\b"%family
            try:
                if re.search(pattern, text):
                    return 'yes'
                else:
                    return 'no'
            except:
                # print(name, family)
                return 'drop'
        
def get_y(mstring):
    if 'yes' in mstring:
        return 1
    elif 'no' in mstring:
        return 0
    else:
        return mstring

### load news content

In [10]:
urls_text = {}

with open(data_root+'crawl_news/url_text_clean.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        url, text = row['url'], row['text']
        urls_text[url] = text

In [11]:
len(urls_text)

520061

### read mentions data

In [15]:
combined_csv = []
# each row (a mention) has non-empty url (see `alt_news_mention_dump.ipynb`)
with open(data_root+"news_mentions.json", 'r') as ffile:
    for row_data in ffile:
        row_data = json.loads(row_data)
        row = [row_data[col] for col in cols_keep]
        combined_csv.append(row)
combined_csv = pd.DataFrame(combined_csv, columns=cols_keep)

In [10]:
len(combined_csv)

4205331

In [11]:
urls = list(set(combined_csv['Mention URL'].tolist()))
dois = list(set(combined_csv['DOI'].tolist()))

In [12]:
print('num. of unique news articles: %d'%len(urls))
print('num. of unique journal papers: %d'%len(dois))
print('num. of mentions: %d'%len(combined_csv))

num. of unique news articles: 2936447
num. of unique journal papers: 787213
num. of mentions: 4205331


288 U.S. outlets with cleaned news stories

In [17]:
combined_csv = combined_csv.loc[combined_csv['Mention URL'].isin(urls_text)]
combined_csv.index = range(len(combined_csv))

In [14]:
combined_csv.head(1)

Unnamed: 0,Mention Date,Mention Title,Outlet or Author,Mention URL,DOI,Research Output Title,Journal/Collection Title,Subjects (FoR),Affiliations (GRID),Publication Date,Altmetric Attention Score
0,2019-01-02T07:00:00+00:00,‘Nanoscavengers’ could protect people from sar...,Science/AAAS,http://ct.moreover.com/?a=38139183068&p=1pl&v=...,10.1126/scitranslmed.aau7091,Nanoscavenger provides long-term prophylactic ...,Science Translational Medicine,Medical And Health Sciences; Biological Sciences,,2019-01-02T00:00:00+00:00,84.9


In [18]:
combined_csv.replace(np.nan, '', regex=True, inplace=True)

In [16]:
len(combined_csv)

749029

In [17]:
for target, val in combined_csv['Outlet or Author'].value_counts().items():
    print('{:50}{:}'.format(target, val))

EurekAlert!                                       63007
Science Daily                                     44088
Technology.org                                    21655
Yahoo! News                                       20532
Health Medicinet                                  18462
Huffington Post                                   17558
Newswise                                          15694
Physician's Briefing                              15198
Health Canal                                      14436
MSN                                               13361
New York Times                                    10961
Nanowerk                                          10372
Futurity                                          10085
Medical Daily                                     9245
Vice                                              8979
Drugs.com                                         8736
Business Insider                                  8586
Scientific American                               81

The Advocate                                      294
The Fresno Bee                                    293
PM 360                                            292
Star-Telegram                                     288
Radio Acadie                                      279
Belleville News-Democrat                          276
Statesman.com                                     273
The Denver Post                                   264
News Channel                                      241
hellogiggles.com                                  231
SFGate                                            227
King 5                                            194
The Daily Meal                                    191
The New York Observer                             167
KUOW                                              157
Drug Discovery and Development                    129
Hawaii News Now                                   120
US News Health                                    101
R&D                         

In [18]:
urls = list(set(combined_csv['Mention URL'].tolist()))
dois = list(set(combined_csv['DOI'].tolist()))

In [20]:
print('num. of unique news articles: %d'%len(urls))
print('num. of unique journal papers: %d'%len(dois))
print('num. of mentions: %d'%len(combined_csv))

num. of unique news articles: 520061
num. of unique journal papers: 275403
num. of mentions: 749029


In [19]:
url_mentioned_paper_cn = defaultdict(int)

for url, doi in combined_csv[['Mention URL', 'DOI']].itertuples(index=False, name=None):
    url_mentioned_paper_cn[url] += 1

### WoS corresponding author

Note: Different from MAG that includes each author-affiliation in the author list, WoS has a field indicating all affiliations. E.g., {"_addr_no": "1 5", "_role": "author", "_seq_no": 1} means that the first author has two affiliations: "1" and "5".

In [21]:
# every doi in this dict has corresponding info.
doi_wos_authors = {}

with open(data_root+'doi_wos_authors.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        doi, authors = row['doi_lowercase'], row['authors']
        # some are not of 'author' role.
        authors = [aut for aut in authors if aut['_role'] == 'author']
        ca_pos_list = []
        aut_last_names = [''] * len(authors)
        for aut in authors:
            pos = aut['_seq_no']
            if 'last_name' in aut:
                aut_last_names[pos-1] = aut['last_name']
            else:
                aut_last_names[pos-1] = aut['full_name']
            if '_reprint' in aut and aut['_reprint'] == 'Y':
                ca_pos_list.append(pos)
        doi_wos_authors[doi] = {'aut_last_names':aut_last_names, 'ca_pos_list': ca_pos_list}

In [51]:
doi_wos_authors['10.1002/art.24519']

{'aut_last_names': ['Bachen', 'Chesney', 'Criswell'], 'ca_pos_list': [1]}

### MAG author names

Note: we queried more dois than those papers mentioned by this US-outlets subset.

Note that if an author has multiple affiliations, MAG codes the author list like this:

[['1221368641', '79576946', '2', 'David F. Dinges'], ['2042375510', '79576946', '1', 'Mathias Basner'], ['2042375510', '2898391981', '1', 'Mathias Basner']]

In [22]:
# risk set
doi_author_list = defaultdict(list)
doi_num_authors = {}

with open(data_root+'dois_authors_mag.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        doi, authors = row['doi'], row['authors']
        uniq_authors = defaultdict(lambda: ['', [], ''])
        for aid, affi_id, seq, name in authors:
            seq = int(seq)
            uniq_authors[seq][0] = aid
            uniq_authors[seq][1].append(affi_id)
            uniq_authors[seq][2] = unidecode.unidecode(name)
        # could be [1, 2, 4, 6]; [2, 3]
        uniq_seqs = sorted(uniq_authors)
        # must be: 1, 2, ..., max
        if uniq_seqs == list(range(1, len(uniq_seqs)+1)):
            last_seq = len(uniq_seqs)
            doi_num_authors[doi] = last_seq
            if last_seq == 1:
                aid, affi_ids, name = uniq_authors[last_seq]
                affis = '|'.join(affi_ids)
                doi_author_list[doi].append([aid, affis, last_seq, name, 'solo_author'])
            else:
                cor_pos = []
                if doi in doi_wos_authors:
                    # this is already unique seq
                    cor_pos = doi_wos_authors[doi]['ca_pos_list']
                for seq in uniq_authors:
                    aid, affi_ids, name = uniq_authors[seq]
                    affis = '|'.join(affi_ids)
                    if seq == 1:
                        doi_author_list[doi].append([aid, affis, seq, name, 'first_position'])
                    elif seq == last_seq:
                        doi_author_list[doi].append([aid, affis, seq, name, 'last_position'])
                    elif seq in cor_pos:
                        doi_author_list[doi].append([aid, affis, seq, name, 'middle_position'])

In [53]:
len(doi_author_list)

268691

In [54]:
len(doi_num_authors)

268691

In [23]:
aid_metric = {}
with open(data_root+'aids_metric_mag.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        aid, metric = row['aid'], row['metric']
        aid_metric[aid] = [int(num) for num in metric]

### each (url, doi, author) triplet is an observation

This is the potential data points for the reg analysis since to know the dependent variable, one needs two info:
* story text (in `urls_text`)
* author name

In [24]:
combined_csv = combined_csv.loc[combined_csv['DOI'].isin(set(doi_author_list.keys()))]
combined_csv.index = range(len(combined_csv))

In [57]:
len(combined_csv)

674272

In [26]:
url_to_doi_edges = list(combined_csv[['Mention URL', 'DOI']].itertuples(index=False, name=None))

In [59]:
len(url_to_doi_edges)

674272

How data size changes

In [60]:
dois_date = dict(zip(combined_csv.DOI, combined_csv['Publication Date']))
urls_date = dict(zip(combined_csv['Mention URL'], combined_csv['Mention Date']))

In [61]:
dois_journal = dict(zip(combined_csv.DOI, combined_csv['Journal/Collection Title']))
urls_outlet = dict(zip(combined_csv['Mention URL'], combined_csv['Outlet or Author']))

In [62]:
doi_mention_cn = {doi: cn for doi, cn in combined_csv.DOI.value_counts().items()}

In [63]:
print('There are %d stories mentioning %d dois.'%(len(urls_text), len(dois)))
print('We found author info for %d dois in MAG, and they were mentioned by %d stories.'%(len(dois_date), len(urls_date)))

There are 520061 stories mentioning 275403 dois.
We found author info for 251630 dois in MAG, and they were mentioned by 472762 stories.


### manually categorize outlets

In [64]:
len(set(combined_csv['Outlet or Author'].tolist()))

288

In [65]:
outlet_cate = pd.read_csv(data_root+'outlet_category.csv', header=0)

In [66]:
outlet_cate = dict(zip(outlet_cate.outlet, outlet_cate.category))

In [67]:
set(outlet_cate.values())

{'General', 'PressRelease', 'SciTech'}

In [68]:
re_cates = {
    'PressRelease': 'Press Releases',
    'SciTech': 'Sci. \& Tech.',
    'General': 'General News'
}

In [21]:
for outlet, cate in outlet_cate.items():
    outlet = outlet.replace('&', '\&')
    print('%s & %s \\\\'%(outlet, re_cates[cate]))

OnMedica & Sci. \& Tech. \\
Huffington Post & General News \\
KiiiTV 3 & General News \\
Carbon Brief & Sci. \& Tech. \\
PR Newswire & Press Releases \\
Nutra Ingredients USA & Sci. \& Tech. \\
The Bellingham Herald & General News \\
CNN News & General News \\
Health Medicinet & Press Releases \\
Herald Sun & General News \\
EurekAlert! & Press Releases \\
AJMC & Press Releases \\
The University Herald & General News \\
Lincoln Journal Star & General News \\
Cardiovascular Business & Sci. \& Tech. \\
MinnPost & General News \\
CNET & Sci. \& Tech. \\
Infection Control Today & Sci. \& Tech. \\
Science 2.0 & Sci. \& Tech. \\
Lexington Herald Leader & General News \\
Statesman.com & General News \\
Nanowerk & Press Releases \\
The San Diego Union-Tribune & General News \\
The Daily Beast & General News \\
Lab Manager & Press Releases \\
SDPB Radio & General News \\
New Hampshire Public Radio & General News \\
Health Day & Press Releases \\
Rocket News & General News \\
KPBS & General News

### prepare reg data

In [28]:
reg_data = []
for url, doi in url_to_doi_edges:
    for aid, affis, seq, name, pos in doi_author_list[doi]:
        reg_data.append([url, doi, aid, affis, seq, name, pos])

In [29]:
reg_data = pd.DataFrame(reg_data, columns=['url', 'doi', 'author_id', 'affiliation_ids', 'author_seq_num', 'author_name', 'author_pos_cate'])


In [71]:
len(reg_data)

1353498

In [31]:
del url_to_doi_edges

### get ethnicity and gender (authors)

In [155]:
# potential errors due to unknown: http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?Fname=Daniel&Lname=Schwartz
eth_url = 'http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?format=json'

In [156]:
payload = {'Fname': 'C. Anna', 'Lname': 'McDowell'}
response = requests.get(eth_url, params=payload)
eval(response.text)

{'Genni': 'F', 'Ethnea': 'ENGLISH', 'Last': 'McDowell', 'First': 'C Anna'}

In [162]:
cn = 0
for name in set(reg_data.author_name):
    if name not in nname_eth_gen:
        cn += 1
        payload = get_author_name(name)
        if payload != 'invalid':
            if cn%500 == 0:
                time.sleep(5)
            response = requests.get(eth_url, params=payload)
            try:
                j = eval(response.text)
                genni = j['Genni']
                ethnea = j['Ethnea']
                nname_eth_gen[name] = {'Ethnea': ethnea, 'Genni': genni}
            except:
                continue
        if cn%1000 == 0:
            print('processed %d names...'%cn)

processed 1000 names...
processed 2000 names...
processed 3000 names...
processed 4000 names...
processed 5000 names...
processed 6000 names...
processed 7000 names...
processed 8000 names...
processed 9000 names...
processed 10000 names...


In [164]:
# with open(data_root+'nname_eth_gen.json', 'w') as ofile:
#     for name, info in nname_eth_gen.items():
#         row = {'name': name, 'info': info}
#         ofile.write(json.dumps(row) + '\n')

In [32]:
nname_eth_gen = {}
with open(data_root+'nname_eth_gen.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        name, info = row['name'], row['info']
        nname_eth_gen[name] = info

In [75]:
len(nname_eth_gen)

351549

### get ethnicity and gender (reporters)

In [33]:
# normalize names
url_reporter = {}

with open(data_root+'crawl_news/url_reporter.json', 'r') as ifile:
    for row in ifile:
        row = json.loads(row)
        url, reporter = row['url'], row['reporter']
        url_reporter[url] = unidecode.unidecode(reporter)

In [77]:
len(url_reporter)

100163

In [78]:
reporter_norm_names = set(url_reporter.values())

In [79]:
len(reporter_norm_names)

13693

In [26]:
reporter_eth_gen = {}

In [206]:
for ix, name in enumerate(reporter_norm_names):
    payload = get_author_name(name)
    if payload != 'invalid':
        if ix%200 == 0:
            time.sleep(5)
        response = requests.get(eth_url, params=payload)
        try:
            j = eval(response.text)
            genni = j['Genni']
            ethnea = j['Ethnea']
            reporter_eth_gen[name] = {'Ethnea': ethnea, 'Genni': genni}
        except:
            continue

In [212]:
# with open(data_root+'crawl_news/reporter_eth_gen.json', 'w') as ofile:
#     for name, info in reporter_eth_gen.items():
#         row = {'name': name, 'info': info}
#         ofile.write(json.dumps(row) + '\n')

In [34]:
reporter_eth_gen = {}

with open(data_root+'crawl_news/reporter_eth_gen.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        name, info = row['name'], row['info']
        reporter_eth_gen[name] = info

In [35]:
len(reporter_eth_gen)

13632

### predict ethnicity using Wiki/Census

In [171]:
def get_pos_name(name, pos = 'first'):
    payload = get_author_name(name)
    if payload != 'invalid':
        if pos == 'first':
            return payload['Fname']
        if pos == 'last':
            return payload['Lname']
    else:
        return 'unknown'

In [180]:
name_df = pd.DataFrame(set(reg_data.author_name.tolist()).union(reporter_norm_names), columns=['nname'])

In [181]:
name_df['Fname'] = name_df.nname.apply(lambda name: get_pos_name(name, pos='first'))
name_df['Lname'] = name_df.nname.apply(lambda name: get_pos_name(name, pos='last'))

In [182]:
name_df.head()

Unnamed: 0,nname,Fname,Lname
0,At Bulathsinghala,At,Bulathsinghala
1,Kaushik Mukherjee,Kaushik,Mukherjee
2,Sophie Fisher,Sophie,Fisher
3,Rostyslaw W. Robak,Rostyslaw,Robak
4,Richard M. Vickery,Richard,Vickery


In [183]:
# use other server to do the next line of code.
name_df.to_csv(data_root+'name_df.csv', header=True, index=False)

In [None]:
name_df_wiki = pred_wiki_name(df=name_df, lname_col='Lname', fname_col='Fname')

In [29]:
name_df_wiki = pd.read_csv(data_root+'name_df_wiki.csv', header=0)

In [31]:
name_eth_wiki = dict(zip(name_df_wiki.nname, name_df_wiki.race))

In [32]:
# name_eth_wiki['solo'] = 'solo'
name_eth_wiki['unknown'] = 'unknown'

In [None]:
name_df_census = pred_census_ln(df=name_df, namecol='Lname', year=2010)

In [34]:
name_df_census = pd.read_csv(data_root+'name_df_census.csv', header=0)

In [35]:
name_df_census.head(1)

Unnamed: 0,nname,Fname,Lname,race,api,black,hispanic,white
0,At Bulathsinghala,At,Bulathsinghala,white,0.124314,0.02164,0.010825,0.843221


In [36]:
name_eth_census = dict(zip(name_df_census.nname, name_df_census.race))

In [37]:
# name_eth_census['solo'] = 'solo'
name_eth_census['unknown'] = 'unknown'

In [39]:
del name_df_wiki, name_df_census, name_df

### affiliation

In [40]:
# Queen's College, London -> queen s college london
# McKinsey & Company -> mckinsey company
# https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema
# Rank = -1000 * Ln( probability of an entity being important )
affi_rank = {}
affi_country = {}
affi_name = {}
for line in yield_one_line('Affiliations.txt'):
    affi_id, rank, dname, lat, lon = line[0], line[1], line[3], line[9], line[10]
    affi_rank[affi_id] = int(rank)
    affi_name[affi_id] = dname
    if lat != "" and lon != "":
        lat, lon = float(lat), float(lon)
        res = reverse_geocode.search([(lat, lon)])
        country = res[0]['country']
        affi_country[affi_id] = country

processing /shared/0/projects/news-quotes/Affiliations.txt...


In [41]:
len(affi_rank)

25542

In [42]:
len(affi_country)

24167

In [43]:
len(affi_name)

25542

### journals

In [44]:
# Altmetric data: JAMA: Journal of the American Medical Association
# WOS data: JAMA-JOURNAL OF THE AMERICAN MEDICAL ASSOCIATION
journal_impact = {}
for line in yield_one_line('wos_jcr.csv', delimiter=',', quote=csv.QUOTE_ALL):
    rank, title, cites, jif, eigen = line
    try:
        impact = float(jif)
        ntitle = norm_string(title)
        journal_impact[ntitle] = impact
    except:
        continue

processing /shared/0/projects/news-quotes/wos_jcr.csv...


In [45]:
len(journal_impact)

11873

In [46]:
journal_impact['jama journal of the american medical association']

51.273

### name complexity

In [38]:
# https://www2.census.gov/geo/tiger/TIGER1999/readme99.txt
# DIACRITICAL MARKS

# The Census Bureau is no longer using codes to represent diacritical marks found
# in some language names.  Beginning with this release of TIGER/Line the Census
# Bureau will be using the ISO 8859-1 character set, commonly referred to as Latin-1,
# to identify characters with diacritical marks.

family_freq = {}
for line in yield_one_line('Names_2010Census.csv', delimiter=',', quote=csv.QUOTE_ALL):
    name, rank, count, prop100k = line[:4]
    if name != 'name':
        family_freq[name] = np.log(float(prop100k))

processing /shared/0/projects/news-quotes/Names_2010Census.csv...


## Prepare reg data

In [36]:
reg_data['author_last_name'] = reg_data['author_name'].apply(lambda name: name.split()[-1])
reg_data['author_last_name'] = reg_data['author_last_name'].apply(lambda name: last_name_fix[name] if name in last_name_fix else name)


In [39]:
reg_data['last_name_feats'] = reg_data['author_name'].apply(get_last_name_feats)
reg_data[['last_name_length', 'last_name_prob']] = pd.DataFrame(reg_data['last_name_feats'].values.tolist(), index = reg_data.index)

In [40]:
reg_data['author_eth_gen'] = reg_data['author_name'].apply(lambda name: get_author_eth_gen(name))
reg_data[['author_eth_ethnea', 'author_gender_ethnea']] = pd.DataFrame(reg_data['author_eth_gen'].values.tolist(), index = reg_data.index)

In [86]:
# map wiki and census ethnicity
reg_data['author_eth_wiki'] = reg_data.author_name.map(name_eth_wiki)
reg_data['author_eth_census'] = reg_data.author_name.map(name_eth_census)

In [87]:
reg_data['reporter_name'] = reg_data['url'].apply(lambda url: url_reporter[url] if url in url_reporter else 'unknown')

In [88]:
reg_data['reporter_eth_gen'] = reg_data['reporter_name'].apply(lambda name: get_reporter_eth_gen(name))
reg_data[['reporter_eth_ethnea', 'reporter_gender_ethnea']] = pd.DataFrame(reg_data['reporter_eth_gen'].values.tolist(), index = reg_data.index)

In [89]:
# map wiki and census ethnicity
reg_data['reporter_eth_wiki'] = reg_data.reporter_name.map(name_eth_wiki)
reg_data['reporter_eth_census'] = reg_data.reporter_name.map(name_eth_census)

In [90]:
reg_data = reg_data.drop(columns=['last_name_feats', 'author_eth_gen', 'reporter_eth_gen'])

### ethnicity categorization

In [41]:
eth_cat_map = {
 'ISRAELI': 'MiddleEastern',
 'ARAB': 'MiddleEastern',
 'TURKISH': 'MiddleEastern',
 'ENGLISH': 'English',
 'HISPANIC': 'SouthernEuropean',
 'ITALIAN': 'SouthernEuropean',    
 'GREEK': 'SouthernEuropean',
 'GERMAN': 'WesternNorthernEuropean',
 'NORDIC': 'WesternNorthernEuropean',
 'DUTCH': 'WesternNorthernEuropean',
 'FRENCH': 'WesternNorthernEuropean',
 'BALTIC': 'WesternNorthernEuropean',
 'HUNGARIAN': 'EasternEuropean',
 'ROMANIAN': 'EasternEuropean',
 'SLAV': 'EasternEuropean',
 'CHINESE': 'Chinese',
 'INDIAN': 'Indian',
 'AFRICAN': 'African',
 'KOREAN': 'EastAsian',
 'JAPANESE': 'EastAsian',
 'THAI': 'EastAsian',
 'VIETNAMESE': 'EastAsian',
 'INDONESIAN': 'EastAsian',
 'MONGOLIAN': 'EastAsian',
 'CARIBBEAN': 'CARIBBEAN',
 'POLYNESIAN': 'POLYNESIAN',
 'org': 'org',
 'unknown': 'unknown'}
# eastern europe: hungarian, slav, romanian
# western and northern europe: baltic, dutch, german, nordic, french
# south europe: hispanic, italian, greek

In [42]:
wiki_cat_map = {
 'Asian,GreaterEastAsian,EastAsian': 'EastAsian',
 'Asian,GreaterEastAsian,Japanese': 'EastAsian',
 'Asian,IndianSubContinent': 'Indian',
 'GreaterAfrican,Africans': 'African',
 'GreaterAfrican,Muslim': 'MiddleEastern',
 'GreaterEuropean,British': 'English',
 'GreaterEuropean,EastEuropean': 'EasternEuropean',
 'GreaterEuropean,Jewish': 'MiddleEastern',
 'GreaterEuropean,WestEuropean,French': 'WesternNorthernEuropean',
 'GreaterEuropean,WestEuropean,Germanic': 'WesternNorthernEuropean',
 'GreaterEuropean,WestEuropean,Hispanic': 'SouthernEuropean',
 'GreaterEuropean,WestEuropean,Italian': 'SouthernEuropean',
 'GreaterEuropean,WestEuropean,Nordic': 'WesternNorthernEuropean',
#  'solo': 'solo',
 'unknown': 'unknown'}

Map individual eth in `Ethnea` to higher-level categories

In [44]:
reg_data['author_eth_ethnea_broad'] = reg_data['author_eth_ethnea'].map(eth_cat_map)
reg_data['reporter_eth_ethnea_broad'] = reg_data['reporter_eth_ethnea'].map(eth_cat_map)

In [48]:
# Report this in SI - the distribution of eth before dropping data.
reg_data.author_eth_ethnea_broad.value_counts() / len(reg_data)

author_eth_ethnea_broad
English                    0.441768
WesternNorthernEuropean    0.204497
SouthernEuropean           0.104017
Chinese                    0.078783
MiddleEastern              0.048845
Indian                     0.041041
EastAsian                  0.036496
EasternEuropean            0.034164
African                    0.005327
unknown                    0.004926
CARIBBEAN                  0.000129
org                        0.000004
POLYNESIAN                 0.000001
Name: count, dtype: float64

In [55]:
# Report this in SI - the distribution of eth before dropping data.
reg_data.author_eth_ethnea_broad.value_counts()

author_eth_ethnea_broad
English                    594396
WesternNorthernEuropean    275149
SouthernEuropean           139954
Chinese                    106002
MiddleEastern               65720
Indian                      55221
EastAsian                   49105
EasternEuropean             45968
African                      7168
unknown                      6628
CARIBBEAN                     174
org                             6
POLYNESIAN                      2
Name: count, dtype: int64

In [60]:
# f_exp
(reg_data.author_eth_ethnea_broad.value_counts() / (len(reg_data) - 174 - 6 - 2)).tolist()[:-3]

[0.4418279490764589,
 0.20452445568348138,
 0.10403096384404796,
 0.07879367670375102,
 0.04885115783636646,
 0.04104701440782094,
 0.03650085370594606,
 0.03416905087373849,
 0.005328136022079653,
 0.0049267418463091435]

In [61]:
(len(reg_data) - 174 - 6 - 2)

1345311

Map individual eth in `Wiki` to higher-level categories

In [189]:
reg_data['author_eth_wiki_broad'] = reg_data['author_eth_wiki'].map(wiki_cat_map)
reg_data['reporter_eth_wiki_broad'] = reg_data['reporter_eth_wiki'].map(wiki_cat_map)

In [190]:
reg_data[['doi', 'url', 'author_name', 'author_eth_ethnea', 'author_eth_wiki', 'author_eth_census']].head(5)

Unnamed: 0,doi,url,author_name,author_eth_ethnea,author_eth_wiki,author_eth_census
0,10.1096/fj.14-255240,http://www.eurekalert.org/pub_releases/2014-10...,David B. Dunger,ENGLISH,"GreaterEuropean,British",white
1,10.1096/fj.14-255240,http://www.sciencedaily.com/releases/2014/10/1...,David B. Dunger,ENGLISH,"GreaterEuropean,British",white
2,10.1016/j.neuron.2013.08.030,http://www.sciencedaily.com/releases/2013/11/1...,Roshan Cools,DUTCH,"GreaterEuropean,British",white
3,10.1016/j.neuron.2013.08.030,http://www.sciencedaily.com/releases/2013/11/1...,Hanneke E.M. den Ouden,DUTCH,"GreaterEuropean,British",white
4,10.1016/j.rhm.2016.10.003,http://healthmedicinet.com/news/why-more-and-m...,Karen Lorimer,ENGLISH,"GreaterEuropean,British",white


Many Controls

In [91]:
reg_data['Publication Date'] = reg_data['doi'].apply(lambda doi: dois_date[doi])
reg_data['Mention Date'] = reg_data['url'].apply(lambda url: urls_date[url])

In [92]:
# num of news coverage per paper
reg_data['doi_men_cn'] = reg_data['doi'].map(doi_mention_cn)
reg_data['author_rank'] = reg_data['author_id'].apply(get_author_rank)
reg_data['affiliation_name'] = reg_data['affiliation_ids'].apply(get_affi_name)
reg_data['affiliation_cate'] = reg_data['affiliation_ids'].apply(get_affi_cate)
reg_data['affiliation_rank'] = reg_data['affiliation_ids'].apply(get_affi_rank)
reg_data['journal_title'] = reg_data['doi'].apply(lambda doi: dois_journal[doi] if dois_journal[doi] != '' else 'unknown')
reg_data['journal_impact'] = reg_data['journal_title'].apply(get_journal_rank)
top_journals = set([jname for jname, num in Counter(reg_data['journal_title']).most_common()[:100] if jname != 'unknown'])
reg_data['top_journal'] = reg_data['journal_title'].apply(get_journal_cate)
reg_data['num_authors'] = reg_data['doi'].map(doi_num_authors)

In [93]:
reg_data['outlet'] = reg_data['url'].map(urls_outlet)
# e.g., 'Harvard Business Review ' was changed to 'Harvard Business Review' in the outlet categorization.
reg_data['outlet'] = reg_data['outlet'].apply(lambda name: name.strip())
reg_data['category'] = reg_data['outlet'].map(outlet_cate)
reg_data['num_words'] = reg_data['url'].apply(get_news_length)
reg_data['num_mentioned_papers'] = reg_data['url'].map(url_mentioned_paper_cn)

In [94]:
reg_data['mention_year'] = reg_data['url'].apply(lambda url: int(urls_date[url][:4]))
year_mean = reg_data['mention_year'].mean()
reg_data['gap_in_years'] = reg_data.apply(get_year_gap, axis=1)

In [95]:
reg_data['is_author_mentioned'] = reg_data.apply(lambda row: check_aut_mentioned(row), axis=1)
reg_data['is_author_mentioned'] = reg_data['is_author_mentioned'].apply(get_y)

In [97]:
len(reg_data)

1353498

Get readability stuff

In [98]:
read_df = pd.read_csv(data_root+'dois_abstract_readability_stats.tsv', sep='\t', header=0)

In [99]:
read_df = read_df[['doi', 'FleschReadingEase', 'sentences_per_paragraph', 'type_token_ratio']]
read_df = read_df.dropna()
read_df.index = range(len(read_df))

In [100]:
len(read_df)

156679

In [101]:
reg_data = pd.merge(reg_data, read_df, how='left', on='doi')

In [102]:
len(reg_data)

1353498

### Get final data for regression (drop missing values)

In [105]:
np.sum(reg_data.isnull(), axis=0)

url                             0
doi                             0
author_id                       0
affiliation_ids                 0
author_seq_num                  0
author_name                     0
author_pos_cate                 0
author_last_name                0
last_name_length                0
last_name_prob                  0
author_eth_ethnea               0
author_gender_ethnea            0
author_eth_wiki                 0
author_eth_census               0
reporter_name                   0
reporter_eth_ethnea             0
reporter_gender_ethnea          0
reporter_eth_wiki               0
reporter_eth_census             0
Publication Date                0
Mention Date                    0
doi_men_cn                      0
author_rank                     0
affiliation_name                0
affiliation_cate                0
affiliation_rank           370716
journal_title                   0
journal_impact             306573
top_journal                     0
num_authors   

In [106]:
# only these cols have missing values.
reg_data = reg_data.dropna(subset=['affiliation_rank', 'gap_in_years', 'FleschReadingEase', 'sentences_per_paragraph', 'type_token_ratio'])
reg_data.index = range(len(reg_data))

In [107]:
# dropping single char last name or He/She with no proper prefix.
reg_data = reg_data.loc[reg_data['is_author_mentioned'] != 'drop']
reg_data = reg_data.astype({"is_author_mentioned": float})

In [110]:
reg_data.author_eth_ethnea.value_counts()

ENGLISH       234513
GERMAN         47767
CHINESE        43039
HISPANIC       28808
FRENCH         25232
INDIAN         21314
NORDIC         19697
ITALIAN        17992
SLAV           14830
ARAB           14099
DUTCH          13461
JAPANESE       10838
ISRAELI         9064
KOREAN          7312
GREEK           4334
TURKISH         2919
AFRICAN         2774
unknown         2549
HUNGARIAN       1731
ROMANIAN         690
THAI             657
BALTIC           174
VIETNAMESE       174
CARIBBEAN         82
INDONESIAN        81
MONGOLIAN          6
org                1
Name: author_eth_ethnea, dtype: int64

In [121]:
top_eth = [eth for eth, cn in reg_data.author_eth_ethnea_broad.value_counts().items() if cn > 500]

In [122]:
top_eth

['English',
 'Western&NorthernEuropean',
 'SouthernEuropean',
 'Chinese',
 'MiddleEastern',
 'Indian',
 'EastAsian',
 'EasternEuropean',
 'African',
 'unknown']

In [123]:
reg_data = reg_data.loc[reg_data['author_eth_ethnea_broad'].isin(top_eth) & reg_data['reporter_eth_ethnea_broad'].isin(top_eth)]

In [124]:
reg_data.index = range(len(reg_data))

In [125]:
reg_data.author_eth_ethnea_broad.value_counts()

English                     234510
Western&NorthernEuropean    106331
SouthernEuropean             51134
Chinese                      43039
MiddleEastern                26082
Indian                       21314
EastAsian                    19068
EasternEuropean              17251
African                       2774
unknown                       2549
Name: author_eth_ethnea_broad, dtype: int64

In [126]:
reg_data.reporter_eth_ethnea_broad.value_counts()

unknown                     418187
English                      68652
Western&NorthernEuropean     13790
SouthernEuropean             10594
MiddleEastern                 3494
EasternEuropean               2924
Chinese                       2449
Indian                        2409
EastAsian                      910
African                        643
Name: reporter_eth_ethnea_broad, dtype: int64

In [127]:
reg_data['mention_year_center'] = reg_data['mention_year'].apply(lambda x: x - year_mean)

In [128]:
len(reg_data)

524052

In [129]:
np.mean(reg_data.is_author_mentioned)

0.41243235404120204

In [130]:
for eth, gp in reg_data.groupby('author_eth_ethnea_broad'):
    print(eth, np.mean(gp.is_author_mentioned))

African 0.29776496034607064
Chinese 0.42092056042194287
EastAsian 0.4062827774281519
EasternEuropean 0.4580604022955191
English 0.40909982516737026
Indian 0.4358168340058178
MiddleEastern 0.43125527183498197
SouthernEuropean 0.406246333163844
Western&NorthernEuropean 0.4082816864319907
unknown 0.3468026677128286


Add corresponding author variables

In [131]:
reg_data['is_corresponding'] = reg_data.apply(lambda row: is_correspond_author(row), axis=1)

In [132]:
test = reg_data[['doi', 'author_name', 'author_pos_cate', 'is_corresponding']].drop_duplicates()

In [133]:
len(test.loc[test['author_pos_cate'] == 'first_position'])

87567

In [134]:
# note that the first/last author of a doi may be dropped due to missing data but the paper may not be dropped.
len(test.loc[test['author_pos_cate'] == 'last_position'])

84708

In [135]:
len(test.loc[test['author_pos_cate'] == 'middle_position'])

9169

In [136]:
len(test.loc[test['author_pos_cate'] == 'solo_author'])

6750

In [137]:
test = pd.DataFrame(reg_data['doi']).drop_duplicates()

In [138]:
# unique papers
len(test)

100486

In [139]:
# the frac of paper that has CA in WoS, in the final regression data.
np.sum(test.doi.isin(doi_wos_authors))/len(test)

0.8598809784447585

In [140]:
test['wos_correspond_pos'] = test['doi'].apply(lambda doi: doi_wos_authors[doi]['ca_pos_list'] if doi in doi_wos_authors else [])
test['num_authors'] = test.doi.map(doi_num_authors)

In [141]:
test.head()

Unnamed: 0,doi,wos_correspond_pos,num_authors
0,10.1096/fj.14-255240,[1],12
2,10.1016/j.neuron.2013.08.030,[1],8
4,10.1016/j.rhm.2016.10.003,[1],5
6,10.1177/0963721411408883,[2],2
10,10.1016/j.neuron.2013.08.003,[8],9


Top authors

In [149]:
top_authors = [name for name, cn in reg_data[['doi', 'author_name']].drop_duplicates().author_name.value_counts().nlargest(100).items()]

In [151]:
reg_data['is_top_author'] = reg_data['author_name'].apply(lambda name: 'yes' if name in top_authors else 'no')

Save for now to get the quote/indirect

In [430]:
# reg_data.to_csv(data_root+"reg_data_plot.csv", index=False, header=True, encoding='utf-8')

Add quote/indirect dependent variables

In [152]:
tem_df = pd.read_csv(data_root+"crawl_news/has_author_quotes.tsv", sep='\t', header=0)

In [153]:
tem_df.columns

Index(['url', 'has_author_quote'], dtype='object')

In [154]:
len(tem_df)

524052

In [155]:
reg_data.insert(0, 'has_author_quote', tem_df['has_author_quote'])

In [156]:
tem_df = pd.read_csv(data_root+"crawl_news/has_indirect_quote.csv", header=0)

In [157]:
tem_df.columns

Index(['url', 'has_indirect_mention', 'mentions_author_institution'], dtype='object')

In [158]:
reg_data.insert(0, 'mentions_author_institution', tem_df['mentions_author_institution'])

In [159]:
reg_data.insert(0, 'has_indirect_mention', tem_df['has_indirect_mention'])

Paper keywords

In [160]:
mag_dir = '/shared/0/datasets/mag/raw_data/'

def yield_one_line_mag(filename, delimiter=',', quoting = csv.QUOTE_ALL):
    '''a generator which produce one line of a given file'''
    with open(filename, 'r') as file:
        reader = csv.reader(file, delimiter=delimiter, quoting=quoting)
        count = 0
        for row in reader:
            count += 1
            if count % 10000000 == 0:
                print('processed %d lines...' % (count))
            yield row

def get_field_vector(doi):
    vec = [0] * len(top_fields)
    if doi in dois_disc:
        for field, score in dois_disc[doi]:
            if field in field_ix:
                vec[field_ix[field]] = score
    return vec

def clean_field_name(name):
    if name == 'Spin-½':
        return 'Spin_half'
    ans = ''
    for ch in name:
        if ch in ['(', ')', '-', ' ', "'"]:
            ans += '_'
        else:
            ans += ch
    return ans

In [161]:
field_id2name_level = dict()
for line in yield_one_line_mag(mag_dir+'FieldsOfStudy.txt', delimiter='\t', quoting=csv.QUOTE_NONE):
    fid, rank, nname, dname, mtype, level, pc, cc, cdate = line
    field_id2name_level[fid] = (dname, level)

In [163]:
dois_disc = {}
with open(data_root+'dois_fields_mag.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        fields = row['fields']
        doi = row['doi']
        dois_disc[doi] = fields

In [164]:
dois_disc['10.1001/2012.jama.11132']

[['2780320433', 0.526666343],
 ['2779668308', 0.6762392],
 ['2780221984', 0.5918762],
 ['511355011', 0.544381559],
 ['2777391703', 0.6243656],
 ['555293320', 0.591008365],
 ['2910068830', 0.600785553],
 ['141071460', 0.387458026],
 ['2777180221', 0.6201425],
 ['71924100', 0.410550684]]

In [165]:
field_cn = defaultdict(int)
top_fields = []

cn_not = 0
for doi in set(reg_data.doi.to_list()):
    if doi in dois_disc:
        fields = dois_disc[doi]
        for field, score in fields:
            field_cn[field] += 1
    else:
        cn_not += 1

for field, cn in field_cn.items():
    if cn >= 500:
        top_fields.append(field)

In [166]:
# only 8 papers do not have fields in MAG.
cn_not

8

In [167]:
len(top_fields)

199

In [168]:
field_ix = {fid: ix for ix, fid in enumerate(top_fields)}
top_fields_colnames = [field_id2name_level[fid][0] for fid in top_fields]
field_name_clean_map = {name: clean_field_name(name) for name in top_fields_colnames}

In [169]:
'", "'.join(field_name_clean_map.values())

'Social_psychology", "Psychology", "Astronomy", "Astrophysics", "Physics", "Population", "Pedagogy", "Cohort_study", "Computer_science", "Public_relations", "Business", "Ecology", "Predation", "Biology", "Environmental_resource_management", "Biodiversity", "Geomorphology", "Geology", "Atmospheric_sciences", "Personality", "Drug", "Alternative_medicine", "Psychological_intervention", "Medicine", "Habitat", "Ecosystem", "Mood", "Cognition", "Health_care", "Endocrinology", "Disease", "Internal_medicine", "Diabetes_mellitus", "Family_medicine", "Political_science", "Biochemistry", "Molecular_biology", "Breast_cancer", "Phenotype", "Cancer_research", "Cancer", "Immune_system", "In_vivo", "Oncology", "Chemotherapy", "Ethnic_group", "Environmental_health", "Cross_sectional_study", "Logistic_regression", "Odds_ratio", "Public_health", "Relative_risk", "Risk_factor", "Stroke", "Pathology", "Phenomenon", "Geography", "Chemistry", "Botany", "Nanotechnology", "Materials_science", "Neuroscience", "

In [170]:
' + '.join(field_name_clean_map.values())

'Social_psychology + Psychology + Astronomy + Astrophysics + Physics + Population + Pedagogy + Cohort_study + Computer_science + Public_relations + Business + Ecology + Predation + Biology + Environmental_resource_management + Biodiversity + Geomorphology + Geology + Atmospheric_sciences + Personality + Drug + Alternative_medicine + Psychological_intervention + Medicine + Habitat + Ecosystem + Mood + Cognition + Health_care + Endocrinology + Disease + Internal_medicine + Diabetes_mellitus + Family_medicine + Political_science + Biochemistry + Molecular_biology + Breast_cancer + Phenotype + Cancer_research + Cancer + Immune_system + In_vivo + Oncology + Chemotherapy + Ethnic_group + Environmental_health + Cross_sectional_study + Logistic_regression + Odds_ratio + Public_health + Relative_risk + Risk_factor + Stroke + Pathology + Phenomenon + Geography + Chemistry + Botany + Nanotechnology + Materials_science + Neuroscience + Offspring + Receptor + Surgery + Demography + Socioeconomic_st

In [171]:
reg_data['field_vector'] = reg_data['doi'].apply(get_field_vector)

In [172]:
reg_data[top_fields_colnames] = pd.DataFrame(reg_data.field_vector.values.tolist(), index = reg_data.index)

In [173]:
reg_data = reg_data.drop(columns=['field_vector'])

In [174]:
reg_data = reg_data.rename(columns=field_name_clean_map)

In [191]:
len(reg_data)

524052

In [192]:
reg_data.to_csv(data_root+"reg_data.csv", index=False, header=True, encoding='utf-8')
# reg_data = pd.read_csv(data_root+"reg_data.csv", header=0)

Save a copy for plotting purposes

In [193]:
reg_data.drop(columns = list(field_name_clean_map.values())).to_csv(data_root+"reg_data_plot.csv", index=False, header=True, encoding='utf-8')
# reg_data_plot = pd.read_csv(data_root+"reg_data_plot.csv", header=0)

Show Black names (vs. Ethnea and Wiki)

In [546]:
tem = reg_data.loc[reg_data.author_eth_census == 'black'][['author_name', 'author_eth_census', 'author_eth_wiki_broad', 'author_eth_ethnea_broad']]
tem = tem.drop_duplicates()

In [548]:
len(tem)

892

In [33]:
subdf = tem.sample(10)
for name, census, ethnea, wiki in zip(subdf.author_name, subdf.author_eth_census, subdf.author_eth_ethnea_broad, subdf.author_eth_wiki_broad):
    print(name, ' & ', census, ' & ', ethnea, ' & ', wiki, '\\\\')

E. Robinson  &  black  &  English  &  English \\
Momar Ndao  &  black  &  RomanceLanguage  &  African \\
Angela F Harris  &  black  &  English  &  English \\
Daddy Mata-Mbemba  &  black  &  RomanceLanguage  &  African \\
A Bolu Ajiboye  &  black  &  African  &  African \\
Lasana T. Harris  &  black  &  English  &  English \\
John M. Harris  &  black  &  English  &  English \\
Edwin S Robinson  &  black  &  English  &  English \\
Eric A. Coleman  &  black  &  English  &  English \\
Mp Coleman  &  black  &  English  &  English \\


Show African names (vs. Census and Wiki)

In [549]:
tem = reg_data.loc[reg_data.author_eth_ethnea_broad == 'African'][['author_name', 'author_eth_census', 'author_eth_wiki_broad', 'author_eth_ethnea_broad']]
tem = tem.drop_duplicates()

In [551]:
len(tem)

908

In [25]:
subdf = tem.sample(10, random_state = 10)
for name, ethnea, census, wiki in zip(subdf.first_aut_name, subdf.eth_first_author, subdf.eth_first_author_census, subdf.eth_first_author_wiki_broad):
    print(name, ' & ', ethnea, ' & ', census, ' & ', wiki, '\\\\')

Alana Lelo  &  African  &  white  &  RomanceLanguage \\
Samuel Lawn  &  African  &  white  &  English \\
Saka S Ajibola  &  African  &  black  &  EastAsian \\
Mosi Adesina Ifatunji  &  African  &  black  &  African \\
Sebastian Giwa  &  African  &  white  &  African \\
Olabisi Oduwole  &  African  &  white  &  African \\
Chidi N. Obasi  &  African  &  white  &  African \\
Habauka M. Kwaambwa  &  African  &  api  &  African \\
Esther E Omaiye  &  African  &  white  &  African \\
Aurel T. Tankeu  &  African  &  white  &  English \\
