In [1]:
import pandas as pd
import requests
import numpy as np
import re
from collections import Counter

In [3]:
author = pd.read_csv('../data/interim/ica_author_data.csv')
author.shape

(13603, 12)

In [5]:
author.head(2)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,authorPosition,affiliation
0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Isabelle Langrock,Isabelle,Langrock,2.0,1.0,"Annenberg School for Communication, University..."
1,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Sandra González-Bailón,Sandra,González-Bailón,2.0,2.0,"Annenberg School for Communication, University..."


In [6]:
firstname = list(set(author.firstName))

In [7]:
firstname = [x for x in firstname if str(x) != 'nan']
len(firstname)

2886

In [8]:
from genderize import Genderize
API_KEY = '486f33235ec473d0d2ceef6273dca789'
genderize = Genderize(
    user_agent='GenderizeDocs/0.0',
    api_key=API_KEY,
    timeout=5.0)
name_gender_results = genderize.get(firstname)

In [9]:
firstname_df = pd.DataFrame(name_gender_results)
firstname_df.columns = ['firstName', 'genderize', 'genderize_prob', 'genderize_basedon']
firstname_df.sample(5)

Unnamed: 0,firstName,genderize,genderize_prob,genderize_basedon
1640,Peng,male,0.84,1286
1809,Marjolein,female,0.99,1828
2440,Eduardo,male,0.99,43964
1936,Tomi,male,0.95,7611
2191,Elisenda,female,0.98,386


In [350]:
# predictions with low accuracy and low basedons 
low_accuracy_names_df = firstname_df[
    (firstname_df.genderize_basedon.astype(int) <= 2000) | 
    (firstname_df.genderize_prob.astype(float) <= 0.80)
]
low_accuracy_names = low_accuracy_names_df.firstName.tolist()
len(low_accuracy_names)

1727

In [11]:
basedons = np.array(firstname_df.genderize_basedon.astype(int))
np.mean(basedons)

13460.174636174635

In [12]:
np.std(basedons)

36153.385559411705

In [13]:
np.median(basedons)

1288.0

In [22]:
firstname_df.head(2)

Unnamed: 0,firstName,genderize,genderize_prob,genderize_basedon
0,Clark,male,0.98,2533
1,Phillipa,female,0.97,124


In [23]:
author.head(2)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,authorPosition,affiliation
0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Isabelle Langrock,Isabelle,Langrock,2.0,1.0,"Annenberg School for Communication, University..."
1,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Sandra González-Bailón,Sandra,González-Bailón,2.0,2.0,"Annenberg School for Communication, University..."


## Merging

In [24]:
authorWithGender = author.merge(firstname_df, on = "firstName", how = "left")

In [25]:
authorWithGender['genderAccuracy'] = np.where(
    authorWithGender.firstName.isin(low_accuracy_names),
    'Low',
    'High'
)

In [26]:
authorWithGender.head(2)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,authorPosition,affiliation,genderize,genderize_prob,genderize_basedon,genderAccuracy
0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Isabelle Langrock,Isabelle,Langrock,2.0,1.0,"Annenberg School for Communication, University...",female,0.99,89728.0,High
1,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Sandra González-Bailón,Sandra,González-Bailón,2.0,2.0,"Annenberg School for Communication, University...",female,0.98,266121.0,High


In [27]:
authorWithGender[authorWithGender.genderAccuracy=='Low'].shape

(3412, 16)

## Race

In [28]:
from ethnicolr import census_ln, pred_census_ln

In [29]:
race_pred = pred_census_ln(authorWithGender, 'lastName', year=2010)

2022-07-23 15:06:23.306625: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [30]:
race_pred.head(2)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,...,affiliation,genderize,genderize_prob,genderize_basedon,genderAccuracy,api,black,hispanic,white,race
0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Isabelle Langrock,Isabelle,Langrock,2.0,...,"Annenberg School for Communication, University...",female,0.99,89728.0,High,0.007762,0.066429,0.030049,0.89576,white
1,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,Journal of Communication,2022-02-16,Sandra González-Bailón,Sandra,González-Bailón,2.0,...,"Annenberg School for Communication, University...",female,0.98,266121.0,High,0.010406,0.0003,0.979183,0.010111,hispanic


In [31]:
race_pred_sub = race_pred[['api', 'black', 'hispanic', 'white']]

In [32]:
race_pred_sub.head()

Unnamed: 0,api,black,hispanic,white
0,0.007762,0.066429,0.030049,0.89576
1,0.010406,0.0003,0.979183,0.010111
2,0.007112,0.088763,0.031022,0.873102
3,0.004433,0.043421,0.024482,0.927664
4,0.101002,0.049881,0.185628,0.663489


In [33]:
race_pred_sub.iloc[1,:].nlargest(2).values

array([0.9791829 , 0.01040568], dtype=float32)

In [35]:
# axis = 1 means row-wise
raceHighest = race_pred_sub.apply(lambda row: row.nlargest(2).values[0], axis = 1)

In [36]:
raceSecondHighest = race_pred_sub.apply(lambda row: row.nlargest(2).values[-1], axis = 1)

In [37]:
raceDiff = raceHighest - raceSecondHighest

In [38]:
raceDiff[1:10]

1    0.968777
2    0.784340
3    0.884243
4    0.477861
5    0.579995
6    0.955232
7    0.220020
8    0.011876
9    0.861246
dtype: float32

In [39]:
race_pred['raceHighest'] = raceHighest
race_pred['raceSecondHighest'] = raceSecondHighest
race_pred['raceDiff'] = raceDiff

In [40]:
race_pred['racePredAccuracy'] = np.where(
    (race_pred['raceHighest'] >= 0.80) & (race_pred['raceDiff'] >= 0.30),
    'High',
    'Low'
)

In [41]:
race_pred[race_pred.racePredAccuracy == 'Low'].shape

(4666, 25)

In [43]:
race_pred.sample(10)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,...,genderAccuracy,api,black,hispanic,white,race,raceHighest,raceSecondHighest,raceDiff,racePredAccuracy
11807,10.1111/j.1083-6101.2009.01448.x,https://academic.oup.com/jcmc/article/14/3/435...,2009,Blogs of Information: How Gender Cues and Indi...,Journal of Computer-Mediated Communication,2009-04-01,Cory L. Armstrong,Cory,Armstrong,2.0,...,Low,0.00227,0.216518,0.032266,0.748946,white,0.748946,0.216518,0.532428,Low
10256,10.1111/j.1468-2885.2004.tb00315.x,https://academic.oup.com/ct/article/14/4/285/4...,2004,Exploring the Concept of Media Enjoyment: An I...,Communication Theory,2006-01-10,Mary Beth Oliver,Mary,Oliver,2.0,...,High,0.003106,0.280178,0.022664,0.694052,white,0.694052,0.280178,0.413873,Low
9727,10.1093/ct/qty018,https://academic.oup.com/ct/article/28/4/466/5...,2018,A Normative Perspective for Political Entertai...,Communication Theory,2018-08-04,Carina Weinmann,Carina,Weinmann,2.0,...,High,0.003292,0.005935,0.031681,0.959092,white,0.959092,0.031681,0.927411,High
11380,10.1111/jcc4.12045,https://academic.oup.com/jcmc/article/19/3/546...,2014,An Analysis of Language in University Students...,Journal of Computer-Mediated Communication,2014-04-01,James Hanney,James,Hanney,5.0,...,High,0.008089,0.057466,0.017947,0.916498,white,0.916498,0.057466,0.859032,High
9242,10.1111/j.1468-2958.1979.tb00649.x,https://academic.oup.com/hcr/article/5/4/355/4...,1979,Commonality Analysis: A Method for Decomposing...,Human Communication Research,2006-03-17,Robert D. McPhee,Robert,Mcphee,2.0,...,High,0.006704,0.127231,0.027354,0.838711,white,0.838711,0.127231,0.71148,High
2823,10.1111/j.1460-2466.2002.tb02540.x,https://academic.oup.com/joc/article/52/1/211/...,2002,The Framing of Feminists and Feminism in News ...,Journal of Communication,2006-01-10,Rebecca Ann Lind,Rebecca,Lind,2.0,...,High,0.008973,0.049541,0.010163,0.931323,white,0.931323,0.049541,0.881781,High
334,10.1093/joc/jqaa025,https://academic.oup.com/joc/article/70/5/721/...,2020,Instagram Inspiration: How Upward Comparison o...,Journal of Communication,2020-09-03,Sophie Börner,Sophie,Börner,4.0,...,High,0.009622,0.197992,0.024429,0.767956,white,0.767956,0.197992,0.569964,Low
3110,10.1111/j.1460-2466.1997.tb02728.x,https://academic.oup.com/joc/article/47/4/102/...,1997,Political Correctness and Academic Principles:...,Journal of Communication,2006-02-07,Hans Mathias Kepplinger,Hans,Kepplinger,1.0,...,High,0.008628,0.028638,0.018324,0.94441,white,0.94441,0.028638,0.915772,High
12953,10.1093/ccc/tcaa023,https://academic.oup.com/ccc/article/13/4/468/...,2020,Love is an Emergency Savings Fund: Suze Orman’...,"Communication, Culture and Critique",2020-10-22,Diane L Cormany,Diane,Cormany,1.0,...,High,0.007378,0.088317,0.029361,0.874943,white,0.874943,0.088317,0.786626,High
4141,10.1111/j.1460-2466.1984.tb02183.x,https://academic.oup.com/joc/article/34/3/174/...,1984,Books,Journal of Communication,2006-02-07,,,,,...,High,0.461375,0.068442,0.085889,0.384294,api,0.461375,0.384294,0.077081,Low


## Affiliations

In [44]:
import re
import random

In [64]:
affs = race_pred.affiliation
# deduplicate
affs = list(set(affs))
# remove nan
affs = [x for x in affs if str(x) != 'nan']
len(affs)

9642

In [65]:
affs = [aff.lower() for aff in affs]
# delete anything between ()
affs = [re.sub(r'\([^)]*\)', '', aff) for aff in affs]
# delete the number at the start of the string
affs = [re.sub(r"^[0-9]*\.?[0-9]+", '', aff) for aff in affs]
affs = [aff.strip() for aff in affs]

In [68]:
affs[1:5]

['alexander repenning is a research assistant professor and member of the center for lifelong learning and design at the university of colorado in boulder. he has worked in research and development at asea brown boveri, xerox parc, and hewlett packard. repenning has also been a consultant for apple computer inc. his research interests include education and computers, end-user programming, interactive learning and simulation environments, human-computer interaction, and artificial intelligence. repenning received his ph.d. in computer science and the certificate of cognitive science from the university of colorado in 1993. repenning is a member of acm  and ieee.',
 "ph.d. candidate in communication at cornell university. she received her b.a. in political science, m.l.i.s., and m.a. in speech from the university of hawaii at manoa. her research focuses on the ways that people's interpersonal and intercultural experiences shape the processing of media messages. in addition to her interes

### ROR datasets

ROR Data and build a dictionary

In [69]:
import json

In [86]:
with open('../data/raw/large/ror.json', 'r') as myfile:
    data=myfile.read()
data = json.loads(data)

In [300]:
ror_name_id_dic = {}
for i in data:
    affname = i['name'].lower()
    ror_name_id_dic[affname] = i['id']

In [301]:
ror_name_id_dic

{'australian national university': 'https://ror.org/019wvm592',
 'monash university': 'https://ror.org/02bfwt286',
 'university of queensland': 'https://ror.org/00rqy9422',
 'macquarie university': 'https://ror.org/01sf06y89',
 'unsw sydney': 'https://ror.org/03r8z3t63',
 'newcastle university': 'https://ror.org/01kj2bm70',
 'university of wollongong': 'https://ror.org/00jtmb277',
 'university of melbourne': 'https://ror.org/01ej9dk98',
 'university of tasmania': 'https://ror.org/01nfmeh72',
 'university of adelaide': 'https://ror.org/00892tw58',
 'james cook university': 'https://ror.org/04gsp2c11',
 'university of sydney': 'https://ror.org/0384j8v12',
 'flinders university': 'https://ror.org/01kpzv902',
 'rmit university': 'https://ror.org/04ttjf776',
 'la trobe university': 'https://ror.org/01rxfrp27',
 'victoria university': 'https://ror.org/01rrz9s51',
 'university of new england': 'https://ror.org/02n2ava60',
 'deakin university': 'https://ror.org/02czsnj07',
 'griffith universit

In [299]:
data[0]

{'id': 'https://ror.org/019wvm592',
 'name': 'Australian National University',
 'types': ['Education'],
 'links': ['http://www.anu.edu.au/'],
 'aliases': [],
 'acronyms': ['ANU'],
 'status': 'active',
 'wikipedia_url': 'http://en.wikipedia.org/wiki/Australian_National_University',
 'labels': [],
 'email_address': None,
 'ip_addresses': [],
 'established': 1946,
 'country': {'country_code': 'AU', 'country_name': 'Australia'},
 'relationships': [{'type': 'Related',
   'label': 'Calvary Hospital',
   'id': 'https://ror.org/041c7s516'},
  {'type': 'Related',
   'label': 'Canberra Hospital',
   'id': 'https://ror.org/04h7nbn38'},
  {'type': 'Related',
   'label': 'Goulburn Base Hospital',
   'id': 'https://ror.org/030jpqj15'},
  {'type': 'Child',
   'label': 'ARC Centre of Excellence for Transformative Meta-Optical Systems',
   'id': 'https://ror.org/05sh7tb37'},
  {'type': 'Child',
   'label': 'ARC Centre of Excellence in Plant Energy Biology',
   'id': 'https://ror.org/01a1mq059'},
  {'ty

In [331]:
target_str = ['university', 
              'school',
              'college', 
              "universität", 
              "université", 
              "inc.", 
              "company", 
              'coorporation',
              'institute',
              'center',
              'centre',
             ]

In [346]:
ror_affnames = []
for i in data:
    affname = i['name'].lower()
    if any(x in affname for x in target_str):
        ror_affnames.append(affname)
ror_affnames.remove('he university')
ror_affnames.remove('french institute for research in computer science and automation')
ror_affnames.remove('australian national university')
ror_affnames.remove('monash university')

In [196]:
len(ror_affnames)

24072

In [120]:
ror_affnames[455:460]

['university of jordan',
 'university of pecs',
 'university of jyväskylä',
 'kanazawa university',
 'kasetsart university']

I concluded that it's better to use absolute match first before I use fuzzy match

In [339]:
from rapidfuzz import process, fuzz, string_metric
from rapidfuzz.fuzz import ratio
from rapidfuzz.string_metric import levenshtein, normalized_levenshtein
random_aff = random.choice(affs)
random_aff

'l. theresa silverman is an assistant professor in the department of journalism and mass communications, new york university'

In [340]:
process.extractOne(random_aff, ror_affnames, scorer=fuzz.WRatio)

('new york university', 90.0, 1396)

In [287]:
matched_dic = {}
matched = 0
failed = 0
failed_list = []
for aff in affs:
    for x in ror_affnames:
        if x in aff:
            matched += 1
            matched_dic[aff] = x
            break

In [288]:
len(affs)

9642

In [289]:
matched

6926

In [292]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.DataFrame(matched_dic.items()).sample(10)

Unnamed: 0,0,1
32,rose k. goldsen is an associate professor of sociology and azriel bibliowicz is a doctoral candidate in sociology at cornell university,cornell university
1457,"department of communication, johannes gutenberg university of mainz , mainz, germany",johannes gutenberg university of mainz
211,"associate professor, school of nursing, university of washington. email: gdemiris@u.washington.edu",university of washington
1400,"graduate school of interdisciplinary information studies, the university of tokyo, 7-3-1 hongo, bunkyo-ku, tokyo, 113-0033, japan",university of tokyo
52,"institute for media and communication studies, university of mannheim, mannheim, 68131, germany",university of mannheim
453,school of communication &amp; information nanyang technological university,nanyang technological university
6273,"annenberg school for communication and journalism , university of southern california, los angeles, ca, usa",university of southern california
2427,london school of economics and political science,london school of economics and political science
3327,dr. jackson is professor of psychology and director of the graduate training program in social psychology at the university of kansas.he was formerly assistant program director at the university of michigan's institute for social research,university of kansas
3373,"center for the management of information, university of arizona, tucson, az, 85721, usa,",university of arizona


In [293]:
failed_list = [x for x in affs if x not in matched_dic.keys()]

In [294]:
len(failed_list)

2716

In [341]:
fuzzy_matched_dic = {}
for aff in failed_list:
    match = process.extractOne(aff, ror_affnames, scorer=fuzz.WRatio)[0]
    fuzzy_matched_dic[aff] = match

KeyboardInterrupt: 

In [348]:
pd.DataFrame(fuzzy_matched_dic.items()).sample(10)

Unnamed: 0,0,1
997,"department of marketing, university of texas, san antonio, tx 78249",monash university
1027,"john g. stewart is a consultant to the joint committee on congressional operations of the u.s. congress. this article is excerpted from a study, “congress and mass communications: an institutional perspective,” conducted for the joint committee on governmental operations by the congressional research service, library of congress",walter and eliza hall institute of medical research
782,university of colorado at boulder,university of colorado boulder
796,"department of communication, university of mainz , 55099 mainz , germany",monash university
448,gerald l. rous is an instructor of sociology at the university of wisconsin at stevens point,monash university
159,"tapio varis is a researcher at the institute of journalism and mass communication of the university of tampere, tampere, finland. this article is a brief summary of some of the findings of an ongoing research project. a more extensive presentation can be found in unesco's reports and papers on mass communication, no. 71,”television traffic: a one-way street”",walter and eliza hall institute of medical research
786,"joseph turow is assistant professor of communication, purdue university. the research for this study was funded by the purdue research foundation. the author would like to acknowledge the kind help of robert lewis shayon in getting the project off the ground and thank the many people from the tv industry who were generous with their time and their comments",walter and eliza hall institute of medical research
812,"james p. dillard is assistant professor of communication at the university of wisconsin, madison",university of wisconsin–madison
1303,joel r. davitz is an associate professor in the department of psychological foundations and services,university of queensland
725,is professor and director of graduate studies in the school of journalism and mass communication,university of queensland
