In [65]:
import pandas as pd
import requests
import numpy as np
import re
from collections import Counter
import random

In [66]:
race_pred = pd.read_csv('../data/interim/author_with_pred.csv')
race_pred.shape

(13603, 25)

In [67]:
race_pred.head(2)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,authorPosition,affiliation,genderize,genderize_prob,genderize_basedon,genderAccuracy,api,black,hispanic,white,race,raceHighest,raceSecondHighest,raceDiff,racePredAccuracy
0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/6529454,2022,The Gender Divide in Wikipedia: Quantifying and Assessing the Impact of Two Feminist Interventions,Journal of Communication,2022-02-16,Isabelle Langrock,Isabelle,Langrock,2.0,1.0,"Annenberg School for Communication, University of Pennsylvania , Philadelphia, PA 19104, USA",female,0.99,89728.0,High,0.007762,0.066429,0.030049,0.89576,white,0.89576,0.066429,0.829331,High
1,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/6529454,2022,The Gender Divide in Wikipedia: Quantifying and Assessing the Impact of Two Feminist Interventions,Journal of Communication,2022-02-16,Sandra González-Bailón,Sandra,González-Bailón,2.0,2.0,"Annenberg School for Communication, University of Pennsylvania , Philadelphia, PA 19104, USA",female,0.98,266121.0,High,0.010406,0.0003,0.979183,0.010111,hispanic,0.979183,0.010406,0.968777,High


## Affiliations

In [68]:
def notNaN(num):
    return num == num

In [69]:
notNaN(1)

True

In [70]:
def process_affiliation_text(aff):
    if notNaN(aff):
        aff = aff.lower()
        # remove anything other than characters
        aff = re.sub('[^a-z ]+', ' ', aff)
        aff = ' '.join(aff.split())
        # delete anything between ()
        aff = re.sub(r'\([^)]*\)', '', aff)
        return aff
    else:
        return np.nan

In [71]:
random_aff = random.choice(race_pred.affiliation)
random_aff

'1 School of Information and Library Science and Department of Sociology, University of North Carolina at Chapel Hill, Chapel Hill, NC 27516, USA'

In [72]:
process_affiliation_text(random_aff)

'school of information and library science and department of sociology university of north carolina at chapel hill chapel hill nc usa'

In [73]:
race_pred['affProcessed'] = [process_affiliation_text(aff) for aff in race_pred.affiliation]

In [75]:
race_pred.head(1)

Unnamed: 0,doi,url,year,title,journal,datePublished,authorFullName,firstName,lastName,numberOfAuthors,authorPosition,affiliation,genderize,genderize_prob,genderize_basedon,genderAccuracy,api,black,hispanic,white,race,raceHighest,raceSecondHighest,raceDiff,racePredAccuracy,affProcessed
0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/6529454,2022,The Gender Divide in Wikipedia: Quantifying and Assessing the Impact of Two Feminist Interventions,Journal of Communication,2022-02-16,Isabelle Langrock,Isabelle,Langrock,2.0,1.0,"Annenberg School for Communication, University of Pennsylvania , Philadelphia, PA 19104, USA",female,0.99,89728.0,High,0.007762,0.066429,0.030049,0.89576,white,0.89576,0.066429,0.829331,High,annenberg school for communication university of pennsylvania philadelphia pa usa


In [76]:
affs = race_pred.affProcessed
# deduplicate
affs = list(set(affs))
# remove nan
affs = [x for x in affs if str(x) != 'nan']
len(affs)

8443

In [77]:
# affs = [aff.lower() for aff in affs]
# # delete anything between ()
# affs = [re.sub(r'\([^)]*\)', '', aff) for aff in affs]
# # delete the number at the start of the string
# affs = [re.sub(r"^[0-9]*\.?[0-9]+", '', aff) for aff in affs]
# affs = [aff.strip() for aff in affs]

In [78]:
affs[1:5]

['chairman of speech correction university of georgia athens georgia',
 'associate professor in the department of communication miami university oxford oh james b stiff is an associate professor in the department of communication arizona state university tempe az',
 'mr kauffman is the director of research at hawthorn center',
 'department of journalism media and communication university of gothenburg seminariegatan b gothenburg sweden']

### ROR datasets

ROR Data and build a dictionary

In [79]:
import json

In [80]:
with open('../data/raw/large/ror.json', 'r') as myfile:
    data=myfile.read()
data = json.loads(data)

In [81]:
ror_name_id_dic = {}
for i in data:
    affname = i['name'].lower()
    ror_name_id_dic[affname] = i['id']

In [82]:
# ror_name_id_dic

In [83]:
data[0]

{'id': 'https://ror.org/019wvm592',
 'name': 'Australian National University',
 'types': ['Education'],
 'links': ['http://www.anu.edu.au/'],
 'aliases': [],
 'acronyms': ['ANU'],
 'status': 'active',
 'wikipedia_url': 'http://en.wikipedia.org/wiki/Australian_National_University',
 'labels': [],
 'email_address': None,
 'ip_addresses': [],
 'established': 1946,
 'country': {'country_code': 'AU', 'country_name': 'Australia'},
 'relationships': [{'type': 'Related',
   'label': 'Calvary Hospital',
   'id': 'https://ror.org/041c7s516'},
  {'type': 'Related',
   'label': 'Canberra Hospital',
   'id': 'https://ror.org/04h7nbn38'},
  {'type': 'Related',
   'label': 'Goulburn Base Hospital',
   'id': 'https://ror.org/030jpqj15'},
  {'type': 'Child',
   'label': 'ARC Centre of Excellence for Transformative Meta-Optical Systems',
   'id': 'https://ror.org/05sh7tb37'},
  {'type': 'Child',
   'label': 'ARC Centre of Excellence in Plant Energy Biology',
   'id': 'https://ror.org/01a1mq059'},
  {'ty

In [84]:
target_str = ['university', 
              'school',
              'college', 
              "universität", 
              "université", 
              "inc.", 
              "company", 
              'coorporation',
              'institute',
              'center',
              'centre',
             ]

In [85]:
ror_affnames = []
for i in data:
    affname = i['name'].lower()
    if any(x in affname for x in target_str):
        ror_affnames.append(affname)
ror_affnames.remove('he university')
ror_affnames.remove('french institute for research in computer science and automation')
ror_affnames.remove('australian national university')
ror_affnames.remove('monash university')

In [86]:
len(ror_affnames)

29820

In [87]:
ror_affnames[455:460]

['national autonomous university of mexico',
 'national university of la plata',
 'heriot-watt university',
 'university of the balearic islands',
 'reykjavík university']

I concluded that it's better to use absolute match first before I use fuzzy match

In [88]:
matched_dic = {}
matched = 0
failed = 0
failed_list = []
for aff in affs:
    for x in ror_affnames:
        if x in aff:
            matched += 1
            matched_dic[aff] = x
            break

In [89]:
len(affs)

8443

In [90]:
matched

6102

In [114]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.DataFrame(matched_dic.items()).sample(10)

Unnamed: 0,0,1
2775,assistant professor of sociology at ohio state university and research fellow in the department of social relations at johns hopkins university,johns hopkins university
2484,francis balle is professor at the university of paris director of the french press institute and co editor of the cahiers de la communication idalina cappe de baillon received a master s degree from the french press institute,university of paris
3253,graduate school of culture technology korea advanced institute of science and technology daejeon republic of korea,korea advanced institute of science and technology
3504,department of communication university of washington at seattle seattle wa usa,university of washington
2640,associate professor of computer science at virginia tech where she has been since prior to that she was a research staff member and manager at ibm s t j watson research center dr rosson received her phd in from the university of texas at austin,the university of texas at austin
3537,dr stewart received his degree from michigan state university in and is a former member of its faculty now with the campbell ewald company in detroit he is responsible for research in the sciences and arts pertaining to communication,michigan state university
4424,harvard t h chan school of public health and dana farber cancer institute brookline avenue lw boston ma usa,cancer institute
4734,senior academic in information systems at murdoch before taking up a professorship at the school of management at griffith university following his ph d cambridge in his postdoctoral work he developed intelligent and collaborative decision support systems in the financial and defense sectors twice a british council visiting fellow to pr china he has over publications in the field of is emphasizing human knowledge and intelligence in organizational systems his current research involves knowledge management and internet commerce applications,griffith university
4093,judee k burgoon is professor of communication at the university of arizona,university of arizona
6057,barbara j wilson ph d university of wisconsin madison is assistant professor in the department of communication university of louisville louisville kentucky this research was supported by a grant from the college of arts and sciences university of louisville the author would like to thank mark filbert for serving as one of the experimenters and lesa mcneill for helping with the experimental materials special thanks are due to robert spencer principal of field elementary school and linda duvall director of the klondike lane kinder care learning center for allowing the study tobe conducted in their schools thanks are also due to the teachers staff and studentsat field elementary school and kinder care learning center,university of louisville


In [115]:
list(set(matched_dic.values()))

['university of new hampshire',
 'the university of texas at austin',
 'valdosta state university',
 'massey university',
 'stephens college',
 'wichita state university',
 'transylvania university',
 'the university of texas at arlington',
 'renmin university of china',
 'singapore university of technology and design',
 'mcgill university',
 'muhlenberg college',
 'acadia university',
 'university of new mexico',
 'century university',
 'loyola university new orleans',
 'dalian university of technology',
 'copenhagen business school',
 'chungnam national university',
 'eckerd college',
 'university of the west of scotland',
 'strayer university',
 'masaryk university',
 'yerevan state university',
 'university of canberra',
 'city university',
 'university of leeds',
 'tulane university',
 'american university',
 'university of tartu',
 'baruch college',
 'university of kent',
 'stillman college',
 'oakland university',
 'university of south alabama',
 'school of international relatio

In [93]:
failed_list = [x for x in affs if x not in matched_dic.keys()]

In [None]:
failed_list

In [94]:
len(failed_list)

2341

In [38]:
# fuzzy_matched_dic = {}
# for aff in failed_list:
#     match = process.extractOne(aff, ror_affnames, scorer=fuzz.WRatio)[0]
#     fuzzy_matched_dic[aff] = match

In [39]:
# pd.DataFrame(fuzzy_matched_dic.items()).sample(10)

In [95]:
failed_list_matched_dic = {}
failed_matched = 0
for aff in failed_list:
    response = requests.get('https://api.ror.org/organizations?query='+aff)
    j = response.json()
    j = j['items'][0]
    try:
        ror_matched_affname = j['name']
        failed_matched += 1
    except:
        ror_matched_affname = None
    failed_list_matched_dic[aff] = ror_matched_affname
    print(f'{failed_list.index(aff) + 1} is done')

1 is done
2 is done
3 is done
4 is done
5 is done
6 is done
7 is done
8 is done
9 is done
10 is done
11 is done
12 is done
13 is done
14 is done
15 is done
16 is done
17 is done
18 is done
19 is done
20 is done
21 is done
22 is done
23 is done
24 is done
25 is done
26 is done
27 is done
28 is done
29 is done
30 is done
31 is done
32 is done
33 is done
34 is done
35 is done
36 is done
37 is done
38 is done
39 is done
40 is done
41 is done
42 is done
43 is done
44 is done
45 is done
46 is done
47 is done
48 is done
49 is done
50 is done
51 is done
52 is done
53 is done
54 is done
55 is done
56 is done
57 is done
58 is done
59 is done
60 is done
61 is done
62 is done
63 is done
64 is done
65 is done
66 is done
67 is done
68 is done
69 is done
70 is done
71 is done
72 is done
73 is done
74 is done
75 is done
76 is done
77 is done
78 is done
79 is done
80 is done
81 is done
82 is done
83 is done
84 is done
85 is done
86 is done
87 is done
88 is done
89 is done
90 is done
91 is done
92 is do

IndexError: list index out of range

In [99]:
pd.DataFrame(failed_list_matched_dic.items()).sample(10)

Unnamed: 0,0,1
342,dr maclay is a research associate professor in the institute of communications research at the university of illinois,Karlsruhe Institute of Technology
607,department of communications university of california davis ca usa,"University of California, Davis"
106,a doctoral student in the department of communication at texas a amp m university her research interests include political and organizational rhetoric,Texas A&M University at Qatar
119,department of communication science vu university amsterdam amsterdam hv netherlands,Vrije Universiteit Amsterdam
553,communication studies university of texas at austin austin tx usa,The University of Texas at Austin
619,b s rutgers university is business development manager at doubleclick inc,"Rutgers, The State University of New Jersey"
374,department of journalism and communication research hanover university of music and drama germany,Hanover University of Music Drama and Media
164,department of life sciences communication university of wisconsin madison hiram smith hall observatory drive madison wi,University of Wisconsin–Madison
452,graduate student in the media lab s sociable media group her current work focuses on developing interfaces that examine the interaction of people and the social cues they perceive in networked electronic spaces previous work has included browsing large video banks as well as reconstructing three dimensional spaces from multiple media sources she holds an sb in electrical engineering an me in electrical engineering and computer science and an sm in media arts and sciences from mit,Beijing Jingshida Electromechanical Equipment Research Institute
184,department of computer science stoneybrook university stoneybrook ny usa,Computer Science Department


In [None]:
a_dic = {}