## Aim

The aim of this notebook is to get a small sample of the author data (500 authors) and predict their gender and race, and also their affiliation type and country origin. 

In [236]:
import pandas as pd
import requests
import math
import string
import re
import numpy as np
from collections import Counter

In [127]:
df = pd.read_csv('../data/interim/ica_author_data_research_paper_only.csv')

In [128]:
df.head(1)

Unnamed: 0,doi,url,title,journal,datePublished,authorName,numberOfAuthors,authorPosition,affiliation
0,10.1093/joc/jqab054,https://academic.oup.com/joc/article/72/2/145/...,Media Systems in the Digital Age: An Empirical...,Journal of Communication,2022-01-17,Edda Humprecht,5.0,1.0,Department of Communication and Media Research...


In [129]:
df[df.authorName.isnull()].shape

(197, 9)

In [130]:
# keep only rows where authorName is nontnull
df = df[df.authorName.notnull()]

In [131]:
full_names = df.authorName

In [132]:
full_names[0].split(' ')[0]

'Edda'

In [133]:
# parsing names
df['firstName'] = [full_name.split(' ')[0] for full_name in full_names]
df['lastName'] = [full_name.split(' ')[-1] for full_name in full_names]

In [140]:
# get a sample
df_small = df.sample(500, 
                     random_state = 42, 
                     ignore_index = True
                    )

In [141]:
df_small.head(2)

Unnamed: 0,doi,url,title,journal,datePublished,authorName,numberOfAuthors,authorPosition,affiliation,firstName,lastName
0,10.1093/joc/jqz040,https://academic.oup.com/joc/article/70/1/13/5...,How the Comforting Process Fails: Psychologica...,Journal of Communication,2020-02-18,Kellie St.Cyr Brisini,3.0,3.0,Department of Communication Arts and Sciences ...,Kellie,Brisini
1,10.1111/j.1468-2958.2010.01395.x,https://academic.oup.com/hcr/article/37/1/107/...,Does Mother Know Best? An Actor–Partner Model ...,Human Communication Research,2011-01-01,Janice L. Krieger,4.0,1.0,"1 School of Communication, The Ohio State Univ...",Janice,Krieger


## Genderize.io

In [136]:
# guessedGender = []
# probability = []
# basedOn = []
# for firstName in df_small['firstName']:
#     response = requests.get('https://api.genderize.io/?name=' + firstName)
#     j = response.json()
#     gender = j['gender']
#     prob = str(j['probability'])
#     count = str(j['count'])
#     guessedGender.append(gender)
#     probability.append(prob)
#     basedOn.append(count)

In [142]:
df_small['gender'] = guessedGender
df_small['gender_prob'] = probability
df_small['gender_basedon'] = basedOn

In [145]:
df_small.head()

Unnamed: 0,doi,url,title,journal,datePublished,authorName,numberOfAuthors,authorPosition,affiliation,firstName,lastName,gender,gender_prob,gender_basedon
0,10.1093/joc/jqz040,https://academic.oup.com/joc/article/70/1/13/5...,How the Comforting Process Fails: Psychologica...,Journal of Communication,2020-02-18,Kellie St.Cyr Brisini,3.0,3.0,Department of Communication Arts and Sciences ...,Kellie,Brisini,female,0.97,1170
1,10.1111/j.1468-2958.2010.01395.x,https://academic.oup.com/hcr/article/37/1/107/...,Does Mother Know Best? An Actor–Partner Model ...,Human Communication Research,2011-01-01,Janice L. Krieger,4.0,1.0,"1 School of Communication, The Ohio State Univ...",Janice,Krieger,female,0.98,6277
2,10.1111/j.1753-9137.2009.01055.x,https://academic.oup.com/ccc/article/3/1/21/40...,"The Exceptional Community: On Strangers, Forei...","Communication, Culture and Critique",2010-02-22,Garnet C. Butchart,1.0,1.0,"1 Department of Communication, University of S...",Garnet,Butchart,male,0.68,171
3,10.1111/hcre.12067,https://academic.oup.com/hcr/article/42/1/71/4...,Consciousness and Self-Regulation in Mobile Co...,Human Communication Research,2016-01-01,Scott W. Campbell,4.0,3.0,"1 Communication Studies, University of Michiga...",Scott,Campbell,male,0.99,31815
4,10.1111/j.1083-6101.2011.01539.x,https://academic.oup.com/jcmc/article/16/2/271...,Two Routes Leading to Conformity Intention in ...,Journal of Computer-Mediated Communication,2011-01-01,Junghyun Kim,1.0,1.0,,Junghyun,Kim,male,0.62,257


In [146]:
gscholar_str = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q='

In [154]:
# gscholar_str + '+'.join(df_small['authorName'][0].split(' '))

'https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q=Kellie+St.Cyr+Brisini'

In [156]:
gscholar_str + df_small['firstName'][0] + '+' + df_small['lastName'][0]

'https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q=Kellie+Brisini'

In [157]:
df_small['gscholarLink'] = [
    gscholar_str + df_small['firstName'][i] + '+' + df_small['lastName'][i] for i in range(len(df_small))]

In [160]:
df_small['gscholarLink'][10]

'https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q=Scott+Alter'

In [161]:
# df_small.to_csv('../data/interim/500_author.csv', index=False)

## Gender API

In [163]:
df_small['firstName'][0]

'Kellie'

In [164]:
# response = requests.get(
#     'https://gender-api.com/get?name='+ 
#     df_small['firstName'][0] + 
#     '&key=Cph74gYuG5d55qn2ccqdPBMgreSytzaUMaZU')

In [165]:
# j = response.json()

{'name': 'kellie',
 'name_sanitized': 'Kellie',
 'country': '',
 'gender': 'female',
 'samples': 2598,
 'accuracy': 98,
 'duration': '15ms',
 'credits_used': 1}

In [167]:
guessedGender_2 = []
probability_2 = []
basedOn_2 = []
for firstName in df_small['firstName']:
    response = requests.get('https://gender-api.com/get?name='+ firstName + 
                            '&key=Cph74gYuG5d55qn2ccqdPBMgreSytzaUMaZU')
    j = response.json()
    gender = j['gender']
    prob = j['accuracy']
    count = j['samples']
    guessedGender_2.append(gender)
    probability_2.append(prob)
    basedOn_2.append(count)

In [170]:
df_small['gender_api'] = guessedGender_2
df_small['gender_accu'] = probability_2
df_small['gender_samples'] = basedOn_2

In [177]:
# Some names are wierd. For example, only a letter for first name
df_small[df_small.gender != df_small.gender_api].shape

(30, 18)

## Race

In [178]:
from ethnicolr import census_ln, pred_census_ln

In [184]:
df_small_race_predicted = pred_census_ln(df_small, 'lastName', year=2010)

In [186]:
df_small_race_predicted.shape

(500, 23)

## ROR

In [188]:
df_small[df_small.affiliation.isnull()]

Unnamed: 0,doi,url,title,journal,datePublished,authorName,numberOfAuthors,authorPosition,affiliation,firstName,lastName,gender,gender_prob,gender_basedon,gscholarLink,gender_api,gender_accu,gender_samples
4,10.1111/j.1083-6101.2011.01539.x,https://academic.oup.com/jcmc/article/16/2/271...,Two Routes Leading to Conformity Intention in ...,Journal of Computer-Mediated Communication,2011-01-01,Junghyun Kim,1.0,1.0,,Junghyun,Kim,male,0.62,257,https://scholar.google.com/scholar?hl=en&as_sd...,male,72,152
68,10.1111/j.1460-2466.1973.tb00951.x,https://academic.oup.com/joc/article/23/3/315/...,Information Utilization and Personality1,Journal of Communication,2006-02-07,Jerome B. Kernan,2.0,1.0,,Jerome,Kernan,male,0.99,55481,https://scholar.google.com/scholar?hl=en&as_sd...,male,99,35619
184,10.1111/j.1460-2466.1973.tb00952.x,https://academic.oup.com/joc/article/23/3/328/...,Optimal Heterophily and Communication Effectiv...,Journal of Communication,2006-02-07,W. Thomas Anderson,2.0,2.0,,W.,Anderson,,0.0,0,https://scholar.google.com/scholar?hl=en&as_sd...,male,80,260
303,10.1111/j.1460-2466.1966.tb00041.x,https://academic.oup.com/joc/article/16/4/273/...,Nationality and Social Perception1,Journal of Communication,2006-02-07,Bryant Wedge,1.0,1.0,,Bryant,Wedge,male,0.98,815,https://scholar.google.com/scholar?hl=en&as_sd...,male,99,1643
358,10.1111/j.1460-2466.1967.tb01195.x,https://academic.oup.com/joc/article/17/4/372/...,"Some Effects of Priming, Incubation and Creati...",Journal of Communication,2006-02-07,Mehvin D. Lynch,2.0,1.0,,Mehvin,Lynch,,0.0,0,https://scholar.google.com/scholar?hl=en&as_sd...,unknown,0,0
420,10.1093/ccc/tcz022,https://academic.oup.com/ccc/article/12/3/434/...,"Corrigendum to “A Shoppable Life: Performance,...","Communication, Culture and Critique",2019-05-03,Emily Hund,2.0,1.0,,Emily,Hund,female,0.98,16379,https://scholar.google.com/scholar?hl=en&as_sd...,female,98,36641


In [190]:
df_small['affiliation'][0]

'Department of Communication Arts and Sciences , The Pennsylvania State University, University Park, PA 16802,  USA'

In [223]:
def notNaN(num):
    return num == num

In [225]:
notNaN(df_small['affiliation'][4])

False

In [237]:
def process_text(aff):
    aff = aff.lower()
    aff = re.sub('[^a-z ]+', ' ', aff)
    aff = ' '.join(aff.split())
    return aff

In [242]:
ror_id_list = []
ror_affname_list = []
ror_afftype_list = []
us_state_list = []
ror_country_list = []
idx = 0
for aff in df_small['affiliation']:
    if notNaN(aff):
        aff = process_text(aff)
        response = requests.get('https://api.ror.org/organizations?query='+aff)
        j = response.json()
        j = j['items'][0]
        try:
            ror_id = j['id']
        except:
            ror_id = None
        try:
            ror_affname = j['name']
        except:
            ror_affname = None
        try:
            ror_afftype = j['types'][0]
        except:
            ror_afftype = None
        try:
            us_state = j['addresses'][0]['state']
        except:
            us_state = None
        try:
            ror_country = j['country']['country_code']
        except:
            ror_country = None
        ror_id_list.append(ror_id)
        ror_affname_list.append(ror_affname)
        ror_afftype_list.append(ror_afftype)
        us_state_list.append(us_state)
        ror_country_list.append(ror_country)
    else:
        ror_id_list.append(None)
        ror_affname_list.append(None)
        ror_afftype_list.append(None)
        us_state_list.append(None)
        ror_country_list.append(None)
    print(f'{idx} is done')
    idx += 1

0 is done
1 is done
2 is done
3 is done
4 is done
5 is done
6 is done
7 is done
8 is done
9 is done
10 is done
11 is done
12 is done
13 is done
14 is done
15 is done
16 is done
17 is done
18 is done
19 is done
20 is done
21 is done
22 is done
23 is done
24 is done
25 is done
26 is done
27 is done
28 is done
29 is done
30 is done
31 is done
32 is done
33 is done
34 is done
35 is done
36 is done
37 is done
38 is done
39 is done
40 is done
41 is done
42 is done
43 is done
44 is done
45 is done
46 is done
47 is done
48 is done
49 is done
50 is done
51 is done
52 is done
53 is done
54 is done
55 is done
56 is done
57 is done
58 is done
59 is done
60 is done
61 is done
62 is done
63 is done
64 is done
65 is done
66 is done
67 is done
68 is done
69 is done
70 is done
71 is done
72 is done
73 is done
74 is done
75 is done
76 is done
77 is done
78 is done
79 is done
80 is done
81 is done
82 is done
83 is done
84 is done
85 is done
86 is done
87 is done
88 is done
89 is done
90 is done
91 is don

In [270]:
df_small_race_predicted['ror_id'] = ror_id_list
df_small_race_predicted['ror_affname'] = ror_affname_list
df_small_race_predicted['ror_afftype'] = ror_afftype_list
df_small_race_predicted['us_state'] = us_state_list
df_small_race_predicted['ror_country'] = ror_country_list

In [271]:
df_small_race_predicted['genderAgree'] = np.where(
    df_small_race_predicted['gender'] == df_small_race_predicted['gender_api'], 
    True, 
    False)

In [272]:
df_small_race_predicted['raceNew'] = np.where(
    df_small_race_predicted['race']=='api', 'Asian', df_small_race_predicted['race'])

In [273]:
df_small_race_predicted.columns

Index(['doi', 'url', 'title', 'journal', 'datePublished', 'authorName',
       'numberOfAuthors', 'authorPosition', 'affiliation', 'firstName',
       'lastName', 'gender', 'gender_prob', 'gender_basedon', 'gscholarLink',
       'gender_api', 'gender_accu', 'gender_samples', 'api', 'black',
       'hispanic', 'white', 'race', 'ror_id', 'ror_affname', 'ror_afftype',
       'us_state', 'ror_country', 'genderAgree', 'raceNew'],
      dtype='object')

In [274]:
cols_to_keep = [
    'doi',
    'url',
    'title',
    'journal',
    'authorName',
    'gscholarLink',
    'firstName',
    'gender',
    'gender_prob',
    'gender_basedon',
    'genderAgree',
    'gender_api',
    'gender_accu',
    'gender_samples',
    'lastName',
    'raceNew',
    'affiliation',
    'ror_affname',
    'ror_id',  
    'ror_afftype',
    'us_state', 
    'ror_country'
]

In [275]:
df_small_product = df_small_race_predicted[cols_to_keep]

In [276]:
col_renamer = {
    'raceNew': 'race'
}

In [277]:
df_small_product.rename(columns = col_renamer).to_csv(
    '../data/interim/df_small_product.csv', index = False)

### Data dictionary

- doi: DOI
- url: paper URL
- title: paper title
- journal: journal
- authorName: author full name
- gscholarLink: the link to searching that author's name on Google Scholar
- firstName: author first name
- gender: the gender result produced by genderize.io
- gender_prob: the probability that this guess is correct, genderize.io
- gender_basedon: how many samples are based on to have this result, genderize.io
- genderAgree: whether this gender prediction is the same as that produced by gender api (in the following columns)
- gender_api: the gender prediction by gender api
- gender_accu: the probability that this guess is correct, gender api
- gender_samples: how many samples are based on to have this result, genderize.io
- lastName: author last name
- race: race prediction based on US census 2010
- affiliation: author affiliation as shown on journal website
- ror_id: ror id for the predicted affiliation name
- ror_affname: predicted affiliation name
- ror_afftype: affiliation type
- us_state: the us state where this affiliation is located
- ror_country: the country where this affiliation is located