In [1]:
import os
import codecs
import json
import string
import re
from glob import glob
from bs4 import BeautifulSoup, NavigableString, Tag

import gender
from gender import getGenders

from fuzzywuzzy import process, fuzz

import time
import openapi_client # pip install git+https://github.com/namsor/namsor-python-sdk2.git
from openapi_client.rest import ApiException
from pprint import pprint


In [2]:
! pwd

/home/naiara/Git_repositories/geoscience-first-authorship


In [3]:
# Configure local paths

root = ! pwd
root = root[0]

print("using root directory:", root)

RAW_PAGES_DIR=root+"/pages/"
PARSED_PAGES_DIR=root+"/egu_parsed/"
GUESSED_NAMES_DIR=root+"/guessed_egu/"
NAME_GENDER_DIR=root+"/name_genders/"

# create directories if they do not exist

for d in [RAW_PAGES_DIR,
         PARSED_PAGES_DIR,
         GUESSED_NAMES_DIR,
         NAME_GENDER_DIR]:
    try:
        os.mkdir(d)
    except FileExistsError:
        pass

using root directory: /home/naiara/Git_repositories/geoscience-first-authorship


### Article class, Guessed class, and function definitions

In [4]:
# Initialize and define useful functions and
# data structures.

def is_initialed_name(name):
    first_term = name.split(" ")[0]
    if len(first_term) == 0:
        # print("first term len zero. name:", name)
        return False
    return first_term[-1] == "." and first_term[:-1].isupper() or len(first_term) == 1 or len(first_term.split("-")[0]) == 1


print("test is_initialed_name- True:", is_initialed_name("J. Smith"), ", False:", is_initialed_name("Joe Smith"),
     ", True:", is_initialed_name("J Smith"), ", True:", is_initialed_name("J-P Ampuero"))

def contains_initialed_name(names):
    for name in names:
        if is_initialed_name(name):
            return True
    return False

print("test contains_initialed_name- True", contains_initialed_name(["J. Smith", "Cat Meowins"]), ", False:", contains_initialed_name(["Joe Smith", "Cat Meowins"]))

def clean_name(name):
    name = name.strip()
    name = re.sub(r'\s', ' ', name)
    if not is_initialed_name(name):
        terms = name.split(" ")
        terms[0] = terms[0].strip(".")
        name = " ".join(terms)
    return name.strip('.')

print("test clean name- Colin. J. Cats:", clean_name("Colin. J. Cats"), "W. B. Easy:", clean_name("W. B. Easy"))

class Article:
    def __init__(self, first_author, all_names, year, month, title, journal):
        # clean the author names
        # - remove non-ascii whitespace
        # - strip bookend whitespace
        # - strip periods from first names if not an initialed name
        
        
        self.first_author = clean_name(first_author)
        self.names = [clean_name(name) for name in all_names]
        self.year = year
        self.month = month
        self.title = title
        self.journal = journal
        
        # create a unique identifier for this article
        self.id = "_".join([journal, year, month, "_".join(title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = is_initialed_name(first_author)
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        return m
        
        
def article_from_map(article_map):
    return Article(
        first_author=article_map["first_author"],
        all_names=article_map["all_names"],
        year=article_map["year"],
        month=article_map["month"],
        title=article_map["title"],
        journal=article_map["journal"])

test_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)

print("last name set:", test_article.last_name_set())
print("article id:", test_article.id)
print("article has initial:", test_article.has_initials)
print("article map:", test_article.to_map())


class Guessed:
    def __init__(self, primary_article, guessed_names, match_article_id):
        self.first_author = primary_article.first_author
        self.names = primary_article.names
        self.year = primary_article.year
        self.month = primary_article.month
        self.title = primary_article.title
        self.journal = primary_article.journal
        
        # create a unique identifier for this article
        self.id = "_".join([self.journal, self.year, self.month, "_".join(self.title.translate(str.maketrans('', '', string.punctuation)).split(" "))])
        
        # determine if article has initialed names
        self.has_initials = True in [is_initialed_name(n) for n in self.names]
        
        self.guessed_names = guessed_names
        self.match_article_id = match_article_id
    
    
    def last_name_set(self):
        # return a set() of all the last names
        name_set = set()
        for name in self.names:
            name_set.add(name.split(" ")[-1])
        return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["has_initials"] = self.has_initials
        m["guessed_names"] = self.guessed_names
        m["match_article_id"] = self.match_article_id
        return m
        
        
def guessed_from_map(guessed_map):
    g = Guessed(
        primary_article=article_from_map(guessed_map),
        guessed_names=guessed_map["guessed_names"],
        match_article_id=guessed_map["match_article_id"])
    return g




class Gendered:
    def __init__(self, primary_article, all_genders, all_percent):
        self.first_author = primary_article.first_author
        
        try:
            self.names = primary_article.guessed_names
        except:
            self.names = primary_article.names
        
        self.year = primary_article.year
        self.month = primary_article.month
        self.title = primary_article.title
        self.journal = primary_article.journal
        self.id = primary_article.id
        
        # gendered:
        self.all_genders = all_genders
        self.all_percent = all_percent
    
#     def last_name_set(self):
#         # return a set() of all the last names
#         name_set = set()
#         for name in self.names:
#             name_set.add(name.split(" ")[-1])
#         return name_set
    
    def to_map(self):
        m = {}
        m["first_author"] = self.first_author
        m["all_names"] = self.names
        m["year"] = self.year
        m["month"] = self.month
        m["title"] = self.title
        m["journal"] = self.journal
        m["id"] = self.id
        m["all_genders"] = self.all_genders
        m["all_percent"] = self.all_percent
        return m
        
        
def gendered_from_map(dictfromjson):
    
    # is the primary article guessed authors?
    # if yes:
    # use guessed_from_map
    # if not:
    # use article_from_map
    if "guessed_names" in list(dictfromjson.keys()):
        prim_art = guessed_from_map(dictfromjson)
    else:
        prim_art = article_from_map(dictfromjson)
    
    g = Gendered(
        primary_article=prim_art,
        all_genders=dictfromjson["all_genders"],
        all_percent=dictfromjson["all_percent"])
    return g


test_primary_article = Article(
    first_author="cat",
    all_names=["A. Cat", "Dog", "Another Cat", "More Cats"],
    year="2019",
    month="02",
    title="a story of cool/cats",
    journal="GeoCatography"
)
test_match_article = Article(
    first_author="cat",
    all_names=["Arthur Cat", "Dog", "Another Cat", "More Cats"],
    year="2018",
    month="05",
    title="existence of cool/cats",
    journal="GeoCatography"
)
test_guessed = Guessed(test_primary_article, "Arthur Cat", test_match_article.id)



test_gendered_article = Gendered(
    test_primary_article,
    all_genders=["init", "male", None, "female"],
    all_percent=[0.0, 0.6, None, 0.5]
)
print(test_guessed.to_map())
print(guessed_from_map(test_guessed.to_map()).to_map() == test_guessed.to_map())

test is_initialed_name- True: True , False: False , True: True , True: True
test contains_initialed_name- True True , False: False
test clean name- Colin. J. Cats: Colin J. Cats W. B. Easy: W. B. Easy
last name set: {'Dog', 'Cats', 'Cat'}
article id: GeoCatography_2019_02_a_story_of_coolcats
article has initial: False
article map: {'first_author': 'cat', 'all_names': ['A. Cat', 'Dog', 'Another Cat', 'More Cats'], 'year': '2019', 'month': '02', 'title': 'a story of cool/cats', 'journal': 'GeoCatography', 'id': 'GeoCatography_2019_02_a_story_of_coolcats', 'has_initials': False}
{'first_author': 'cat', 'all_names': ['A. Cat', 'Dog', 'Another Cat', 'More Cats'], 'year': '2019', 'month': '02', 'title': 'a story of cool/cats', 'journal': 'GeoCatography', 'id': 'GeoCatography_2019_02_a_story_of_coolcats', 'has_initials': True, 'guessed_names': 'Arthur Cat', 'match_article_id': 'GeoCatography_2018_05_existence_of_coolcats'}
True


#### Determine how many papers have initialed authors and create a dictionary with all last names from articles and abstracts


In [5]:
# How many articles have authors with initialed names

count = 0
for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not is_initialed_name(art.names[0]):
            continue
        #print(art.names)
        count += 1

        
print("{count} papers from a total of {total} contain initialed first author names.".format(count = count, 
                                                                                   total = len(files)))

# Create in-memory map for name guessing

# This map is generated from the articles where names are not initialed.
# The keys on the map are last names. The values are arrays of articles
# where at least one author on the article has the keyed last name.

# We create this dictionary from all abstracts. 

count_init=0
count_total = 0

last_names_to_abstracts = {}

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
            
        # considering all authors and all articles:
        for name in art.names:
            count_total += 1
            # only add if it is not initialed
            if is_initialed_name(name):
                count_init += 1
                continue
            
            last_name = name.split(" ")[-1]            
            
            if last_name not in last_names_to_abstracts.keys():
                last_names_to_abstracts[last_name] = []
            last_names_to_abstracts[last_name].append(art)
        
print("All last names, abstract map size", len(last_names_to_abstracts.keys()))
print("{count} author names are initialed from a total of {total} author names.".format(count = count_init,
                                                                                      total = count_total))

1673 papers from a total of 17452 contain initialed first author names.
All last names, abstract map size 20407
7600 author names are initialed from a total of 82715 author names.


#### Now try to guess their names by comparing to non-initialed authors

In [6]:
# Create guesses for first author names and save to files
def extract_name_guess(initial_name, possible_names):
    last_name = initial_name.split(" ")[-1]
    for pn in possible_names:
        if last_name != pn.split(" ")[-1]:
            continue  # not the name we're looking for
        if (initial_name[0] != pn[0]) and (fuzz.token_sort_ratio(initial_name,pn) < 90):
            continue  # first letters do not match TODO: There are cases where this is not correct, e.g.: Martin Mai and P. Martin Mai
        if len(initial_name.split()) > 2 and len(pn.split()) > 2:
            # both names have middle initial
            if initial_name.split()[1][0] != pn.split()[1][0]:
                # middle initial does not fit
                continue
        if len(initial_name.split("-")) > 1 and len(pn.split("-")) > 1:
            # both names have hyphen
            if initial_name.split("-")[1][0] != pn.split("-")[1][0]:
                # second part of hyphenated name does not fit
                continue
        if is_initialed_name(pn):
            continue
        return pn, True
    return "", False

# create an output text file to quickly check if any bullshit occurs
check_file = open("output_checklist_guessednames.txt", "w")


# unsupported edge cases
# Check that the same initialized name
# is not mapped to multiple different
# complete names
# J. M. Li => Jia Li
# J. M. Li => Jiaxun Li
# J. M. Li => Jingyuan Li

count = 0
count_guess = 0
unmapped_names = set()

for _, _, files in os.walk(PARSED_PAGES_DIR):
    
    for file in files:
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
        if not contains_initialed_name(art.names): # take only article that contain initialed names
            continue
                
        guessed_authors = []
        
        for author in art.names: # Loop through all author in the article
            
            if not is_initialed_name(author): # take only the authors with initialed names
                guessed_authors.append(author) # save the full name for later writing the json file
                continue
                
            last_name = author.split(" ")[-1] # take the last name

            if last_name not in last_names_to_abstracts.keys(): 
                # check if the last name is in the dictionary
                guessed_authors.append(author) # if not, we need to save it as it is
                unmapped_names.add(author) # collect all unmapped names to double-check the performance
                count += 1 
                continue
            
            # load the articles and abstracts related to the last name
        
            try:
                abstracts = last_names_to_abstracts[last_name]
            except KeyError:
                abstracts = []
                pass
            
            
            # gather all the guesses, paired with the size of overlap.
            # guesses with largest overlap will be written into file. 
            
            guesses = []
            possible_names = set()
            
    
            for article in abstracts: 
            
                overlap = article.last_name_set() & art.last_name_set() #overlap between coauthors

                if (len(art.names) > 1) and (len(overlap) < 2): 
                    name, ok = extract_name_guess(author, article.names) # not always there is an overlap and sometimes... 
                    if ok:#...there is only one person in the community with this name...
                        possible_names.add(name.split(" ")[0] + " " + name.split(" ")[-1]) #...so save the names to check this later.
                    continue  # skip articles without enough overlap. 
            
                name, ok = extract_name_guess(author, article.names)
                if not ok:
                    continue
            
                check_file.write("{}\t\t{}\n".format(name, author))
                guessed = Guessed(art, [name], article.id)
                guesses.append((len(overlap), guessed)) # saved the guessed name and the overlap
            

                    
            #simplify the set 'possible_names' due to slightly different characters (e.g., accents)
            
            tmp = list(possible_names)
            tmp2 = []

            for i in range(len(possible_names)-1):
 
                highest = process.extractOne(tmp[i],tmp[i+1:])
                if highest[1] > 85:
                    continue
                tmp2.append(tmp[i])

            if len(tmp)!=0:
                tmp2.append(tmp[-1])

            possible_names = set(tmp2)   

                
        
            if len(guesses) == 0: #if no guesses, check if there is only one author in the field with the last name
                if len(possible_names)==1:
                    check_file.write("{}\t\t{}\n".format(list(possible_names)[0], author))
                    guessed = Guessed(art, [list(possible_names)[0]], article.id)
                    guesses.append((len(overlap), guessed))
                else:
                    guessed_authors.append(author)
                    unmapped_names.add(author)
                    count += 1
                    continue

            guesses = sorted(guesses, key=lambda x: x[0], reverse=True) #This does not change guesses if we do not assign it to variable guesses
            the_guess = guesses[0][1]
            count_guess += 1
            guessed_authors.append(the_guess.guessed_names[0])
            
            
        final_guess = Guessed(art, guessed_authors, the_guess.match_article_id)    
        outfile_name = final_guess.id[:80]+".json"
        with codecs.open(GUESSED_NAMES_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(final_guess.to_map()))

check_file.close()
print('Names not guessed:',len(unmapped_names))
print('Number of authors not guessed:', count)

print('Number of authors guessed:', count_guess)


Names not guessed: 1760
Number of authors not guessed: 1962
Number of authors guessed: 5638



#### Start with the name --> gender: test genderize.io api

In [7]:
# class for gender result and test
class GenderResult(object):
    def __init__(self, name, result):
        self.name = name
        self.binary = result[0]
        self.percent = result[1]
        self.count = result[2]
        
    def to_map(self):
        m = {}
        
        m["name"] = self.name
        m["binary"] = self.binary
        m["percent"] = self.percent
        m["count"] = self.count
        return m
    

def gender_result_from_map(m):
    return GenderResult(m["name"], ( m["binary"], m["percent"], m["count"]))

name = "ChunYuan"
g = getGenders(name)[0]
gr = GenderResult(name, g)
gender_result_from_map(json.loads(json.dumps(gr.to_map()))).to_map()

{'name': 'ChunYuan', 'binary': 'female', 'percent': 0.6, 'count': 5}

#### Just read in all the previously determined and stored name --> gender pairs

In [8]:
# read the Gender API gender guesses into memory

gender_results = []

#for _, _, files in #os.walk(NAME_GENDER_DIR):
files = glob(os.path.join(NAME_GENDER_DIR, "*.json"))
for file in files:
    with codecs.open(file, "r", "utf8") as infile:
        gr = gender_result_from_map(json.loads(infile.read()))
        gender_results.append(gr)
            
        
    
# Create a map of first name to gender result
name_to_gr = {}
for gr in gender_results:
    name_to_gr[gr.name] = gr
    
print("Length of current name-gender map", len(name_to_gr))

# test
print("Colin." in name_to_gr)
print("First name in current gender map, gender: ", \
      list(name_to_gr)[0], name_to_gr[list(name_to_gr)[0]].binary)
print("Last name in current gender map, gender: ", \
      list(name_to_gr)[-1], name_to_gr[list(name_to_gr)[-1]].binary)

Length of current name-gender map 15811
False
First name in current gender map, gender:  Kusala male
Last name in current gender map, gender:  Viktória female


#### Genderize all the names
#### - collect the list of first names from PARSED and GUESSED
#### - check if it is in the list already, if not: call genderize io
#### - finally check how many in the guessed initialed names are male / female

In [9]:
# Collect list of all PARSED and GUESSED names to be gendered

first_names = set()


## TODO:  Adapt here for all authors instead for first author only. I think this is done now?? Can someone double-check it?
files_guessed = glob(os.path.join(GUESSED_NAMES_DIR, "*.json"))
files_parsed = glob(os.path.join(PARSED_PAGES_DIR, "*.json"))

for file in files_guessed:
    with codecs.open(file, "r", "utf8") as infile:
        guess = guessed_from_map(json.loads(infile.read()))
        
    for name in guess.guessed_names:
        first_name = name.split(" ")[0].capitalize() 
        if len(first_name) == 0 or is_initialed_name(name): #if it is initialed nothing to do
            print("empty or initialed name?:", name)
            continue
        first_names.add(first_name)


for file in files_parsed:
    with codecs.open(file, "r", "utf8") as infile:
        art = article_from_map(json.loads(infile.read()))
        
    for name in art.names:
        first_name = name.split(" ")[0].capitalize() 
        if len(first_name) == 0 or is_initialed_name(name):  #if it is initialed nothing to do
            print("empty or initialed name?:", name)
            continue
        first_names.add(first_name)

print("number of unique first names of all authors to genderize:", len(first_names))
print("From {} to {}.".format(list(first_names)[0],
                              list(first_names)[-1]))        

empty or initialed name?: R. Bøe
empty or initialed name?: S. Lennartz
empty or initialed name?: M. Zaiser
empty or initialed name?: Y. Mkukuhira
empty or initialed name?: S. Mitsumori
empty or initialed name?: M. Haring
empty or initialed name?: D. Wyborn
empty or initialed name?: M. Adachi
empty or initialed name?: H. Klug
empty or initialed name?: C. Daughney
empty or initialed name?: F. Verhagen
empty or initialed name?: R. Westerhoff
empty or initialed name?: N. Dudley Ward
empty or initialed name?: H. Christian Hass
empty or initialed name?: B. Heinze
empty or initialed name?: E. -W. Cheng
empty or initialed name?: B. Zavala
empty or initialed name?: J. de Souza Silva
empty or initialed name?: M. Korhan Erturac
empty or initialed name?: N. Kato
empty or initialed name?: E. Rowena Hay
empty or initialed name?: M. Galbiati
empty or initialed name?: P. Rocchini
empty or initialed name?: D. G. Taimazov
empty or initialed name?: T. I. Sharapudinov
empty or initialed name?: M. G. Magom

empty or initialed name?: B. Mei
empty or initialed name?: Z. Zou
empty or initialed name?: A. Bian
empty or initialed name?: H. Wang
empty or initialed name?: M. Allen
empty or initialed name?: W. Bai
empty or initialed name?: J. Gil Antonio
empty or initialed name?: J. Bartels
empty or initialed name?: F. Wenderoth
empty or initialed name?: T. Fritzer
empty or initialed name?: B. Huber
empty or initialed name?: J. Hefty
empty or initialed name?: L. Gerhatova
empty or initialed name?: H. Caeiro Maria
empty or initialed name?: T. Buchli
empty or initialed name?: I. Smedberg
empty or initialed name?: R. Matuła
empty or initialed name?: F. Javier Pavon Carrasco
empty or initialed name?: T. Heister
empty or initialed name?: A. P. van den Berg
empty or initialed name?: W. Bangerth
empty or initialed name?: J. J. Marigo
empty or initialed name?: G. Made Agung Nandaka
empty or initialed name?: C. Hans Nelson
empty or initialed name?: A. Nathues
empty or initialed name?: J. B. Vincent
empty o

empty or initialed name?: C. Sigarán-Loría
empty or initialed name?: R. Hack
empty or initialed name?: J. D. Nieuwenhuis
empty or initialed name?: D. Kaniewski
empty or initialed name?: K. Szkornik
empty or initialed name?: G. Verstraeten
empty or initialed name?: P. Tregoning
empty or initialed name?: A. Purcell
empty or initialed name?: F. La Vigna
empty or initialed name?: P. Teoli
empty or initialed name?: R. Mazza
empty or initialed name?: G. Capelli
empty or initialed name?: P. Martin Mai
empty or initialed name?: O. Novotny
empty or initialed name?: J. Jansky
empty or initialed name?: J. Batlo
empty or initialed name?: H. Ferreira
empty or initialed name?: G. Zoppe'
empty or initialed name?: G. Marcato
empty or initialed name?: G. Verstraeten
empty or initialed name?: A. Kettner
empty or initialed name?: M. Van Den Eeckhaut
empty or initialed name?: H. M. Modaressi
empty or initialed name?: H. M. Müller
empty or initialed name?: M. Hata
empty or initialed name?: N. Oshiman
empty

empty or initialed name?: J. D. Eccles
empty or initialed name?: R. S. White
empty or initialed name?: P. A. F. Christie
empty or initialed name?: J. Chen
empty or initialed name?: X. Yang
empty or initialed name?: C. J. Spiers
empty or initialed name?: C. Boulton
empty or initialed name?: M. Villeneuve
empty or initialed name?: C. Goodin
empty or initialed name?: T. Tiira
empty or initialed name?: T. Janik
empty or initialed name?: E. Kozlovskaya
empty or initialed name?: M. Grad
empty or initialed name?: A. Korja
empty or initialed name?: K. Komminaho
empty or initialed name?: E. Hegedüs
empty or initialed name?: C. A. Kovács
empty or initialed name?: H. Silvennoinen
empty or initialed name?: E. Brückl
empty or initialed name?: J. Plomerova
empty or initialed name?: V. Babuska
empty or initialed name?: L. Vecsey
empty or initialed name?: T. Maurin
empty or initialed name?: J. F. Stéphan
empty or initialed name?: B. Mercier de Lépinay
empty or initialed name?: D. Dhont
empty or initia

empty or initialed name?: L. W. Chen
empty or initialed name?: S. K. Wu
empty or initialed name?: W. C. Chi
empty or initialed name?: C. S. Liu
empty or initialed name?: C. T. Shyu
empty or initialed name?: Y. S. Wang
empty or initialed name?: M. D. Ballmer
empty or initialed name?: C. P. Conrad
empty or initialed name?: E. I. Smith
empty or initialed name?: H. Sato
empty or initialed name?: S. Abe
empty or initialed name?: N. Kawai
empty or initialed name?: H. Saito
empty or initialed name?: N. Kato
empty or initialed name?: T. Ishiyama
empty or initialed name?: T. Iwasaki
empty or initialed name?: E. Kurashimo
empty or initialed name?: M. Inaba
empty or initialed name?: A. Van Horne
empty or initialed name?: P. Martin Mai
empty or initialed name?: I. Jiménez-Munt
empty or initialed name?: M. Fernandez
empty or initialed name?: S. Zlotnik
empty or initialed name?: R. Ditommaso
empty or initialed name?: M. Mucciarelli
empty or initialed name?: Z. Kereselidze
empty or initialed name?: N

empty or initialed name?: L. Bertrand
empty or initialed name?: E. LeGarzic
empty or initialed name?: Y. Géraud
empty or initialed name?: M. Diraison
empty or initialed name?: A. Hoechner
empty or initialed name?: M. Ge
empty or initialed name?: A. Babeyko
empty or initialed name?: T. Nissen-Meyer
empty or initialed name?: S. Hanasoge
empty or initialed name?: L. Stehly
empty or initialed name?: P. Cupillard
empty or initialed name?: D. Albarello
empty or initialed name?: M. Mucciarelli
empty or initialed name?: J. Freymark
empty or initialed name?: J. Sippel
empty or initialed name?: M. Scheck-Wenderoth
empty or initialed name?: H. -J. Götze
empty or initialed name?: C. Reichert
empty or initialed name?: M. Dec
empty or initialed name?: M. Malinowski
empty or initialed name?: B. Nita
empty or initialed name?: E. Perchuc
empty or initialed name?: M. -A. Schröter
empty or initialed name?: C. Weimann
empty or initialed name?: H. Sturm
empty or initialed name?: M. Holschneider
empty or in

empty or initialed name?: H. Serdar Akyuz
empty or initialed name?: T. -Y. Huang
empty or initialed name?: Y. -N. Chen
empty or initialed name?: Y. Gung
empty or initialed name?: W. -T. Liang
empty or initialed name?: L. -Y. Chiao
empty or initialed name?: M. Korhan Erturac
empty or initialed name?: J. G. Rubino
empty or initialed name?: L. B. Monachesi
empty or initialed name?: L. Guarracino
empty or initialed name?: T. M. Müller
empty or initialed name?: K. Holliger
empty or initialed name?: D. Trippanera
empty or initialed name?: M. Salvatore
empty or initialed name?: M. Porreca
empty or initialed name?: J. Ruch
empty or initialed name?: A. Pimentel
empty or initialed name?: J. Pacheco
empty or initialed name?: V. Acocella
empty or initialed name?: R. De Ritis
empty or initialed name?: G. Ventura
empty or initialed name?: M. Chiappini
empty or initialed name?: C. Legendre
empty or initialed name?: T. Meier
empty or initialed name?: S. Lebedev
empty or initialed name?: W. Friederich


empty or initialed name?: D. Blagojevic
empty or initialed name?: G. Todorovic
empty or initialed name?: V. Vasilic
empty or initialed name?: M. Namık Çaǧatay
empty or initialed name?: S. Karato
empty or initialed name?: T. Olugboji
empty or initialed name?: K. Michael Cline
empty or initialed name?: B. Jolly
empty or initialed name?: L. Lonergan
empty or initialed name?: A. Whittaker
empty or initialed name?: A. Arda Ozacar
empty or initialed name?: K. J. Shou
empty or initialed name?: L. Y. Fei
empty or initialed name?: J. F. Lee
empty or initialed name?: C. Y. Wei
empty or initialed name?: C. C. Wu
empty or initialed name?: C. Y. Hong
empty or initialed name?: S. Smith
empty or initialed name?: G. Di Toro
empty or initialed name?: E. Spagnuolo
empty or initialed name?: S. Nielsen
empty or initialed name?: M. Violay
empty or initialed name?: R. Spiess
empty or initialed name?: A. Billi
empty or initialed name?: V. Cuomo
empty or initialed name?: M. Proto
empty or initialed name?: F. 

empty or initialed name?: T. Kempka
empty or initialed name?: B. Norden
empty or initialed name?: E. Tillner
empty or initialed name?: B. Nakaten
empty or initialed name?: M. Kühn
empty or initialed name?: J. Chan
empty or initialed name?: D. R. Schmitt
empty or initialed name?: G. Nieuwenhuis
empty or initialed name?: E. Poureslami Ardakani
empty or initialed name?: J. Kueck
empty or initialed name?: M. R. Abasolo
empty or initialed name?: T. W. Becker
empty or initialed name?: L. A. Alpert
empty or initialed name?: I. W. Bailey
empty or initialed name?: M. S. Miller
empty or initialed name?: H. -S. Kim
empty or initialed name?: M. Riedel
empty or initialed name?: D. -G. Yoo
empty or initialed name?: G. -Y. Kim
empty or initialed name?: B. -J. Ryu
empty or initialed name?: J. Holzhauer
empty or initialed name?: C. Bordes
empty or initialed name?: F. Oppermann
empty or initialed name?: D. Brito
empty or initialed name?: U. Yaramanci
empty or initialed name?: C. Fillon
empty or initiale

empty or initialed name?: H. Razafindrakoto
empty or initialed name?: M. Mai
empty or initialed name?: F. Ozcep
empty or initialed name?: S. Karabulut
empty or initialed name?: T. Ozcep
empty or initialed name?: I. Smedberg
empty or initialed name?: M. Uski
empty or initialed name?: T. Tiira
empty or initialed name?: K. Komminaho
empty or initialed name?: A. Korja
empty or initialed name?: R. Matuła
empty or initialed name?: A. P. Valentine
empty or initialed name?: L. M. Kalnins
empty or initialed name?: J. Trampert
empty or initialed name?: F. Javier Pavon Carrasco
empty or initialed name?: J. Ramon Arrowsmith
empty or initialed name?: T. Geenen
empty or initialed name?: T. Heister
empty or initialed name?: A. P. van den Berg
empty or initialed name?: M. Jacobs
empty or initialed name?: W. Bangerth
empty or initialed name?: R. Dietmar Müller
empty or initialed name?: Y. Capdeville
empty or initialed name?: P. Cupillard
empty or initialed name?: J. J. Marigo
empty or initialed name?: 

empty or initialed name?: J. Arnoso
empty or initialed name?: E. J. Vélez
empty or initialed name?: V. Soler
empty or initialed name?: F. G. Montesinos
empty or initialed name?: M. Benavent
empty or initialed name?: J. R. Elliott
empty or initialed name?: Z. Li
empty or initialed name?: E. K. Nissen
empty or initialed name?: J. A. Jackson
empty or initialed name?: S. Lamb
empty or initialed name?: B. Parsons
empty or initialed name?: R. A. Sloan
empty or initialed name?: A. Fuenzalida
empty or initialed name?: H. Lyon-Caen
empty or initialed name?: M. Lancieri
empty or initialed name?: A. Rietbrock
empty or initialed name?: R. Madariaga
empty or initialed name?: A. García-Jerez
empty or initialed name?: A. Jiménez
empty or initialed name?: F. Luzón
empty or initialed name?: F. J. Sánchez-Sesma
empty or initialed name?: E. Carmona
empty or initialed name?: L. Crescentini
empty or initialed name?: V. Botta
empty or initialed name?: A. Amoruso
empty or initialed name?: A. Bettini
empty or

empty or initialed name?: G. C. Feng
empty or initialed name?: S. Jónsson
empty or initialed name?: X. L. Ding
empty or initialed name?: Z. Li
empty or initialed name?: J. -X. Dessa
empty or initialed name?: M. Lelièvre
empty or initialed name?: S. Simon
empty or initialed name?: A. Deschamps
empty or initialed name?: N. Béthoux
empty or initialed name?: S. Solarino
empty or initialed name?: M. -O. Beslier
empty or initialed name?: F. Sage
empty or initialed name?: O. Bellier
empty or initialed name?: F. Courboulex
empty or initialed name?: F. Klingelhoefer
empty or initialed name?: E. Eva
empty or initialed name?: G. Ferretti
empty or initialed name?: D. Scafidi
empty or initialed name?: M. Pavan
empty or initialed name?: C. Eva
empty or initialed name?: M. Lefeldt
empty or initialed name?: E. Flueh
empty or initialed name?: J. Vlcek
empty or initialed name?: T. Fischer
empty or initialed name?: T. Tóth
empty or initialed name?: G. Wórum
empty or initialed name?: A. Nádor
empty or ini

empty or initialed name?: J. -Michael Kendal
empty or initialed name?: K. Hosseini-zad
empty or initialed name?: S. C. Stähler
empty or initialed name?: K. Sigloch
empty or initialed name?: C. Scheingraber
empty or initialed name?: X. Mao
empty or initialed name?: J. H. Li
empty or initialed name?: B. Knapmeyer-Endrun
empty or initialed name?: F. Krüger
empty or initialed name?: S. Rondenay
empty or initialed name?: F. D. Pearce
empty or initialed name?: J. -A. L. Olive
empty or initialed name?: M. D. Behn
empty or initialed name?: J. Michael Kendall
empty or initialed name?: M. Rosenau
empty or initialed name?: O. Oncken
empty or initialed name?: Z. Erdos
empty or initialed name?: R. S. Huismans
empty or initialed name?: P. van der Beek
empty or initialed name?: S. Gutjahr
empty or initialed name?: S. Buske
empty or initialed name?: O. R. Clausen
empty or initialed name?: D. L. Egholm
empty or initialed name?: R. Wesenberg
empty or initialed name?: K. Emoto
empty or initialed name?: H

empty or initialed name?: T. Yamasaki
empty or initialed name?: G. A. Houseman
empty or initialed name?: S. Allgeyer
empty or initialed name?: H. Hébert
empty or initialed name?: R. Madariaga
empty or initialed name?: F. Boudin
empty or initialed name?: P. Bernard
empty or initialed name?: Y. Tony Song
empty or initialed name?: M. Dovžak
empty or initialed name?: S. Stanič
empty or initialed name?: K. Bergant
empty or initialed name?: G. Gregorič
empty or initialed name?: J. M. Marrero
empty or initialed name?: A. Garcia
empty or initialed name?: A. Llinares
empty or initialed name?: P. Lopez
empty or initialed name?: R. Ortinz
empty or initialed name?: J. Ole Ross
empty or initialed name?: B. Grecu
empty or initialed name?: C. Neagoe
empty or initialed name?: D. Tataru
empty or initialed name?: G. Stuart
empty or initialed name?: S. Allgeyer
empty or initialed name?: C. Daubord
empty or initialed name?: H. Hébert
empty or initialed name?: A. Loevenbruck
empty or initialed name?: F. Sc

empty or initialed name?: R. Cassidy
empty or initialed name?: J. -C. Comte
empty or initialed name?: J. Nitsche
empty or initialed name?: U. Ofterdinger
empty or initialed name?: R. Flynn
empty or initialed name?: M. Lendholt
empty or initialed name?: M. Hammitzsch
empty or initialed name?: M. A. Esbri Palomares
empty or initialed name?: P. J. Tackley
empty or initialed name?: M. Halis Saka
empty or initialed name?: D. Do Couto
empty or initialed name?: C. Gorini
empty or initialed name?: L. Jolivet
empty or initialed name?: J. Letouzey
empty or initialed name?: J. Smit
empty or initialed name?: E. d'Acremont
empty or initialed name?: J. L. Auxietre
empty or initialed name?: L. Le Pourhiet
empty or initialed name?: F. Estrada
empty or initialed name?: M. Elabassi
empty or initialed name?: A. Ammar
empty or initialed name?: H. Jabour
empty or initialed name?: B. Vendeville
empty or initialed name?: A. Plaza-Faverola
empty or initialed name?: I. Pecher
empty or initialed name?: D. Klaes

empty or initialed name?: M. Didem Cambaz
empty or initialed name?: Y. Soda
empty or initialed name?: T. Morishita
empty or initialed name?: H. -R. Wenk
empty or initialed name?: T. Favale
empty or initialed name?: F. De Angelis
empty or initialed name?: L. De Filippis
empty or initialed name?: E. Zabranova
empty or initialed name?: C. Matyska
empty or initialed name?: L. Hanyk
empty or initialed name?: Y. Jia
empty or initialed name?: N. Horn
empty or initialed name?: W. Lenhardt
empty or initialed name?: J. Ramón Arrowsmith
empty or initialed name?: N. Gestermann
empty or initialed name?: T. Plenefisch
empty or initialed name?: F. Vallianatos
empty or initialed name?: G. Papadakis
empty or initialed name?: G. Michas
empty or initialed name?: P. Sammonds
empty or initialed name?: A. Yalciner
empty or initialed name?: F. Imamura
empty or initialed name?: E. Mas
empty or initialed name?: I. Necmioglu
empty or initialed name?: C. Ozer
empty or initialed name?: A. Zaytsev
empty or initial

empty or initialed name?: J. Liu
empty or initialed name?: X. Chen
empty or initialed name?: E. Pozzo di Borgo
empty or initialed name?: J. Marfaing
empty or initialed name?: G. Waysand
empty or initialed name?: W. -J. Huang
empty or initialed name?: K. M. Johnson
empty or initialed name?: E. Carmona
empty or initialed name?: J. Almendros
empty or initialed name?: R. Martín
empty or initialed name?: G. Cortés
empty or initialed name?: G. Alguacil
empty or initialed name?: J. Moreno
empty or initialed name?: B. Martín
empty or initialed name?: A. Martos
empty or initialed name?: I. Serrano
empty or initialed name?: D. Stich
empty or initialed name?: J. M. Ibáñez
empty or initialed name?: C. Baillard
empty or initialed name?: W. Crawford
empty or initialed name?: V. Ballu
empty or initialed name?: C. Hibert
empty or initialed name?: A. Gailler
empty or initialed name?: H. Hébert
empty or initialed name?: A. Loevenbruck
empty or initialed name?: B. Hernandez
empty or initialed name?: S. S

empty or initialed name?: D. Fernández-Blanco
empty or initialed name?: G. Bertotti
empty or initialed name?: P. A. Torres
empty or initialed name?: S. Meletlidis
empty or initialed name?: N. Luengo-Oroz
empty or initialed name?: D. Moure
empty or initialed name?: C. Rodero
empty or initialed name?: V. Villasante-Marcos
empty or initialed name?: R. Abella
empty or initialed name?: C. López
empty or initialed name?: M. J. Blanco
empty or initialed name?: A. Jetson Ronald
empty or initialed name?: C. G. Lai
empty or initialed name?: A. S. Papageoriou
empty or initialed name?: D. Franke
empty or initialed name?: K. Piepjohn
empty or initialed name?: C. Gaedicke
empty or initialed name?: C. Brandes
empty or initialed name?: N. Sobelev
empty or initialed name?: B. Mouly
empty or initialed name?: A. Ozgun Konca
empty or initialed name?: I. Tonguç Uysal
empty or initialed name?: Y. Lavallée
empty or initialed name?: M. J. Heap
empty or initialed name?: A. Laumann
empty or initialed name?: K. 

In [10]:
# Call the Gender API and save output for all the GUESSED names
# WARN: Makes API calls
# Does nothing if the guessed names were genderized previously

print("starting size:", len(name_to_gr))

nnames = 0
to_genderize = []
for i, name in enumerate(first_names):

    if name in name_to_gr:
        #print("known, ", name)
        continue
    else:
        print("new, ", name)
        to_genderize.append(name)

    nnames += 1
    
    if nnames == 10: # guess the gender of 10 names simultaneously
        result = getGenders(to_genderize)
    #         if len(result) > 1:
    #             print("long result:", result)
        for j, pres in enumerate(result):
            r = pres
            gr = GenderResult(to_genderize[j], r)

            # update gender name map in memory
            name_to_gr[gr.name] = gr
            print(name_to_gr[gr.name].name, name_to_gr[gr.name].binary)

            # also save for later use
            file_name = "_".join(to_genderize[j].translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
            
            with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
                outfile.write(json.dumps(gr.to_map()))
        nnames = 0
        to_genderize = []

if nnames > 0:
    result = getGenders(to_genderize)
    for j, pres in enumerate(result):
        r = pres
        gr = GenderResult(to_genderize[j], r)

        # update gender name map in memory
        name_to_gr[gr.name] = gr
        print(name_to_gr[gr.name].name, name_to_gr[gr.name].binary)

        # also save for later use
        file_name = "_".join(to_genderize[j].translate(str.maketrans('', '', string.punctuation)).split(" ")) + ".json"
        
        with codecs.open(NAME_GENDER_DIR+file_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(gr.to_map()))
    nnames = 0
    to_genderize = []       
    
print("finish size:", len(name_to_gr))

print(name_to_gr["Weitao"])

starting size: 15811
new,  Wen-fei
new,  Zi-jun
new,  Yusheng
new,  Wen-lu
new,  Gianmaria
new,  Zheng-kang
new,  Yulin
new,  Rahsan
new,  Xuemin
new,  Annemarie
Wen-fei None
Zi-jun male
Yusheng male
Wen-lu None
Gianmaria male
Zheng-kang None
Yulin female
Rahsan female
Xuemin male
Annemarie female
new,  Yifan
new,  Lijun
new,  Chunyu
new,  Rose-marie
new,  Tjaart
new,  Wen-sheng
new,  Jongwon
new,  Jean-loup
new,  Li-wei
new,  Kuoliang
Yifan male
Lijun female
Chunyu male
Rose-marie female
Tjaart male
Wen-sheng male
Jongwon male
Jean-loup male
Li-wei male
Kuoliang None
new,  Ergun
new,  Arjan
new,  Gui-bin
new,  Shun-chiang
new,  Gaozhong
new,  Shinichi
new,  Bard
new,  Tae-hee
new,  Miguel-angel
new,  Run-qiu
Ergun male
Arjan male
Gui-bin None
Shun-chiang None
Gaozhong None
Shinichi male
Bard male
Tae-hee male
Miguel-angel male
Run-qiu None
new,  Bo-hung
new,  Ya-ju
new,  Xi-bin
new,  Muayyad
new,  Kuan-chuan
new,  Wen-jie
new,  Yong-hong
new,  Shao-yang
new,  Young-soo
new,  Wei-qiang


#### Now, let's create a json database that can be used in another notebook.

In [11]:
# Go both through the guessed names and the non-initialed names.
# Find genders on the base of the name_to_gr map and save the results


AUTHOR_GENDER_DIR = root + "/author_genders_egu/"
if not os.path.exists(AUTHOR_GENDER_DIR):
    os.mkdir(AUTHOR_GENDER_DIR)

for _, _, files in os.walk(PARSED_PAGES_DIR):
    for file in files:       
        with codecs.open(PARSED_PAGES_DIR+file, "r", "utf8") as infile:
            art = article_from_map(json.loads(infile.read()))
            
        if contains_initialed_name(art.names):
            continue
            
        all_names = art.names
        all_genders = []
        all_percent = []

        for n in all_names:
            n = n.split()[0].capitalize()
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(float(name_to_gr[n].percent))
            except KeyError:
                print(n)
                #raise KeyError("Check name_to_gr database, it should be complete by now!")
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
       
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))
# now do the guessed names
for _, _, files in os.walk(GUESSED_NAMES_DIR):
    for file in files:       
        with codecs.open(GUESSED_NAMES_DIR+file, "r", "utf8") as infile:
            art = guessed_from_map(json.loads(infile.read()))
            
        all_names = art.guessed_names
        all_genders = []
        all_percent = []

        for n in all_names:
            if is_initialed_name(n):
                all_genders.append("init")
                all_percent.append(None)
                #print(n)
                continue

            n = n.split()[0].capitalize()            
            
            try:
                all_genders.append(name_to_gr[n].binary)
                all_percent.append(float(name_to_gr[n].percent))
            except KeyError:
                print(n)
                #raise KeyError("Name {} is not gendered yet".format(n))
        art_out = art.to_map()
        art_out["all_genders"] = all_genders
        art_out["all_percent"] = all_percent
        # save!
        outfile_name = os.path.basename(file)
        with codecs.open(AUTHOR_GENDER_DIR+outfile_name, "w", "utf8") as outfile:
            outfile.write(json.dumps(art_out))

In [13]:

files = glob(os.path.join(AUTHOR_GENDER_DIR, "*.json"))
count = 0
count_f = 0
count_m = 0
for file in files:
    outfile_name = os.path.basename(file)
    
    #print("ATTENTION nmax set")
    with codecs.open(file, "r", "utf8") as infile:
        art = gendered_from_map(json.loads(infile.read()))

    
    # Check if any name has gender == None
    
    for i, name in enumerate(art.names):
        if art.all_genders[i] == "init":
            count += 1
            continue
        
        if art.all_genders[i] != "male" and art.all_genders[i] != "female":
            count += 1
            
        if art.all_genders[i] == "male":
            count_m += 1
            
        if art.all_genders[i] == "female":
            count_f += 1
            
print('Total of uncathegorized names:', count, 'Percentage:' ,100*count/(count + count_f + count_m))
print('Total of female author names:', count_f, 'Percentage:', 100*count_f/(count + count_f + count_m))
print('Total of male author names:', count_m, 'Percentage:', 100*count_m/(count + count_f + count_m))

Total of uncathegorized names: 4423 Percentage: 5.347276793810071
Total of female author names: 18823 Percentage: 22.756452880372365
Total of male author names: 59469 Percentage: 71.89627032581757


### Re-do guessing for those that came out as None from the namsor database (good for chinese names)


In [15]:
# Configure API key authorization: api_key
configuration = openapi_client.Configuration()
configuration.api_key['X-API-KEY'] = 'd33fd630c4c66b498360a5e23a33f3b8'

# create an instance of the API class
api_instance = openapi_client.PersonalApi(openapi_client.ApiClient(configuration))
first_name = 'Ray Y.' # str | 
last_name = 'Chuang' # str | 

try:
    # Infer the likely gender of a name.
    api_response = api_instance.gender(first_name, last_name)
    pprint(api_response)
except ApiException as e:
    print("Exception when calling PersonalApi->gender: %s\n" % e)
    

{'category': None,
 'first_name': 'Ray Y.',
 'gender_scale': -0.7749051405700533,
 'id': '500a0ec8-3a5a-4cce-a871-b9ecf4fa9647',
 'last_name': 'Chuang',
 'likely_gender': 'male',
 'probability_calibrated': 0.8874525702850267,
 'score': 12.127235462630383,
 'script': 'LATIN'}


In [16]:
# loop over author_gender
AUTHOR_ALLGENDER_DIR = root + "/author_allgenders_egu"
if not os.path.exists(AUTHOR_ALLGENDER_DIR):
    os.mkdir(AUTHOR_ALLGENDER_DIR)

files = glob(os.path.join(AUTHOR_GENDER_DIR, "*.json"))
nmax = 10000
ncall = 0
for file in files:
    outfile_name = os.path.basename(file)
    if os.path.exists(os.path.join(AUTHOR_ALLGENDER_DIR, outfile_name)):
        continue
    
    #print("ATTENTION nmax set")
    with codecs.open(file, "r", "utf8") as infile:
        art = gendered_from_map(json.loads(infile.read()))

    
    # Check if any name has gender == None
    
    for i, name in enumerate(art.names):
        if art.all_genders[i] == "init":
            continue
        
        if art.all_genders[i] == "None":
            # print(art.names, art.all_genders, art.all_percent)
            
            # call namsor
            first_name = name.split()[0] 
            last_name = name.split()[-1]
            try:
                # Infer the likely gender of a name.
                api_response = api_instance.gender(first_name, last_name)
                # pprint(api_response)
                gender = api_response.likely_gender
                prob = api_response.probability_calibrated
                
                art.all_genders[i] = gender
                art.all_percent[i] = prob

            except ApiException as e:
                print("Exception when calling PersonalApi->gender: %s\n" % e)
            #print(art.names, art.all_genders, art.all_percent)
            
            #print("=========================")
            
            ncall += 1
            print(ncall, end=",")
        
    if ncall > nmax:
        break
        
    # overwrite the file
    art_out = art.to_map()

    # save!
    with codecs.open(os.path.join(AUTHOR_ALLGENDER_DIR, outfile_name), "w", "utf8") as outfile:
        outfile.write(json.dumps(art_out))        




1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,

730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767,768,769,770,771,772,773,774,775,776,777,778,779,780,781,782,783,784,785,786,787,788,789,790,791,792,793,794,795,796,797,798,799,800,801,802,803,804,805,806,807,808,809,810,811,812,813,814,815,816,817,818,819,820,821,822,823,824,825,826,827,828,829,830,831,832,833,834,835,836,837,838,839,840,841,842,843,844,845,846,847,848,849,850,851,852,853,854,855,856,857,858,859,860,861,862,863,864,865,866,867,868,869,870,871,872,873,874,875,876,877,878,879,880,881,882,883,884,885,886,887,888,889,890,891,892,893,894,895,896,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943,944,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,970,971,972,973,974,975,976,977,978,979,

2428,2429,2430,2431,2432,2433,2434,2435,2436,2437,2438,2439,2440,2441,2442,2443,2444,2445,2446,2447,2448,2449,2450,2451,2452,2453,2454,2455,2456,2457,2458,2459,2460,2461,

In [19]:
AUTHOR_ALLGENDER_DIR = root + "/author_allgenders"

files = glob(os.path.join(AUTHOR_ALLGENDER_DIR, "*.json"))
count = 0
count_f = 0
count_m = 0
for file in files:
    outfile_name = os.path.basename(file)
    
    #print("ATTENTION nmax set")
    with codecs.open(file, "r", "utf8") as infile:
        art = gendered_from_map(json.loads(infile.read()))

    
    # Check if any name has gender == None
    
    for i, name in enumerate(art.names):

        if (i == 0) and (art.all_genders[i] == "init"):
            count += 1
            continue
        
        if (i == 0) and (art.all_genders[i] != "male" and art.all_genders[i] != "female"):
            count += 1
            
        if (i == 0) and (art.all_genders[i] == "male"):
            count_m += 1
            
        if (i == 0) and (art.all_genders[i] == "female"):
            count_f += 1
            
print('Total of uncathegorized names:', count, 'Percentage:' ,100*count/(count + count_f + count_m))
print('Total of female author names:', count_f, 'Percentage:', 100*count_f/(count + count_f + count_m))
print('Total of male author names:', count_m, 'Percentage:', 100*count_m/(count + count_f + count_m))

Total of uncathegorized names: 663 Percentage: 3.424586776859504
Total of female author names: 4694 Percentage: 24.24586776859504
Total of male author names: 14003 Percentage: 72.32954545454545
