In [None]:
from semanticscholar import SemanticScholar
from bs4 import BeautifulSoup
from selenium import webdriver
import requests

import random
import time
import csv

from pandas import DataFrame


In [24]:
def write_csv(data, filename):
    with open(filename, 'w', encoding='UTF8') as f:
        # create the csv writer
        writer = csv.writer(f)
        # write a row to the csv file
        writer.writerow(data)

## First experiment

In [3]:
sch = SemanticScholar()
results = sch.search_paper('+"gender bias"', fields=['title','year', 'authors', 'publicationTypes', 'citationCount', 'url'])
print(f'{results.total} results.', f'First occurrence: {results[0].title}.')

2357826 results. First occurrence: Gender Bias in Machine Translation.


In [3]:
data = []
for result in results:
    for author in result['authors']:
        data.append({'Title': result['title'], 
                    'Year': result['year'],
                    'Author': author['name'],
                    'PublicationType': result['publicationTypes'], 
                    'CitationCount': result['citationCount'], 
                    'URL': result['url']})

In [5]:
results[0].keys()

dict_keys(['paperId', 'url', 'title', 'year', 'citationCount', 'publicationTypes', 'authors'])

In [12]:
len(results)

100

In [6]:
df = DataFrame(data, columns = ['Title', 'Year', 'Author', 'PublicationType', 'CitationCount', 'URL'])
df.head()

Unnamed: 0,Title,Year,Author,PublicationType,CitationCount,URL
0,Gender Bias in Machine Translation,2021.0,Beatrice Savoldi,"[JournalArticle, Review]",44,https://www.semanticscholar.org/paper/2ab29510...
1,Gender Bias in Machine Translation,2021.0,Marco Gaido,"[JournalArticle, Review]",44,https://www.semanticscholar.org/paper/2ab29510...
2,Gender Bias in Machine Translation,2021.0,L. Bentivogli,"[JournalArticle, Review]",44,https://www.semanticscholar.org/paper/2ab29510...
3,Gender Bias in Machine Translation,2021.0,Matteo Negri,"[JournalArticle, Review]",44,https://www.semanticscholar.org/paper/2ab29510...
4,Gender Bias in Machine Translation,2021.0,M. Turchi,"[JournalArticle, Review]",44,https://www.semanticscholar.org/paper/2ab29510...


NameError: name 'df' is not defined

## Data 2012-2023

In [4]:
data_authors = []
df_authors = DataFrame(columns = ['Title', 'Year', 'Author', 'PublicationType', 'CitationCount', 'URL'])

sch = SemanticScholar()
for year in range(2012, 2024):
    results = sch.search_paper('+"gender bias"', 
        fields=['title','year', 'authors', 'publicationTypes', 'citationCount', 'url'], 
        year=year)
    print(len(results))
    for result in results:
        for author in result['authors']:
            data_authors.append({'Title': result['title'], 
                    'Year': result['year'],
                    'Author': author['name'],
                    'PublicationType': result['publicationTypes'], 
                    'CitationCount': result['citationCount'], 
                    'URL': result['url']})
    print(f'Since 2012 until {year} there are {len(data_authors)} results')
    

100
Since 2012 until 2012 there are 29948 results
100
Since 2012 until 2013 there are 61566 results
100
Since 2012 until 2014 there are 94458 results
100
Since 2012 until 2015 there are 127151 results
100
Since 2012 until 2016 there are 161192 results
100
Since 2012 until 2017 there are 196362 results
100
Since 2012 until 2018 there are 231544 results
100
Since 2012 until 2019 there are 266785 results
100
Since 2012 until 2020 there are 274972 results
100
Since 2012 until 2021 there are 314638 results
100
Since 2012 until 2022 there are 352906 results
100
Since 2012 until 2023 there are 396553 results


write to csv

In [18]:
# open the file in the write mode
with open('data/semanticScholar_try_+gender_bias.csv', 'w', encoding='UTF8') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(data_authors)

write to a DataFrame

In [5]:
df_authors = DataFrame(data_authors, columns = ['Title', 'Year', 'Author', 'PublicationType', 'CitationCount', 'URL'])
print(df_authors['Author'].nunique())

247671


check duplicates

In [88]:
df_au_dd = df_authors.drop_duplicates(subset=["Title", "Author"], keep="first")
print(len(df_authors), len(df_au_dd))

396553 357789


drop duplicates inplace

In [130]:
df_authors.drop_duplicates(subset=["Title", "Author"], keep="first", inplace=True, ignore_index=True)

cast Year as int

In [92]:
df_authors.Year.astype('Int64')

0         2012
1         2012
2         2012
3         2012
4         2012
          ... 
396548    2023
396549    2023
396550    2023
396551    2023
396552    2023
Name: Year, Length: 357789, dtype: Int64

Show main info about the Dataframe and number of unique names

In [93]:
print(df_authors.info())
print(df_authors.isnull().sum())
df_authors.Author.nunique()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357789 entries, 0 to 396552
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Title            357789 non-null  object
 1   Year             357789 non-null  int64 
 2   Author           357789 non-null  object
 3   PublicationType  229206 non-null  object
 4   CitationCount    357789 non-null  int64 
 5   URL              357789 non-null  object
dtypes: int64(2), object(4)
memory usage: 19.1+ MB
None
Title                   0
Year                    0
Author                  0
PublicationType    128583
CitationCount           0
URL                     0
dtype: int64


247671

In [38]:
df_authors.to_csv('data/authors_2012-2023.csv')

### Sharpen the results

In [102]:
print('3 patterns', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Bb]ias|[Gg]ender(.*)[Gg]ap|[Gg]ender(.*)[dD]ifference')]))
print('Gender bias', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Bb]ias')]))
print('Gender Gap', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Gg]ap')]))
print('Gender difference', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Dd]ifference')]))


  print('3 patterns', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Bb]ias|[Gg]ender(.*)[Gg]ap|[Gg]ender(.*)[dD]ifference')]))


3 patterns 39287


  print('Gender bias', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Bb]ias')]))


Gender bias 7718


  print('Gender Gap', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Gg]ap')]))


Gender Gap 4810


  print('Gender difference', len(df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Dd]ifference')]))


Gender difference 27218


In [377]:
df_authors_re = df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Bb]ias|[Gg]ender(.*)[Gg]ap|[Gg]ender(.*)[dD]ifference')]
df_authors_re = df_authors_re.reset_index(drop=True)
#df_authors_re.to_csv('data/authors_2012-2023_regex.csv')

  df_authors_re = df_authors[df_authors.Title.str.contains('[Gg]ender(.*)[Bb]ias|[Gg]ender(.*)[Gg]ap|[Gg]ender(.*)[dD]ifference')]


In [378]:
print('number of row in the initial DF: ', len(df_authors), '\n', 'number of row in the DF after regex: ', len(df_authors_re), sep='')
print('number of unique articles before search precision:', df_authors['Title'].nunique())
print('number of unique article titles after search precision:',df_authors_re['Title'].nunique()) #.to_csv('data/regex_search.csv')
print('number of unique author names after search precision:',df_authors_re['Author'].nunique()) #.to_csv('data/regex_search.csv')

number of row in the initial DF: 357789
number of row in the DF after regex: 39287
number of unique articles before search precision: 99189
number of unique article titles after search precision: 10551
number of unique author names after search precision: 33793


### Data cleaning

#### Clean chinese characters

Search for any Chinese characters in Author names, as this is the crucial informaiton for our data.

In [320]:
df_authors_re[['Title', 'Author']][df_authors_re['Author'].str.contains(r'[\u4e00-\u9FFF]', regex=True)]

Unnamed: 0,Title,Author
4089,Gender Roles: Biases or Differences?,魏静
4179,從本土企業董事性別比例平衡與績效的關係看性別平等意涵; Corporate Board Ge...,王孟洵
7925,Gender Bias and Resistance in Edna O'Brien's C...,張崇旂
7996,Gender Bias Against Foreign Wives : Analysis o...,加藤 直子
9900,Lower Melatonin Secretion in Older Females: Ge...,Kenji 賢史 Obayashi 大林
9901,Lower Melatonin Secretion in Older Females: Ge...,Keigo 圭吾 Saeki 佐伯
9902,Lower Melatonin Secretion in Older Females: Ge...,Nobuhiro 庸浩 Tone 刀根
9903,Lower Melatonin Secretion in Older Females: Ge...,Junko 淳子 Iwamoto 岩本
9904,Lower Melatonin Secretion in Older Females: Ge...,Kimie 季美恵 Miyata 宮田
9905,Lower Melatonin Secretion in Older Females: Ge...,Yoshito 義人 Ikada 筏


Often if the author name has Chinese characters, also title would have them, but they are always accompanied with an English title, so we can remove Chinese characters and be sure, that the title will not ne NaN.

In [379]:
df_authors_re['Title'] = df_authors_re['Title'].str.replace(r'[\u4e00-\u9FFF]','', regex=True)

We will partially drop rows, where the author column only contains Chinese characters, if in this case the preliminary analysis has shown, that's it'snot possible to find out the full name of the author. For some titles this information was found and will be udated.

On the other hand we will keep the rows with mixed Latin and Chinese characters, as they provide the name information and just remove Chinese characters.

In [380]:
titles_drop = ['Deuce or advantage? : examining gender bias in online coverage of professional tennis through a mixed methodology',
    'Gender Roles: Biases or Differences?',
    'Gender Differences in Conversation',
    'Analysis of Gender Differences in Color Reaction in TV Show You Are the One',
    'Gender Bias and Stereotypes in English Example Sentences—Taking Linguistic Academic Journals as Example']
title_name_drop = ['; Corporate Board Gender Diversity and Firm Financial Performance：The Institutional Investor Bias in Taiwan',
    '王孟洵']

print(len(df_authors_re))

i_toDrop = df_authors_re[(df_authors_re.Title.isin(titles_drop)) |\
    ((df_authors_re.Title == title_name_drop[0]) & \
    (df_authors_re.Author == title_name_drop[1]))].index
df_authors_re = df_authors_re.drop(i_toDrop)

print(len(df_authors_re))

39287
39280


In [381]:
df_authors_re['Author'][df_authors_re['Author'].str.contains(r'[\u4e00-\u9FFF]', regex=True) & df_authors_re['Author'].str.contains(r'[a-zA-Z]', regex=True)] \
    = df_authors_re['Author'].str.replace(r'[\u4e00-\u9FFF]','', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_authors_re['Author'][df_authors_re['Author'].str.contains(r'[\u4e00-\u9FFF]', regex=True) & df_authors_re['Author'].str.contains(r'[a-zA-Z]', regex=True)] \


In [382]:
replace_ChinCharNames = {'張崇旂': 'Tsung-Chi Chang', #'Gender bias and resistance in Edna O’Brien’s “country girls' trilogy”' 
    '加藤 直子': 'Naoko Kato', #'Gender Bias Against Foreign Wives : Analysis of International Divorce Cases in Japan' 
    '三好 葉月': 'Hazuki Moyoshi', #'Reducing unconscious gender bias through workshop with co-speculative design'
    '加藤 朗': 'Akira Kato'} #'Reducing unconscious gender bias through workshop with co-speculative design'

df_authors_re['Author'] = df_authors_re['Author'].replace(replace_ChinCharNames, regex=True)

In [383]:
df_authors_re[['Title','Author']][df_authors_re['Author'].str.contains(r'[\u4e00-\u9FFF]', regex=True)]

Unnamed: 0,Title,Author


#### Search for Hangul (Korean characters)

In [327]:
df_authors_re[['Title','Author']][df_authors_re['Author'].str.contains(r'[\u3131-\u3163\uac00-\ud7a3]', regex=True)]

Unnamed: 0,Title,Author
154,Gender Bias in Schooling,김기승
3518,한국가족 내 부부간 성관계에 투영된 젠더 격차(Gender Gap),함인희
7894,Gender Differences in Oral Health Literacy rel...,이규은
7895,Gender Differences in Oral Health Literacy rel...,염영희
7896,Gender Differences in Oral Health Literacy rel...,김상숙
7897,Gender Differences in Oral Health Literacy rel...,한정희
7995,Cultural Linguistic Study on the Chinese Gende...,김현태
8730,A Study on Factors Influencing Elders' Suicida...,구춘영
8731,A Study on Factors Influencing Elders' Suicida...,김정순
8732,A Study on Factors Influencing Elders' Suicida...,유정옥


In [384]:
replace_KorCharNames = {'김기승': 'Seung-Gi Kim',
    'M. Milligan': 'Michael A. Milligan',
    '이규은': 'Kyu Eun lee', #Gender Differences in Oral Health Literacy
    '염영희': 'Young-Hee Yom', #Gender Differences in Oral Health Literacy
    '김상숙': 'Sang Suk Kim', #Gender Differences in Oral Health Literacy
    '한정희': 'Jung Hee Han', #Gender Differences in Oral Health Literacy
    '김현태': 'Kim Hyun Tae', #'Korean' 'Male' #'Cultural Linguistic Study on the Chinese Gender-biased Terms against Women'
    '구춘영': 'Chun Young Koo', #A Study on Factors Influencing Elders' Suicidal Ideation: Focused on Comparison of Gender Differences
    '김정순': 'Jung Soon Kim', #A Study on Factors Influencing Elders' Suicidal Ideation: Focused on Comparison of Gender Differences
    '유정옥': 'Jungok Yu', #A Study on Factors Influencing Elders' Suicidal Ideation: Focused on Comparison of Gender Differences
    '김세현': 'Sehyoun Kim'} #An Audience-response Questionnaire Survey on Gender-biased Language in Korean Subtitling}

df_authors_re['Author'] = df_authors_re['Author'].replace(replace_KorCharNames, regex=True)
i_toDrop = df_authors_re[df_authors_re.Title == '한국가족 내 부부간 성관계에 투영된 젠더 격차(Gender Gap)'].index
df_authors_re = df_authors_re.drop(i_toDrop)

len(df_authors_re)


39279

#### Check rows with names, that only contain one word

In [392]:
df_authors_re[['Title', 'Author']][~df_authors_re.Author.str.contains('\s')] #.to_csv('data/onewordnames.csv')
    #'^[\S]\.[\S].')] 
    # #& ~df_authors_re.Author.str.contains('^[a-zA-Z]\.\s[\S].*')]


Unnamed: 0,Title,Author


In [370]:
print(df_authors_re['Author'][df_authors_re.Author.isin(names_short)])

Series([], Name: Author, dtype: object)


In [393]:
names_update = {'K: Hughes': 'Karen D. Hughes',
'S. Albanesi': 'Stefania Albanesi',
'Nyfed': 'Aysegül Sahin',
'Bankole': 'Bankole Fasanya',
'Maranda': 'Maranda Mcbride',
'Pope-Ford': 'Regina Pope-Ford',
'Ntuen': 'Celestine Ntuen',
'E. Alpha': 'Ayande Alpha',
'Quinetta': 'Quinetta M. Roberson',
'Hrebickova': 'Hana Hrebickova',
#'M.': 'Gregory S. Parks',
'E. Oyeniyi': 'Akande Samson Oyeniyi',
'ZhangXiaofei': 'Zhang Xiaofei',
'GuoXitong': 'Guo Xitong',
'LaiKee-hung': 'Lai Kee-hung',
'GuoFeng': 'Guo Feng',
'LiChenlei': 'Li Chenlei',
'CohenAaron': 'Aaron Cohen',
'Ein-MorEliana': 'Eliana Einmor',
'StessmanJochanan': 'Jochanan Stessman',
'B. Rustin': 'Brett Rustin Nielsen',
'R. Foels': 'Rob Foels',
'I. GuerdjikovaAnna': 'Anna I. Guerdjikova',
'J. BlomThomas': 'Thomas J. Blom',
'MoriNicole': 'Nicole Mori',
'CasutoLeah': 'Leah Casuto',
'KeckPaul E.Jr.': 'Paul E.Jr. Keck',
'L. McElroySusan': 'Susan McElroy',
'TerranovaVictoria': 'Victoria Terranova',
'M. VandiverDonna': 'Donna Vandiver',
'GuoXitong': 'Guo Xitong',
'HanXiaocui': 'Han Xiaocui',
'ZhangXiaofei': 'Zhang Xiaofei',
'DangYuanyuan': 'Dang Yuanyuan',
'ChenChun': 'Chen Chun',
'Corrine': 'Corinne Wilsey',
'Hazel-FernandezLeslie': 'Leslie Hazel-Fernandez',
'NeroDamion': 'Damion Nero',
'MoretzChad': 'Chad Moretz',
'SlabaughLane': 'Lane Slabaugh',
'MeahYunus': 'Yunus Meah',
'BaltzJean': 'Jean Baltz',
'CostantinoMary': 'Mary Costantino',
'BouchardJonathan': 'Jonathan Bouchard',
'Omiya': 'Kazuto Omiya',
'S. Gilli': 'Sofia Gilli',
'TempletonKim': 'Kim Templeton',
'S. Javed': 'Saira Javed',
'Umm-e-Siddiqa': 'Umme Siddiqa',
'IramZehraBokharey': 'Zehra Iram Bokharey',
'J. Turabián': 'Jose Turabián',
'ra Moreno-Ruiz': 'Sandra Moreno-Ruiz',
'R. Cucho-Jove': 'Raul Cucho-Jove',
'ro Villarin-Castro': 'Alejandro Villarin-Castro',
'P. Bhattacharya': 'Prabir Bhattacharya',
'Amsterdam': 'Marja J Aartsen',
'S. Yamtinah': 'Sri Yamtinah',
'M. Masykuri': 'Mohammad Masykuri',
'A. Ashadi': 'Ashadi Ashadi',
'A. Shidiq': 'Ari Syahidul Shidiq',
'CarapinhaRené': 'René Carapinha',
'E. Choo': 'Esther Choo',
'R. DeMayo': 'Robert DeMayo',
'Warnakula':'Upuli Warnakula',
"Stephen D. O'Connell": 'Stephen D. OConnell',
"O'Connell": 'Stephen D. OConnell',
'Baloglu': 'Mustafa Baloğlu',
'AuerMichael': 'Michael Auer',
'FrauscherBirgit': 'Birgit Frauscher',
'HochleitnerMargarethe': 'Margarethe Hochleitner',
'HöglBirgit': 'Birgit Högl',
'Mzamu-Kampanje': 'Jessica Mzamu-Kampanje',
'SudomKerry': 'Kerry Sudom',
'GuérinEva': 'Eva Guérin',
'E. C. LeeJennifer': 'Jennifer E.C. Lee',
'S. Ananiadou': 'Sophia Ananiadou',
'Odbal': 'Odbal Odbal',
'Y. Hirota': 'Yusuke Hirota',
'y-hirota': 'Yuta Nakashima', 
'O. Steinlein': 'Ortrud Steinlein',
'M. Reithmair': 'Marlene Reithmair',
'ya Syunyaeva': 'Zulfiya Syunyaeva',
'E. Sattler': 'Elke C Sattler',
'Anthonius':'Anthonius de Boer',
'Denier': 'Nicole Denier',
'Gag': 'Gagandeep Singh',
'M. Abiodun':'Abiodun M. Gesinde',
'Js Oluwafunto':'Oluwafunto J. Sanu'}

df_authors_re['Author'] = df_authors_re['Author'].replace(names_update)


In [390]:
add_lines = {'Author': ['Mike Martin',
        'Daniel Zimprich', 
        'Noa Garcia', 
        'Gregory S. Parks', 
        'Rodney M. Miller', 
        'Ashadi Ashadi', 
        'Cecilia O. Alm',
        'Reeta Karra',
        'Jennifer Lyke',
        'Yong Li',
        'Jana Vasickova',
        'Dorota Groffik',
        'Sharon Luise Njet-Moi Tjin A Tsoi',
        'Andries Koster'
        ],
    'Title' : ['Gender differences in level and change in cognitive functioning : Results from the Longitudinal Aging Study',
        'Gender differences in level and change in cognitive functioning : Results from the Longitudinal Aging Study',
        'Gender and Racial Bias in VisualQuestion Answering Datasets',
        'Eighteen Million Cracks: Genders Role in the 2008 Presidential Campaign',
        'If the Shoe Fits: A Historical Exploration of Gender Bias in the U.S. Sneaker Industry',
        "Gender differences in students’ attitudes toward science: An analysis of students’ science process skill using testlet instrument",
        'Analyzing Gender Bias in Student Evaluations Acknowledgments',
        'Gender Bias in Availability of School Education in Villages - A Study of Kalisindh Thermal Power Project',
        'Gender Differences in Perceived Illness, Stress, and Coping in Undergraduates',
        'Racial/ethnic and gender differences in severity of diabetes-related complications, health care resource use, and costs in a Medicare population.',
        'Gender, Age and Body Mass Differences Influencing the Motivation for Physical Activity among Polish Youths',
        'Gender, Age and Body Mass Differences Influencing the Motivation for Physical Activity among Polish Youths',
        'Gender Differences in Motivation and Learning Preferences of Pharmacists in Lifelong Learning',
        'Gender Differences in Motivation and Learning Preferences of Pharmacists in Lifelong Learning'
]}

df_authors_re = df_authors_re.append(DataFrame(add_lines,columns=['Author', 'Title'])).reset_index(drop=True)
df_authors_re.tail()

  df_authors_re = df_authors_re.append(DataFrame(add_lines,columns=['Author', 'Title'])).reset_index(drop=True)


Unnamed: 0,Title,Year,Author,PublicationType,CitationCount,URL
39211,Racial/ethnic and gender differences in severi...,,Yong Li,,,
39212,"Gender, Age and Body Mass Differences Influenc...",,Jana Vasickova,,,
39213,"Gender, Age and Body Mass Differences Influenc...",,Dorota Groffik,,,
39214,Gender Differences in Motivation and Learning ...,,Sharon Luise Njet-Moi Tjin A Tsoi,,,
39215,Gender Differences in Motivation and Learning ...,,Andries Koster,,,


In [388]:
titles_drop2 = ['Gender-biased Diagnosing', 
    'Opiorphin Secretion Pattern in Healthy Volunteers: Gender Difference andOrgan Specificity',
    "Gender Bias in Women's Health, Obstetric, and Neonatal Nursing."]

i_toDrop = df_authors_re[df_authors_re['Title'].isin(titles_drop2)].index
df_authors_re = df_authors_re.drop(i_toDrop)

In [389]:
print(len(df_authors_re))
i_noSpace = df_authors_re[~df_authors_re.Author.str.contains('\s')].index
df_authors_re = df_authors_re.drop(i_noSpace)
print(len(df_authors_re))

39268
39202


In [391]:
df_authors_re

Unnamed: 0,Title,Year,Author,PublicationType,CitationCount,URL
0,Gender Bias in the Iranian High School EFL Tex...,2012.0,M. Amini,,85.0,https://www.semanticscholar.org/paper/0ea3d994...
1,Gender Bias in the Iranian High School EFL Tex...,2012.0,P. Birjandi,,85.0,https://www.semanticscholar.org/paper/0ea3d994...
2,Stag Parties Linger: Continued Gender Bias in ...,2012.0,L. Isbell,[JournalArticle],93.0,https://www.semanticscholar.org/paper/551f36ed...
3,Stag Parties Linger: Continued Gender Bias in ...,2012.0,T. Young,[JournalArticle],93.0,https://www.semanticscholar.org/paper/551f36ed...
4,Stag Parties Linger: Continued Gender Bias in ...,2012.0,A. Harcourt,[JournalArticle],93.0,https://www.semanticscholar.org/paper/551f36ed...
...,...,...,...,...,...,...
39211,Racial/ethnic and gender differences in severi...,,Yong Li,,,
39212,"Gender, Age and Body Mass Differences Influenc...",,Jana Vasickova,,,
39213,"Gender, Age and Body Mass Differences Influenc...",,Dorota Groffik,,,
39214,Gender Differences in Motivation and Learning ...,,Sharon Luise Njet-Moi Tjin A Tsoi,,,


#### Other strange cases

Rename some titles, that don't correspond fully to original paper title.

In [394]:
# 'https://www.semanticscholar.org/paper/b5612cc672817a7d65faf8c273550858b63790a2'
rename_title = {'Re; Pandit SR, Venugopal P, Keshavamurthy R, Chawla A. Challenges and gender-based differences for women in the Indian urological workforce: Results of a survey. Indian J Urol 2022;38:282-6':\
        'Challenges and gender-based differences for women in the Indian urological workforce: Results of a survey',
    'SUCCESSFUL TITLE VII CLAIM FOR IMPLICIT GENDER BIAS ? A . Implicit Bias : A Definition B . Implicit Gender Bias , Title VII , and the Clinton Campaign':\
        'Eighteen Million Cracks: Genders Role in the 2008 Presidential Campaign'}

df_authors_re['Title'] = df_authors_re.Title.replace(rename_title)

name_toDrop = df_authors_re[df_authors_re.Author == 'for the Colon Cancer Family Registry'].index
df_authors_re = df_authors_re.drop(name_toDrop)

Check the rename results

In [396]:
df_authors_re['Title'][df_authors_re['Title'].isin(rename_title.values())]

Series([], Name: Title, dtype: object)

In [397]:
df_authors_re.to_csv('data/authors_2012-2023_regex.csv')

### Find rows with full names

create two DF, with full names and with short names

In [486]:
df_authors_full = df_authors_re[~df_authors_re.Author.str.contains('^[\S]\.\s[\S].*|^[\S]\.\s[\S]\.|^[\S]\.[\S].')].reset_index(drop=True)
    #~df_authors_re.Author.str.contains('^[\S]\.\s[\S]\.*]')]
df_authors_short = df_authors_re[df_authors_re.Author.str.contains('^[\S]\.\s[\S].*|^[\S]\.\s[\S]\.|^[\S]\.[\S].')].reset_index(drop=True)
    #df_authors_re.Author.str.contains('^[\S]\.\s[\S]\.')]
print('Length of DF with full names is ', len(df_authors_full), '\n', 'Length of DF with short names is ', len(df_authors_short), sep='')

Length of DF with full names is 15516
Length of DF with short names is 23700


check if there are any authors in both df due to a mistake in regex

In [399]:
df_authors_full[df_authors_full['Author'].isin(df_authors_short['Author'])] #[df_authors.Author.str.contains('^[A-Za-z]{1}\.\s[A-Za-z-]{2,50}')]

Unnamed: 0,Title,Year,Author,PublicationType,CitationCount,URL


check some problematic cases

In [400]:
print(df_authors_short['Author'][df_authors_short['Author']=='G. Ríos-Muñoz'],
    df_authors_short['Author'][df_authors_short['Author']=='A. van Mark'],
    df_authors_short['Author'][df_authors_short['Author']=='Á. M. Fidalgo'],
    df_authors_short['Author'][df_authors_short['Author']=='C. Çöteli'], 
    df_authors_short['Author'][df_authors_short['Author']=='K. Židová'], 
    df_authors_short['Author'][df_authors_short['Author']=='Á. Szabó-Morvai'], sep='\n')

37272    G. Ríos-Muñoz
Name: Author, dtype: object
34619    A. van Mark
Name: Author, dtype: object
20307    Á. M. Fidalgo
Name: Author, dtype: object
27359    C. Çöteli
Name: Author, dtype: object
3357    K. Židová
Name: Author, dtype: object
38231    Á. Szabó-Morvai
Name: Author, dtype: object


Print the info about resulting dataframes

In [487]:
print(f'In the final search there are {df_authors_re.Title.nunique()} publications from {df_authors_re.Author.nunique()} authors')
print(f'{df_authors_full.Title.nunique()} publications have {df_authors_full.Author.nunique()} full names')
print(f'{df_authors_full[df_authors_full.Title.isin(df_authors_short.Title.unique())].Title.nunique()} publications have both forms of author names')
print(f'{df_authors_short[df_authors_short.Title.isin(df_authors_full.Title.unique())].Author.nunique()} author names have to be corrected')
print(f'{df_authors_full[~df_authors_full.Title.isin(df_authors_short.Title.unique())].Title.nunique()} publications \
have {df_authors_short[~df_authors_short.Title.isin(df_authors_full.Title.unique())].Author.nunique()} authors with only short forms ')

In the final search there are 10536 publications from 33714 authors
7444 publications have 13720 full names
5056 publications have both forms of author names
12405 author names have to be corrected
2388 publications have 8598 authors with only short forms 


In [488]:
df_authors_full.Year.value_counts(sort=False)

2012.0    1190
2013.0    1106
2014.0    1183
2015.0    1283
2016.0    1377
2017.0    1510
2018.0    1506
2019.0    1580
2020.0     633
2021.0    1889
2022.0    1817
2023.0     428
Name: Year, dtype: int64

In [None]:
#df_authors_full[['First name', 'Last name', 'Middle names']] = df_authors_full.Author.str.split(expand=True)

In [489]:
def name_separator(fullName):
    name = fullName.split()
    if len(name) > 2:
        firstName = name[0]
        lastName = name[-1]
        middleName = ' '.join(name[1:-1])
        return firstName, lastName, middleName
    elif len(name) == 2:
        firstName = name[0]
        lastName = name[1]
        return firstName, lastName, None

df_authors_full[['FirstName', 'LastName', 'MiddleName']] = df_authors_full.apply(lambda x: name_separator(x.Author), axis=1, result_type="expand")

     #.str.split(expand=True)

In [None]:
# Alternative: https://stackoverflow.com/questions/38437847/pandas-split-name-column-into-first-and-last-name-if-contains-one-space
# name_df[['first_name','last_name']] = name_df['name'].loc[name_df['name'].str.split().str.len() == 2].str.split(expand=True)

In [490]:
df_authors_full[['Author','FirstName', 'LastName', 'MiddleName']][~df_authors_full['MiddleName'].isna()]

Unnamed: 0,Author,FirstName,LastName,MiddleName
0,Laura V. Zimmermann,Laura,Zimmermann,V.
1,Seyyed Ahmad Mousavi,Seyyed,Mousavi,Ahmad
4,Margaret S. Stockdale,Margaret,Stockdale,S.
6,Seyyed Ahmad Mousavi,Seyyed,Mousavi,Ahmad
7,Judith Sama Yeba,Judith,Yeba,Sama
...,...,...,...,...
15487,Anna M. Stadelman,Anna,Stadelman,M.
15505,Gregory S. Parks,Gregory,Parks,S.
15506,Rodney M. Miller,Rodney,Miller,M.
15508,Cecilia O. Alm,Cecilia,Alm,O.


In [407]:
import random

test_batch = random.sample(list(df_authors_full['Author'].unique()), 500)
test_batch

['Glenn Hardaker',
 'Gail M. Williams',
 'Roger Kingerlee',
 'Fabian Kosse',
 'Laura N MacMullin',
 'Irina Rets',
 'Emma A Gans',
 'Noah J. Kaplan',
 'Yi-Qing Yang',
 'Danika I. DesRoches',
 'Youngji Kim',
 'Yingping Liu',
 'Lukas Retzer',
 'Emma von Essen',
 'MR Chiappetta',
 "Patricia O'Rourke",
 'Shawn M. Bergman',
 'Elena Manzoni',
 'Shih-Huang Lee',
 'Giulia Antonelli',
 'Maithili S. Umate',
 'Roopa Gurm',
 'Megan Oka',
 'Jessica Greenawalt',
 'Sanjay Singh',
 'Ryuhei So',
 'Ritesh Kumar',
 'Rui Hu',
 'Mengya Yang',
 'Akira Nakayama',
 'Joel R. Anderson',
 'Maria Kanwal',
 'Zeyu Xiao',
 'Ishara Sahama',
 'Sławomir Jandziś',
 'Omar Ruvalcaba',
 'Bas van Leeuwen',
 'Jennifer Gatewood Owens',
 'Corali Bel',
 'Katerina P. Günter',
 'Lalini Ramanathan',
 'Ansh Mittal',
 'Ya-song Wu',
 'Cristiano Ciappei',
 'Christa E. Kiersch',
 'Huda Haider Al Sayed Ahmed',
 'Yoshito  Ikada ',
 'Nik Thompson',
 'Kara Kedrick',
 'Norah Alshareef',
 'Hyunsik Kang',
 'Kagari Shibazaki',
 'Ursula Kessels'

In [408]:
test_batch_info = df_authors_full[['Author', 'FirstName', 'LastName', 'Title', 'URL']].loc[df_authors_full['Author'].isin(test_batch)]
test_batch_info.to_csv('data/test_batch_full_info.csv')

### Group by title to ind out, if all authors have names in the same format

In [75]:
author_gr = df_authors_re.groupby('Title')['Author'].apply(lambda x: "{%s}" % ', '.join(x))
type(author_gr)

pandas.core.series.Series

In [None]:
#author_gr.str.contains('[A-Za-z]{1}\.\s[A-Za-z]{2,50}')

To update manually:

In [117]:
df_authors_short[df_authors_short.Title.isin(df_authors_full.Title.unique())].to_csv('data/short_names_to_fill.csv')

In [410]:
df_authors_re[df_authors_re.Title == 'Reducing unconscious gender bias through workshop with co-speculative design']

Unnamed: 0,Title,Year,Author,PublicationType,CitationCount,URL
27186,Reducing unconscious gender bias through works...,2019.0,Hazuki Moyoshi,,0.0,https://www.semanticscholar.org/paper/454ff650...
27187,Reducing unconscious gender bias through works...,2019.0,Akira Kato,,0.0,https://www.semanticscholar.org/paper/454ff650...


In [None]:
7926 'Gender bias and resistance in Edna O’Brien’s “country girls' trilogy”' 'Tsung-Chi Chang'
'Gender Bias Against Foreign Wives : Analysis of International Divorce Cases in Japan' 'Naoko Kato'
'Cultural Linguistic Study on the Chinese Gender-biased Terms against Women' 'Kim Hyun Tae' 'Korean' 'Male'
'Reducing unconscious gender bias through workshop with co-speculative design' 'Hazuki Moyoshi'
'Reducing unconscious gender bias through workshop with co-speculative design' 'Akira Kato'

'Odbal' 'female'

From https://aclanthology.org/2022.ccl-1.46.pdf:
36393-36398


# Determine gender by name
## Genderize.io

In [41]:
import requests, json

def getGenders(names):
	url = ""
	cnt = 0
	if not isinstance(names,list):
		names = [names,]
	
	for name in names:
		if url == "":
			url = "name[0]=" + name
		else:
			cnt += 1
			url = url + "&name[" + str(cnt) + "]=" + name

	req = requests.get("https://api.genderize.io?" + url)
	results = json.loads(req.text)
	
	retrn = []
	for result in results:
		if result["gender"] is not None:
			retrn.append([result['name'], result["gender"], result["probability"], result["count"]])
		else:
			retrn.append([u'None',u'0.0',0.0])
	return retrn

gender_names = getGenders([name for name in df_authors_full.Author.head()])
gender_names

[('None', '0.0', 0.0),
 ('None', '0.0', 0.0),
 ('None', '0.0', 0.0),
 ('None', '0.0', 0.0),
 ('None', '0.0', 0.0)]

In [43]:
req = requests.get("https://api.genderize.io?name=pedro")
result = json.loads(req.text)
result

{'count': 618003, 'gender': 'male', 'name': 'pedro', 'probability': 1.0}

In [63]:
genders = []
for name in [name for name in df_authors_full.FirstName.head()]:
    req = requests.get("https://api.genderize.io?name=" + name) #https://api.genderize.io?name=peter
    result = json.loads(req.text)
    genders.append(result)

print(genders)

[{'count': 84, 'gender': 'male', 'name': 'Seyyed', 'probability': 0.94}, {'count': 618003, 'gender': 'male', 'name': 'Pedro', 'probability': 1.0}, {'count': 149760, 'gender': 'female', 'name': 'Helena', 'probability': 1.0}, {'count': 14226, 'gender': 'male', 'name': 'Theo', 'probability': 0.97}, {'count': 35293, 'gender': 'female', 'name': 'Shruti', 'probability': 1.0}]


## Gender API

In [68]:
url_genderapi = 'https://gender-api.com/get?name='
key_genderapi = '&key=wTotoGrvGYYVau3ktAD5LBaLDgRWTc8c5728'

genderapi = []

for name in [name for name in df_authors_full.FirstName.head()]:
    req = requests.get(url_genderapi+name+key_genderapi)
    result = json.loads(req.text)
    genderapi.append(result)

print(genderapi)

[]


In [69]:
result

{'name': 'shruti',
 'name_sanitized': 'Shruti',
 'country': '',
 'gender': 'female',
 'samples': 3562,
 'accuracy': 99,
 'duration': '14ms',
 'credits_used': 1}

# Is it possible to extract more names from initials?

In [491]:
df_authors_short[['FirstName', 'LastName', 'MiddleName']] = df_authors_short.apply(lambda x: name_separator(x.Author), axis=1, result_type="expand")

In [432]:
df_authors_short[df_authors_short['LastName'].isna()]

Unnamed: 0,Title,Year,Author,PublicationType,CitationCount,URL,FirstName,LastName,MiddleName


In [464]:
def update_names_from_RG(paper_title, surname,webdriver='dr', name=None):
    time.sleep(random.randint(1,10))
    query = paper_title.replace(' ', '+') + '+' + surname
    url = f"https://www.researchgate.net/search.Search.html?query={query}&type=publication"
    #print(query, url)
    
    dr.get(url)
    soup = BeautifulSoup(dr.page_source,"lxml")

    results = soup.find_all('div', class_="nova-legacy-v-publication-item__stack nova-legacy-v-publication-item__stack--gutter-m")
        
    for result in results:
        result_title = result.find("a", class_="nova-legacy-e-link nova-legacy-e-link--color-inherit nova-legacy-e-link--theme-bare")
        result_title = result_title.text
    
        if result_title.lower() == paper_title.lower():
            #print(result_title)
            results_name = result.find_all('a', class_='nova-legacy-v-person-inline-item')
            for name in results_name:
                full_name = name.text.split(' ')
                #print(surname, full_name[-1])
                if surname.lower() == full_name[-1].lower():
                    if not full_name[0].replace('.', '').isupper():
                        if (full_name[0] not in ['Mr.', 'Dr.', 'Miss., Md.']):
                            first_name = full_name[0]
                    return first_name

In [527]:
dr = webdriver.Chrome()

In [None]:
df_authors_short['FirstName_full'].iloc[:3] = df_authors_short.apply(lambda x: update_names_from_RG(x.Title, x.LastName, dr), axis=1)

In [None]:
for i in range(11500,23700,50):
    df_authors_short_batch = df_authors_short.iloc[i:i+50]
    df_authors_short_batch['FirstName'] = df_authors_short_batch.apply(lambda x: update_names_from_RG(x.Title, x.LastName, dr), axis=1)
    df_authors_full = df_authors_full.append(df_authors_short_batch, ignore_index=True)
    print(f'{i+50} done')
    if (i + 50) % 500 == 0:
        #dr = webdriver.Chrome()
        df_authors_full[~df_authors_full.FirstName.isnull()].to_csv('data/authors_full_names_extended.csv')

In [529]:
df_authors_full[~df_authors_full.FirstName.isnull()].to_csv('data/authors_full_names_extended.csv')

In [538]:
df_authors_full[df_authors_full.FirstName.isnull()].to_csv('data/authors_short_names_toFill.csv')

Results of update and the rate of success (number of update names in proportion to all added names)

In [537]:
print('Added new full names: ',len(df_authors_full[~df_authors_full.FirstName.isnull()]) - 15500, '\n',
    'Could not update names: ', len(df_authors_full[df_authors_full.FirstName.isnull()]), '\n',
    'success rate = ', ((len(df_authors_full[~df_authors_full.FirstName.isnull()]))-15500)/23700, sep='')

Added new full names: 10082
Could not update names: 13784
success rate = 0.42540084388185656


Try some more filters

In [542]:
df_authors_extended = df_authors_full[~df_authors_full.FirstName.isnull()].copy()

There are some erroneous firstNames: 
- one letter initials with a dot (eg. 'M.'),
- initials with two letters and dash or dot in between (eg. 'M. C.' or 'M.-C.')
- a title, that was collected as Name (eg. 'Dr.', 'Mr.)

They need to be cleared, but without erasing some names, that have dots at the end (eg. 'Jennifer.')

In [549]:
df_authors_extended[(df_authors_extended.FirstName.str.contains('\.$') & df_authors_extended.FirstName.str.replace('.', '').str.isupper()) | \
    df_authors_extended.FirstName.isin(['Mr.', 'Dr.', 'Miss.', 'Md.'])]

  df_authors_extended[(df_authors_extended.FirstName.str.contains('\.$') & df_authors_extended.FirstName.str.replace('.', '').str.isupper()) | \


Unnamed: 0,Title,Year,Author,PublicationType,CitationCount,URL,FirstName,LastName,MiddleName
307,Gender differences in treatment outcomes of tu...,2012.0,M-C. Yu,"[Study, JournalArticle]",69.0,https://www.semanticscholar.org/paper/f0f1d8f9...,M-C.,Yu,
1271,Investment Behaviour of Indian Investors: Gend...,2013.0,Dr. Seema Singh,[Review],6.0,https://www.semanticscholar.org/paper/80dcf5ac...,Dr.,Singh,Seema
1612,Gender Differences in Disability-Free Life Exp...,2013.0,Md. Ismail Tareque,"[JournalArticle, Review]",36.0,https://www.semanticscholar.org/paper/5ef97ee9...,Md.,Tareque,Ismail
2303,Gender Bias Without Borders An Investigation o...,2014.0,Dr. Katherine Pieper,,41.0,https://www.semanticscholar.org/paper/f30f1f5c...,Dr.,Pieper,Katherine
2327,Gender Bias in Performance Evaluations:,2014.0,Dr. Jan Sauermann,,0.0,https://www.semanticscholar.org/paper/04dda39b...,Dr.,Sauermann,Jan
...,...,...,...,...,...,...,...,...,...
39000,Gender-specifi c differences in ascending aort...,2022.0,S. Sazonova,,0.0,https://www.semanticscholar.org/paper/e38f64f8...,S.,Sazonova,
39082,"Health-oriented leadership, gender-differences...",2023.0,H. Drexler,"[JournalArticle, Review]",0.0,https://www.semanticscholar.org/paper/b221f70c...,H.,Drexler,
39316,Gender differences in the association between ...,2023.0,T. Muhammad,,0.0,https://www.semanticscholar.org/paper/e94cc702...,T.,Muhammad,
39322,Exploring the gender gap in mobile money aware...,2023.0,C. Leigh Anderson,,0.0,https://www.semanticscholar.org/paper/cd259f56...,C.,Anderson,Leigh


In [None]:
authors_short = df_authors_full[df_authors_full.FirstName.isnull()].copy()
authors_short = authors_short.append(df_authors_extended[(df_authors_extended.FirstName.str.contains('\.$') & df_authors_extended.FirstName.str.replace('.', '').str.isupper()) | \
    df_authors_extended.FirstName.isin(['Mr.', 'Dr.', 'Miss.', 'Md.'])])
authors_short

In [554]:
names_dot = authors_short[authors_short.Title.str.contains('\.$')].copy()

In [555]:
for i in range(0,3838,50):
    df_authors_short_batch = names_dot.iloc[i:i+50]
    df_authors_short_batch['FirstName'] = df_authors_short_batch.apply(lambda x: update_names_from_RG(x.Title, x.LastName, dr), axis=1)
    df_authors_full = df_authors_full.append(df_authors_short_batch[df_authors_short_batch.notnull()], ignore_index=True)
    print(f'{i+50} done')
    if (i + 50) % 500 == 0:
        #dr = webdriver.Chrome()
        df_authors_full[~df_authors_full.FirstName.isnull()].to_csv('data/authors_full_names_extended2.csv')

#authors_short[authors_short.Title.str.contains('\.$')]

## Manual update

In [None]:
'R. Dietz': 'Richard Dietz',
'R. Pearson':  'Richard Pearson',
'M. Semak': 'Matthew R Semak',
'C. Willis': 'Courtney W Willis',
'D. Syed': 'Deeba N. Syed',
'H. Mukhtar': 'Hasan Mukhtar'

How is the relative distribution of gender among the researchers, who study gender bias?

How is the temporal dynamics of the female representation as well as the decade forecast?

Check search pattern "women + bias" (as in "bias against women") in df_authors, that are not in df_authors_re

try google scholar with SemanticScholar for author names