In [1]:
# install the follwoing packages in the enviroment:
# python3 -m pip install pandas
# python3 -m pip install seaborn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import json

import os

from read_jsondata import read_jsons

import time

In [2]:
# Define local paths

root = ! pwd
root = root[0]

RAW_DIR=root+"/author_allgenders/"  

if not os.path.exists(RAW_DIR):
    print("The directory {} does not exist.\nThere is no raw data for statistical analysis.".format(RAW_DIR))

#### READ DATAFRAME

In [19]:
df = read_jsons(RAW_DIR, columns=['journal','all_names', 'all_genders','all_percent','year'])
df

Unnamed: 0,all_names,year,journal,all_genders,all_percent
0,"[Roberto Ortega, Luis Quintanar]",2010,GRL,"[male, male]","[0.99, 0.99]"
1,"[Takashi Iidaka, Toshihiro Igarashi, Akinori H...",2017,Tectp,"[male, male, male, female, male, male]","[0.99, 1.0, 1.0, 0.5, 0.98, 0.9]"
2,[Max A. Meju],2019,GEOPHYSICS,[male],[0.98]
3,"[Magnús T. Gudmundsson, Kristín Jónsdóttir, An...",2016,Science,"[male, female, male, male, male, male, male, f...","[0.99, 0.93, 0.99, 0.99, 1.0, 1.0, 0.75, 0.93,..."
4,"[Martine Simoes, Y.‐G. Chen, Dattatraya P. Shi...",2014,JGRSolidEarth,"[female, init, male, male]","[0.98, None, 1.0, 0.99]"
...,...,...,...,...,...
19355,"[Javad Behseresht, Steven L. Bryant]",2012,E%26PSL,"[male, male]","[0.97, 0.99]"
19356,"[Junqing Liu, Jiří Zahradník]",2020,GRL,"[male, male]","[0.88, 0.99]"
19357,"[Thessa Tormann, Bogdan Enescu, Jochen Woessne...",2015,NatureGeoscience,"[female, male, male, male]","[0.98, 0.98, 1.0, 0.98]"
19358,"[Shengji Wei, Peter M. Shearer]",2017,JGRSolidEarth,"[female, male]","[0.5, 0.99]"


##### Clean data

In [20]:
# clean some journal names

df.loc[df.journal=='E%26PSL','journal'] = 'EPSL'

df.loc[df.journal.str.contains("Bulletin"),'journal'] = 'BSSA'

df.loc[df.journal.str.contains("Seismological"),'journal'] = 'SRL'

# Include impact factor:

dict_IF = {'Nature': 46.486, 'Science': 41.845, 'NatureGeoscience': 16.103, 'EPSL': 4.823, 'GRL': 4.952, 
        'JGRSolidEarth': 4.191, 'G3': 3.721, 'SRL': 3.131, 'Tectp': 3.048, 'SolidEarth': 2.921, 
       'GEOPHYSICS': 3.093, 'GJI': 2.834, 'BSSA': 2.274, 'PEPI': 2.413}

df['IF'] = df['journal'].map(dict_IF)
df

Unnamed: 0,all_names,year,journal,all_genders,all_percent,IF
0,"[Roberto Ortega, Luis Quintanar]",2010,GRL,"[male, male]","[0.99, 0.99]",4.952
1,"[Takashi Iidaka, Toshihiro Igarashi, Akinori H...",2017,Tectp,"[male, male, male, female, male, male]","[0.99, 1.0, 1.0, 0.5, 0.98, 0.9]",3.048
2,[Max A. Meju],2019,GEOPHYSICS,[male],[0.98],3.093
3,"[Magnús T. Gudmundsson, Kristín Jónsdóttir, An...",2016,Science,"[male, female, male, male, male, male, male, f...","[0.99, 0.93, 0.99, 0.99, 1.0, 1.0, 0.75, 0.93,...",41.845
4,"[Martine Simoes, Y.‐G. Chen, Dattatraya P. Shi...",2014,JGRSolidEarth,"[female, init, male, male]","[0.98, None, 1.0, 0.99]",4.191
...,...,...,...,...,...,...
19355,"[Javad Behseresht, Steven L. Bryant]",2012,EPSL,"[male, male]","[0.97, 0.99]",4.823
19356,"[Junqing Liu, Jiří Zahradník]",2020,GRL,"[male, male]","[0.88, 0.99]",4.952
19357,"[Thessa Tormann, Bogdan Enescu, Jochen Woessne...",2015,NatureGeoscience,"[female, male, male, male]","[0.98, 0.98, 1.0, 0.98]",16.103
19358,"[Shengji Wei, Peter M. Shearer]",2017,JGRSolidEarth,"[female, male]","[0.5, 0.99]",4.191


In [21]:
## Remove rows for papers from 2021

df = df[~df['year'].isin(['2021'])].copy()
df

Unnamed: 0,all_names,year,journal,all_genders,all_percent,IF
0,"[Roberto Ortega, Luis Quintanar]",2010,GRL,"[male, male]","[0.99, 0.99]",4.952
1,"[Takashi Iidaka, Toshihiro Igarashi, Akinori H...",2017,Tectp,"[male, male, male, female, male, male]","[0.99, 1.0, 1.0, 0.5, 0.98, 0.9]",3.048
2,[Max A. Meju],2019,GEOPHYSICS,[male],[0.98],3.093
3,"[Magnús T. Gudmundsson, Kristín Jónsdóttir, An...",2016,Science,"[male, female, male, male, male, male, male, f...","[0.99, 0.93, 0.99, 0.99, 1.0, 1.0, 0.75, 0.93,...",41.845
4,"[Martine Simoes, Y.‐G. Chen, Dattatraya P. Shi...",2014,JGRSolidEarth,"[female, init, male, male]","[0.98, None, 1.0, 0.99]",4.191
...,...,...,...,...,...,...
19355,"[Javad Behseresht, Steven L. Bryant]",2012,EPSL,"[male, male]","[0.97, 0.99]",4.823
19356,"[Junqing Liu, Jiří Zahradník]",2020,GRL,"[male, male]","[0.88, 0.99]",4.952
19357,"[Thessa Tormann, Bogdan Enescu, Jochen Woessne...",2015,NatureGeoscience,"[female, male, male, male]","[0.98, 0.98, 1.0, 0.98]",16.103
19358,"[Shengji Wei, Peter M. Shearer]",2017,JGRSolidEarth,"[female, male]","[0.5, 0.99]",4.191


##### Create new columns for statistics

In [22]:
# First author's gender and percentage:

df['First_Author'] = df['all_names'].apply(lambda x: x[0]) #take the first element of the list all_genders
df['First_Author_gend'] = df['all_genders'].apply(lambda x: x[0]) #take the first element of the list all_genders
df['First_Author_gendprob'] = df['all_percent'].apply(lambda x: x[0]) #take the first element of the list all_genders



# Last author's gender and percentage:

df['Last_Author'] = df['all_names'].apply(lambda x: x[-1]) #take the last element of the list all_genders
df['Last_Author_gend'] = df['all_genders'].apply(lambda x: x[-1]) #take the last element of the list all_genders
df['Last_Author_gendprob'] = df['all_percent'].apply(lambda x: x[-1]) #take the last element of the list all_genders

df

Unnamed: 0,all_names,year,journal,all_genders,all_percent,IF,First_Author,First_Author_gend,First_Author_gendprob,Last_Author,Last_Author_gend,Last_Author_gendprob
0,"[Roberto Ortega, Luis Quintanar]",2010,GRL,"[male, male]","[0.99, 0.99]",4.952,Roberto Ortega,male,0.99,Luis Quintanar,male,0.99
1,"[Takashi Iidaka, Toshihiro Igarashi, Akinori H...",2017,Tectp,"[male, male, male, female, male, male]","[0.99, 1.0, 1.0, 0.5, 0.98, 0.9]",3.048,Takashi Iidaka,male,0.99,Research Group Joint Seismic Observations at t...,male,0.90
2,[Max A. Meju],2019,GEOPHYSICS,[male],[0.98],3.093,Max A. Meju,male,0.98,Max A. Meju,male,0.98
3,"[Magnús T. Gudmundsson, Kristín Jónsdóttir, An...",2016,Science,"[male, female, male, male, male, male, male, f...","[0.99, 0.93, 0.99, 0.99, 1.0, 1.0, 0.75, 0.93,...",41.845,Magnús T. Gudmundsson,male,0.99,Alessandro Aiuppa,male,0.99
4,"[Martine Simoes, Y.‐G. Chen, Dattatraya P. Shi...",2014,JGRSolidEarth,"[female, init, male, male]","[0.98, None, 1.0, 0.99]",4.191,Martine Simoes,female,0.98,Ashok K. Singhvi,male,0.99
...,...,...,...,...,...,...,...,...,...,...,...,...
19355,"[Javad Behseresht, Steven L. Bryant]",2012,EPSL,"[male, male]","[0.97, 0.99]",4.823,Javad Behseresht,male,0.97,Steven L. Bryant,male,0.99
19356,"[Junqing Liu, Jiří Zahradník]",2020,GRL,"[male, male]","[0.88, 0.99]",4.952,Junqing Liu,male,0.88,Jiří Zahradník,male,0.99
19357,"[Thessa Tormann, Bogdan Enescu, Jochen Woessne...",2015,NatureGeoscience,"[female, male, male, male]","[0.98, 0.98, 1.0, 0.98]",16.103,Thessa Tormann,female,0.98,Stefan Wiemer,male,0.98
19358,"[Shengji Wei, Peter M. Shearer]",2017,JGRSolidEarth,"[female, male]","[0.5, 0.99]",4.191,Shengji Wei,female,0.50,Peter M. Shearer,male,0.99


##### Clean names just in case

In [37]:
def Clean_names(x):
    first_name = x.split()[0] 
    last_name = x.split()[-1]
    
    name = first_name + ' ' + last_name
    
    return name


df['First_Author_clean'] = df['First_Author'].apply(lambda x: Clean_names(x))
df['Last_Author_clean'] = df['Last_Author'].apply(lambda x: Clean_names(x))

df.drop(columns=['First_Author', 'Last_Author'],inplace = True)
df

In [44]:
### Count number of papers for each author and create dictionary

dict_last = df.Last_Author_clean.value_counts().to_dict()
dict_first = df.First_Author_clean.value_counts().to_dict()

In [133]:
### Create new dataframes, one for first authors and another one for last authors

df_first = df[['First_Author_clean','First_Author_gend','First_Author_gendprob']].copy()
df_last = df[['Last_Author_clean','Last_Author_gend','Last_Author_gendprob']].copy()

df_first['Num_papers'] = df_first.First_Author_clean.map(dict_first) # create new column with number of papers
df_last['Num_papers'] = df_last.Last_Author_clean.map(dict_last)

In [134]:
### Drop duplicated name and sort in descending order of num_papers

df_first2 = df_first.drop_duplicates('First_Author_clean').sort_values(by=['Num_papers'],ascending=False).reset_index(drop = True)

df_last2 = df_last.drop_duplicates('Last_Author_clean').sort_values(by=['Num_papers'],ascending=False).reset_index(drop = True)


In [144]:
## It is easier to have all probabilities with respect to female

# prob(female) = 1 - prob(male)

# Prob first author female:

df_first2['First_Author_probF'] = df_first2['First_Author_gendprob']

df_first2.loc[df_first2['First_Author_gend'] == 'male','First_Author_probF'] = \
    1 - df_first2.loc[df_first2['First_Author_gend'] == 'male','First_Author_probF']

# Prob first author female:

df_last2['Last_Author_probF'] = df_last2['Last_Author_gendprob']

df_last2.loc[df_last2['Last_Author_gend'] == 'male','Last_Author_probF'] = \
    1 - df_last2.loc[df_last2['Last_Author_gend'] == 'male','Last_Author_probF']


In [149]:
print('Probability female on top 30 first authors', df_first2.loc[0:29,'First_Author_probF'].sum()/30)
print('Probability female on top 10 first authors', df_first2.loc[0:9,'First_Author_probF'].sum()/10)


df_first2.iloc[0:30]

Probability female on top 30 first authors 0.16000012073968015
Probability female on top 10 first authors 0.255


Unnamed: 0,First_Author_clean,First_Author_gend,First_Author_gendprob,Num_papers,First_Author_probF
0,Norman Sleep,male,0.99,24,0.01
1,Gail Atkinson,female,0.94,24,0.94
2,Fred Pollitz,male,0.96,23,0.04
3,Diego Melgar,male,0.99,23,0.01
4,Thorne Lay,male,0.97,22,0.03
5,Ivan Koulakov,male,0.99,21,0.01
6,Susan Hough,female,0.98,20,0.98
7,Xin Liu,female,0.52,19,0.52
8,Yangkang Chen,male,1.0,18,0.0
9,Andreas Fichtner,male,0.99,18,0.01


In [150]:
print('Probability female on top 30 last authors', df_last2.loc[0:29,'Last_Author_probF'].sum()/30)
print('Probability female on top 10 last authors', df_last2.loc[0:9,'Last_Author_probF'].sum()/10)


df_last2.iloc[0:30,:]

Probability female on top 30 last authors 0.09466666666666664
Probability female on top 10 last authors 0.246


Unnamed: 0,Last_Author_clean,Last_Author_gend,Last_Author_gendprob,Num_papers,Last_Author_probF
0,Peter Shearer,male,0.99,47,0.01
1,Gregory Beroza,male,0.99,41,0.01
2,Stefan Wiemer,male,0.98,41,0.02
3,Xiaofei Chen,male,0.57,39,0.43
4,Gail Atkinson,female,0.94,39,0.94
5,Barbara Romanowicz,female,0.98,38,0.98
6,Roland Bürgmann,male,0.99,38,0.01
7,Yehuda Ben‐Zion,male,1.0,37,0.0
8,Jean Virieux,male,0.95,34,0.05
9,Hiroo Kanamori,male,0.99,34,0.01
