In [None]:
# install the follwoing packages in the enviroment:
# python3 -m pip install pandas
# python3 -m pip install seaborn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import json

import os

from read_jsondata import read_jsons

import time

## Hypotheses to be tested

#### As reference values to compare to, we will use demographics from ECS from AGU and EGU. This will give an idea of how many of the active scientists at these professional levels are represented to the peer-reviewed articles (which is the main crucial factor for career advancing and perhaps the daily goal of most academics)

FIRST GLANCES AT DATA

- % of female first authors (hists?)
- % publications with all male vs. % publications with all female authors (hists?)

CO-AUTHORSHIP ANALYSES

- When 1st author is female: % of male vs. female co-authors (bars..?)
- When 1st author is female: likelihood of last author (possibly PI) to be female vs. male 
- When 1st author is male: % of female coauthors and % of male coauthors
- When the last name is female (possible PI), is there a higher % of female co-authors vs. male ones?

JOURNAL IF ANALYSES:

- Correlation between IF and female first authors: does higher IF mean fewer female first authors?




In [None]:
# Define local paths

root = ! pwd
root = root[0]

RAW_DIR=root+"/author_allgenders_egu/"  

if not os.path.exists(RAW_DIR):
    print("The directory {} does not exist.\nThere is no raw data for statistical analysis.".format(RAW_DIR))

In [None]:
df = read_jsons(RAW_DIR)
df

### Create new columns in the dataframe extracting useful information from list of coauthors

In [None]:
# Number of authors:

df['Number_authors'] = df['all_genders'].apply(lambda x: len(x)) #take the length of the list all_genders
df['Number_init'] = df['all_genders'].apply(lambda x: len([s for s in x if "init"==s]))

# First author's gender and percentage:

df['First_Author_gend'] = df['all_genders'].apply(lambda x: x[0]) #take the first element of the list all_genders
df['First_Author_perc'] = df['all_percent'].apply(lambda x: x[0])

# Last author's gender and percentage:

df['Last_Author_gend'] = df['all_genders'].apply(lambda x: x[-1]) #take the last element of the list all_genders
df['Last_Author_perc'] = df['all_percent'].apply(lambda x: x[-1])

df

In [None]:
df = df[df.Number_init==0].copy()
df

 #### It is easier if the all probabilities are with respect to the same gender (female)

In [None]:
# prob(female) = 1 - prob(male)

# Prob last author female:

df['Last_Author_probF'] = df['Last_Author_perc']
df.loc[df['Last_Author_gend'] == 'male','Last_Author_probF'] = \
    1 - df.loc[df['Last_Author_gend'] == 'male','Last_Author_probF']

# Prob first author female:

df['First_Author_probF'] = df['First_Author_perc']
df.loc[df['First_Author_gend'] == 'male','First_Author_probF'] = \
    1 - df.loc[df['First_Author_gend'] == 'male','First_Author_probF']

df

## Now we can compute some interesting probabilities:

### Useful formulas:

Suppose $x_i$ refers to the article $i$ and $N$ is the total number of articles. Then, the probability of an article having female author is:

$$p(\text{female}) = \sum_{i}^N p(\text{female}|x_i) p(x_i). $$

If we have all the probabilities with respect to the female gender, then the probability of having a male author will be:

$$p(\text{male}) = \sum_{i}^N (1 - p(\text{female}|x_i)) p(x_i). $$

$p(x_i)$ is the probability of the article $x_i$. All articles have the same probability, therefore $p(x_i) = \frac{1}{N}$. This means that the formulas above are same as taking the average of  $p(\text{female}|x_i)$ or $(1 - p(\text{female}|x_i))$, respectively.


### Let's compute some easy statistics to start

In [None]:
print('Probability of having a female first author:', df['First_Author_probF'].sum()/df.shape[0])
print('Probability of having a male first author:', (1 - df['First_Author_probF']).sum()/df.shape[0])


print('Probability of having a female last author:', df['Last_Author_probF'].sum()/df.shape[0])
print('Probability of having a male last author:', (1 - df['Last_Author_probF']).sum()/df.shape[0])



#### Probabilities of having at least one male/female author in an article

Having at least one female author refers to any coauthor combination excluding the case in which all authors are male:

$$p(\text{at least 1 female}|x_i) = 1 - p(\text{all male}|x_i)$$

Computing probability for all male coauthors is easier. In the following, we drop the dependency on $x_i$ for clarity.

$$p(\text{all male}) = p(\text{male}_1)p(\text{male}_2|\text{male}_1)p(\text{male}_3|\text{male}_1,\text{male}_2)... = \prod_i^n p(\text{male}_i)$$

where n is the number of authors and the last step assumes that the gender probability of each authorship is independent of the gender of other coauthors (just to simplify the problem). 

In [None]:
#Define functions to multiply probabilities in each row

#prob at least a female author

def Prob_atleast_Fauthor(x,y):
    prod = 1
    for i,elem in enumerate(x):
        if elem == 'male':
            prod *= float(y[i]) 
        elif elem == 'female':
            prod *= 1 - float(y[i])
    return 1 - prod

#prob at least a male author

def Prob_atleast_Mauthor(x,y):
    prod = 1
    for i,elem in enumerate(x):
        if elem == 'male':
            prod *= 1 - float(y[i]) 
        elif elem == 'female':
            prod *= float(y[i])
    return 1 - prod


# Create corresponding columns:

df['Prob_atleast_Fauthor'] = df.apply(lambda x: Prob_atleast_Fauthor(x.all_genders, x.all_percent), axis=1)
df['Prob_atleast_Mauthor'] = df.apply(lambda x: Prob_atleast_Mauthor(x.all_genders, x.all_percent), axis=1)

df

In [None]:
print('Probability of having at least one female author in an article', 
      df['Prob_atleast_Fauthor'].sum()/df.shape[0])

print('Probability of having at least one male author in an article', 
      df['Prob_atleast_Mauthor'].sum()/df.shape[0])

print('or the opposite...')

print('Probability of having all female authors in an article', 
      1 - df['Prob_atleast_Mauthor'].sum()/df.shape[0])

print('Probability of having all male authors in an article', 
      1 - df['Prob_atleast_Fauthor'].sum()/df.shape[0])

#### in between: Overall frequency of female / male authors

In [None]:
def Prob_author(x,y, kind="female", allkinds="malefemale"):
    sum = 0
    for i,elem in enumerate(x):
        if elem not in allkinds:
            continue
        if elem != kind:
            sum += 1 - float(y[i]) 
        elif elem == kind:
            sum += float(y[i])
    return sum

def count_kind(x, kind="init"):
    sum = 0
    for elem in x:
        if elem == kind:
            sum += 1
    return(sum)
# How many authors in total?
n_authors_all =df.Number_authors.sum()
n_authors_all
# Sum of probability of female / total nr. 
p_female_all = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, kind="female"), axis=1).sum() / n_authors_all

# Sum of probability of male / total nr.
p_male_all = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, kind="male"), axis=1).sum() / n_authors_all

# Sum of init / total nr
p_init_all =  df.apply(lambda x: count_kind(x.all_genders), axis=1).sum() / n_authors_all
# Check it adds to 1
print("All probabilities sum: ", p_female_all + p_male_all + p_init_all)
print("Overall frequency of female authorship: ", 100.*p_female_all)
print("Overall frequency of male authorship: ", 100.*p_male_all)
print("Overall frequency of unidentified names: ", 100. * p_init_all)
print("=" * 100)

## correct for init values
n_init = df.apply(lambda x: count_kind(x.all_genders), axis=1).sum()
## Sum of probability of female / total nr. 
p_female_all = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, kind="female"), axis=1).sum() / (n_authors_all - n_init)

# Sum of probability of male / total nr.
p_male_all = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, kind="male"), axis=1).sum() / (n_authors_all - n_init)
print("All probabilities sum: ", p_female_all + p_male_all)
print("Overall frequency of female authorship after accounting for init: ", 100.*p_female_all)
print("Overall frequency of male authorship after accounting for init: ", 100.*p_male_all)


# so if we go by these rough probabilities then...
print("=" * 100)
print("Probability that a paper with x author(s) has at least 1 female author:")
print("x=1: p=", round(1. - p_male_all, 3), ";   x=2: p=", round(1. - p_male_all ** 2, 3),
      ";   x=3: p=", round(1. - p_male_all ** 3, 3), ";   p=10: p=", round(1. - p_male_all ** 10, 3))
print("=" * 100)
# and the reverse:
print("Probability that a paper with x author(s) has only female authors:")
print("x=1: p=", round(p_female_all, 3), ";   x=2: p=", round(p_female_all ** 2, 3),
      ";   x=3: p=", round(p_female_all ** 3, 3), ";   p=10: p=", round(p_female_all ** 10, 3))
# etc
print("=" * 100)

n_authors = range(1, 21)
p_one_f_given_n = 1. - p_male_all ** n_authors
p_one_m_given_n = 1. - p_female_all ** n_authors

p_all_f_given_n = p_female_all ** n_authors
p_all_m_given_n = p_male_all ** n_authors


# UGLY CODE below. Please fix if you are inspired
n_authors_data = df.Number_authors.unique()
n_authors_data.sort()

p_atleast_f_per_n = []
p_atleast_m_per_n = []
p_all_f_per_n = []
p_all_m_per_n = []
nr_papers_n = []
n_authors_data = n_authors_data[0: 20]
for i in n_authors_data:
    p_atleast_f_per_n.append(100*df[df.Number_authors == i].Prob_atleast_Fauthor.mean())    
    p_atleast_m_per_n.append(100*df[df.Number_authors == i].Prob_atleast_Mauthor.mean())
    p_all_f_per_n.append(100*(1. - df[df.Number_authors == i].Prob_atleast_Mauthor).mean())
    p_all_m_per_n.append(100*(1. - df[df.Number_authors == i].Prob_atleast_Fauthor).mean())
    
    nr_papers_n.append(len(df[df.Number_authors == i])) 
# End ugly crap code
    

plt.figure(figsize=(9, 6))
plt.subplot(221)
plt.scatter(n_authors_data, p_atleast_f_per_n, color="g", alpha=0.7)
plt.scatter(n_authors, 100*p_one_f_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(0., 110)
plt.xticks([i for i in range(0, 11, 2)], ["" for i in range(0, 11, 2)])
plt.title("Probability at least 1 f")
plt.legend(["Data", "Random draw"])


plt.subplot(222)
plt.scatter(n_authors_data, p_atleast_m_per_n, color="g", alpha=0.7)
plt.scatter(n_authors, 100*p_one_m_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(0., 110)
plt.title("Probability at least 1 m")
plt.legend(["Data", "Random draw"])

plt.xticks([i for i in range(0, 11, 2)], ["" for i in range(0, 11, 2)])

plt.subplot(223)
plt.scatter(n_authors_data, p_all_f_per_n, color="g", alpha=0.7)
plt.scatter(n_authors, 100*p_all_f_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(0., 110)

plt.title("Probability all f")
plt.legend(["Data", "Random draw"])

plt.xlabel("Nr. of authors")
plt.subplot(224)
plt.scatter(n_authors_data, p_all_m_per_n, color="g", alpha=0.7)
plt.scatter(n_authors, 100*p_all_m_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(0., 110)
plt.title("Probability all m")
plt.legend(["Data", "Random draw"])
plt.xlabel("Nr. of authors")

plt.savefig("prob_atleast_all.png", dpi=150)
print(nr_papers_n)

In [None]:
plt.figure(figsize=(9, 6))
plt.subplot(221)
plt.scatter(n_authors_data,  p_atleast_f_per_n - 100*p_one_f_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(-7.5, 7.5)
plt.xticks([i for i in range(0, 11, 2)], ["" for i in range(0, 11, 2)])
plt.title("Bias at least 1 f")


plt.subplot(222)
plt.scatter(n_authors_data, p_atleast_m_per_n - 100*p_one_m_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(-7.5, 7.5)
plt.xticks([i for i in range(0, 11, 2)], ["" for i in range(0, 11, 2)])
plt.title("Bias at least 1 m")


plt.subplot(223)
plt.scatter(n_authors_data,  p_all_f_per_n - 100*p_all_f_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(-7.5, 7.5)
plt.xticks([i for i in range(0, 11, 2)], ["" for i in range(0, 11, 2)])
plt.title("Bias all f")


plt.subplot(224)
plt.scatter(n_authors_data, p_all_m_per_n - 100*p_all_m_given_n, marker="x")
plt.grid()
plt.xlim(0, 10)
plt.ylim(-7.5, 7.5)
plt.xticks([i for i in range(0, 11, 2)], ["" for i in range(0, 11, 2)])
plt.title("Bias all m")

In [None]:
print('Average number of authors for abstracts', 
      df['Number_authors'].sum()/df.shape[0])

In [None]:
print('Expected probability of having at least one female author in an article:', 
      1. - p_male_all ** (df['Number_authors'].sum()/df.shape[0]))
print('Expected probability of having at least one male author in an article:', 
      1. - p_female_all ** (df['Number_authors'].sum()/df.shape[0]))

In [None]:
print('Expected probability of having at least one author in an article with 4 authors:', 
      1. - p_male_all ** 4)
print('Expected probability of having at least one male author in an article with 4 authors:', 
      1. - p_female_all ** 4)



##### important: these results may be sensitive to accounting for non-gendered "init" data points in computing the P_atleast.... We should remove them for a cleaner result. I think we could correct for them by enforcing that p_allmale and p_onefemale must sum to one (now not the exact)
- With the number of papers in each category, we may consider everything up to about 10 authors a decent sample size? (for n_authors=10, n_papers = 252)
- men are overrepresented in single-author papers
- gender mixing of authors is not random: the probability of mixed authorships in the data does not reflect a random choice based on overall gender frequencies in the data. The data show more likely all-male and all-female papers compared to random choice by overall frequency. This seems to be particularly the case for all-male author groups. Boys club?
- this may put female authors at disavantage, because their available pool of co-authors is smaller, while men have an increased co-author pool. Female authors may be at disadvantage with regard to forging collaboration


#### See the temporal trend of having at least one F or one M in a publication for all journals per year


In [None]:
years = df['year'].unique() # a list of unique journal names
years.sort()
print(years)

for i in years: #update values for each journal
    cond = df['year']==i
    print("Number of articles = ", len(df[cond].values), " for year ", i)
    df.loc[cond,'P_atleast_F_year'] = df.loc[cond,'Prob_atleast_Fauthor'].sum()/df[cond].shape[0]
    df.loc[cond,'P_atleast_M_year'] = df.loc[cond,'Prob_atleast_Mauthor'].sum()/df[cond].shape[0]



In [None]:
sns.barplot(y="year", x="P_atleast_F_year",  data=df, order=years, palette='rainbow').set_title('Prob of at least one female all journals per year')
plt.xlim([0,1])

In [None]:
sns.barplot(y="year", x="P_atleast_M_year",  data=df, order=years, palette='rainbow').set_title('Prob of at least one male all journals per year')
plt.xlim([0,1])

#### Female & male percentages per year 

In [None]:
# To discuss: Here we should make sure that "male" only counts "male"
# last_auth_F_year = df.groupby(['year'])['Last_Author_gend'].apply(lambda x: x[x.str.contains('female')].count())
# print(last_auth_F_year)

# last_auth_M_year = df.groupby(['year'])['Last_Author_gend'].apply(lambda x: x[x.str.contains('male')].count())
# print(last_auth_M_year)

# first_auth_F_year = df.groupby(['year'])['First_Author_gend'].apply(lambda x: x[x.str.contains('female')].count())
# print(first_auth_F_year)

# first_auth_M_year = df.groupby(['year'])['First_Author_gend'].apply(lambda x: x[x.str.contains('male')].count())
# print(first_auth_M_year)

# first author> someone needs to double check this! 
for i in years: #update values for each journal
    cond = (df['year']==i)# & (df["First_Author_gend"] != "init")  
    # alternatively: Do not remove Init and then probabilities do not sum to 1.
    # cond = df['year']==i
    
    print("Number of articles = ", len(df[cond].values), " for year ", i)
    df.loc[cond,'P_first_F_year'] = df.loc[cond,'First_Author_probF'].sum() / len(df.loc[cond])
    df.loc[cond,'P_first_M_year'] = (1. - df.loc[cond,'First_Author_probF']).sum() / len(df.loc[cond])
    
    # uncomment the following line to check if sums up to 1
    # print(df.loc[cond, "P_first_F_year"].iloc[0:5] + df.loc[cond, "P_first_M_year"].iloc[0:5])
    print("Probability that first author is female: ", df.loc[cond, "P_first_F_year"].iloc[0])
    print("Probability that first author is male: ", df.loc[cond, "P_first_M_year"].iloc[0])

    
    # last author
    cond = (df['year']==i) & (df["Last_Author_gend"] != "init")  
    # alternatively: Do not remove Init and then probabilities do not sum to 1.
    # cond = df['year']==i
    
    #print("Number of articles = ", len(df[cond].values), " for year ", i)
    df.loc[cond,'P_last_F_year'] = df.loc[cond,'Last_Author_probF'].sum() / len(df.loc[cond])
    df.loc[cond,'P_last_M_year'] = (1. - df.loc[cond,'Last_Author_probF']).sum() / len(df.loc[cond])
    
    # uncomment the following line to check if sums up to 1
    # print(df.loc[cond, "P_last_F_year"].iloc[0:5] + df.loc[cond, "P_last_M_year"].iloc[0:5])
    print("Probability that last author is female: ", df.loc[cond, "P_last_F_year"].iloc[0])
    print("Probability that last author is male: ", df.loc[cond, "P_last_M_year"].iloc[0])

df.to_csv("analysis_output_" + time.strftime("%Y-%m-%d.csv"))

In [None]:
sns.barplot(y="year", x="P_first_M_year",  data=df, order=years, palette='rainbow').set_title('Prob of first author male per year')
plt.xlim([0,1])
plt.show()

sns.barplot(y="year", x="P_first_F_year",  data=df, order=years, palette='rainbow').set_title('Prob of first author female per year')
plt.xlim([0,1])
plt.show()


sns.barplot(y="year", x="P_last_M_year",  data=df, order=years, palette='rainbow').set_title('Prob of last author male per year')
plt.xlim([0,1])
plt.show()

sns.barplot(y="year", x="P_last_F_year",  data=df, order=years, palette='rainbow').set_title('Prob of last author female per year')
plt.xlim([0,1])

#### We should check if the numbers above are biased or are a consequence of female/male author distribution.

We can generate synthetic data using the distribution of female/male authors.

--> The synthetics have moved to analysis-Synthetics notebook, remain below for not accidentally deleting something useful

In [None]:
def Prob_author(x,y, kind="female", allkinds="malefemale"):
    sum = 0
    for i, elem in enumerate(x):
        if elem not in allkinds:
            continue
        if elem != kind:
            sum += 1 - float(y[i]) 
        elif elem == kind:
            sum += float(y[i])
    return sum


df['Prob_Fauthor'] = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, "female"), axis=1)
df['Prob_Mauthor'] = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, "male"), axis=1)

df

In [None]:
years = df['year'].unique() # a list of unique journal names
years.sort()
print(years)

for i in years: #update values for each journal
    cond = df['year']==i
    print("Number of articles = ", len(df[cond].values), " for year ", i)
    n_year = df.loc[cond,'all_genders'].apply(lambda x: len([s for s in x if "init"!=s])).sum()
    print(n_year)
    df.loc[cond,'P_F_year'] = df.loc[cond,'Prob_Fauthor'].sum()/n_year
    df.loc[cond,'P_M_year'] = df.loc[cond,'Prob_Mauthor'].sum()/n_year
    

In [None]:
sns.barplot(y="year", x="P_F_year",  data=df, order=years, palette='rainbow').set_title('Prob of female per year')
plt.xlim([0,1])

In [None]:
sns.barplot(y="year", x="P_M_year",  data=df, order=years, palette='rainbow').set_title('Prob of male per year')
plt.xlim([0,1])

In [None]:
### Define a random sampler from the distribution of female/male authors:

elements = ['female', 'male', 'init']
p1 = df['Prob_Fauthor'].sum()/df['Number_authors'].sum() #prob of an author being female
p2 = df['Prob_Mauthor'].sum()/df['Number_authors'].sum() #prob of an author being male
p3 = 1 - p1 - p2 #prob of an author being init

probabilities = [p1, p2 , p3]
print(probabilities)
np.random.choice(elements, 10, p=probabilities) # example: take 10 random samples



In [None]:
# Generate synthetic genders per each article maintaining the number of authors:

dfn = pd.DataFrame()

dfn['Synth_genders'] = df['Number_authors'].apply(lambda x: np.random.choice(elements, x, p=probabilities))
dfn

In [None]:
### Now check for these synthetics the probabilities:

# prob having at least one female author

def Prob_atleast_Fauthor_synth(x):
    prod = 1
    for i,elem in enumerate(x):
        if elem == 'male':
            prod *= 1 
        elif elem == 'female':
            prod *= 0
    return 1 - prod

dfn['Prob_atleast_Fauthor_synth'] = dfn.apply(lambda x: Prob_atleast_Fauthor_synth(x.Synth_genders), axis=1)

print('Probability of having at least one female author in an article', 
      dfn['Prob_atleast_Fauthor_synth'].sum()/dfn.shape[0])

# prob having at least one male author

def Prob_atleast_Mauthor_synth(x):
    prod = 1
    for i,elem in enumerate(x):
        if elem == 'male':
            prod *= 0
        elif elem == 'female':
            prod *= 1
    return 1 - prod

dfn['Prob_atleast_Mauthor_synth'] = dfn.apply(lambda x: Prob_atleast_Mauthor_synth(x.Synth_genders), axis=1)

print('Probability of having at least one male author in an article',
      dfn['Prob_atleast_Mauthor_synth'].sum()/dfn.shape[0])


### What is the prob of first and last authorships?


dfn['First_Author_Fperc_synth'] = dfn['Synth_genders'].apply(lambda x: 1 if x[0]=='female' else 0)
dfn['First_Author_Mperc_synth'] = dfn['Synth_genders'].apply(lambda x: 1 if x[0]=='male' else 0)

dfn['Last_Author_Fperc_synth'] = dfn['Synth_genders'].apply(lambda x: 1 if x[-1]=='female' else 0)
dfn['Last_Author_Mperc_synth'] = dfn['Synth_genders'].apply(lambda x: 1 if x[-1]=='male' else 0)

print('First author female:', dfn['First_Author_Fperc_synth'].sum()/dfn.shape[0])
print('First author male:', dfn['First_Author_Mperc_synth'].sum()/dfn.shape[0])
print('Last author female:', dfn['Last_Author_Fperc_synth'].sum()/dfn.shape[0])
print('Last author male:', dfn['Last_Author_Mperc_synth'].sum()/dfn.shape[0])

In [None]:
0.2537245014898006 + 0.7214072885629155