## A notebook to generate sythetic data as a reference



In [None]:
# install the follwoing packages in the enviroment:
# python3 -m pip install pandas
# python3 -m pip install seaborn

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import json

import os

from read_jsondata import read_jsons

In [None]:
# Define local paths

root = ! pwd
root = root[0]

RAW_DIR=root+"/author_allgenders/"  

if not os.path.exists(RAW_DIR):
    print("The directory {} does not exist.\nThere is no raw data for statistical analysis.".format(RAW_DIR))

In [None]:
df = read_jsons(RAW_DIR)
df['Number_authors'] = df['all_genders'].apply(lambda x: len(x)) #take the length of the list all_genders


### Synthetics

#### We should check if the numbers above are biased or are a consequence of female/male author distribution.

We can generate synthetic data using the distribution of female/male authors.


#### 1. Probability of an author being female/male:

In [None]:
def Prob_author(x,y, kind="female", allkinds="malefemale"):
    sum = 0
    for i,elem in enumerate(x):
        if elem not in allkinds:
            continue
        if elem != kind:
            sum += 1 - float(y[i]) 
        elif elem == kind:
            sum += float(y[i])
    return sum


df['Prob_Fauthor'] = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, "female"), axis=1)
df['Prob_Mauthor'] = df.apply(lambda x: Prob_author(x.all_genders, x.all_percent, "male"), axis=1)
#df

#### 2. Create some synthetic genders for authors:

In [None]:
### Define a random sampler from the distribution of female/male authors:

elements = ['female', 'male', 'init']
p1 = df['Prob_Fauthor'].sum()/df['Number_authors'].sum() #prob of an author being female
p2 = df['Prob_Mauthor'].sum()/df['Number_authors'].sum() #prob of an author being male
p3 = 1 - p1 - p2 #prob of an author being init

probabilities = [p1, p2 , p3]
np.random.choice(elements, 10, p=probabilities) # example: take 10 random samples

In [None]:
## Define some useful functions:
def Prob_atleast_cond_l(x, kind, allkinds="malefemale"):
    prod = 1
    z = x[:-1]
    for elem in z:
        if elem not in allkinds:
            continue
        if elem != kind:
            prod *= 1 
        elif elem == kind:
            prod *= 0
    return 1 - prod



def Prob_atleast_cond_f(x, kind, allkinds="malefemale"):
    prod = 1
    z = x[1:]
    for elem in z:
        if elem not in allkinds:
            continue
        if elem != kind:
            prod *= 1 
        elif elem == kind:
            prod *= 0
    return 1 - prod

In [None]:
# Generate synthetic genders per each article maintaining the number of authors:

N = 10

pf_lf = 0
pf_lm = 0
pf_ff = 0
pf_fm = 0

for i in range(N):

    df2 = pd.DataFrame()

    df2['Synth_genders'] = df['Number_authors'].apply(lambda x: np.random.choice(elements, x, p=probabilities))
    df2['Number_authors'] = df['Number_authors']
    
    # First author's gender and percentage:

    df2['First_Author_gend'] = df2['Synth_genders'].apply(lambda x: x[0]) 


    # Last author's gender and percentage:

    df2['Last_Author_gend'] = df2['Synth_genders'].apply(lambda x: x[-1]) 
    
    ### For example: what is the prob of at least a female coauthor if first/last is male/female??

    df2_lf = df2[(df2.Last_Author_gend == 'female') & (df2.Number_authors > 1)].copy() # only articles with last female
    df2_lm = df2[(df2.Last_Author_gend == 'male') & (df2.Number_authors > 1)].copy() # only articles with last male
    df2_ff = df2[(df2.First_Author_gend == 'female') & (df2.Number_authors > 1)].copy() # only articles with first female
    df2_fm = df2[(df2.First_Author_gend == 'male') & (df2.Number_authors > 1)].copy() # only articles with first male
    

    ### Create new columns with useful probabilities (last author condition)

    df2_lf['Prob_atleast_Fauthor_lf'] = df2_lf.apply(lambda x: Prob_atleast_cond_l(x.Synth_genders,'female'), axis=1)
    df2_lf['Prob_atleast_Mauthor_lf'] = df2_lf.apply(lambda x: Prob_atleast_cond_l(x.Synth_genders,'male'), axis=1)

    df2_lm['Prob_atleast_Fauthor_lm'] = df2_lm.apply(lambda x: Prob_atleast_cond_l(x.Synth_genders,'female'), axis=1)
    df2_lm['Prob_atleast_Mauthor_lm'] = df2_lm.apply(lambda x: Prob_atleast_cond_l(x.Synth_genders,'male'), axis=1)

    ### Create new columns with useful probabilities (first author condition)

    df2_ff['Prob_atleast_Fauthor_ff'] = df2_ff.apply(lambda x: Prob_atleast_cond_f(x.Synth_genders,'female'), axis=1)
    df2_ff['Prob_atleast_Mauthor_ff'] = df2_ff.apply(lambda x: Prob_atleast_cond_f(x.Synth_genders,'male'), axis=1)

    df2_fm['Prob_atleast_Fauthor_fm'] = df2_fm.apply(lambda x: Prob_atleast_cond_f(x.Synth_genders,'female'), axis=1)
    df2_fm['Prob_atleast_Mauthor_fm'] = df2_fm.apply(lambda x: Prob_atleast_cond_f(x.Synth_genders,'male'), axis=1)

    
    ### Sum probabilities to make the average later
    
    pf_lf += df2_lf['Prob_atleast_Fauthor_lf'].sum()/df2_lf.shape[0]
    pf_lm += df2_lm['Prob_atleast_Fauthor_lm'].sum()/df2_lm.shape[0]
    
    pf_ff += df2_ff['Prob_atleast_Fauthor_ff'].sum()/df2_ff.shape[0]
    pf_fm += df2_fm['Prob_atleast_Fauthor_fm'].sum()/df2_fm.shape[0]
    
    
    
### The probabilities tend to....    

print('Probability of having at least one female author in an article with last female author', 
      pf_lf/N)
print('Probability of having at least one female author in an article with last male author', 
      pf_lm/N)


print('Probability of having at least one female author in an article with first female author', 
      pf_ff/N)
print('Probability of having at least one female author in an article with first male author', 
      pf_fm/N)