In [35]:
import pandas as pd
import numpy as np


In [74]:
df_all = pd.read_csv('data_light/authors_journals.csv')

# Remove repeated authors within the same paper
df_all = df_all.drop_duplicates()

# Assign LMICs
lmics_codes = pd.read_csv('data_light/lmics_codes.csv')['Code'].values

# Group by researcher_id to get one row per author
df_all = df_all.groupby('researcher_id').agg({'journal.title': 'first', 'aff_country_code': 'first'}).reset_index()

# Function to determine LMIC and use inside apply
def is_lmic(x, lmics_codes):

    # if x is None or x is np.nan:
    if x == None:
        return np.nan
    elif x in lmics_codes:
        return 1
    else:
        return 0
    
# Add LMIC column
df_all['LMIC'] = df_all.aff_country_code.apply(lambda x: is_lmic(x, lmics_codes))

# Assign gender
gender_df = pd.read_feather('data/high_impact_publications.feather')[['researcher_id', 'gender', 'current_organization_id']]

# merge with authors_journals.csv
df_final = gender_df.merge(df_all, on='researcher_id').drop_duplicates()
df_final = df_final[['researcher_id', 'gender', 'LMIC', 'current_organization_id']]

df_final.to_csv('data_light/authors_info.csv', index=False)

# Assign year
year_df = pd.read_feather('data/high_impact_publications.feather')[['pub_id', 'year']]
year_df = year_df.drop_duplicates()
year_df.to_csv('data_light/pubs_info.csv', index=False)

In [59]:
df = pd.read_csv('data_light/authors_journals.csv') \
       .drop(['Unnamed: 0','aff_name', 'aff_city_id','author_name'], axis=1) \
       .rename(columns={'journal.title': 'journal'}) 

# merge with pub_info to get the year
df = df.merge(pd.read_csv('data_light/pubs_info.csv'), on='pub_id')

# merge with author_info to get the gender
df = df.merge(pd.read_csv('data_light/authors_info.csv').drop(['current_organization_id'], axis=1), on='researcher_id')

# just a pub_id - research_id pair, no duplicated, groupby
# if one institution is LMIC, the author is LMIC
df = df.groupby(['pub_id', 'researcher_id']).agg({'journal': 'first',
                                                  'year': 'first',
                                                  'gender': 'first',
                                                  'aff_country_code': 'first',
                                                  'aff_id': 'first',
                                                  'LMIC': 'max'}).reset_index()

# drop year 2023
df = df[df['year'] != 2023]

# exclude papers with only one author
df1 = df.groupby('pub_id').filter(lambda x: len(x) > 1) 
print(f"{len(df.researcher_id.unique())} authors (beginning)")
print(f"{len(df.pub_id.unique())} publications (beginning)")
print(f"{len(df.researcher_id.unique()) - len(df1.researcher_id.unique())} authors dropped where there was only one author")
print(f"{len(df1.researcher_id.unique())} authors")


# exclude papers with more than 30, 40, 50 authors
maxs = [30, 40, 50]

for max_ in maxs:
    
       df2 = df1.groupby('pub_id').filter(lambda x: len(x) <= max_)
       print(f"{len(df1.researcher_id.unique()) - len(df2.researcher_id.unique())} authors dropped where papers had more than {max_} authors")
       print(f"{len(df2.researcher_id.unique())} authors")

       df2.to_csv(f'data_light/clean/max_{max_}_authors.csv', index=False)


183471 authors (beginning)
97335 publications (beginning)
7681 authors dropped where there was only one author
175790 authors
13658 authors dropped where papers had more than 30 authors
162132 authors
8529 authors dropped where papers had more than 40 authors
167261 authors
5903 authors dropped where papers had more than 50 authors
169887 authors


In [78]:
# read the data max 30, 40, 50 authors and create pessimistic and optimistic datasets for gender missingness
maxs = [30, 40, 50]

for max_ in maxs:
    df_baseline = pd.read_csv(f'data_light/clean/max_{max_}_authors.csv')

    # replace missing gender info by female
    df_opti = df_baseline.copy()
    df_opti['gender'].fillna('female', inplace=True)
    df_opti.to_csv(f'data_light/clean/gender_opti/max_{max_}_authors.csv', index=False)

    # replace missing gender info by male
    df_pessi = df_baseline.copy()
    df_pessi['gender'].fillna('male', inplace=True)
    df_pessi.to_csv(f'data_light/clean/gender_pessi/max_{max_}_authors.csv', index=False)

    # replace missing LMIC info by 1
    df_opti = df_baseline.copy()
    df_opti['LMIC'].fillna(1, inplace=True)
    df_opti.to_csv(f'data_light/clean/lmic_opti/max_{max_}_authors.csv', index=False)

    # replace missing LMIC info by 0
    df_pessi = df_baseline.copy()
    df_pessi['LMIC'].fillna(0, inplace=True)
    df_pessi.to_csv(f'data_light/clean/lmic_pessi/max_{max_}_authors.csv', index=False)

In [33]:
# get the gender of the first author using df_opti and df_pessi

df_opti.groupby('pub_id').gender.first().value_counts(normalize=True)

male      0.630675
female    0.369325
Name: gender, dtype: float64

In [34]:
df_opti.gender.value_counts(normalize=True)

male      0.610233
female    0.389767
Name: gender, dtype: float64