In [1]:
import time
import population_pyramids, pickle
import matplotlib.pyplot as plt
import numpy as np
import os
import json
from datetime import datetime

Set the following parameters in the configuration file:

- ``input_data_name``: *string* type. File name of author data. The file ``<input_data_name>.jsonl`` must be placed in the ``researcher_population_pyramids/framework/data/`` directory.
- ``base_year``: *int* type. The base year from which the population pyramid is extended for future projections (i.e., using observed data up to this year).
- ``target_past_years``: *list* type. A list of specific past years for which to create the observed population pyramids. Each year must fall within the range of your provided publication data.
- ``t_max_projection``: *int* type. The maximum year up to which population pyramid projections are generated.
- ``survival_prob_threshold``: *float* type. The threshold for the survival probability of an author's inter-publication interval, used to identify "active" authors. (e.g., 0.01 was used in our manuscript.)

In [2]:
input_data_name = 'author_sample_lst'
base_year = 2023
target_past_years = [2010, 2023]
t_max_projection = 2050
survival_prob_threshold = 0.01

In [3]:
f_path = './data/' + input_data_name + '.jsonl'
with open(f_path) as f:
    author_sample_lst = [json.loads(l) for l in f.readlines()]

female_count, male_count = 0, 0
pub_y_lst = []
for author in author_sample_lst:
    g = author["gender"]
    if g == 0:
        female_count += 1
    elif g == 1:
        male_count += 1
    for pub_d in author["pub_date_lst"]:
        pub_y_lst.append(datetime.strptime(str(pub_d), '%Y-%m-%d').year)

t_min, t_max = min(pub_y_lst), max(pub_y_lst)

print(f"Number of authors: {female_count + male_count}, Number of female authors: {female_count}, Number of male authors: {male_count}")
print(f"Oldest publication year in the data: {t_min}, Newest publication year in the data: {t_max}")

Number of authors: 191086, Number of female authors: 81570, Number of male authors: 109516
Oldest publication year in the data: 1950, Newest publication year in the data: 2023


In [4]:
researcher_population_pyramid = {}

print("Computing researcher population pyramids...")

t_s = time.time()

(female_survival_probability, male_survival_probability) = population_pyramids.calc_survival_probability_for_publication_interval(input_data_name)

pub_interval_threshold = population_pyramids.calc_threshold_for_publication_interval(female_survival_probability, male_survival_probability, survival_prob_threshold)

print(f"Inter-publication threshold (in years): Female {float(pub_interval_threshold[0])/365}, Male {float(pub_interval_threshold[1])/365}")

for target_year in target_past_years:
    female_productivity, male_productivity, female_count, male_count = population_pyramids.calc_researcher_population_pyramid(input_data_name, pub_interval_threshold, target_year)
    researcher_population_pyramid[target_year] = (female_count, male_count)

target_year = t_max_projection
future_female_count, future_male_count, female_prob, male_prob, female_newcomer_count, male_newcomer_count = population_pyramids.calc_future_researcher_population_pyramid(input_data_name, pub_interval_threshold, base_year, target_year)
researcher_population_pyramid[target_year] = (future_female_count, future_male_count, female_prob, male_prob, female_newcomer_count, male_newcomer_count)

t_e = time.time()

print("Elapsed time: ", t_e - t_s)

(female_count, male_count, female_prob, male_prob, female_newcomer_count, male_newcomer_count) = researcher_population_pyramid[t_max_projection]

total_active = np.sum(list(female_count[base_year].values())) + np.sum(list(male_count[base_year].values()))
total_female = np.sum(list(female_count[base_year].values()))
total_male = np.sum(list(male_count[base_year].values()))
newcomer_count = np.sum(list(female_newcomer_count.values())) + np.sum(list(male_newcomer_count.values()))
inflow_ratio = float(newcomer_count) / total_active
print(f"Number of active authors in {t_max}: {total_active}, Number of newly active authors in {t_max}: {newcomer_count}, Researcher inflow in {t_max}: {inflow_ratio}")

car_mat_f = float(np.sum([k * female_count[base_year][k] for k in female_count[base_year]])) / total_female
car_mat_m = float(np.sum([k * male_count[base_year][k] for k in male_count[base_year]])) / total_male
car_mat_gender_gap = str('{:.2f}'.format(100 * float(car_mat_f - car_mat_m) / car_mat_m))
print(f"Female mean of the cumulative productivity in {t_max}: {car_mat_f}, Male mean of the cumulative productivity in {t_max}: {car_mat_m}, Gender gap in cumulative productivity in {t_max}: {car_mat_gender_gap}")
    
# f_path = ("./data/researcher_population_pyramid.pkl")
# with open(f_path, "wb") as f:
#     pickle.dump(researcher_population_pyramid, f)

Computing researcher population pyramids...
Inter-publication threshold (in years): Female 14.676712328767124, Male 14.775342465753425
Elapsed time:  66.93786025047302
Number of active authors in 2023: 79913.0, Number of newly active authors in 2023: 7955, Researcher inflow in 2023: 0.09954575600966051
Female mean of the cumulative productivity in 2023: 9.314077713000502, Male mean of the cumulative productivity in 2023: 11.984584363210615, Gender gap in cumulative productivity in 2023: -22.28


The following files will be generated in the `researcher_population_pyramids/framework/data/` directory:

- `female_author_count.csv`: For each cumulative productivity, the number of female active authors is shown for each year (observed years in *target_past_years* and projected years from *base_year + 1* to *t_max_projection*).
- `female_inflow_<base_year>.csv`: For each cumulative productivity, the number of female newly active authors in year *base_year* is shown.
- `female_trans_prob_from_<base_year-1>_to_<base_year>.csv`: The probability that a female active author with cumulative productivity *k_1* in *base_year-1* transitions to cumulative productivity *k_2* in *base_year*.
- `male_author_count.csv`: For each cumulative productivity, the number of male active authors is shown for each year (observed years in *target_past_years* and projected years from *base_year + 1* to *t_max_projection*).
- `male_inflow_<base_year>.csv`: For each cumulative productivity, the number of male newly active authors in year *base_year* is shown.
- `male_trans_prob_from_<base_year-1>_to_<base_year>.csv`: The probability that a male active author with cumulative productivity *k_1* in *base_year-1* transitions to cumulative productivity *k_2* in *base_year*.

In [5]:
population_pyramids.save(researcher_population_pyramid, target_past_years, base_year, t_max_projection)

In [6]:
def plot_researcher_population_pyramid(female_count, male_count, base_year, target_year, max_n=1000000, max_productivity=100):
    
    fontsize = 30
    plt.rcParams["font.size"] = fontsize
    
    fig, ax = plt.subplots(ncols=2, figsize=(15, 10))
    
    # Female
    y, x = [], []
    for n in range(1, max_productivity+1):
        if female_count.get(n, 0) > 0:
            y.append(n)
            x.append(female_count.get(n, 0))
        else:
            y.append(n)
            x.append(0.1)
    ax[0].barh(y, x, color='darkgreen', height=0.5, label='Female', log=True)
    ax[0].yaxis.tick_right()
    ax[0].set_yticks(np.array(range(0, max_productivity + 1, 10)))
    ax[1].set_yticklabels([])
    ax[0].set_xlim([max_n, 1])
    ax[0].set_xlabel('Number of authors')
    ax[0].set_title('Female')
    
    # Male
    y, x = [], []
    for n in range(1, max_productivity+1):
        if female_count.get(n, 0) > 0:
            y.append(n)
            x.append(male_count.get(n, 0))
        else:
            y.append(n)
            x.append(0.1)
    ax[1].barh(y, x,
               color='orange', height=0.5, label='Male', log=True)
    ax[1].set_yticks(np.array(range(0, max_productivity + 1, 10)))
    ax[1].set_xlim([1, max_n])
    ax[1].set_title('Male')
    
    fig.text(0.51, 0.5, 'Cumulative productivity', va='center', ha='center', 
             rotation='vertical', fontsize=fontsize)
    
    plt.subplots_adjust(left=0.05, right=0.95, bottom=0.12, top=0.95, wspace=0.33)

    if not os.path.exists('./figs'):
        os.makedirs('./figs')
    
    # fig.legend(loc='upper right')
    # plt.show()
    if base_year == "":
        plt.savefig('./figs/researcher_population_pyramid_' + str(target_year) + '.pdf')
    else:
        plt.savefig('./figs/researcher_population_pyramid_' + str(target_year) + "_based_on_" + str(base_year) + '.pdf')
    plt.clf()
    plt.close()
    
    return

The following figures will be generated in the `researcher_population_pyramids/framework/figs/` directory:

- `researcher_population_pyramid_<target_past_year>.pdf`: Population pyramid for each year in *target_past_years*.
- `researcher_population_pyramid_<t_max_projection>_based_on_<base_year>.pdf`: Population pyramid for the *t_max_projection* year, predicted based on *base_year* data.

In [7]:
# f_path = ("./data/researcher_population_pyramid.pkl")
# with open(f_path, mode="rb") as f:
#     researcher_population_pyramid = pickle.load(f)

for target_year in target_past_years:
    base_year = ""
    (female_count, male_count) = researcher_population_pyramid[target_year]
    plot_researcher_population_pyramid(female_count, male_count, base_year, target_year, max_n=1000000, max_productivity=100)

base_year, target_year = t_max, t_max_projection
(female_count, male_count, female_prob, male_prob, female_newcomer_count, male_newcomer_count) = researcher_population_pyramid[target_year]
plot_researcher_population_pyramid(female_count[target_year], male_count[target_year], base_year, target_year, max_n=1000000, max_productivity=100)