In [24]:
import math

import numpy as np
import pycountry
import pickle
import pandas as pd
import statsmodels.api as sm
from scipy import stats
import models
import gender_analysis
from sklearn.linear_model import LinearRegression
from scipy.stats import ttest_ind

In [2]:
# Parameters
orbis_test_size = 450
wgnd_test_size = 5000
random_state = 42
shuffle = True
include_middle_name = False
include_last_name = False
clearning_special_chars = True
min_ratio_train_to_test = 5
min_ratio_minority_to_majority = 0.2
min_name_samples = 500
arab_league = {
    "DZ", "BH", "KM", "DJ", "EG", "IQ", "JO", "KW", "LB", "LY", "MR", 
    "MA", "OM", "PS", "QA", "SA", "SO", "SD", "SY", "TN", "AE", "YE",
}

In [3]:
arab_countries = ["DZ", "EG", "IQ", "JO", "LB", "MA", "SA", "TN", "AE"]
comparison_countries = ["AU", "CA", "FR", "DE", "IT", "JP", "ES", "SE", "GB", "US"]

In [28]:
# Country selection
with open("./data/data_sets.pkl", 'rb') as f:
    data_by_country = pickle.load(f)

name_count_by_country = {c: (len(data_by_country[c][0]), len(data_by_country[c][1])) for c in data_by_country}

all_country_lst = [_[0] for _ in sorted(list(name_count_by_country.items()), key=lambda x: x[1], reverse=True)]
country_lst = []
n, n_ = 0, 0
arab_minority_group = set()
arab_name_samples = {}
for country in all_country_lst:
    f_name_count = len(data_by_country[country][0])
    m_name_count = len(data_by_country[country][1])
    
    if country in arab_league:
        arab_name_samples[country] = (f_name_count, m_name_count)
        
    f_test_size = orbis_test_size
    f_validation_size = orbis_test_size
    f_remaining_size = f_name_count - f_test_size - f_validation_size
    m_test_size = orbis_test_size
    m_validation_size = orbis_test_size
    m_remaining_size = m_name_count - m_test_size - m_validation_size
    
    if f_remaining_size <= m_remaining_size:
        f_train_size = f_remaining_size
        m_train_size = int(float(f_remaining_size) / min_ratio_minority_to_majority)
    else:
        f_train_size = int(float(f_remaining_size) / min_ratio_minority_to_majority)
        m_train_size = m_remaining_size
        
    #print(country, f_name_count, f_test_size + f_validation_size, m_name_count, m_test_size + m_validation_size)

    if f_train_size < f_test_size * min_ratio_train_to_test or m_train_size < m_test_size * min_ratio_train_to_test:
        if country in arab_league and f_name_count > orbis_test_size and m_name_count > orbis_test_size:
            arab_minority_group.add(country)
        continue
        
    n_ += f_name_count + m_name_count
    country_lst.append(country)

country_lst += list(arab_minority_group)

countries_with_low_auc = {'CN', 'IN', 'BD', 'KR'}
    
f_name = "./data/complementnb_best_alpha.pickle"
with open(f_name, "rb") as f:
    nb_alpha = pickle.load(f)

f_name = "./data/complementnb_prob_threshold_in_orbis.pickle"
with open(f_name, "rb") as f:
    nb_prob_threshold_orbis = pickle.load(f)

f_name = "./data/complementnb_prob_threshold_in_wgnd.pickle"
with open(f_name, "rb") as f:
    nb_prob_threshold_wgnd = pickle.load(f)

nb_prob_threshold = {}
trained_country_lst = []
passed_country_lst = []
for country in country_lst:
    
    if country in nb_prob_threshold_orbis and country in nb_prob_threshold_wgnd:
        trained_country_lst.append(country)
    
    if (country not in nb_prob_threshold_orbis or country not in nb_prob_threshold_wgnd 
            or country in countries_with_low_auc):
        continue
        
    p1 = nb_prob_threshold_orbis[country]
    p2 = nb_prob_threshold_wgnd[country]
    nb_prob_threshold[country] = max(p1, p2)
    passed_country_lst.append(country)
    
print(len(trained_country_lst))
print(trained_country_lst)
    
country_lst = list(passed_country_lst)
    
f_path = ("./data/author_gender"
          + "_middle_name_" + str(include_middle_name)
          + "_last_name_" + str(include_last_name)
          + "_special_chars_" + str(clearning_special_chars)
          + ".pkl")
with open(f_path, mode="rb") as f:
    author_gender = pickle.load(f)

f_path = ("./data/author_sample_lst"
          + "_middle_name_" + str(include_middle_name)
          + "_last_name_" + str(include_last_name)
          + "_special_chars_" + str(clearning_special_chars)
          + ".pkl")

with open(f_path, mode="rb") as f:
    (author_name_corpus, author_sample_lst) = pickle.load(f)
    
num_authors_by_country = {}
for country in country_lst:

    n_f, n_m = 0, 0
    for author_id in author_sample_lst[country]:
        g = author_gender.get(author_id, -1)
        if g == 0:
            n_f += 1
        elif g == 1:
            n_m += 1

    num_authors_by_country[country] = [n_f, n_m]
    
final_country_lst = []
for country in country_lst:
    if (country not in nb_prob_threshold_orbis or country not in nb_prob_threshold_wgnd 
            or country in countries_with_low_auc):
        continue
        
    n_f = num_authors_by_country[country][0]
    n_m = num_authors_by_country[country][1]
    
    if n_f < 1000 or n_m < 1000:
        continue
        
    final_country_lst.append(country)

country_lst = list(final_country_lst)

print(len(passed_country_lst), len(final_country_lst))
print(set(passed_country_lst) - set(final_country_lst))
print(final_country_lst)

65
['RU', 'GB', 'ZA', 'IT', 'RO', 'BG', 'NO', 'CH', 'NZ', 'RS', 'CZ', 'IN', 'CN', 'DE', 'JO', 'IE', 'US', 'FR', 'PH', 'PL', 'LB', 'AU', 'PG', 'BH', 'OM', 'DZ', 'NG', 'SE', 'ES', 'IS', 'NL', 'MD', 'EG', 'CA', 'PT', 'TR', 'KE', 'KW', 'LT', 'AT', 'DK', 'AE', 'JP', 'LS', 'ZW', 'BE', 'EE', 'IR', 'ME', 'BD', 'SA', 'LK', 'KR', 'JM', 'GH', 'AL', 'BY', 'IL', 'MA', 'BA', 'CY', 'SY', 'SO', 'IQ', 'TN']
61 58
{'LS', 'SO', 'PG'}
['RU', 'GB', 'ZA', 'IT', 'RO', 'BG', 'NO', 'CH', 'NZ', 'RS', 'CZ', 'DE', 'JO', 'IE', 'US', 'FR', 'PH', 'PL', 'LB', 'AU', 'BH', 'OM', 'DZ', 'NG', 'SE', 'ES', 'IS', 'NL', 'MD', 'EG', 'CA', 'PT', 'TR', 'KE', 'KW', 'LT', 'AT', 'DK', 'AE', 'JP', 'ZW', 'BE', 'EE', 'IR', 'ME', 'SA', 'LK', 'JM', 'GH', 'AL', 'BY', 'IL', 'MA', 'BA', 'CY', 'SY', 'IQ', 'TN']


In [5]:
f_path = ("./data/author_gender"
          + "_middle_name_" + str(include_middle_name)
          + "_last_name_" + str(include_last_name)
          + "_special_chars_" + str(clearning_special_chars)
          + ".pkl")
with open(f_path, mode="rb") as f:
    author_gender = pickle.load(f)
    
n = 0
for a_id in author_gender:
    if author_gender[a_id] in {0, 1}:
        n += 1
print(n)

15107054


In [7]:
f_path = ("./data/num_women_and_men_by_year"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")

with open(f_path, mode="rb") as f:
    num_women_and_men_by_year = pickle.load(f)
    
HAC_MAX_LAGS = 3 

print("\\begin{tabular}{lllll}")
print("\\hline")
print("Country & Slope Before & Slope After & t-statistic & p-value \\\\")
print("\\hline")

for country in ["EG", "TN"]:
    your_proportion_data = np.array([
        float(num_women_and_men_by_year[country][y][0]) / np.sum(num_women_and_men_by_year[country][y]) 
        for y in range(2000, 2024)
    ])
    
    df = pd.DataFrame({
        'year': np.arange(2000, 2024),
        'proportion': your_proportion_data
    })
    
    df_filtered = df[~df['year'].between(2010, 2013)].copy() # .copy() を追加
    
    df_before = df_filtered[df_filtered['year'] < 2010].copy()
    df_after = df_filtered[df_filtered['year'] > 2013].copy()
    
    X_b = sm.add_constant(df_before['year'])
    y_b = df_before['proportion']
    model_b = sm.OLS(y_b, X_b)
    
    results_b_hac = model_b.fit(cov_type='HAC', cov_kwds={'maxlags': HAC_MAX_LAGS})
    
    X_a = sm.add_constant(df_after['year'])
    y_a = df_after['proportion']
    model_a = sm.OLS(y_a, X_a)
    
    results_a_hac = model_a.fit(cov_type='HAC', cov_kwds={'maxlags': HAC_MAX_LAGS})
    
    slope_b = results_b_hac.params['year']
    slope_a = results_a_hac.params['year']
    
    se_b = results_b_hac.bse['year']
    se_a = results_a_hac.bse['year']
    
    slope_diff = slope_b - slope_a
    se_diff = np.sqrt(se_b**2 + se_a**2) 
    
    t_stat = slope_diff / se_diff
    
    df_combined = len(y_b) + len(y_a) - 4 
    
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=df_combined))
    
    print(" & ".join([
        str(country), 
        str('{:.4f}'.format(slope_b)), 
        str('{:.4f}'.format(slope_a)), 
        str('{:.4f}'.format(t_stat)), 
        str('{:.4f}'.format(p_value))
    ]) + " \\\\")
    
print("\\hline")

\begin{tabular}{lllll}
\hline
Country & Slope Before & Slope After & t-statistic & p-value \\
\hline
EG & 0.0048 & 0.0091 & -6.0613 & 0.0000 \\
TN & 0.0144 & 0.0088 & 3.1868 & 0.0057 \\
\hline


In [12]:
# Table 1

country_name = {
    "TN": "Tunisia",
    "SA": "Saudi Arabia",
    "EG": "Egypt",
    "MA": "Morocco",
    "JO": "Jordan",
    "AE": "United Arab Emirates",
    "LB": "Lebanon",
    "DZ": "Algeria",
    "IQ": "Iraq",
    "JP": "Japan",
    "US": "United States",
    "GB": "United Kingdom",
    "FR": "France",
    "DE": "Germany",
    "ES": "Spain",
    "IT": "Italy",
    "CA": "Canada",
    "AU": "Australia",
    "SE": "Sweden",
}

f_path = ("./data/productive_people_pyramid"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")

with open(f_path, mode="rb") as f:
    productive_people_pyramid = pickle.load(f)
    
def cagr(n1, n2, t1, t2):
    return (n2 / n1) ** (1 / (t2 - t1)) - 1
    
p_f_lst = {}
senior_p_f_lst = {}
years = [2023, 2030, 2040, 2050]
n_lst = {}
for country in sorted(["DZ", "EG", "IQ", "JO", "LB", "MA", "SA", "TN", "AE"]):
    
    n_2010 = np.sum(num_women_and_men_by_year[country][2010])
    n_2023 = np.sum(num_women_and_men_by_year[country][2023])
    rate_2010_2023 = cagr(n_2010, n_2023, 2010, 2023)
    
    base_year, target_year = 2023, 2100
    (female_count, male_count, female_prob, male_prob, female_newcomer_count, male_newcomer_count) = productive_people_pyramid[(country, base_year, target_year)]
    
    n_lst[country] = {y: np.sum(list(female_count[y].values())) + np.sum(list(male_count[y].values())) for y in range(base_year, target_year+1)}
    rate_2023_2030 = cagr(n_lst[country][2023], n_lst[country][2030], 2023, 2030)
    rate_2031_2040 = cagr(n_lst[country][2031], n_lst[country][2040], 2031, 2040)
    rate_2041_2050 = cagr(n_lst[country][2041], n_lst[country][2050], 2041, 2050)
    
    p_f_lst[country] = {}
    senior_p_f_lst[country] = {}
    for y in years:
        total_active = np.sum(list(female_count[y].values())) + np.sum(list(male_count[y].values()))
        total_female = np.sum(list(female_count[y].values()))
        total_male = np.sum(list(male_count[y].values()))
        inflow_gender_gap = float(total_female - total_male) / total_male
        female_ratio = float(total_female) / total_active
        car_mat_f = float(np.sum([k * female_count[y][k] for k in female_count[y]])) / total_female
        car_mat_m = float(np.sum([k * male_count[y][k] for k in male_count[y]])) / total_male
        car_mat_gender_gap = float(car_mat_f - car_mat_m) / car_mat_m
        
        total_senior = np.sum([female_count[y][k] for k in female_count[y] if k > 50]) + np.sum([male_count[y][k] for k in male_count[y] if k > 50])
        total_senior_female = np.sum([female_count[y][k] for k in female_count[y] if k > 50])
        
        p_f_lst[country][y] = female_ratio
        senior_p_f_lst[country][y] = float(total_senior_female) / total_senior

    print(" & ".join([
        str(country), 
        "$" + str('{:.3f}'.format(rate_2010_2023)) + "$", 
        "$" + str('{:.3f}'.format(rate_2023_2030)) + "$", 
        "$" + str('{:.3f}'.format(rate_2031_2040)) + "$", 
        "$" + str('{:.3f}'.format(rate_2041_2050)) + "$", 
        "$" + str('{:.1f}'.format(p_f_lst[country][2023] * 100)) + "$",  
        "$" + str('{:.1f}'.format(p_f_lst[country][2030] * 100)) + "$",  
        "$" + str('{:.1f}'.format(p_f_lst[country][2040] * 100)) + "$", 
        "$" + str('{:.1f}'.format(p_f_lst[country][2050] * 100)) + "$", 
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2023] * 100)) + "$",  
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2030] * 100)) + "$",  
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2040] * 100)) + "$", 
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2050] * 100)) + "$", 
    ]) + " \\\\")
    
print("\\hline")
    
n_lst = {}
for country in sorted(["AU", "CA", "FR", "DE", "IT", "JP", "ES", "SE", "GB", "US"]):
    
    n_2010 = np.sum(num_women_and_men_by_year[country][2010])
    n_2023 = np.sum(num_women_and_men_by_year[country][2023])
    rate_2010_2023 = cagr(n_2010, n_2023, 2010, 2023)
    
    base_year, target_year = 2023, 2100
    (female_count, male_count, female_prob, male_prob, female_newcomer_count, male_newcomer_count) = productive_people_pyramid[(country, base_year, target_year)]
    
    n_lst[country] = {y: np.sum(list(female_count[y].values())) + np.sum(list(male_count[y].values())) for y in range(base_year, target_year+1)}
    rate_2023_2030 = cagr(n_lst[country][2023], n_lst[country][2030], 2023, 2030)
    rate_2031_2040 = cagr(n_lst[country][2031], n_lst[country][2040], 2031, 2040)
    rate_2041_2050 = cagr(n_lst[country][2041], n_lst[country][2050], 2041, 2050)
    
    p_f_lst[country] = {}
    senior_p_f_lst[country] = {}
    for y in years:
        total_active = np.sum(list(female_count[y].values())) + np.sum(list(male_count[y].values()))
        total_female = np.sum(list(female_count[y].values()))
        total_male = np.sum(list(male_count[y].values()))
        inflow_gender_gap = float(total_female - total_male) / total_male
        female_ratio = float(total_female) / total_active
        car_mat_f = float(np.sum([k * female_count[y][k] for k in female_count[y]])) / total_female
        car_mat_m = float(np.sum([k * male_count[y][k] for k in male_count[y]])) / total_male
        car_mat_gender_gap = float(car_mat_f - car_mat_m) / car_mat_m
        
        total_senior = np.sum([female_count[y][k] for k in female_count[y] if k > 50]) + np.sum([male_count[y][k] for k in male_count[y] if k > 50])
        total_senior_female = np.sum([female_count[y][k] for k in female_count[y] if k > 50])
        
        p_f_lst[country][y] = female_ratio
        senior_p_f_lst[country][y] = float(total_senior_female) / total_senior

    print(" & ".join([
        str(country), 
        "$" + str('{:.3f}'.format(rate_2010_2023)) + "$", 
        "$" + str('{:.3f}'.format(rate_2023_2030)) + "$", 
        "$" + str('{:.3f}'.format(rate_2031_2040)) + "$", 
        "$" + str('{:.3f}'.format(rate_2041_2050)) + "$", 
        "$" + str('{:.1f}'.format(p_f_lst[country][2023] * 100)) + "$",  
        "$" + str('{:.1f}'.format(p_f_lst[country][2030] * 100)) + "$",  
        "$" + str('{:.1f}'.format(p_f_lst[country][2040] * 100)) + "$", 
        "$" + str('{:.1f}'.format(p_f_lst[country][2050] * 100)) + "$", 
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2023] * 100)) + "$",  
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2030] * 100)) + "$",  
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2040] * 100)) + "$", 
        "$" + str('{:.1f}'.format(senior_p_f_lst[country][2050] * 100)) + "$", 
    ]) + " \\\\")

AE & $0.165$ & $0.076$ & $0.040$ & $0.020$ & $40.5$ & $46.3$ & $49.4$ & $50.8$ & $17.5$ & $22.9$ & $32.3$ & $38.1$ \\
DZ & $0.133$ & $0.055$ & $0.035$ & $0.023$ & $42.3$ & $44.9$ & $46.7$ & $47.8$ & $13.5$ & $12.0$ & $14.0$ & $19.6$ \\
EG & $0.143$ & $0.071$ & $0.040$ & $0.023$ & $46.5$ & $48.5$ & $49.8$ & $50.2$ & $27.1$ & $37.2$ & $48.2$ & $53.3$ \\
IQ & $0.227$ & $0.102$ & $0.045$ & $0.018$ & $43.8$ & $47.6$ & $49.1$ & $48.9$ & $16.0$ & $25.0$ & $30.4$ & $30.7$ \\
JO & $0.136$ & $0.069$ & $0.034$ & $0.016$ & $35.4$ & $40.8$ & $43.5$ & $44.2$ & $18.1$ & $22.3$ & $27.3$ & $32.6$ \\
LB & $0.113$ & $0.056$ & $0.034$ & $0.021$ & $50.7$ & $54.7$ & $57.2$ & $58.2$ & $31.2$ & $33.6$ & $41.8$ & $49.5$ \\
MA & $0.133$ & $0.078$ & $0.041$ & $0.021$ & $38.1$ & $40.8$ & $42.2$ & $42.6$ & $21.6$ & $18.0$ & $24.8$ & $29.4$ \\
SA & $0.200$ & $0.091$ & $0.042$ & $0.020$ & $37.6$ & $43.0$ & $45.4$ & $45.9$ & $19.6$ & $29.7$ & $39.5$ & $42.9$ \\
TN & $0.081$ & $0.038$ & $0.023$ & $0.014$ & $56.2$ & $6

In [10]:
# Table S1

arab_league = [
    "DZ", "BH", "KM", "DJ", "EG", "IQ", "JO", "KW", "LB", "LY", "MR", 
    "MA", "OM", "PS", "QA", "SA", "SO", "SD", "SY", "TN", "AE", "YE",
]

arab_countries = sorted([(code, pycountry.countries.get(alpha_2=code).name) for code in sorted(arab_league)], key=lambda x: x[1])

print("\\begin{table}[htbp]")
print("\\centering")
print("\\caption{Member States of the Arab League}")
print("\\label{tab:arab-league}")
print("\\begin{tabular}{ll}")
print("\\hline")
print("Country & Country Code \\\\")
print("\\hline")

for code, name in arab_countries:
    print(f"{name} \\\\")

print("\\hline")
print("\\end{tabular}")
print("\\end{table}")

\begin{table}[htbp]
\centering
\caption{Member States of the Arab League}
\label{tab:arab-league}
\begin{tabular}{ll}
\hline
Country & Country Code \\
\hline
Algeria \\
Bahrain \\
Comoros \\
Djibouti \\
Egypt \\
Iraq \\
Jordan \\
Kuwait \\
Lebanon \\
Libya \\
Mauritania \\
Morocco \\
Oman \\
Palestine, State of \\
Qatar \\
Saudi Arabia \\
Somalia \\
Sudan \\
Syrian Arab Republic \\
Tunisia \\
United Arab Emirates \\
Yemen \\
\hline
\end{tabular}
\end{table}


In [22]:
# Table S2

new_trained_country_lst = sorted([(country, pycountry.countries.get(alpha_2=country).name) for country in trained_country_lst], key=lambda x: x[1])

n_all = 0
for (country, country_name) in new_trained_country_lst:

    f_name = ("./data/" + str(country).lower() + "_train_val_test_sets"
                  + "_middle_name_" + str(include_middle_name)
                  + "_last_name_" + str(include_last_name)
                  + "_special_chars_" + str(clearning_special_chars)
                  + ".pkl")
    with open(f_name, 'rb') as f:
        (X_train, y_train, X_val, y_val, X_test, y_test,
                         test_name_country_lst, val_name_country_lst) = pickle.load(f)
        
    n_f = int(len(y_train)) - int(np.sum(y_train))
    n_m = int(np.sum(y_train))
    if n_f <= n_m:
        p = float(n_f) / (n_m)
    else:
        p = float(n_m) / n_f

    print(f"{country_name} & " + "{:,}".format(n_f) + " & " + "{:,}".format(n_m) + "\\\\")
    
print("Number of countries", len(new_trained_country_lst))

Albania & 4,560 & 22,800\\
Algeria & 27,413 & 137,065\\
Australia & 45,789 & 104,429\\
Austria & 10,746 & 36,555\\
Bahrain & 35,066 & 45,000\\
Bangladesh & 5,515 & 22,171\\
Belarus & 4,385 & 12,505\\
Belgium & 8,741 & 32,846\\
Bosnia and Herzegovina & 2,741 & 8,511\\
Bulgaria & 609,226 & 822,719\\
Canada & 18,289 & 62,179\\
China & 90,973 & 222,418\\
Cyprus & 2,339 & 9,200\\
Czechia & 153,267 & 304,260\\
Denmark & 9,701 & 30,685\\
Egypt & 18,316 & 91,580\\
Estonia & 8,272 & 15,792\\
France & 61,820 & 170,251\\
Germany & 82,796 & 269,500\\
Ghana & 4,637 & 12,014\\
Iceland & 22,307 & 46,856\\
India & 94,220 & 305,744\\
Iran, Islamic Republic of & 7,050 & 35,250\\
Iraq & 273,599 & 1,041,777\\
Ireland & 70,759 & 149,920\\
Israel & 4,357 & 21,785\\
Italy & 2,508,071 & 5,348,406\\
Jamaica & 4,754 & 7,588\\
Japan & 9,200 & 46,000\\
Jordan & 71,967 & 359,835\\
Kenya & 16,919 & 59,825\\
Korea, Republic of & 4,781 & 23,905\\
Kuwait & 14,005 & 49,845\\
Lebanon & 55,521 & 130,120\\
Lesotho & 8,961

In [35]:
# Table S3

nb_alpha, nb_prob_threshold = models.set_cnb_params(passed_country_lst)
passed_country_lst_ = sorted([(country, pycountry.countries.get(alpha_2=country).name) for country in passed_country_lst], key=lambda x: x[1])

def format_val(val, decimals):
    if isinstance(val, float):
        rounded = round(val, decimals)
        if decimals == 3 and abs(rounded - 1.000) < 1e-8:
            return "0.999"
        if decimals == 4 and abs(rounded - 1.0000) < 1e-8:
            return "0.9999"
        return f"{rounded:.{decimals}f}"
    return val

print("\\begin{longtable}{lcc}")
print("\\caption{Country-Specific Parameters} \\\\")
print("\\label{tab:alpha-theta} \\\\")
print("\\hline")
print("Country & Value D & Value G \\\\")
print("\\hline\\endfirsthead")
print("\\hline")
print("Country & Value D & Value G \\\\")
print("\\hline\\endhead")

for (country, country_name) in passed_country_lst_:
    print(" & ".join([
        str(country_name),
        "$" + str('{:.3f}'.format(nb_alpha[country])) + "$",
        "$" + str('{:.4f}'.format(nb_prob_threshold[country])) + "$",
    ]) + "\\\\")

print("\\hline")
print("\\end{longtable}")

print(len(passed_country_lst))

\begin{longtable}{lcc}
\caption{Country-Specific Parameters} \\
\label{tab:alpha-theta} \\
\hline
Country & Value D & Value G \\
\hline\endfirsthead
\hline
Country & Value D & Value G \\
\hline\endhead
Albania & $0.100$ & $0.9000$\\
Algeria & $1.000$ & $0.9000$\\
Australia & $0.010$ & $0.9000$\\
Austria & $0.001$ & $0.9600$\\
Bahrain & $0.010$ & $0.9910$\\
Belarus & $0.001$ & $0.9990$\\
Belgium & $0.100$ & $0.9000$\\
Bosnia and Herzegovina & $0.100$ & $0.9000$\\
Bulgaria & $0.001$ & $0.9000$\\
Canada & $0.100$ & $0.9000$\\
Cyprus & $0.010$ & $0.9900$\\
Czechia & $10.000$ & $0.9000$\\
Denmark & $0.100$ & $0.9000$\\
Egypt & $0.100$ & $0.9000$\\
Estonia & $0.100$ & $0.9000$\\
France & $0.100$ & $0.9000$\\
Germany & $1.000$ & $0.9000$\\
Ghana & $0.100$ & $0.9000$\\
Iceland & $0.010$ & $0.9000$\\
Iran, Islamic Republic of & $0.100$ & $0.9400$\\
Iraq & $0.100$ & $0.9992$\\
Ireland & $0.100$ & $0.9000$\\
Israel & $0.100$ & $0.9000$\\
Italy & $0.001$ & $0.9000$\\
Jamaica & $0.100$ & $0.9000$\\

In [37]:
# Table S4

nqg_prob_threshold = models.set_nqg_param(passed_country_lst)
passed_country_lst_ = sorted([(country, pycountry.countries.get(alpha_2=country).name) for country in passed_country_lst], key=lambda x: x[1])

def format_val(val, decimals):
    if isinstance(val, float):
        rounded = round(val, decimals)
        if decimals == 3 and abs(rounded - 1.000) < 1e-8:
            return "0.999"
        if decimals == 4 and abs(rounded - 1.0000) < 1e-8:
            return "0.9999"
        return f"{rounded:.{decimals}f}"
    return val

print("\\begin{longtable}{lc}")
print("\\caption{Country-Specific Parameters} \\\\")
print("\\label{tab:alpha-theta} \\\\")
print("\\hline")
print("Country & Value D & Value G \\\\")
print("\\hline\\endfirsthead")
print("\\hline")
print("Country & Value D & Value G \\\\")
print("\\hline\\endhead")

for (country, country_name) in passed_country_lst_:
    print(" & ".join([
        str(country_name),
        "$" + str('{:.4f}'.format(nqg_prob_threshold[country])) + "$",
    ]) + "\\\\")

print("\\hline")
print("\\end{longtable}")

print(len(passed_country_lst))

{'RU': 0.9, 'GB': 0.9, 'ZA': 0.9, 'IT': 0.9, 'RO': 0.9, 'BG': 0.9, 'NO': 0.9, 'CH': 0.9, 'NZ': 0.9, 'RS': 0.9, 'CZ': 0.9, 'DE': 0.9, 'JO': 0.9, 'IE': 0.9, 'US': 0.9, 'FR': 0.9, 'PH': 0.9, 'PL': 0.9, 'LB': 0.9, 'AU': 0.9, 'PG': 0.9, 'BH': 0.9, 'OM': 0.9, 'DZ': 0.9, 'NG': 0.9, 'SE': 0.9, 'ES': 0.9, 'IS': 0.9, 'NL': 0.9, 'MD': 0.9, 'EG': 0.9, 'CA': 0.9, 'PT': 0.9, 'TR': 0.9, 'KE': 0.9, 'KW': 0.9, 'LT': 0.9, 'AT': 0.9, 'DK': 0.9, 'AE': 0.9, 'JP': 0.9, 'LS': 0.9, 'ZW': 0.9, 'BE': 0.9, 'EE': 0.9, 'IR': 0.9, 'ME': 0.9, 'SA': 0.93, 'LK': 0.9, 'JM': 0.9, 'GH': 0.9, 'AL': 0.9, 'BY': 0.9, 'IL': 0.9, 'MA': 0.9, 'BA': 0.9, 'CY': 0.9, 'SY': 0.9, 'SO': 0.9, 'IQ': 0.9, 'TN': 0.9}
\begin{longtable}{lc}
\caption{Country-Specific Parameters} \\
\label{tab:alpha-theta} \\
\hline
Country & Value D & Value G \\
\hline\endfirsthead
\hline
Country & Value D & Value G \\
\hline\endhead
Albania & $0.9000$\\
Algeria & $0.9000$\\
Australia & $0.9000$\\
Austria & $0.9000$\\
Bahrain & $0.9000$\\
Belarus & $0.9000$\

In [51]:
# Table S5

df_nb = pd.read_excel("./data/orbis_test_results_ComplementNB.xlsx")
df_nqg = pd.read_excel("./data/orbis_test_results_nqg.xlsx")

df_nb = df_nb[['country', 'ROC AUC', 'f1', 'gender assignment rate']]
df_nqg = df_nqg[['country', 'ROC AUC', 'f1', 'gender assignment rate']]
df_nb.columns = ['Country', 'NB_ROC_AUC', 'NB_F1', 'NB_GAR']
df_nqg.columns = ['Country', 'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']

merged_df = pd.merge(df_nb, df_nqg, on='Country', how='inner').dropna()

def get_country_name(alpha2):
    try:
        return pycountry.countries.get(alpha_2=alpha2).name
    except:
        return None

merged_df['CountryName'] = merged_df['Country'].apply(get_country_name)
merged_df = merged_df.dropna(subset=['CountryName']).sort_values(by='CountryName')

def adjust_almost_one(x):
    return 0.999 if np.isclose(x, 1.00000, atol=1e-3) else x

for col in ['NB_ROC_AUC', 'NB_F1', 'NB_GAR', 'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']:
    merged_df[col] = merged_df[col].apply(adjust_almost_one)

latex_code = merged_df[['CountryName', 'NB_ROC_AUC', 'NB_F1', 'NB_GAR',
                        'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']].to_latex(
    index=False,
    column_format='lcccccc',
    header=['Country', 'NB ROC AUC', 'NB F1', 'NB GAR', 'NQG ROC AUC', 'NQG F1', 'NQG GAR'],
    float_format="%.3f"
)

print(latex_code)

print("\nNumber of countries:", merged_df.shape[0])

summary_stats = merged_df[['NB_ROC_AUC', 'NB_F1', 'NB_GAR',
                           'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']].agg(['mean', 'std']).transpose()

print("\nMean and standard deviation for each indicator:")
print(summary_stats.to_string(float_format="%.3f"))

\begin{tabular}{lcccccc}
\toprule
Country & NB ROC AUC & NB F1 & NB GAR & NQG ROC AUC & NQG F1 & NQG GAR \\
\midrule
Albania & 0.984 & 0.963 & 0.907 & 0.950 & 0.963 & 0.861 \\
Algeria & 0.965 & 0.951 & 0.906 & 0.945 & 0.953 & 0.879 \\
Australia & 0.995 & 0.978 & 0.956 & 0.992 & 0.994 & 0.911 \\
Austria & 0.996 & 0.984 & 0.971 & 0.995 & 0.996 & 0.947 \\
Bahrain & 0.998 & 0.997 & 0.959 & 0.969 & 0.969 & 0.858 \\
Belarus & 0.999 & 0.999 & 0.967 & 0.997 & 0.998 & 0.978 \\
Belgium & 0.982 & 0.971 & 0.880 & 0.979 & 0.985 & 0.880 \\
Bosnia and Herzegovina & 0.986 & 0.974 & 0.908 & 0.979 & 0.978 & 0.908 \\
Bulgaria & 0.998 & 0.993 & 0.990 & 0.987 & 0.991 & 0.866 \\
Canada & 0.987 & 0.973 & 0.906 & 0.984 & 0.986 & 0.884 \\
Cyprus & 0.976 & 0.974 & 0.819 & 0.971 & 0.970 & 0.889 \\
Czechia & 0.999 & 0.995 & 0.959 & 0.999 & 0.999 & 0.931 \\
Denmark & 0.989 & 0.977 & 0.908 & 0.987 & 0.987 & 0.879 \\
Egypt & 0.988 & 0.968 & 0.939 & 0.945 & 0.944 & 0.791 \\
Estonia & 0.998 & 0.994 & 0.962 & 0.991 & 0

In [53]:
# Table S6

df_nb = pd.read_excel("./data/wgnd_test_results_ComplementNB.xlsx")
df_nqg = pd.read_excel("./data/wgnd_test_results_nqg.xlsx")

df_nb = df_nb[['country', 'ROC AUC', 'f1', 'gender assignment rate']]
df_nqg = df_nqg[['country', 'ROC AUC', 'f1', 'gender assignment rate']]
df_nb.columns = ['Country', 'NB_ROC_AUC', 'NB_F1', 'NB_GAR']
df_nqg.columns = ['Country', 'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']

merged_df = pd.merge(df_nb, df_nqg, on='Country', how='inner').dropna()

def get_country_name(alpha2):
    try:
        return pycountry.countries.get(alpha_2=alpha2).name
    except:
        return None

merged_df['CountryName'] = merged_df['Country'].apply(get_country_name)
merged_df = merged_df.dropna(subset=['CountryName']).sort_values(by='CountryName')

def adjust_almost_one(x):
    return 0.999 if np.isclose(x, 1.00000, atol=1e-3) else x

for col in ['NB_ROC_AUC', 'NB_F1', 'NB_GAR', 'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']:
    merged_df[col] = merged_df[col].apply(adjust_almost_one)

latex_code = merged_df[['CountryName', 'NB_ROC_AUC', 'NB_F1', 'NB_GAR',
                        'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']].to_latex(
    index=False,
    column_format='lcccccc',
    header=['Country', 'NB ROC AUC', 'NB F1', 'NB GAR', 'NQG ROC AUC', 'NQG F1', 'NQG GAR'],
    float_format="%.3f"
)

print(latex_code)

print("\nNumber of countries:", merged_df.shape[0])

summary_stats = merged_df[['NB_ROC_AUC', 'NB_F1', 'NB_GAR',
                           'NQG_ROC_AUC', 'NQG_F1', 'NQG_GAR']].agg(['mean', 'std']).transpose()

print("\nMean and standard deviation for each indicator:")
print(summary_stats.to_string(float_format="%.3f"))

\begin{tabular}{lcccccc}
\toprule
Country & NB ROC AUC & NB F1 & NB GAR & NQG ROC AUC & NQG F1 & NQG GAR \\
\midrule
Albania & 0.932 & 0.913 & 0.813 & 0.997 & 0.998 & 0.892 \\
Algeria & 0.872 & 0.934 & 0.418 & 0.989 & 0.997 & 0.820 \\
Australia & 0.993 & 0.978 & 0.961 & 0.996 & 0.997 & 0.906 \\
Austria & 0.916 & 0.896 & 0.824 & 0.992 & 0.993 & 0.898 \\
Bahrain & 0.903 & 0.902 & 0.659 & 0.988 & 0.995 & 0.816 \\
Belarus & 0.862 & 0.911 & 0.431 & 0.905 & 0.941 & 0.685 \\
Belgium & 0.991 & 0.984 & 0.892 & 0.997 & 0.996 & 0.909 \\
Bosnia and Herzegovina & 0.960 & 0.952 & 0.759 & 0.986 & 0.990 & 0.898 \\
Bulgaria & 0.999 & 0.999 & 0.999 & 0.999 & 0.999 & 0.904 \\
Canada & 0.993 & 0.980 & 0.945 & 0.998 & 0.997 & 0.904 \\
Cyprus & 0.817 & 0.897 & 0.294 & 0.989 & 0.988 & 0.883 \\
Czechia & 0.953 & 0.957 & 0.710 & 0.968 & 0.973 & 0.814 \\
Denmark & 0.999 & 0.999 & 0.976 & 0.999 & 0.999 & 0.934 \\
Egypt & 0.891 & 0.900 & 0.625 & 0.988 & 0.995 & 0.823 \\
Estonia & 0.981 & 0.967 & 0.889 & 0.997 & 0

In [56]:
# Table S7

def format_val(val, decimals):
    if isinstance(val, float):
        rounded = round(val, decimals)
        return f"{rounded:.{decimals}f}"
    return val

passed_country_lst_ = sorted([(country, pycountry.countries.get(alpha_2=country).name) for country in passed_country_lst], key=lambda x: x[1])

print("\\begin{longtable}{lcc}")
print("\\caption{Country-Specific Parameters} \\\\")
print("\\label{tab:alpha-theta} \\\\")
print("\\hline")
print("Country & Value D & Value G \\\\")
print("\\hline\\endfirsthead")
print("\\hline")
print("Country & Value D & Value G \\\\")
print("\\hline\\endhead")

all_n_f, all_n_m = 0, 0
p_f_lst = []
all_n = 0
author_ids = []
for (country, country_name) in passed_country_lst_:
    
    all_n += len(author_sample_lst[country])
    
    n_f, n_m = 0, 0
    for author_id in author_sample_lst[country]:
        g = author_gender.get(author_id, -1)
        if g in {0, 1}:
            author_ids.append(author_id)
        if g == 0:
            n_f += 1
        elif g == 1:
            n_m += 1
    
    p_f = float(n_f) / (n_f + n_m)
    p_f_lst.append((country_name, p_f))
    
    print(f"{country_name} & " + "{:,}".format(n_f) + " & " + "{:,}".format(n_m) + " & " + format_val(p_f*100, 1) + "\% \\\\")
    
    all_n_f += n_f
    all_n_m += n_m

print("\\hline")
print("\\end{longtable}")

print(len(passed_country_lst_), len(set(author_ids)))
print(all_n_f + all_n_m, all_n_f, all_n_m, float(all_n_f) / (all_n_m + all_n_f))
print(sorted(p_f_lst, key=lambda x: x[1])[0], sorted(p_f_lst, key=lambda x: x[1])[-1])
print(all_n)

\begin{longtable}{lcc}
\caption{Country-Specific Parameters} \\
\label{tab:alpha-theta} \\
\hline
Country & Value D & Value G \\
\hline\endfirsthead
\hline
Country & Value D & Value G \\
\hline\endhead
Albania & 3,802 & 2,644 & 59.0\% \\
Algeria & 25,220 & 30,153 & 45.5\% \\
Australia & 167,622 & 209,286 & 44.5\% \\
Austria & 46,871 & 70,258 & 40.0\% \\
Bahrain & 2,604 & 2,452 & 51.5\% \\
Belarus & 3,917 & 4,840 & 44.7\% \\
Belgium & 60,884 & 81,802 & 42.7\% \\
Bosnia and Herzegovina & 7,294 & 5,897 & 55.3\% \\
Bulgaria & 18,245 & 22,235 & 45.1\% \\
Canada & 248,601 & 288,415 & 46.3\% \\
Cyprus & 3,514 & 5,442 & 39.2\% \\
Czechia & 43,943 & 60,692 & 42.0\% \\
Denmark & 50,177 & 57,431 & 46.6\% \\
Egypt & 100,791 & 115,829 & 46.5\% \\
Estonia & 7,339 & 6,957 & 51.3\% \\
France & 293,981 & 451,893 & 39.4\% \\
Germany & 358,990 & 581,464 & 38.2\% \\
Ghana & 8,140 & 20,760 & 28.2\% \\
Iceland & 3,211 & 3,723 & 46.3\% \\
Iran, Islamic Republic of & 211,591 & 250,720 & 45.8\% \\
Iraq & 31,12

In [58]:
# Table S8
f_path = ("./data/num_women_and_men_by_year"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")

with open(f_path, mode="rb") as f:
    num_women_and_men_by_year = pickle.load(f)
    
arab_countries = ["DZ", "EG", "IQ", "JO", "LB", "MA", "SA", "TN", "AE"]
comparison_countries = ["AU", "CA", "FR", "DE", "IT", "JP", "ES", "SE", "GB", "US"]
    
betas_arab = []
betas_comp = []

start_t = 2000
end_t = 2023

country_name = {
    "TN": "Tunisia",
    "SA": "Saudi Arabia",
    "EG": "Egypt",
    "MA": "Morocco",
    "JO": "Jordan",
    "AE": "United Arab Emirates",
    "LB": "Lebanon",
    "DZ": "Algeria",
    "IQ": "Iraq",
    "JP": "Japan",
    "US": "United States",
    "GB": "United Kingdom",
    "FR": "France",
    "DE": "Germany",
    "ES": "Spain",
    "IT": "Italy",
    "CA": "Canada",
    "AU": "Australia",
    "SE": "Sweden",
}

t = range(start_t, end_t + 1)
data = {country: [float(num_women_and_men_by_year[country][y][0]+num_women_and_men_by_year[country][y][1]) for y in t] for country in arab_countries + comparison_countries}

slope_arab = {}
for (country, series) in data.items():
    X = np.array(t).reshape(-1, 1)
    y = np.log1p(series)  # log-transform if needed
    model = LinearRegression().fit(X, y)
    slope = model.coef_[0]
    score = model.score(X, y)
    
    if country in arab_countries:
        slope_arab[country] = slope

    if country in arab_countries:
        betas_arab.append((country, slope, score))
    elif country in comparison_countries:
        betas_comp.append((country, slope, score))
        
for (country, slope, score) in betas_arab:
    print(" & ".join([
        str(country_name[country]), 
        str('{:.4f}'.format(slope)), 
        str('{:.4f}'.format(score))
    ]) + " \\\\")
    
print("\\hline")

for (country, slope, score) in betas_comp:
    print(" & ".join([
        str(country_name[country]), 
        str('{:.4f}'.format(slope)), 
        str('{:.4f}'.format(score))
    ]) + " \\\\")

Algeria & 0.1537 & 0.9726 \\
Egypt & 0.1473 & 0.9938 \\
Iraq & 0.2615 & 0.9766 \\
Jordan & 0.1428 & 0.9933 \\
Lebanon & 0.1165 & 0.9839 \\
Morocco & 0.1063 & 0.9874 \\
Saudi Arabia & 0.1772 & 0.9797 \\
Tunisia & 0.1262 & 0.9254 \\
United Arab Emirates & 0.1556 & 0.9974 \\
\hline
Australia & 0.0572 & 0.9563 \\
Canada & 0.0502 & 0.9323 \\
France & 0.0418 & 0.9207 \\
Germany & 0.0519 & 0.9549 \\
Italy & 0.0576 & 0.9754 \\
Japan & 0.0192 & 0.9042 \\
Spain & 0.0696 & 0.9659 \\
Sweden & 0.0423 & 0.9521 \\
United Kingdom & 0.0473 & 0.9660 \\
United States & 0.0420 & 0.9679 \\


In [59]:
# Table S9

f_path = ("./data/num_women_and_men_by_year"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")

with open(f_path, mode="rb") as f:
    num_women_and_men_by_year = pickle.load(f)
    
arab_countries = ["DZ", "EG", "IQ", "JO", "LB", "MA", "SA", "TN", "AE"]
comparison_countries = ["AU", "CA", "FR", "DE", "IT", "JP", "ES", "SE", "GB", "US"]

betas_arab = []
betas_comp = []

start_t = 2000
end_t = 2023

t = range(start_t, end_t + 1)
data = {country: [float(num_women_and_men_by_year[country][y][0])/(num_women_and_men_by_year[country][y][0]+num_women_and_men_by_year[country][y][1]) for y in t] for country in arab_countries + comparison_countries}

slope_arab = {}
for (country, series) in data.items():
    X = np.array(t).reshape(-1, 1)
    y = np.array(series)  # log-transform if needed
    model = LinearRegression().fit(X, y)
    slope = model.coef_[0]
    score = model.score(X, y)

    if country in arab_countries:
        betas_arab.append((country, slope, score))
        slope_arab[country] = slope
    elif country in comparison_countries:
        betas_comp.append((country, slope, score))
        
    #print(country, slope, score)

for (country, slope, score) in betas_arab:
    print(" & ".join([
        str(country_name[country]), 
        str('{:.4f}'.format(slope)), 
        str('{:.4f}'.format(score))
    ]) + " \\\\")
    
print("\\hline")

for (country, slope, score) in betas_comp:
    print(" & ".join([
        str(country_name[country]), 
        str('{:.4f}'.format(slope)), 
        str('{:.4f}'.format(score))
    ]) + " \\\\")

Algeria & 0.0075 & 0.9600 \\
Egypt & 0.0086 & 0.9721 \\
Iraq & 0.0075 & 0.8484 \\
Jordan & 0.0109 & 0.9489 \\
Lebanon & 0.0088 & 0.9515 \\
Morocco & 0.0029 & 0.6889 \\
Saudi Arabia & 0.0105 & 0.9145 \\
Tunisia & 0.0139 & 0.9760 \\
United Arab Emirates & 0.0101 & 0.9734 \\
\hline
Australia & 0.0069 & 0.9834 \\
Canada & 0.0071 & 0.9951 \\
France & 0.0061 & 0.9716 \\
Germany & 0.0074 & 0.9861 \\
Italy & 0.0048 & 0.9604 \\
Japan & 0.0041 & 0.9940 \\
Spain & 0.0065 & 0.9849 \\
Sweden & 0.0060 & 0.9800 \\
United Kingdom & 0.0066 & 0.9970 \\
United States & 0.0072 & 0.9956 \\


In [60]:
# Table S10

f_path = ("./data/num_women_and_men_by_year"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")

with open(f_path, mode="rb") as f:
    num_women_and_men_by_year = pickle.load(f)
    
HAC_MAX_LAGS = 3 

print("\\begin{tabular}{lllll}")
print("\\hline")
print("Country & Slope Before & Slope After & t-statistic & p-value \\\\")
print("\\hline")

for country in ["EG", "TN"]:
    
    your_author_counts = np.array([
        np.sum(num_women_and_men_by_year[country][y]) for y in range(2000, 2024)
    ])
    
    df = pd.DataFrame({
        'year': np.arange(2000, 2024),
        'authors_or_proportion': your_author_counts 
    })
    
    df['log_authors_or_proportion'] = np.log10(df['authors_or_proportion'])
    
    df_filtered = df[~df['year'].between(2010, 2013)].copy()
    
    df_before = df_filtered[df_filtered['year'] < 2010].copy()
    df_after = df_filtered[df_filtered['year'] > 2013].copy()
    
    X_b = sm.add_constant(df_before['year'])
    y_b = df_before['log_authors_or_proportion']
    model_b = sm.OLS(y_b, X_b)
    
    results_b_hac = model_b.fit(cov_type='HAC', cov_kwds={'maxlags': HAC_MAX_LAGS})
    
    X_a = sm.add_constant(df_after['year'])
    y_a = df_after['log_authors_or_proportion']
    model_a = sm.OLS(y_a, X_a)
    
    results_a_hac = model_a.fit(cov_type='HAC', cov_kwds={'maxlags': HAC_MAX_LAGS})
    
    slope_b = results_b_hac.params['year']
    slope_a = results_a_hac.params['year']
    
    se_b = results_b_hac.bse['year']
    se_a = results_a_hac.bse['year']
    
    slope_diff = slope_b - slope_a
    se_diff = np.sqrt(se_b**2 + se_a**2) 
    
    t_stat = slope_diff / se_diff
    
    df_combined = len(y_b) + len(y_a) - 4 
    
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=df_combined))
    
    print(" & ".join([
        str(country), 
        str('{:.3f}'.format(slope_b)), 
        str('{:.3f}'.format(slope_a)), 
        str('{:.3f}'.format(t_stat)), 
        str('{:.3f}'.format(p_value))
    ]) + " \\\\")
    
print("\\hline")

\begin{tabular}{lllll}
\hline
Country & Slope Before & Slope After & t-statistic & p-value \\
\hline
EG & 0.0555 & 0.0541 & 0.4469 & 0.66095981262763414676442152995150536298751831054688 \\
TN & 0.0924 & 0.0208 & 22.0690 & 0.00000000000020872192862952942959964275360107421875 \\
\hline


In [62]:
# Table S11

f_path = ("./data/num_women_and_men_by_year"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars) # 変数名を修正
              + ".pkl")

with open(f_path, mode="rb") as f:
    num_women_and_men_by_year = pickle.load(f)
    
HAC_MAX_LAGS = 3 

print("\\begin{tabular}{lllll}")
print("\\hline")
print("Country & Slope Before & Slope After & t-statistic & p-value \\\\")
print("\\hline")

for country in ["EG", "TN"]:

    your_proportion_data = np.array([
        float(num_women_and_men_by_year[country][y][0]) / np.sum(num_women_and_men_by_year[country][y]) 
        for y in range(2000, 2024)
    ])
    
    df = pd.DataFrame({
        'year': np.arange(2000, 2024),
        'proportion': your_proportion_data
    })
    
    df_filtered = df[~df['year'].between(2010, 2013)].copy()
    
    df_before = df_filtered[df_filtered['year'] < 2010].copy()
    df_after = df_filtered[df_filtered['year'] > 2013].copy()
    
    X_b = sm.add_constant(df_before['year'])
    y_b = df_before['proportion']
    model_b = sm.OLS(y_b, X_b)
    
    results_b_hac = model_b.fit(cov_type='HAC', cov_kwds={'maxlags': HAC_MAX_LAGS})
    
    X_a = sm.add_constant(df_after['year'])
    y_a = df_after['proportion']
    model_a = sm.OLS(y_a, X_a)
    
    results_a_hac = model_a.fit(cov_type='HAC', cov_kwds={'maxlags': HAC_MAX_LAGS})
    
    slope_b = results_b_hac.params['year']
    slope_a = results_a_hac.params['year']
    
    se_b = results_b_hac.bse['year']
    se_a = results_a_hac.bse['year']
    
    slope_diff = slope_b - slope_a
    se_diff = np.sqrt(se_b**2 + se_a**2) 
    
    t_stat = slope_diff / se_diff
    
    df_combined = len(y_b) + len(y_a) - 4 
    
    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=df_combined))
    
    print(" & ".join([
        str(country), 
        str('{:.3f}'.format(slope_b)), 
        str('{:.3f}'.format(slope_a)), 
        str('{:.3f}'.format(t_stat)), 
        str('{:.3f}'.format(p_value))
    ]) + " \\\\")
    
print("\\hline")

\begin{tabular}{lllll}
\hline
Country & Slope Before & Slope After & t-statistic & p-value \\
\hline
EG & 0.005 & 0.009 & -6.061 & 0.000 \\
TN & 0.014 & 0.009 & 3.187 & 0.006 \\
\hline


In [63]:
# Table S12

f_path = ("./data/pub_interval_threshold"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")
with open(f_path, mode="rb") as f:
    pub_interval_threshold = pickle.load(f)

print("\\begin{longtable}{lcc}")
print("\\caption{} \\\\")
print("\\label{table:} \\\\")
print("\\hline")
print("Country & Female & Male \\\\")
print("\\hline\\endfirsthead")
print("\\hline")
print("Country & Female & Male \\\\")
print("\\hline\\endhead")

lst = []
for i in range(0, len(final_country_lst)):
    country = country_lst[i]
    [female_threshold, male_threshold] = pub_interval_threshold[country]
    female_threshold = str('{:.2f}'.format(float(female_threshold) / 365))
    male_threshold = str('{:.2f}'.format(float(male_threshold) / 365))
    lst.append((pycountry.countries.get(alpha_2=country).name, female_threshold, male_threshold))

for (country, female_threshold, male_threshold) in sorted(lst, key=lambda x: x[0]):
    print(f"{country} & {female_threshold} & {male_threshold} \\\\")

print("\\hline")
print("\\end{longtable}")

print(len(country_lst))

\begin{longtable}{lcc}
\caption{} \\
\label{table:} \\
\hline
Country & Female & Male \\
\hline\endfirsthead
\hline
Country & Female & Male \\
\hline\endhead
Albania & 9.04 & 10.59 \\
Algeria & 10.35 & 10.01 \\
Australia & 11.80 & 10.43 \\
Austria & 9.80 & 8.64 \\
Bahrain & 16.01 & 15.93 \\
Belarus & 12.84 & 13.01 \\
Belgium & 7.24 & 6.42 \\
Bosnia and Herzegovina & 10.33 & 10.45 \\
Bulgaria & 13.35 & 14.71 \\
Canada & 13.28 & 11.68 \\
Cyprus & 8.46 & 7.37 \\
Czechia & 8.68 & 8.59 \\
Denmark & 9.68 & 8.36 \\
Egypt & 14.68 & 14.93 \\
Estonia & 9.01 & 8.39 \\
France & 10.01 & 9.39 \\
Germany & 11.01 & 9.61 \\
Ghana & 10.57 & 10.01 \\
Iceland & 10.26 & 10.01 \\
Iran, Islamic Republic of & 7.72 & 7.01 \\
Iraq & 12.60 & 12.97 \\
Ireland & 15.69 & 13.59 \\
Israel & 11.10 & 9.96 \\
Italy & 8.76 & 7.92 \\
Jamaica & 25.86 & 24.29 \\
Japan & 12.01 & 10.47 \\
Jordan & 9.80 & 9.84 \\
Kenya & 11.26 & 11.49 \\
Kuwait & 13.26 & 13.24 \\
Lebanon & 13.07 & 12.89 \\
Lithuania & 7.01 & 7.42 \\
Moldova, R

In [64]:
# Table S13

f_path = ("./data/productive_people_pyramid"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")
with open(f_path, mode="rb") as f:
    productive_people_pyramid = pickle.load(f)

base_year = 2023
target_year = 2100

print("\\begin{longtable}{lcc}")
print("\\caption{} \\\\")
print("\\label{table:} \\\\")
print("\\hline")
print(" & Researcher & Gender Gap in \\\\")
print("Country & Inflow & Career Maturity \\\\")
print("\\hline\\endfirsthead")
print("\\hline")
print(" & Researcher & Gender Gap in \\\\")
print("Country & Inflow & Career Maturity \\\\")
print("\\hline\\endhead")

lst = []
for i in range(len(country_lst)):
    country = country_lst[i]
    
    (female_count, male_count, female_prob, male_prob,
     female_newcomer_count, male_newcomer_count) = productive_people_pyramid[(country, base_year, target_year)]

    total_active = np.sum(list(female_count[base_year].values())) + np.sum(list(male_count[base_year].values()))
    total_female = np.sum(list(female_count[base_year].values()))
    total_male = np.sum(list(male_count[base_year].values()))
    inflow_gender_gap = float(total_female - total_male) / total_male
    female_ratio = float(total_female) / total_active
    
    car_mat_f = float(np.sum([k * female_count[base_year][k] for k in female_count[base_year]])) / total_female
    car_mat_m = float(np.sum([k * male_count[base_year][k] for k in male_count[base_year]])) / total_male
    car_mat_gender_gap = str('{:.2f}'.format(100 * float(car_mat_f - car_mat_m) / car_mat_m))
    
    inflow_ratio = str('{:.2f}'.format(100 * float(np.sum(list(female_newcomer_count.values())) + np.sum(list(male_newcomer_count.values()))) / total_active))

    lst.append((pycountry.countries.get(alpha_2=country).name, inflow_ratio, car_mat_gender_gap))

for (country, female_threshold, male_threshold) in sorted(lst, key=lambda x: x[0]):
    print(f"{country} & ${female_threshold}\%$ & ${male_threshold}\%$ \\\\")

print("\\hline")
print("\\end{longtable}")

print(len(country_lst))

\begin{longtable}{lcc}
\caption{} \\
\label{table:} \\
\hline
 & Researcher & Gender Gap in \\
Country & Inflow & Career Maturity \\
\hline\endfirsthead
\hline
 & Researcher & Gender Gap in \\
Country & Inflow & Career Maturity \\
\hline\endhead
Albania & $8.98\%$ & $-24.02\%$ \\
Algeria & $7.97\%$ & $-39.88\%$ \\
Australia & $5.02\%$ & $-30.00\%$ \\
Austria & $6.44\%$ & $-40.30\%$ \\
Bahrain & $10.42\%$ & $-39.85\%$ \\
Belarus & $9.03\%$ & $-28.45\%$ \\
Belgium & $6.78\%$ & $-36.45\%$ \\
Bosnia and Herzegovina & $6.23\%$ & $-21.07\%$ \\
Bulgaria & $6.12\%$ & $-4.91\%$ \\
Canada & $5.30\%$ & $-32.75\%$ \\
Cyprus & $8.86\%$ & $-45.05\%$ \\
Czechia & $5.28\%$ & $-37.24\%$ \\
Denmark & $5.96\%$ & $-40.05\%$ \\
Egypt & $9.83\%$ & $-20.91\%$ \\
Estonia & $4.63\%$ & $-39.35\%$ \\
France & $5.72\%$ & $-30.46\%$ \\
Germany & $6.07\%$ & $-44.18\%$ \\
Ghana & $11.84\%$ & $-32.65\%$ \\
Iceland & $4.82\%$ & $-30.83\%$ \\
Iran, Islamic Republic of & $7.43\%$ & $-43.79\%$ \\
Iraq & $15.31\%$ & $-24.