In [1]:
import construct_openalex
import gender_analysis
import pickle
import models

In [2]:
# Parameters
orbis_test_size = 450
wgnd_test_size = 5000
random_state = 42
shuffle = True
include_middle_name = False
include_last_name = False
clearning_special_chars = True
min_ratio_train_to_test = 5
min_ratio_minority_to_majority = 0.2
min_name_samples = 500
arab_league = {
    "DZ", "BH", "KM", "DJ", "EG", "IQ", "JO", "KW", "LB", "LY", "MR", 
    "MA", "OM", "PS", "QA", "SA", "SO", "SD", "SY", "TN", "AE", "YE",
}

In [5]:
# Country selection
with open("./data/data_sets.pkl", 'rb') as f:
    data_by_country = pickle.load(f)

name_count_by_country = {c: (len(data_by_country[c][0]), len(data_by_country[c][1])) for c in data_by_country}

all_country_lst = [_[0] for _ in sorted(list(name_count_by_country.items()), key=lambda x: x[1], reverse=True)]
country_lst = []
n, n_ = 0, 0
arab_minority_group = set()
arab_name_samples = {}
for country in all_country_lst:
    f_name_count = len(data_by_country[country][0])
    m_name_count = len(data_by_country[country][1])
    
    if country in arab_league:
        arab_name_samples[country] = (f_name_count, m_name_count)
        
    f_test_size = orbis_test_size
    f_validation_size = orbis_test_size
    f_remaining_size = f_name_count - f_test_size - f_validation_size
    m_test_size = orbis_test_size
    m_validation_size = orbis_test_size
    m_remaining_size = m_name_count - m_test_size - m_validation_size
    
    if f_remaining_size <= m_remaining_size:
        f_train_size = f_remaining_size
        m_train_size = int(float(f_remaining_size) / min_ratio_minority_to_majority)
    else:
        f_train_size = int(float(f_remaining_size) / min_ratio_minority_to_majority)
        m_train_size = m_remaining_size
        
    print(country, f_name_count, f_test_size + f_validation_size, m_name_count, m_test_size + m_validation_size)

    if f_train_size < f_test_size * min_ratio_train_to_test or m_train_size < m_test_size * min_ratio_train_to_test:
        if country in arab_league and f_name_count > orbis_test_size and m_name_count > orbis_test_size:
            arab_minority_group.add(country)
        continue
        
    n_ += f_name_count + m_name_count
    country_lst.append(country)

country_lst += list(arab_minority_group)

countries_with_low_auc = {'CN', 'IN', 'BD', 'KR'}

f_name = "./data/complementnb_best_alpha.pickle"
with open(f_name, "rb") as f:
    nb_alpha = pickle.load(f)

f_name = "./data/complementnb_prob_threshold_in_orbis.pickle"
with open(f_name, "rb") as f:
    nb_prob_threshold_orbis = pickle.load(f)

f_name = "./data/complementnb_prob_threshold_in_wgnd.pickle"
with open(f_name, "rb") as f:
    nb_prob_threshold_wgnd = pickle.load(f)

nb_prob_threshold = {}
new_country_lst = []
for country in country_lst:
    if (country not in nb_prob_threshold_orbis or country not in nb_prob_threshold_wgnd 
            or country in countries_with_low_auc):
        continue
        
    p1 = nb_prob_threshold_orbis[country]
    p2 = nb_prob_threshold_wgnd[country]
    nb_prob_threshold[country] = max(p1, p2)
    new_country_lst.append(country)

country_lst = list(new_country_lst)

print(len(country_lst))
print(country_lst)

RU 6809103 900 10065354 900
GB 4736468 900 7911337 900
ZA 3061224 900 3829995 900
IT 2508971 900 5349306 900
AR 1732553 900 2194551 900
UA 1307747 900 2329730 900
FI 616804 900 824248 900
RO 612856 900 949509 900
BG 610126 900 823619 900
NO 426803 900 695419 900
CH 377475 900 747701 900
NZ 315754 900 518050 900
RS 174991 900 331788 900
MY 164274 900 501751 900
CZ 154167 900 305160 900
CL 148767 900 324038 900
TH 110090 900 305600 900
IN 95120 900 306644 900
CN 91873 900 223318 900
VN 87210 900 394714 900
DE 83696 900 270400 900
JO 72867 900 409478 900
IE 71659 900 150820 900
US 70970 900 260674 900
FR 62720 900 171151 900
PH 62400 900 41599 900
PL 56743 900 134037 900
LB 56421 900 131020 900
AU 46689 900 105329 900
PG 36712 900 115424 900
BH 35966 900 45900 900
OM 34553 900 140942 900
PK 29193 900 154307 900
DZ 28313 900 295427 900
MM 27661 900 78479 900
NG 27310 900 56344 900
SK 25059 900 53108 900
SE 25025 900 63070 900
ES 23916 900 60953 900
ID 23763 900 65262 900
HK 23506 900 67050

In [None]:
#construct_openalex.construct_work_data(country_lst, start_year=1950, end_year=2023)

In [None]:
#construct_openalex.construct_affiliation_to_country()

In [None]:
#construct_openalex.construct_author_name()

In [None]:
#construct_openalex.construct_author_data(country_lst, start_year=1950, end_year=2023)

In [None]:
#gender_analysis.construct_author_sample_lst(country_lst, include_middle_name, include_last_name, clearning_special_chars)

In [None]:
f_path = ("./data/author_sample_lst"
              + "_middle_name_" + str(include_middle_name)
              + "_last_name_" + str(include_last_name)
              + "_special_chars_" + str(clearning_special_chars)
              + ".pkl")

with open(f_path, mode="rb") as f:
    (author_name_corpus, author_sample_lst) = pickle.load(f)

In [None]:
len(author_name_corpus), len(author_sample_lst)

In [None]:
author_gender = gender_analysis.gender_assignment_via_ComplementNB(author_name_corpus, author_sample_lst,
                                                                   country_lst,
                                                                   include_middle_name, include_last_name,
                                                                   clearning_special_chars, nb_alpha,
                                                                   nb_prob_threshold)