In [1]:
import pandas as pd
import pickle
import setup, models
import matplotlib.pyplot as plt

fontsize = 30
plt.rcParams["font.size"] = fontsize
#plt.rcParams['font.family'] = "sans-serif"
plt.rcParams['xtick.direction'] = 'out'
plt.rcParams['ytick.direction'] = 'out'
plt.rcParams['xtick.major.width'] = 2.5
plt.rcParams['xtick.minor.width'] = 1
plt.rcParams['ytick.major.width'] = 2.5
plt.rcParams['ytick.minor.width'] = 1
plt.rcParams['axes.linewidth'] = 2.5
plt.rcParams['xtick.major.size'] = 15
plt.rcParams['xtick.minor.size'] = 12.5
plt.rcParams['ytick.major.size'] = 15
plt.rcParams['ytick.minor.size'] = 12.5
plt.rcParams['hatch.linewidth'] = 0.3

pd.set_option("future.no_silent_downcasting", True)

In [2]:
#setup.construct_data_sets()

In [3]:
with open("./data/data_sets.pkl", 'rb') as f:
    data_by_country = pickle.load(f)
name_count_by_country = {c: (len(data_by_country[c][0]), len(data_by_country[c][1])) for c in data_by_country}
lst = sorted(list(name_count_by_country.items()), key=lambda x: x[1], reverse=True)
for x in lst:
    print(x)

('RU', (6809103, 10065354))
('GB', (4736468, 7911337))
('ZA', (3061224, 3829995))
('IT', (2508971, 5349306))
('AR', (1732553, 2194551))
('UA', (1307747, 2329730))
('FI', (616804, 824248))
('RO', (612856, 949509))
('BG', (610126, 823619))
('NO', (426803, 695419))
('CH', (377475, 747701))
('NZ', (315754, 518050))
('RS', (174991, 331788))
('MY', (164274, 501751))
('CZ', (154167, 305160))
('CL', (148767, 324038))
('TH', (110090, 305600))
('IN', (95120, 306644))
('CN', (91873, 223318))
('VN', (87210, 394714))
('DE', (83696, 270400))
('JO', (72867, 409478))
('IE', (71659, 150820))
('US', (70970, 260674))
('FR', (62720, 171151))
('PH', (62400, 41599))
('PL', (56743, 134037))
('LB', (56421, 131020))
('AU', (46689, 105329))
('PG', (36712, 115424))
('BH', (35966, 45900))
('OM', (34553, 140942))
('PK', (29193, 154307))
('DZ', (28313, 295427))
('MM', (27661, 78479))
('NG', (27310, 56344))
('SK', (25059, 53108))
('SE', (25025, 63070))
('ES', (23916, 60953))
('ID', (23763, 65262))
('HK', (23506, 670

In [4]:
len(data_by_country.keys())

225

In [5]:
# Parameters
orbis_test_size = 450
wgnd_test_size = 5000
random_state = 42
shuffle = True
include_middle_name = False
include_last_name = False
clearning_special_chars = True
min_ratio_train_to_test = 5
min_ratio_minority_to_majority = 0.2
min_name_samples = 450

In [6]:
arab_league = {
    "DZ", "BH", "KM", "DJ", "EG", "IQ", "JO", "KW", "LB", "LY", "MR", 
    "MA", "OM", "PS", "QA", "SA", "SO", "SD", "SY", "TN", "AE", "YE",
}
#test_country_lst = ["AE", "JP", "TN", "MA", "US"]

In [7]:
all_country_lst = [_[0] for _ in sorted(list(name_count_by_country.items()), key=lambda x: x[1], reverse=True)]
country_lst = []
n, n_ = 0, 0
arab_minority_group = set()
arab_name_samples = {}
for country in all_country_lst:
    f_name_count = len(data_by_country[country][0])
    m_name_count = len(data_by_country[country][1])
    
    if country in arab_league:
        arab_name_samples[country] = (f_name_count, m_name_count)
        
    f_test_size = orbis_test_size
    f_validation_size = orbis_test_size
    f_remaining_size = f_name_count - f_test_size - f_validation_size
    m_test_size = orbis_test_size
    m_validation_size = orbis_test_size
    m_remaining_size = m_name_count - m_test_size - m_validation_size
    
    if f_remaining_size <= m_remaining_size:
        f_train_size = f_remaining_size
        m_train_size = int(float(f_remaining_size) / min_ratio_minority_to_majority)
    else:
        f_train_size = int(float(f_remaining_size) / min_ratio_minority_to_majority)
        m_train_size = m_remaining_size
        
    print(country, f_name_count, f_test_size + f_validation_size, m_name_count, m_test_size + m_validation_size)

    if f_train_size < f_test_size * min_ratio_train_to_test or m_train_size < m_test_size * min_ratio_train_to_test:
        if country in arab_league and f_name_count > orbis_test_size and m_name_count > orbis_test_size:
            arab_minority_group.add(country)
        continue
        
    n_ += f_name_count + m_name_count
    country_lst.append(country)

country_lst += list(arab_minority_group)
print(len(country_lst))
print(n_)
print(arab_minority_group)
print(arab_name_samples)

RU 6809103 900 10065354 900
GB 4736468 900 7911337 900
ZA 3061224 900 3829995 900
IT 2508971 900 5349306 900
AR 1732553 900 2194551 900
UA 1307747 900 2329730 900
FI 616804 900 824248 900
RO 612856 900 949509 900
BG 610126 900 823619 900
NO 426803 900 695419 900
CH 377475 900 747701 900
NZ 315754 900 518050 900
RS 174991 900 331788 900
MY 164274 900 501751 900
CZ 154167 900 305160 900
CL 148767 900 324038 900
TH 110090 900 305600 900
IN 95120 900 306644 900
CN 91873 900 223318 900
VN 87210 900 394714 900
DE 83696 900 270400 900
JO 72867 900 409478 900
IE 71659 900 150820 900
US 70970 900 260674 900
FR 62720 900 171151 900
PH 62400 900 41599 900
PL 56743 900 134037 900
LB 56421 900 131020 900
AU 46689 900 105329 900
PG 36712 900 115424 900
BH 35966 900 45900 900
OM 34553 900 140942 900
PK 29193 900 154307 900
DZ 28313 900 295427 900
MM 27661 900 78479 900
NG 27310 900 56344 900
SK 25059 900 53108 900
SE 25025 900 63070 900
ES 23916 900 60953 900
ID 23763 900 65262 900
HK 23506 900 67050

In [8]:
print(len(set(set(country_lst) - set(arab_league))))

70


In [9]:
print(len(country_lst))

84


In [10]:
print(country_lst)

['RU', 'GB', 'ZA', 'IT', 'AR', 'UA', 'FI', 'RO', 'BG', 'NO', 'CH', 'NZ', 'RS', 'MY', 'CZ', 'CL', 'TH', 'IN', 'CN', 'VN', 'DE', 'JO', 'IE', 'US', 'FR', 'PH', 'PL', 'LB', 'AU', 'PG', 'BH', 'OM', 'PK', 'DZ', 'MM', 'NG', 'SK', 'SE', 'ES', 'ID', 'HK', 'IS', 'NL', 'MD', 'EG', 'CA', 'PT', 'TR', 'KE', 'KW', 'HU', 'SG', 'LT', 'AT', 'BR', 'DK', 'AE', 'GR', 'JP', 'LS', 'ZW', 'BE', 'EE', 'IR', 'LV', 'UZ', 'ME', 'BD', 'SA', 'LK', 'KR', 'JM', 'GH', 'AL', 'BY', 'IL', 'KH', 'MA', 'BA', 'CY', 'TN', 'IQ', 'SY', 'SO']


In [11]:
# Train-and-test sets construction
setup.construct_train_validation_test_sets_across_countries(data_by_country, country_lst, orbis_test_size,
                                                            orbis_test_size, random_state, shuffle,
                                                            include_middle_name, include_last_name,
                                                            clearning_special_chars,
                                                            min_ratio_minority_to_majority, arab_league,
                                                            arab_minority_group)

RU: Female train/val/test sizes: F:6808203/450/450 | Male train/val/test sizes: M:10064454/450/450 | Ratio of minority train size to majority train size: 0.6764602431488087
GB: Female train/val/test sizes: F:4735568/450/450 | Male train/val/test sizes: M:7910437/450/450 | Ratio of minority train size to majority train size: 0.5986480898590053
ZA: Female train/val/test sizes: F:3060324/450/450 | Male train/val/test sizes: M:3829095/450/450 | Ratio of minority train size to majority train size: 0.7992290606527129
IT: Female train/val/test sizes: F:2508071/450/450 | Male train/val/test sizes: M:5348406/450/450 | Ratio of minority train size to majority train size: 0.46893803499584735
AR: Female train/val/test sizes: F:1731653/450/450 | Male train/val/test sizes: M:2193651/450/450 | Ratio of minority train size to majority train size: 0.7893931167719933
UA: Female train/val/test sizes: F:1306847/450/450 | Male train/val/test sizes: M:2328830/450/450 | Ratio of minority train size to majori

In [11]:
# world gender name dictionary data
setup.construct_wgnd_val_test_data(country_lst, wgnd_test_size, clearning_special_chars, random_state,
                                 min_name_samples)

Read wgnd name-and-gender data.
Number of countries: 65


In [13]:
# # Multinomial Naive Bayes
# models.tuning_alpha_in_orbis_data_for_MultinomialNB(country_lst, include_middle_name, include_last_name, clearning_special_chars)
# models.tuning_probability_threshold_in_orbis_data_for_MultinomialNB(country_lst, include_middle_name, include_last_name, clearning_special_chars)
# models.tuning_probability_threshold_in_wgnd_data_for_MultinomialNB(country_lst, include_middle_name, include_last_name,
#                                                                clearning_special_chars)

In [14]:
# Complement Naive Bayes
models.tuning_alpha_in_orbis_data_for_ComplementNB(country_lst, include_middle_name, include_last_name,
                                                   clearning_special_chars)
models.tuning_probability_threshold_in_orbis_data_for_ComplementNB(country_lst, include_middle_name,
                                                                   include_last_name, clearning_special_chars)
models.tuning_probability_threshold_in_wgnd_data_for_ComplementNB(country_lst, include_middle_name,
                                                                  include_last_name, clearning_special_chars)

country RU alpha 0.001 measure {'PR AUC': 0.9999145299145299, 'ROC AUC': 0.9999111111111111, 'Threshold': 0.9999999945286575, 'N_val': 900, 'N_train': 16872657, 'Gender assignment rate': 0.9066666666666666, 'Training time in seconds': 8.954402923583984, 'Testing time in seconds': 0.7189738750457764}
country RU alpha 0.01 measure {'PR AUC': 0.9999009200283087, 'ROC AUC': 0.9998962962962963, 'Threshold': 0.9999998912098166, 'N_val': 900, 'N_train': 16872657, 'Gender assignment rate': 0.9211111111111111, 'Training time in seconds': 8.761014938354492, 'Testing time in seconds': 0.7432711124420166}
country RU alpha 0.1 measure {'PR AUC': 0.9998666374500863, 'ROC AUC': 0.9998617283950617, 'Threshold': 0.9999998687910023, 'N_val': 900, 'N_train': 16872657, 'Gender assignment rate': 0.9155555555555556, 'Training time in seconds': 8.665780782699585, 'Testing time in seconds': 0.7523329257965088}
country RU alpha 1 measure {'PR AUC': 0.9998401455236462, 'ROC AUC': 0.9998370370370371, 'Threshold'

In [15]:
models.tuning_probability_threshold_in_orbis_data_for_nqg(country_lst, include_middle_name, include_last_name,
                                                          clearning_special_chars)
models.tuning_probability_threshold_in_wgnd_data_for_nqg(country_lst)

In [11]:
models.test_ComplementNB_in_orbis_data(country_lst, include_middle_name, include_last_name, clearning_special_chars)

RU 0.001 0.93 0.9997217652612715 0.9997234567901235 0.9955357142857143 900 16872657 0.9955555555555555 3.1662991046905518 0.23076581954956055
GB 1.0 0.9 0.9983261424118139 0.998358024691358 0.9907940161104718 900 12646005 0.9655555555555555 2.7151830196380615 0.20600390434265137
ZA 0.01 0.9 0.9788959220899227 0.9786024691358025 0.9617346938775511 900 6889419 0.8711111111111111 2.315992832183838 0.20553898811340332
IT 0.001 0.9 0.9999901671583087 0.9999901234567902 0.9988876529477196 900 7856477 0.9988888888888889 2.263744831085205 0.21824216842651367
AR Not performed test.
UA Not performed test.
FI Not performed test.
RO 10.0 0.9 0.9673951018035862 0.9795037037037038 0.9674306393244874 900 1560565 0.9211111111111111 1.1380209922790527 0.23621702194213867
BG 0.001 0.9 0.9983814270296149 0.9978814814814815 0.9932659932659933 900 1431945 0.99 1.0337610244750977 0.20020079612731934
NO 0.1 0.9 0.9894946800636794 0.9911382716049383 0.9796610169491525 900 1120422 0.9833333333333333 1.24747490

In [12]:
models.test_ComplementNB_in_wgnd_data(country_lst, include_middle_name, include_last_name, clearning_special_chars)

RU 0.001 0.93 0.9441962827769442 0.9392473200000001 0.9010925819436457 10000 16872657 0.8695
GB 1.0 0.9 0.9945628161238703 0.9948332200000001 0.9844266666666667 10000 12646005 0.9375
ZA 0.01 0.9 0.9771552961311375 0.9769896 0.9527811566957868 10000 6889419 0.9043
IT 0.001 0.9 0.990852783030243 0.9949959800000001 0.9900719278695168 10000 7856477 0.9871
AR Not performed test.
UA Not performed test.
FI Not performed test.
RO 10.0 0.9 0.9381486813067702 0.93835008 0.9464888588791357 10000 1560565 0.5924
BG 0.001 0.9 1.0 1.0 1.0 10000 1431945 0.9999
NO 0.1 0.9 0.9987578344770927 0.9986809 0.9916857202244855 10000 1120422 0.9622
CH 0.01 0.9 0.9976899386734099 0.9976881400000001 0.9899231248701433 10000 1123376 0.9626
NZ 0.1 0.9 0.9989512732136536 0.99892276 0.9889081103083341 10000 832004 0.9827
RS 0.001 0.97 0.9254182357803709 0.92190544 0.907626208378088 10000 504979 0.7448
MY Not performed test.
CZ 10.0 0.9 0.9542112748422745 0.95300448 0.9568831900803156 10000 457527 0.7097
CL Not perfor

In [11]:
models.test_nqg_in_orbis_data(country_lst, include_middle_name, include_last_name, clearning_special_chars)

{'RU': 0.9, 'GB': 0.9, 'ZA': 0.9, 'IT': 0.9, 'RO': 0.9, 'BG': 0.9, 'NO': 0.9, 'CH': 0.9, 'NZ': 0.9, 'RS': 0.9, 'CZ': 0.9, 'IN': 0.9, 'CN': -1.0, 'DE': 0.9, 'JO': 0.9, 'IE': 0.9, 'US': 0.9, 'FR': 0.9, 'PH': 0.9, 'PL': 0.9, 'LB': 0.9, 'AU': 0.9, 'PG': 0.9, 'BH': 0.9, 'OM': 0.9, 'DZ': 0.9, 'NG': 0.9, 'SE': 0.9, 'ES': 0.9, 'IS': 0.9, 'NL': 0.9, 'MD': 0.9, 'EG': 0.9, 'CA': 0.9, 'PT': 0.9, 'TR': 0.9, 'KE': 0.9, 'KW': 0.9, 'LT': 0.9, 'AT': 0.9, 'DK': 0.9, 'AE': 0.9, 'JP': 0.9, 'LS': 0.9, 'ZW': 0.9, 'BE': 0.9, 'EE': 0.9, 'IR': 0.9, 'ME': 0.9, 'BD': 0.9, 'SA': 0.93, 'LK': 0.9, 'KR': 0.9, 'JM': 0.9, 'GH': 0.9, 'AL': 0.9, 'BY': 0.9, 'IL': 0.9, 'MA': 0.9, 'BA': 0.9, 'CY': 0.9, 'SY': 0.9, 'TN': 0.9, 'SO': 0.9, 'IQ': 0.9}
['RU', 0.9, 0.9954068614667055, 0.9976469135802469, 0.9988262910798122, 900, 0.9466666666666667]
['GB', 0.9, 0.9936589014219199, 0.9963308641975309, 0.9975216852540273, 900, 0.8966666666666666]
['ZA', 0.9, 0.8928959892762064, 0.9193827160493827, 0.9656946826758147, 900, 0.647777777

In [12]:
models.test_nqg_in_wgnd_data(country_lst)

{'RU': 0.9, 'GB': 0.9, 'ZA': 0.9, 'IT': 0.9, 'RO': 0.9, 'BG': 0.9, 'NO': 0.9, 'CH': 0.9, 'NZ': 0.9, 'RS': 0.9, 'CZ': 0.9, 'IN': 0.9, 'CN': -1.0, 'DE': 0.9, 'JO': 0.9, 'IE': 0.9, 'US': 0.9, 'FR': 0.9, 'PH': 0.9, 'PL': 0.9, 'LB': 0.9, 'AU': 0.9, 'PG': 0.9, 'BH': 0.9, 'OM': 0.9, 'DZ': 0.9, 'NG': 0.9, 'SE': 0.9, 'ES': 0.9, 'IS': 0.9, 'NL': 0.9, 'MD': 0.9, 'EG': 0.9, 'CA': 0.9, 'PT': 0.9, 'TR': 0.9, 'KE': 0.9, 'KW': 0.9, 'LT': 0.9, 'AT': 0.9, 'DK': 0.9, 'AE': 0.9, 'JP': 0.9, 'LS': 0.9, 'ZW': 0.9, 'BE': 0.9, 'EE': 0.9, 'IR': 0.9, 'ME': 0.9, 'BD': 0.9, 'SA': 0.93, 'LK': 0.9, 'KR': 0.9, 'JM': 0.9, 'GH': 0.9, 'AL': 0.9, 'BY': 0.9, 'IL': 0.9, 'MA': 0.9, 'BA': 0.9, 'CY': 0.9, 'SY': 0.9, 'TN': 0.9, 'SO': 0.9, 'IQ': 0.9}
['RU', 0.9, 0.9203298588341533, 0.92714616, 0.9584868234627373, 10000, 0.7058]
['GB', 0.9, 0.9978872743102697, 0.99780254, 0.9979557069846678, 10000, 0.8805]
['ZA', 0.9, 0.9917924080270357, 0.99078334, 0.9895004038306219, 10000, 0.8667]
['IT', 0.9, 0.9990041571613112, 0.99902216, 0