In [None]:
import os
import subprocess

import json
import pandas as pd
import numpy as np

In [None]:
# ensure data is downloaded and available. Be mindful that source_data is 3.3GB unzipped
if 'source_data' not in os.listdir('../'):
    subprocess.run(['osf', '-p', 'tz38q', 'fetch', 'source_data.zip'], cwd="../")
    subprocess.run(['unzip', 'source_data.zip'], cwd="../")
    subprocess.run(['rm', 'source_data.zip'], cwd="../")
    
# create name_data directory if necessary
constructed_resource_path = '../name_data/'
if 'name_data' not in os.listdir('../'):
    os.mkdir(constructed_resource_path)

In [None]:
sources = sorted([ns for ns in os.listdir('../source_data') if 'NS' in ns])

## 1. fine-grained name-gender data

In [None]:
# load and combine all data sources
all_data = np.vstack([pd.read_csv('../source_data/'+ns+'/'+ns+'.csv',
                                  keep_default_na=False).values
                for ns in sources])

df = pd.DataFrame(all_data,columns = ['tag','ascii','ns','country','year','dtype','value','adjustment','utf'])

# remove problematic strings
drop_names = ([x for x in set(df['ascii'].values) if '.' in x] + 
              [x for x in set(df['ascii'].values) if len(x) < 2])

gdf = df.loc[~df['ascii'].isin(drop_names)].copy()
gdf['ascii'] = [n.strip() for n in gdf['ascii'].values]

# save fine-grained_name-gender_data
gdf.to_csv(constructed_resource_path+'fine-grained_name-gender_data.csv',index=False)

## 2. source-aggregated name-gender associations

In [None]:
all_dict = {s:{} for s in set(gdf['ascii'])}
for row in gdf.values:
    append = row[2] in all_dict[row[1]].keys()
    if row[0] == '-':
        d = {'m':(100 - row[6])*row[7],'f':row[6]*row[7]}
    elif row[0] == 'f':
        d = {'m':0,'f':row[6]*row[7]}
    else:
        d = {'m':row[6]*row[7],'f':0}
        
    if append:
        all_dict[row[1]][row[2]]['m'] += d['m']
        all_dict[row[1]][row[2]]['f'] += d['f']
    else:
        all_dict[row[1]][row[2]] = d
        
# save source-aggregated_name-gender_associations
json.dump(all_dict,open(constructed_resource_path+'source-aggregated_name-gender_associations.json','w'))

## 3. averaged name-gender estimates

In [None]:
count_sources = [ns for ns,v in pd.DataFrame(gdf.groupby('ns')['tag'].unique()).iterrows() if '-' not in v.values[0]]

In [None]:
all_ratios = {}
for n,p in all_dict.items():
    n_count = 0
    n_sources = 0
    estimates = []
    for s,v in p.items():
        n_sources += 1
        total = v['m']+v['f']
        if s in count_sources:
            n_count += total
        estimates.append(v['f']/total)
    all_ratios[n] = {'N_sources':n_sources, 'M_counts':n_count, 'AVG_estimate':np.mean(estimates)}
    
# save averaged_name-gender_estimates
json.dump(all_ratios,open(constructed_resource_path+'averaged_name-gender_estimates.json','w'))

### light version of above dictionary (for nqg package)

In [None]:
nqg_data = {k:[int(v['N_sources']),
               int(v['M_counts']),
            np.round(v['AVG_estimate'],3)] for k,v in sorted(all_ratios.items(),
                                                             key = lambda x:x[0])}

json.dump(nqg_data,open(constructed_resource_path+'nqg_data.json','w'))