In [None]:
import os
import subprocess

import json
import pandas as pd
import numpy as np
from collections import defaultdict
import re

In [None]:
# ensure data is downloaded and available
if 'name_data' not in os.listdir('../'):
    os.mkdir('../name_data')
    
def download_if_necessary(folder, file):
    if file not in os.listdir('../%s'%folder):
        subprocess.run(['osf', '-p', 'tz38q', 'fetch', 
                        '%s/%s'%(folder, file), '%s/%s'%(folder, file)], cwd="../")

In [None]:
# load data
download_if_necessary('name_data', 'fine-grained_name-gender_data.csv')
df = pd.read_csv('../name_data/fine-grained_name-gender_data.csv',keep_default_na=False, dtype={'year': str})

In [None]:
# add features to data

df['count'] = df['value']*df['adjustment']
df['estimate'] = df['value']*df['adjustment']

df.loc[df['dtype'].isin(['d','s']),'count'] = np.nan
df.loc[df['dtype'].isin(['d','s']),'estimate'] = df['value'] / 100.

df['decade'] = 'XXXX'
df.loc[df['year'] != 'XXXX','decade'] = [str(1925 + 10*((int(year) - 1920)//10)) if 
                                        (int(year) < 2010 and int(year) >= 1920) else 'XXXX'
                                         for year in df.loc[df['year'] != 'XXXX','year'].values]

df['ns_dtype'] = [str(x).zfill(2)+y for x,y in df[['ns','dtype']].values]

## low level grouping

In [None]:
all_names = sorted(df['ascii'].unique())
source_counts = df.groupby('ns_dtype')['count'].sum().to_dict()

In [None]:
# group data
name_groups = {n:{} for n in all_names}
for v in df.values:
    tag, name, estimate = v[[0,1,10]]
    key = '_'.join(v[[12,3,11]]) # ns_dtype, country, decade

    if key not in name_groups[name]:
        name_groups[name][key] = {'m':[0],'f':[0],'-':[]}
    
    name_groups[name][key][tag].append(estimate)

In [None]:
# compute estimates, counts, and popularity (percent) for group level data
re_count_data = re.compile("\d{2}[ab].*")
name_preds = {n:[] for n in all_names}
for name,groups in name_groups.items():
    for group,data in groups.items():
        source = group.split('_')[0]
        if re_count_data.match(source): # for count data
            f = sum(data['f'])
            m = sum(data['m'])
            count = f+m
            estimate = f/(count)
            percent = count/source_counts[source]
        else: # for score, dict data
            estimate = np.mean(data['-'])
            count = len(data['-'])
            percent = np.nan
        name_preds[name].append([group,estimate,percent,count])

## aggregate by country and decade

In [None]:
def make_key(i,j):
    if i == 'c':
        return j.split('_')[1]
    elif i == 'd':
        return j.split('_')[2]
    else:
        return '-'


def compute_average(key, estimates_pops_counts):
    estimates, pops, counts = estimates_pops_counts.T

    weights = pops/pops.sum()
    return np.dot(estimates,weights), pops.sum(), counts.sum()


def compute_estimates(preds, keys = ['c', 'd']):
    dic = {k:defaultdict(list) for k in keys}
    for pred in preds:
        x = [pred[1],pred[2],pred[3]]
        if re_count_data.match(pred[0]):
            for k in keys:
                dic[k][make_key(k,pred[0])].append(x)

    dic_estimates = {i:{k:
                    compute_average(k, np.array(v)) for k,v in j.items()}
                    for i,j in dic.items()}
    return(dic_estimates)

In [None]:
name_estimates = {}
for n in all_names:
    estimate = compute_estimates(name_preds[n])
    for k in ['c', 'd']:
        if len(estimate[k]) > 0:
            name_estimates[n] = estimate
            break

In [None]:
# save country and decade aggregated estimates
if 'replication_data' not in os.listdir('../'):
    os.mkdir('../replication_data')

json.dump(name_estimates,
          open('../replication_data/country-decade-aggregated_estimates.json','w'))