In [15]:
import pandas as pd, numpy as np

In [16]:
import ast

def interval_type(s):
    """Parse interval string to Interval"""
    
    table = str.maketrans({'[': '(', ']': ')'})
    left_closed = s.startswith('[')
    right_closed = s.endswith(']')

    left, right = ast.literal_eval(s.translate(table))

    t = 'neither'
    if left_closed and right_closed:
        t = 'both'
    elif left_closed:
        t = 'left'
    elif right_closed:
        t = 'right'

    return pd.Interval(left, right, closed=t)

In [18]:
def format_emigration_rates_for_file(emigration_rates, name):
    emigration_rates = emigration_rates.rename(name).reset_index()
    age_bound_cols = ['age_start', 'age_end']
    emigration_rates[age_bound_cols] = [[x.left, x.right] for x in emigration_rates['age_group']]
    emigration_rates = emigration_rates.drop(columns=['age_group']).rename(columns={'ST': 'state', 'PUMA': 'puma'})
    return emigration_rates[age_bound_cols + [c for c in emigration_rates.columns if c not in age_bound_cols]]

In [19]:
files = [
    ('group_quarters_person_emigration_rates.csv', 'group_quarters_person_emigration_rate'),
    ('household_emigration_rates.csv', 'household_emigration_rate'),
    ('non_reference_person_emigration_rates.csv', 'non_reference_person_emigration_rate'),
]

In [None]:
for (file_name, col_name) in files:
    df = pd.read_csv(file_name)
    df['age_group'] = df.age_group.apply(interval_type)
    df = df.set_index(['age_group', 'sex', 'race_ethnicity', 'born_in_us', 'ST']).emigration_rate
    
    format_emigration_rates_for_file(df, col_name).to_csv(file_name.replace('.csv', '_new.csv'), index=False)