In [10]:
def clean(census_data, race_data):
    
    warnings. filterwarnings('ignore')

    # clean census data
    census_data = census_data[['location','label','estimate','moe']]
    census_data['label'] = census_data['label'].str.replace('Estimate', '').str.replace('Total', '').str.replace('!!', '').str.replace(':', '')
    census_data['location'] = census_data['location'].str.replace('; St. Louis city; Missouri','')

    # Filter out aggregate columns
    all_totals = census_data[census_data['label']=='']
    total_tract_pops = np.array(all_totals['estimate'].tolist())
    tract_pop_proportions = total_tract_pops / np.sum(total_tract_pops)
    trans_totals_mask = ~census_data['label'].astype(str).str.contains('[$]')
    trans_totals = census_data.loc[trans_totals_mask][1:]
    income_totals = census_data[census_data['label'].str.startswith("$")]
    not_to_include = pd.concat([trans_totals, income_totals, all_totals])
    merged_df = pd.merge(census_data, not_to_include, how='left', indicator=True)
    no_totals = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])[1:]
    split_data = no_totals['label'].str.split('$', 1, expand=True)
    no_totals[['transportation', 'income']] = split_data
    no_totals.drop(columns=['label'], inplace=True)
    no_totals['income'] = '$' + no_totals['income']
    from_this = no_totals['income'].unique()
    to_this = [1, 10000, 15000, 25000, 35000, 50000, 65000, 75000]
    no_totals['avg income'] = no_totals['income'].replace(from_this, to_this)
    clean_census_data = no_totals
    
    # clean race data
    race_data = race_data[['location','label','concept','estimate','moe']]
    race_data['label'] = race_data['label'].str.replace('Estimate', '').str.replace('Total', '').str.replace('!!', '').str.replace(':', '')
    race_data['location'] = race_data['location'].str.replace('; St. Louis city; Missouri','')
    race_data['concept'] = race_data['concept'].str.replace('MEANS OF TRANSPORTATION TO WORK ', '').str.replace(')','').str.replace('(','').str.replace('ALONE','')

    # Filter out aggregate rows and rename columns
    race_data = race_data[race_data['label'] != '']
    race_data = race_data.rename(columns={'label': 'transportation', 'concept': 'race'})
    
    # Group by tract
    by_tract = clean_census_data.groupby('location', as_index = False)['estimate'].sum()

    # Transit weight: normalized % public transit users in tract / % public transit users in city
    tract_pops = np.array(clean_census_data.groupby('location', as_index = False)['estimate'].sum()['estimate'].tolist())
    public_transit_pops = np.array(clean_census_data[clean_census_data['transportation']=='Public transportation (excluding taxicab)'].groupby(['location','transportation'], as_index = False)['estimate'].sum()['estimate'].tolist())
    public_transit_pcts = np.divide(public_transit_pops, tract_pops)
    total_transit_pct = np.divide(sum(public_transit_pops), sum(tract_pops))
    transit_weight_unnorm = public_transit_pcts / total_transit_pct
    by_tract['transit weight'] = transit_weight_unnorm / sum(transit_weight_unnorm)


    # Income weight: normalized inverse of sum(tract income) / sum(total income)
    tract_incomes = (clean_census_data['avg income']*clean_census_data['estimate']).groupby(clean_census_data['location']).sum().tolist()
    tract_totals = sum(tract_incomes)
    tract_incomes_new = np.array([tract_totals - i for i in tract_incomes])
    income_weight = tract_incomes_new / sum(tract_incomes_new)
    by_tract['income weight'] = income_weight
    
    # Race weight: normalized % nonwhite in tract / % nonwhite in city
    total_pops = np.array(race_data.groupby('location')['estimate'].sum().tolist())
    nonwhite_pops = np.array(race_data[race_data['race'] != 'WHITE '].groupby('location')['estimate'].sum().tolist())
    nonwhite_pcts = np.divide(nonwhite_pops, total_pops)
    total_pct = np.divide(sum(nonwhite_pops), sum(total_pops))
    race_weight_unnorm = nonwhite_pcts / total_pct
    by_tract['race weight'] = race_weight_unnorm/sum(race_weight_unnorm)

    by_tract['node id'] = by_tract['location'].str.replace('Census Tract ', '')
    
    return by_tract