# Merge Distribution of Household Income data into a single data file

## Imports

In [24]:
import pandas as pd
import os
import re
from functools import reduce

## Collect list of files from data directory

In [25]:
files = os.listdir('data')
files

['table_01_demographics_1979_2014.csv',
 'table_02_income_group_minimums_1979_2014.csv',
 'table_03_average_household_income_1979_2014.csv',
 'table_04_median_household_income_1979_2014.csv',
 'table_05_components_inc_before_transfers_taxes_1979_2014.csv',
 'table_06_components_means_tested_transfers_1979_2014.csv',
 'table_07_components_federal_taxes_1979_2014.csv',
 'table_08_means_tested_transfer_rates_1979_2014.csv',
 'table_09_federal_tax_rates_1979_2014.csv',
 'table_10_household_income_shares_1979_2014.csv',
 'table_11_means_tested_transfer_shares_1979_2014.csv',
 'table_12_federal_tax_shares_1979_2014.csv']

In [45]:
df_list = []
for file in files:
    if not re.search('minimum|median', file):
        df = pd.read_csv('data\\' + file)
        
        # Add "_rates" to end of column names for rates tables
        if re.search('rates', file):
            for column in df.columns:
                if column not in ['household_type', 'income_group', 'year']:
                    df.rename(columns = {column : column + '_rate'}, inplace=True)
        
        # Add "share_of_" to front of column names for shares tables
        if re.search('shares', file):
            for column in df.columns:
                if column not in ['household_type', 'income_group', 'year']:
                    df.rename(columns = {column : 'share_of_' + column}, inplace=True)
        
        df_list.append(df)

In [49]:
all_data = reduce(lambda df1, df2: pd.merge(df1, df2, on = ['household_type', 'income_group', 'year'], copy=False), df_list)

In [50]:
all_data.head()

Unnamed: 0,household_type,income_group,year,num_households,num_children,num_adults,num_elderly,num_people,market_income_x,social_insurance_benefits_x,...,share_of_all_means_tested_transfers,share_of_medicaid,share_of_snap,share_of_ssi,share_of_other_transfers,share_of_all_federal_taxes,share_of_individual_income_tax,share_of_payroll_taxes,share_of_corporate_income_tax,share_of_excise_taxes
0,all_households,all_quintiles,1979,81.1,64.8,132.9,24.2,221.9,60300.0,4500.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
1,all_households,all_quintiles,1980,82.6,64.5,135.2,24.7,224.5,58100.0,4900.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
2,all_households,all_quintiles,1981,83.8,63.9,136.8,25.3,226.0,58100.0,5100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
3,all_households,all_quintiles,1982,84.3,63.2,138.5,25.8,227.6,57700.0,5600.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
4,all_households,all_quintiles,1983,85.8,63.3,140.0,26.1,229.4,58300.0,5700.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [51]:
all_data.columns

Index(['household_type', 'income_group', 'year', 'num_households',
       'num_children', 'num_adults', 'num_elderly', 'num_people',
       'market_income_x', 'social_insurance_benefits_x',
       'inc_before_transfers_taxes_x', 'means_tested_transfers_x',
       'federal_taxes_x', 'inc_after_transfers_taxes',
       'inc_before_transfers_taxes_y', 'market_income_y', 'wages',
       'employee_contrib_deferred_comp', 'employer_contrib_health_ins',
       'employer_share_payroll_taxes', 'federal_unemployment_tax',
       'corp_tax_borne_by_labor', 'business_income', 'capital_gains',
       'tax_exempt_interest', 'taxable_interest', 'positive_rental_income',
       'dividends', 'corp_tax_borne_by_capital', 'other_market_income',
       'social_insurance_benefits_y', 'social_security', 'medicare',
       'unemployment_insurance', 'workers_compensation',
       'means_tested_transfers_y', 'medicaid', 'snap', 'ssi',
       'other_transfers', 'federal_taxes_y', 'individual_income_tax',
      

In [65]:
# Clean up duplicate columns
for column in all_data.columns:
    if column.endswith('_x'):
        all_data.rename(columns = {column : column[0:-2]}, inplace=True)
    if column.endswith('_y'):
        all_data.drop(column, axis=1, inplace=True)

In [66]:
all_data.columns

Index(['household_type', 'income_group', 'year', 'num_households',
       'num_children', 'num_adults', 'num_elderly', 'num_people',
       'market_income', 'social_insurance_benefits',
       'inc_before_transfers_taxes', 'means_tested_transfers', 'federal_taxes',
       'inc_after_transfers_taxes', 'wages', 'employee_contrib_deferred_comp',
       'employer_contrib_health_ins', 'employer_share_payroll_taxes',
       'federal_unemployment_tax', 'corp_tax_borne_by_labor',
       'business_income', 'capital_gains', 'tax_exempt_interest',
       'taxable_interest', 'positive_rental_income', 'dividends',
       'corp_tax_borne_by_capital', 'other_market_income', 'social_security',
       'medicare', 'unemployment_insurance', 'workers_compensation',
       'medicaid', 'snap', 'ssi', 'other_transfers', 'individual_income_tax',
       'payroll_taxes', 'corporate_income_tax', 'excise_taxes',
       'means_tested_transfers_rate', 'medicaid_rate', 'snap_rate', 'ssi_rate',
       'other_transfer