# Merge Distribution of Household Income data into a single data file

## Imports

In [None]:
import pandas as pd
import os
import re
from functools import reduce

## Collect list of files from data directory

In [None]:
files = os.listdir('data')
files

In [None]:
df_list = []
for file in files:
    if not re.search('minimum|median', file):
        df = pd.read_csv('data\\' + file)
        
        # Add "_rates" to end of column names for rates tables
        if re.search('rates', file):
            for column in df.columns:
                if column not in ['household_type', 'income_group', 'year']:
                    df.rename(columns = {column : column + '_rate'}, inplace=True)
        
        # Add "share_of_" to front of column names for shares tables
        if re.search('shares', file):
            for column in df.columns:
                if column not in ['household_type', 'income_group', 'year']:
                    df.rename(columns = {column : 'share_of_' + column}, inplace=True)
        
        df_list.append(df)

In [None]:
all_data = reduce(lambda df1, df2: pd.merge(df1, df2, on = ['household_type', 'income_group', 'year']), df_list)

In [None]:
all_data.head()

In [None]:
all_data.columns

In [None]:
# Clean up duplicate columns
for column in all_data.columns:
    if column.endswith('_x'):
        all_data.rename(columns={column : column[0:-2]}, inplace=True)
    if column.endswith('_y'):
        all_data.drop(column, axis=1, inplace=True)

In [None]:
print ("all_data now has ", len(all_data.columns), "columns")
print (all_data.columns)

## Merge in medians
Only merging on `['household_type'] == 'all_households'` and `['income_group'] == 'all_quintiles'`

In [None]:
medians = pd.read_csv('data\\table_04_median_household_income_1979_2014.csv')
medians.head()

In [None]:
for column in medians.columns:
    if not column == 'year':
        medians.rename(columns={column : 'median_' + column}, inplace=True)
medians.head()

In [None]:
medians['household_type'] = 'all_households'
medians['income_group'] = 'all_quintiles'

In [None]:
all_data = pd.merge(all_data, medians, on=['household_type', 'income_group', 'year'], how='left')

In [None]:
print ("all_data now has ", len(all_data.columns), "columns")
print (all_data.columns)