# Preparing, Cleaning, and Merging the Datasets

## Converting fixed-width files to Pandas Dataframe

In [None]:
import pandas as pd
from io import StringIO
import re

### Set up functions to convert and split each .dat file (for the years 2014, 2015, 2016, 2018) into 2 separate dataframes (one for each record type:
 ## Family, Person)
 ## *Household record will not be used in this project

FAMILY_LAYOUT = []
PERSON_LAYOUT = []

### Extract these columns only

FAMILY_NUMERIC_COLS = ['FFPOS', 'FH-SEQ', 'FPERSONS', 'FPOVCUT', 'FAMLIS', 'POVLL', 'FTOTVAL', 'FEARNVAL']
PERSON_NUMERIC_COLS = ['PERIDNUM', 'PF-SEQ', 'PH-SEQ', 'A-AGE', 'PEAFEVER', 'A-HGA', 'A-MJOCC', 'PEARNVAL', 'WSAL-VAL', 'DIV-VAL', 'RTM-VAL']

### Splits data dictionary into 3 separate data dictionaries by record type
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
def split_dictionary_by_record(input_path, year):
    with open(input_path, 'r') as f:
        lines = f.readlines()

    record_sections = {'HOUSEHOLD RECORD': [], 'FAMILY RECORD': [], 'PERSON RECORD': []}
    current_section = None

    for line in lines:
        line = line.strip()
        if line in record_sections:
            current_section = line
        elif current_section:
            record_sections[current_section].append(line + '\n')

    with open(f'household_dict{year}.txt', 'w') as f:
        f.writelines(record_sections['HOUSEHOLD RECORD'])
    with open(f'family_dict{year}.txt', 'w') as f:
        f.writelines(record_sections['FAMILY RECORD'])
    with open(f'person_dict{year}.txt', 'w') as f:
        f.writelines(record_sections['PERSON RECORD'])

### Helper function: matches appropriate record location from the data dictionary based on each column to be extracted
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
def extract_layout_from_dict(file_path, features):
    layout = []
    feature_set = set(features)
    with open(file_path, 'r') as f:
        for line in f:
            match = re.match(r"D\s+([\w-]+)\s+(\d+)\s+(\d+)", line)
            if match:
                name, size, start = match.groups()
                if name in feature_set:
                    size = int(size)
                    start = int(start) - 1  # Adjusted for 0-based indexing
                    end = start + size
                    layout.append((name, start, end))
    return layout

### Helper function: decodes .fwf format to dataframe using the extracted layout
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
def parse_record_lines(lines, layout, numeric_cols):
    colspecs = [(start, end) for (_, start, end) in layout]
    names = [name for (name, _, _) in layout]
    df = pd.read_fwf(StringIO(''.join(lines)), colspecs=colspecs, names=names)
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

### Takes .fwf file and data dictionaries (split by record) as input, and outputs respective dataframes
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
def parse_asec_fixed_width(filepath, family_dict_path, person_dict_path):
    global FAMILY_LAYOUT, PERSON_LAYOUT
    FAMILY_LAYOUT = extract_layout_from_dict(family_dict_path, FAMILY_NUMERIC_COLS)
    PERSON_LAYOUT = extract_layout_from_dict(person_dict_path, PERSON_NUMERIC_COLS)

    with open(filepath, 'r') as f:
        lines = [line for line in f if not line.startswith('*')]

    # Split by record type
    family_lines = [line for line in lines if line.startswith('2')]
    person_lines = [line for line in lines if line.startswith('3')]

    fam_df = parse_record_lines(family_lines, FAMILY_LAYOUT, FAMILY_NUMERIC_COLS)
    person_df = parse_record_lines(person_lines, PERSON_LAYOUT, PERSON_NUMERIC_COLS)

    return fam_df, person_df


In [None]:
### Convert and load dataframes for years with fixed-width files
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
fwf_years = ['2014', '2015', '2016', '2018']

dataframes = {}

for year in fwf_years:
    split_dictionary_by_record(f'asec_codex{year}.txt', year)
    dataframes[year] = {}
    dataframes[year]['fam'], dataframes[year]['per'] = parse_asec_fixed_width(f'asec{year}.dat', f'family_dict{year}.txt', f'person_dict{year}.txt')

## Loading .csv files for the remaining years

In [None]:
### Load and process remaining years' datasets
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
csv_years = ['2017', '2019', '2020', '2021', '2022', '2023', '2024']

for year in csv_years:

  dataframes[year] = {}
  dataframes[year]['fam'] = pd.read_csv(f'family{year}.csv')
  dataframes[year]['per'] = pd.read_csv(f'person{year}.csv')

  ### Select relevant columns
  dataframes[year]['fam'] = dataframes[year]['fam'][['FFPOS', 'FH_SEQ', 'FPERSONS', 'FPOVCUT', 'FAMLIS', 'POVLL', 'FTOTVAL', 'FEARNVAL']]

  ### Create RTM_VAL column to match with fwf datasets
  dataframes[year]['per']['RTM_VAL'] = dataframes[year]['per']['ANN_VAL'] + dataframes[year]['per']['DBTN_VAL']
  dataframes[year]['per'] = dataframes[year]['per'][['PERIDNUM', 'PF_SEQ', 'PH_SEQ', 'A_AGE', 'PEAFEVER', 'A_HGA', 'A_MJOCC', 'PEARNVAL', 'WSAL_VAL', 'CAP_VAL', 'DIV_VAL', 'RTM_VAL']]

  ### Add YEAR column
  dataframes[year]['per']['YEAR'] = int(year)
  dataframes[year]['fam']['YEAR'] = int(year)


In [None]:
### Rename columns for consistency
dataframes['2014']['fam'].rename(columns={'FH-SEQ': 'FH_SEQ'}, inplace=True)
dataframes['2014']['per'].rename(columns={'PF-SEQ': 'PF_SEQ', 'A-AGE': 'A_AGE', 'A-HGA': 'A_HGA', 'A-MJOCC': 'A_MJOCC', 'PH-SEQ': 'PH_SEQ', 'DIV-VAL': 'DIV_VAL', 'RTM-VAL': 'RTM_VAL', 'WSAL-VAL': 'WSAL_VAL'}, inplace=True)

In [None]:
### add CAP_VAL and YEAR column to fwf dataframes to match
for year in fwf_years:
  dataframes[year]['per']['CAP_VAL'] = np.nan
  dataframes[year]['per']['YEAR'] = int(year)
  dataframes[year]['fam']['YEAR'] = int(year)

## Merging all datasets

In [None]:
all_fam_dfs = []
all_per_dfs = []

for year, dfs in dataframes.items():
    all_fam_dfs.append(dfs['fam'])
    all_per_dfs.append(dfs['per'])

# Concatenate all dataframes by record
merged_fam = pd.concat(all_fam_dfs, ignore_index=True)
merged_per = pd.concat(all_per_dfs, ignore_index=True)

In [None]:
### Add extra feature aggregates to family record
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
grouped_per = merged_per.groupby(['YEAR', 'PH_SEQ', 'PF_SEQ'])[['CAP_VAL', 'DIV_VAL', 'RTM_VAL']].sum()

fam_plus = pd.merge(merged_fam, grouped_per, left_on=['YEAR', 'FH_SEQ', 'FFPOS'], right_on=['YEAR', 'PH_SEQ', 'PF_SEQ'], how='inner')

fam_plus.rename(columns = {'CAP_VAL': 'CAP_TOT', 'DIV_VAL': 'DIV_TOT', 'RTM_VAL': 'RTM_TOT'}, inplace = True)
fam_plus['ADJUSTED_INC'] = fam_plus['FTOTVAL'] / (fam_plus['FPERSONS'])**.5

### Merging family and person dataframes
 ## ---------------------------------------------------------------------------------------------------------------------------------------------
merged_asec = pd.merge(merged_per, fam_plus, left_on=['YEAR', 'PH_SEQ', 'PF_SEQ'], right_on=['YEAR', 'FH_SEQ', 'FFPOS'], how='inner')

first_cols = ['YEAR', 'PH_SEQ', 'PF_SEQ']

other_cols = [col for col in merged_asec.columns if col not in first_cols]

# Reorder
merged_asec = merged_asec[first_cols + other_cols]

## Save merged dataframe to .csv file for efficient loading

In [None]:
merged_asec.to_csv('merged_asec.csv', index=False)