In [1]:
from pathlib import Path

import pandas as pd

external_folder = Path('../../data/external/LODES')
interim_folder = Path('../../data/interim/LODES')
"""
Filetypes 
- od
- rac
- wac
- xwalk

OD
[ST]_od_[PART]_[TYPE]_[YEAR].csv.gz where
    [ST] = lowercase, 2-letter postal code for a chosen state
    [PART] = Part of the state file, can have a value of either “main” or “aux”. Complimentary parts of
    the state file, the main part includes jobs with both workplace and residence in the state
    and the aux part includes jobs with the workplace in the state and the residence outside of
    the state.
    [TYPE] = Job Type, can have a value of “JT00” for All Jobs, “JT01” for Primary Jobs, “JT02” for
    All Private Jobs, “JT03” for Private Primary Jobs, “JT04” for All Federal Jobs, or “JT05”
    for Federal Primary Jobs.
    [YEAR] = Year of job data. Can have the value of 2002-2015 for most states. 

RAC
[ST]_rac_[SEG]_[TYPE]_[YEAR]_1.csv.gz where
    [ST] = lowercase, 2-letter postal code for a chosen state
    [SEG] = Segment of the workforce, can have the values of “S000”, “SA01”, “SA02”, “SA03”,
    “SE01”, “SE02”, “SE03”, “SI01”, “SI02”, or “SI03”. These correspond to the same
    segments of the workforce as are listed in the OD file structure above.
    [TYPE] = Job Type, can have a value of “JT00” for All Jobs, “JT01” for Primary Jobs, “JT02” for
    All Private Jobs, “JT03” for Private Primary Jobs, “JT04” for All Federal Jobs, or “JT05”
    for Federal Primary Jobs.
    [YEAR] = Year of job data. Can have the value of 2002-2015 for most states.
    
WAC
[ST]_wac_[SEG]_[TYPE]_[YEAR].csv.gz where
    [ST] = lowercase, 2-letter postal code for a chosen state
    [SEG] = Segment of the workforce, can have the values of “S000”, “SA01”, “SA02”, “SA03”,
    “SE01”, “SE02”, “SE03”, “SI01”, “SI02”, or “SI03”. These correspond to the same
    segments of the workforce as are listed in the OD file structure above.
    [TYPE] = Job Type, can have a value of “JT00” for All Jobs, “JT01” for Primary Jobs, “JT02” for
    All Private Jobs, “JT03” for Private Primary Jobs, “JT04” for All Federal Jobs, or “JT05”
    for Federal Primary Jobs.
    [YEAR] = Year of job data. Can have the value of 2002-2015 for most states
    
XWALK
[ST]_xwalk.csv.gz where
    [ST] = lowercase, 2-letter postal code for a chosen state
"""
print('')




In [2]:
state = set()
dataset = set()
for f in external_folder.rglob('*.csv'):
    fields = f.stem.split('_')
    state.add(fields[0])
    dataset.add(fields[1])
print(state)
print(dataset)

if not interim_folder.is_dir():
    interim_folder.mkdir()

{'or'}
{'xwalk', 'od', 'wac', 'rac'}


In [3]:
def parse_od_file(od_file_name):    
    df = pd.read_csv(od_file_name)
    
    fields = od_file_name.stem.split('_')
    
    df['ST'] = fields[0]
    df['PART'] = fields[2]
    df['TYPE'] = fields[3]
    df['YEAR'] = fields[4]
    
    return df

od_df = pd.concat([parse_od_file(f) for f in external_folder.rglob('*od*.csv')])

for col in ['ST', 'PART', 'TYPE', 'YEAR']:
    print(f"{col}: {od_df[col].unique()}")
    
long_od_df = pd.melt(od_df, id_vars=['ST', 'PART', 'TYPE', 'YEAR', 'w_geocode', 'h_geocode'])

out_filename = interim_folder / 'lodes_od.csv'

long_od_df.to_csv(out_filename, index=False)

ST: ['or']
PART: ['main' 'aux']
TYPE: ['JT03' 'JT02' 'JT05' 'JT04' 'JT00' 'JT01']
YEAR: ['2015']


In [5]:
def parse_rac_file(rac_file_name):
    df = pd.read_csv(rac_file_name)
    
    fields = rac_file_name.stem.split('_')
    
    df['ST'] = fields[0]
    df['SEG'] = fields[2]
    df['TYPE'] = fields[3]
    df['YEAR'] = fields[4]
    
    return df

rac_df = pd.concat([parse_rac_file(f) for f in external_folder.rglob('*rac*.csv')])

for col in ['ST', 'SEG', 'TYPE', 'YEAR']:
    print(f"{col}: {rac_df[col].unique()}")
    
long_rac_df = pd.melt(rac_df, id_vars=['ST', 'SEG', 'TYPE', 'YEAR', 'h_geocode'])

out_filename = interim_folder / 'lodes_rac.csv'

long_rac_df.to_csv(out_filename, index=False)

ST: ['or']
SEG: ['S000' 'SE01' 'SE03' 'SI03' 'SA03' 'SE02' 'SA01' 'SI02' 'SI01' 'SA02']
TYPE: ['JT00' 'JT01' 'JT04' 'JT05' 'JT02' 'JT03']
YEAR: ['2015']


In [6]:
def parse_wac_file(wac_file_name):
    df = pd.read_csv(wac_file_name)
    
    fields = wac_file_name.stem.split('_')
    
    df['ST'] = fields[0]
    df['SEG'] = fields[2]
    df['TYPE'] = fields[3]
    df['YEAR'] = fields[4]
    
    return df

wac_df = pd.concat([parse_wac_file(f) for f in external_folder.rglob('*wac*.csv')])

for col in ['ST', 'SEG', 'TYPE', 'YEAR']:
    print(f"{col}: {wac_df[col].unique()}")
    
long_wac_df = pd.melt(wac_df, id_vars=['ST', 'SEG', 'TYPE', 'YEAR', 'w_geocode'])

out_filename = interim_folder / 'lodes_wac.csv'

long_wac_df.to_csv(out_filename, index=False)

ST: ['or']
SEG: ['SE03' 'SI03' 'SE01' 'S000' 'SA03' 'SE02' 'SA01' 'SI02' 'SI01' 'SA02']
TYPE: ['JT01' 'JT00' 'JT03' 'JT02' 'JT05' 'JT04']
YEAR: ['2015']


In [8]:
def parse_xwalk_file(xwalk_file_name):
    df = pd.read_csv(xwalk_file_name, low_memory=False)
    
    fields = xwalk_file_name.stem.split('_')
    
    df['ST'] = fields[0]
    
    return df

xwalk_df = pd.concat([parse_xwalk_file(f) for f in external_folder.rglob('*xwalk*.csv')])

for col in ['ST']:
    print(f"{col}: {xwalk_df[col].unique()}")
    
long_xwalk_df = pd.melt(xwalk_df, id_vars=['ST', 'tabblk2010'])

out_filename = interim_folder / 'lodes_xwalk.csv'

long_xwalk_df.to_csv(out_filename, index=False)

ST: ['or']
