In [1]:
import xml.etree.ElementTree as ET

# read XML data
tree = ET.parse('usa_00006.xml')
root = tree.getroot()

variablesToKeep = ['YEAR', 'SUBSAMP', 'SERIAL', 'HHWT', 'STATEFIP', 'COUNTYFIPS',
    'PERNUM', 'PERWT', 'SEX', 'MARST', 'RACE']
variablesCorrection = {
    'HHTYPE': {
        '0': None,
        '9': None
    },
    'REGION': {
        '97': None,
        '99': None
    },
    'OWNERSHP': {
        '0': None
    },
    'AGE': {
        '000': '0',
        '090': '90',
        '100': '100',
        '112': '112',
        '115': '115'
    },
    'SEX': {
        
    },
    'MARST': {
        
    },
    'RACE': {
        
    },
    'STATEFIP': {
        
    },
    'HISPAN': {
        '9': None
    },
    'BPL': {
        '999': None
    },
    'SCHOOL': {
        '0': None,
        '9': None
    },
    'EDUC': {
        '00': None
    },
    'EDUCD': {
        '000': None,
        '001': None,
        '999': None
    },
    'EMPSTAT': {
        '0': None
    },
    'OCC': {
        '999': None
    },
    'OCC1990': {
        '999': None
    },
    'MIGRATE5': {
        '0': None,
        '9': None
    },
    'MIGRATE1': {
        '0': None,
        '9': None
    },
    'MIGPLAC5': {
        '000': None,
        '999': None
    },
    'MIGPLAC1': {
        '000': None,
        '997': None,
        '999': None
    }
}
otherNullValues = {
    'RENT': [0, 1],
    'VALUEH': [9999999],
    'INCTOT': [9999999]
}


convertDict = {}
variablesForSQL = []

data_desc = root.find("{http://www.icpsr.umich.edu/DDI}dataDscr")
for var in data_desc:
    name = var.get('name')

    # will need to adjust
    if name in variablesCorrection:
        convertDict[name] = {}
        variablesForSQL.append(name)
        for category in var.findall("{http://www.icpsr.umich.edu/DDI}catgry"):

            valtag = category.find("{http://www.icpsr.umich.edu/DDI}catValu")
            labtag = category.find("{http://www.icpsr.umich.edu/DDI}labl")

            if valtag is not None and labtag is not None:
                val = valtag.text
                label = labtag.text

                if name in variablesCorrection and val in variablesCorrection[name]:
                    convertDict[name][val] = variablesCorrection[name][val]
                else:
                    pass
                convertDict[name][val] = label
        for val in variablesCorrection[name]:
            convertDict[name][val] = variablesCorrection[name][val]
    # leave variable as is
    elif name in variablesToKeep or name in otherNullValues:
        variablesForSQL.append(name)
    # not using for now
    else:
        continue

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('usa_00006_filtered.csv')
df = df[variablesForSQL]
df['STATEFIP_orig'] = df['STATEFIP']
df['OCC1990_orig'] = df['OCC1990']
df['EDUCD_orig'] = df['EDUCD']
df['OCC_orig'] = df['OCC']
df['BPL_orig'] = df['BPL']

staysNumeric = ['AGE']


In [3]:
for var in convertDict:
    newDict = {}
    if var in staysNumeric:
        newDict = {int(x):int(y) for x, y in convertDict[var].items()}
    else:
        if df[var].dtype == 'int64':
            newDict = {int(x):y for x, y in convertDict[var].items()}
            #df[var].replace()
        if df[var].dtype == 'float64':
            newDict = {float(x):y for x, y in convertDict[var].items()}
    df[var] = df[var].map(newDict)
    
for var in otherNullValues:
    for nullVal in otherNullValues[var]:
        df.loc[df[var] == nullVal, var] = np.nan

In [4]:
df.to_csv('usa00006_processed.csv')