In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

# 1. Configuration

In [1]:
INPUT_FILE = 'data/IPUMS/ipums_1823.csv'
OUTPUT_DIR = 'data/processed/'

In [4]:
MISSING_CODES = {
    'OWNERSHP': 0,
    'OWNERSHPD': 0,
    'SEX': 9,
    'MARST': 9,
    'HISPAN': 9,
    'EDUC': 99,
    'EMPSTAT': [0, 9],
    'EMPSTATD': [0, 99],
    'MIGRATE': [0, 9],
    'MIGRATE1D': [00, 90],
    'RENT': 9999999,
    'HHINCOME': 9999999,
    'VALUEH': [9999998, 9999999],
    'INCTOT': 9999999
}

# 2. Data loading

In [7]:
dtype_dict = {
        'YEAR': 'int32',
        'SAMPLE': 'int32',
        'SERIAL': 'int64',
        'CBSERIAL': 'int64',
        'STATEFIP': 'int16',
        'COUNTYFIP': 'int16',
        'PUMA': 'int32',
        'GQ': 'int8',
        'PERNUM': 'int16',
    }

In [8]:
df = pd.read_csv('../../data/IPUMS/ipums_1722_FLonly.csv', dtype=dtype_dict)

In [9]:
df.shape

(1376799, 34)

In [7]:
df.sample()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,PUMA,STRATA,...,HISPAN,HISPAND,EDUC,EDUCD,EMPSTAT,EMPSTATD,INCTOT,POVERTY,MIGRATE1,MIGRATE1D
972090,2022,202201,305845,2022000125287,209.0,2022003058451,12,0,1398,139812,...,0,0,6,65,1,10,17000,190,1,10


# 2. Cleaning

In [8]:
# Handles NA

for var, codes in MISSING_CODES.items():
    if var in df.columns:
        df[var] = df[var].replace(codes, np.nan)

if 'INCTOT' in df.columns:
    df.INCTOT = df.INCTOT.replace(MISSING_CODES.get('INCTOT', []), np.nan)

In [9]:
# Re-classing numerical columns

num_vars = ['VALUEH', 'HHINCOME', 'RENT', 'HHWT']

for x in num_vars:
    if x in df.columns:
        df[x] = pd.to_numeric(df[x], errors='coerce')

In [10]:
# Homeownership flags

if 'OWNERSHP' in df.columns:
    df['is_owner'] = (df.OWNERSHP == 1).astype(int)
    df['is_renter'] = (df.OWNERSHP == 2).astype(int)

if 'OWNERSHPD' in df.columns:
    df['owns_free_clear'] = (df.OWNERSHPD == 12).astype(int)
    df['owns_with_mortgage'] = (df.OWNERSHPD == 13).astype(int)

In [11]:
df.head()

Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,PUMA,STRATA,...,EMPSTAT,EMPSTATD,INCTOT,POVERTY,MIGRATE1,MIGRATE1D,is_owner,is_renter,owns_free_clear,owns_with_mortgage
0,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,861112,...,1.0,10.0,8500.0,29,1,10.0,0,1,0,0
1,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,861112,...,,,,29,1,10.0,0,1,0,0
2,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,861112,...,,,,29,1,10.0,0,1,0,0
3,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,861112,...,,,,29,1,10.0,0,1,0,0
4,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,861112,...,3.0,30.0,0.0,29,1,10.0,0,1,0,0


In [12]:
df.isnull().sum()

YEAR                       0
SAMPLE                     0
SERIAL                     0
CBSERIAL                   0
HHWT                       0
CLUSTER                    0
STATEFIP                   0
COUNTYFIP                  0
PUMA                       0
STRATA                     0
GQ                         0
OWNERSHP               65512
OWNERSHPD              65512
RENT                       0
HHINCOME               65512
VALUEH                415465
PERNUM                     0
PERWT                      0
NCHILD                     0
SEX                        0
AGE                        0
MARST                      0
RACE                       0
RACED                      0
HISPAN                     0
HISPAND                    0
EDUC                       0
EDUCD                      0
EMPSTAT               197356
EMPSTATD              197356
INCTOT                183160
POVERTY                    0
MIGRATE1                   0
MIGRATE1D              10165
is_owner      

In [18]:
df.to_csv('../../data/processed/FL_IPUMS.csv')