In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read the data
df = pd.read_csv('usa_00003.csv')
len(df)

In [None]:
# Get information about memory usage of df
df.info(memory_usage="deep")

In [None]:
# Get the number of columns with each type
df.dtypes.value_counts()

In [None]:
preprocessed_df = df.copy()

## Set indices

The SAMPLE and SERIAL columns can uniquely identify every household, which when combined with PERNUM (a number that uniquely identifies members in each household) can uniquely identify each person. Since all data is from the same 2021 sample, SAMPLE is unnecessary. Therefore, SERIAL and PERNUM are used as indices (after being downcast to uint dtype in order to use less memory).

In [None]:
assert(preprocessed_df.SERIAL.min() >= 0 and preprocessed_df.PERNUM.min() >= 0)
preprocessed_df.SERIAL = pd.to_numeric(preprocessed_df.SERIAL, downcast='unsigned')
preprocessed_df.PERNUM = pd.to_numeric(preprocessed_df.PERNUM, downcast='unsigned')
print("New datatypes: SERIAL: {}, PERNUM: {}".format(preprocessed_df.SERIAL.dtype, preprocessed_df.PERNUM.dtype))
preprocessed_df.set_index(['SERIAL', 'PERNUM'], inplace=True)
preprocessed_df.head()

## Remove unnecessary columns

These columns don't encode any useful data and are therefore dropped.

In [None]:
# All samples are from 2021 IPUMS data, so year and sample columns are unnecessary
unnecessary_columns = ['YEAR', 'SAMPLE']

# Drop unnecessary columns
for col in unnecessary_columns:
    if col in preprocessed_df.columns:
        assert(len(preprocessed_df[col].unique()) == 1)
        print('Dropping {} column; original value: {}'.format(col, preprocessed_df[col][0]))
        preprocessed_df.drop([col], axis=1, inplace=True)

In [None]:
# This should print nothing after previous cell is run. If it prints anything, you can probably drop that column.
for col in preprocessed_df.columns:
    unique = preprocessed_df[col].unique()
    if len(unique) == 1:
        print(col, unique)

## Drop Quality Columns

Data quality columns give information about whether the response was altered by staff. The method of this alteration is generally not specified. These columns are dropped because they are unnecessary when training the model (though they can be useful for analyzing potential discrepancies in data).

Note that the original data contains 44 data quality flags, so this many columns should be dropped by the next cell.

In [None]:
dropped = 0
for col in preprocessed_df.columns:
    if col[0] == 'Q':
        if col[1:] not in preprocessed_df.columns:
            print("{} dropped but no column {} exists. Make sure this isn't an error.".format(col, col[1:]))
        preprocessed_df.drop([col], axis=1, inplace=True)
        dropped += 1
print('Dropped {} columns'.format(dropped))

## Preprocess columns

The below section will preprocess columns in order to make data easier to understand and to reduce memory usage. Each cell processes a different column.

#### Preprocess binary columns

First, get a list of columns with 2 unique values (which can be turned into boolean columns).

In [None]:
# Print all columns with 2 unique values (can convert to boolean columns)
for col in preprocessed_df.columns:
    unique = preprocessed_df[col].unique()
    if len(unique) == 2:
        print('{} ({}): {}'.format(col, preprocessed_df[col].dtype, len(unique)))

Next, preprocess any columns that should have NaN values (this is necessary as often responses that should be NaN such as no response or N/A are instead coded as 0).

In [None]:
preprocessed_df.SCHOOL.replace(0, np.nan, inplace=True)

Below is a list of columns that will be converted to boolean columns. Each tuple contains the current column name, the name of the boolean column that it'll be converted to, and a boolean that determines whether the column will be dropped when the boolean column is created. Unless you have a good reason to keep the original column, you should drop it once the boolean column is created.

In [None]:
# List of tuples containing current column name, new boolean column name, and whether to drop current column when new column created
boolean_column_conversions = [
    # SEX is 1 if male, 2 if female (no other options)
    ('SEX', 'isFemale', False),

    # These columns store binary race (single person can have multiple races, RACNUM stores how many races single person selected)
    ('RACAMIND', 'isAmericanIndian', True),
    ('RACASIAN', 'isAsian', True),
    ('RACBLK', 'isBlack', True),
    ('RACPACIS', 'isPacificIslander', True),
    ('RACWHT', 'isWhite', True),
    ('RACOTHER', 'isOtherRace', True),

    # These columns store information about health insurance coverage

    # Private includes HINSEMP, HINSPUR, and HINSTRI
    ('HCOVPRIV', 'hasPrivateHealthInsurance', True),
    # Includes current/former employers or union health insurance; or those covered by family's insurance provided by these groups
    ('HINSEMP', 'hasEmployerHealthInsurance', True),
    # Includes health insurance purchased directly by individual or family
    ('HINSPUR', 'hasPurchasedPrivHealthInsurance', True),
    # Includes TRICARE and other military health program
    ('HINSTRI', 'hasMilitaryHealthInsurance', True),

    # Public includes HINSCARE, HINSCAID, HINSVA
    ('HCOVPUB', 'hasPublicHealthInsurance', True),
    ('HINSCARE', 'hasMedicare', True),
    # Includes Medicaid, Medical Assistance, or other government plans for those w/ low income or disability
    ('HINSCAID', 'hasMedicaid', True),
    ('HINSVA', 'hasVeteransHealthInsurance', True),  # Includes all who have ever used/enrolled for VA health care

    ('HINSIHS', 'hasIndianHealthInsurance', True),  # Includes those getting insurance through Indian Health Service

    # HCOVANY includes HINSEMP, HINSPUR, HINSTRI, HINSCARE, HINSCAID, HINSVA
    # Indian Health Services insurance not included in HCOVANY (since IPUMS says IHS policies not always comprehensive)
    ('HCOVANY', 'hasHealthInsurance', True),

    # Whether respondant is currently in school
    ('SCHOOL', 'isInSchool', True),
]

In [None]:
for col, new_col, drop_col in boolean_column_conversions:
    if col in preprocessed_df.columns:
        assert(len(preprocessed_df[col].unique()) == 2)
        if(preprocessed_df[col].isnull().values.any()):
            print('Warning: {} has null values. This means {} will have null values. Make sure this is expected.'.format(col, new_col))
        preprocessed_df[new_col] = (preprocessed_df[col] - 1).astype("boolean")
        if drop_col:
            preprocessed_df.drop([col], axis=1, inplace=True)
    elif new_col not in preprocessed_df.columns:
        print('Warning: {} not in preprocessed_df so {} not generated'.format(col, new_col))

In [None]:
(~(preprocessed_df.SCHOOL.replace(0, np.nan) - 1).astype("boolean")).sum()

In [None]:
(preprocessed_df.SCHOOL.replace(0, np.nan) - 1).astype("boolean").value_counts()

In [None]:
(preprocessed_df.SCHOOL.replace(0, np.nan) - 1)

Preprocess HISPAN column into isHispanic boolean column. Dropping HISPAN and HISPAND (detailed version of HISPAN) removes information about Hispanic respondants' specific background, so don't drop these columns if you want to preserve this information.

In [None]:
# HISPAN column is 0 if not Hispanic, nonzero if Hispanic

# 9 represents unknown value. Shouldn't be present in our data.
assert(9 not in preprocessed_df.HISPAN.values)

preprocessed_df['isHispanic'] = preprocessed_df.HISPAN.astype(bool)

# Drop these if you don't care about details. HISPAND is more detailed version of HISPAND
preprocessed_df.drop(['HISPAN'], axis=1, inplace=True)
preprocessed_df.drop(['HISPAND'], axis=1, inplace=True)

# preprocessed_df.head()

sameSexMarriage is true if respondant's spouse is of same sex as respondant, false otherwise (including if respondant has no spouse). Note that this is not a perfect analogue for sexuality as anyone who is not married will be marked as false and the data only includes binary sex, not gender.

In [None]:
preprocessed_df['sameSexMarriage'] = (preprocessed_df['SEX'] == preprocessed_df['SEX_SP'])
preprocessed_df.drop(['SEX', 'SEX_SP'], axis=1, inplace=True)
preprocessed_df.sameSexMarriage.sum()  # Shows number of people in same sex marriages

Converts housing type to binary column. Values of 1-2 refer to households, while values of 3-5 refer to group quarters. Values of 0 or 6 should not be present.

Note that group quarters refers to living arragements like rooming houses or military barracks, with a large number of units with individuals unrelated to the respondant.

In [None]:
assert(preprocessed_df.GQ.max() < 6 and preprocessed_df.GQ.min() > 0)

preprocessed_df['isGroupQuarters'] = (preprocessed_df['GQ'] > 2)
preprocessed_df.drop(['GQ'], axis=1, inplace=True)

Converts birthplace to binary column bornInUS. Note that this includes US outlying areas and territories: American Samoa, Guam, Puerto Rico, US Virgin Islands, and other US possessions.

In [None]:
print('Number of people born in US: {}. Number of those who were born in US outlying areas: {}'.format(
    (preprocessed_df.BPL <= 120).sum(), ((preprocessed_df.BPL >= 100) & (preprocessed_df.BPL <= 120)).sum()))

preprocessed_df['bornInUS'] = (preprocessed_df.BPL <= 120)
preprocessed_df.drop(['BPL', 'BPLD'], axis=1, inplace=True)

#### Preprocess Categorical Columns

The next cells preprocess data that is split among multiple different categorical columns.

Converts MARST (marital status) into 3 categorical columns: isMarried, wasMarried, and neverMarried.

Note that people are marked as married whether their spouse is present (MARST = 1) or absent (MARST = 2). Similarly, people are marked as wasMarried whether they are separated (MARST = 3), divorced (MARST = 4), or widowed (MARST = 5).

In [None]:
preprocessed_df['isMarried'] = (preprocessed_df['MARST'] <= 2)
preprocessed_df['wasMarried'] = ((preprocessed_df['MARST'] >= 3) & (preprocessed_df['MARST'] <= 5))
preprocessed_df['neverMarried'] = (preprocessed_df['MARST'] == 6)
preprocessed_df.drop(['MARST'], axis=1, inplace=True)

Converts SPEAKENG (which includes data on whether and how well the respondant speaks English) into 3 columns:

1. Speaks English
2. Speaks English well
3. Speaks only English

Note that how well the respondant speaks English is self reported by the respondant rather than being evaluated with an objective metric.

In [None]:
# Ensure only correct values are present in column
assert(sorted(preprocessed_df.SPEAKENG.unique().tolist()) == [0, 1, 3, 4, 5, 6])

# Create boolean columns for English speaking ability
preprocessed_df['speaksEnglish'] = (preprocessed_df.SPEAKENG > 1)
preprocessed_df['speaksOnlyEnglish'] = (preprocessed_df.SPEAKENG == 3)
# Note: if respondant only speaks English, they will be marked as speaksEnglishWell
preprocessed_df['speaksEnglishWell'] = ((preprocessed_df.SPEAKENG > 2) & (preprocessed_df.SPEAKENG < 6))

# Add NaN values for respondents who didn't answer
noResponse = (preprocessed_df.SPEAKENG == 0)
for col in ['speaksEnglish', 'speaksOnlyEnglish', 'speaksEnglishWell']:
    preprocessed_df.loc[noResponse, col] = np.nan
    # preprocessed_df.loc[col, noResponse] = np.nan

preprocessed_df.drop(['SPEAKENG'], axis=1, inplace=True)

#### Miscellaneous Preprocessing

In [None]:
# YRMARR is the year in which respondant was last marriage. If respondent was never married, YRMARR is 0. This is converted to NaN.
preprocessed_df.YRMARR.replace(0, np.nan, inplace=True)

# Ensure there are no discrepencies in the data
assert(preprocessed_df.YRMARR.isna() == preprocessed_df.neverMarried).all()

In [None]:
# RACE and RACED dropped because race information was already encoded. Don't drop these if you want more detailed information.
preprocessed_df.drop(['RACE', 'RACED'], axis=1, inplace=True)

In [None]:
# Ancestry data dropped because it is not used to train the model.
preprocessed_df.drop(['ANCESTR1', 'ANCESTR2', 'ANCESTR1D', 'ANCESTR2D'], axis=1, inplace=True)

In [None]:
# These columns include languages spoken at home. This is excluded in favor of SPEAKENG, which provides less granular information.
preprocessed_df.drop(['LANGUAGE1', 'LANGUAGED'], axis=1, inplace=True)

In [None]:
preprocessed_df.columns

In [None]:
(preprocessed_df.BPL == 950).sum()

In [None]:
preprocessed_df.AGE.unique()

In [None]:
df.info(memory_usage="deep")

In [None]:
# Get the number of columns with each type
df.dtypes.value_counts()

In [None]:
# Show memory usage of each column of df
df.memory_usage()