In [None]:
import pandas as pd
import numpy as np

In [None]:
# Read the data
df = pd.read_csv('usa_00003.csv')
len(df)

In [None]:
# Get information about memory usage of df
df.info(memory_usage="deep")

In [None]:
# Get the number of columns with each type
df.dtypes.value_counts()

In [None]:
preprocessed_df = df.copy()

## Set indices

The SAMPLE and SERIAL columns can uniquely identify every household, which when combined with PERNUM (a number that uniquely identifies members in each household) can uniquely identify each person. Since all data is from the same 2021 sample, SAMPLE is unnecessary. Therefore, SERIAL and PERNUM are used as indices (after being downcast to uint dtype in order to use less memory).

In [None]:
assert(preprocessed_df.SERIAL.min() >= 0 and preprocessed_df.PERNUM.min() >= 0)
preprocessed_df.SERIAL = pd.to_numeric(preprocessed_df.SERIAL, downcast='unsigned')
preprocessed_df.PERNUM = pd.to_numeric(preprocessed_df.PERNUM, downcast='unsigned')
print("New datatypes: SERIAL: {}, PERNUM: {}".format(preprocessed_df.SERIAL.dtype, preprocessed_df.PERNUM.dtype))
preprocessed_df.set_index(['SERIAL', 'PERNUM'], inplace=True)
preprocessed_df.head()

## Remove unnecessary columns

These columns don't encode any useful data and are therefore dropped.

In [None]:
# All samples are from 2021 IPUMS data, so year and sample columns are unnecessary
unnecessary_columns = ['YEAR', 'SAMPLE']

# Drop unnecessary columns
for col in unnecessary_columns:
    if col in preprocessed_df.columns:
        unique = preprocessed_df[col].unique()
        assert(len(unique) == 1)
        print('Dropping {} column; original value: {}'.format(col, unique[0]))
        preprocessed_df.drop([col], axis=1, inplace=True)

In [None]:
# This should print nothing after previous cell is run. If it prints anything, you can probably drop that column.
for col in preprocessed_df.columns:
    unique = preprocessed_df[col].unique()
    if len(unique) == 1:
        print(col, unique)

## Drop Quality Columns

Data quality columns give information about whether the response was altered by staff. The method of this alteration is generally not specified. These columns are dropped because they are unnecessary when training the model (though they can be useful for analyzing potential discrepancies in data).

Note that the original data contains 44 data quality flags, so this many columns should be dropped by the next cell.

In [None]:
quality_columns = [col for col in preprocessed_df.columns if col[0] == 'Q']
for col in quality_columns:
    if col[1:] not in preprocessed_df.columns:
        print(f"Dropping {col} but no column {col[1:]} exists. Make sure this isn't an error.")

preprocessed_df.drop(quality_columns, axis=1, inplace=True)
print(f'Dropped {len(quality_columns)} columns')

## Preprocess columns

The below section will preprocess columns in order to make data easier to understand and to reduce memory usage. Each cell processes a different column.

#### Preprocess binary columns

First, get a list of columns with 2 unique values (which can be turned into boolean columns).

In [None]:
# Print all columns with 2 unique values (or 3 unique values if one of them is NaN)
for col in preprocessed_df.columns:
    unique = preprocessed_df[col].unique()
    if len(unique) == 2 or (len(unique) == 3 and preprocessed_df[col].isnull().values.any()):
        print('{} ({}): {}'.format(col, preprocessed_df[col].dtype, len(unique)))

Next, preprocess any columns that should have NaN values (this is necessary as often responses that should be NaN such as no response or N/A are instead coded as 0).

In [None]:
preprocessed_df.SCHOOL.replace(0, np.nan, inplace=True)
preprocessed_df.CARPOOL.replace(0, np.nan, inplace=True)

Below is a list of columns that will be converted to boolean columns. Each tuple contains the current column name, the name of the boolean column that it'll be converted to, and a boolean that determines whether the column will be dropped when the boolean column is created. Unless you have a good reason to keep the original column, you should drop it once the boolean column is created.

In [None]:
# List of tuples containing current column name, new boolean column name, and whether to drop current column when new column created
boolean_column_conversions = [
    # SEX is 1 if male, 2 if female (no other options)
    ('SEX', 'isFemale', False),

    # These columns store binary race (single person can have multiple races, RACNUM stores how many races single person selected)
    ('RACAMIND', 'isAmericanIndian', True),
    ('RACASIAN', 'isAsian', True),
    ('RACBLK', 'isBlack', True),
    ('RACPACIS', 'isPacificIslander', True),
    ('RACWHT', 'isWhite', True),
    ('RACOTHER', 'isOtherRace', True),

    # These columns store information about health insurance coverage

    # Private includes HINSEMP, HINSPUR, and HINSTRI
    ('HCOVPRIV', 'hasPrivateHealthInsurance', True),
    # Includes current/former employers or union health insurance; or those covered by family's insurance provided by these groups
    ('HINSEMP', 'hasEmployerHealthInsurance', True),
    # Includes health insurance purchased directly by individual or family
    ('HINSPUR', 'hasPurchasedPrivHealthInsurance', True),
    # Includes TRICARE and other military health program
    ('HINSTRI', 'hasMilitaryHealthInsurance', True),

    # Public includes HINSCARE, HINSCAID, HINSVA
    ('HCOVPUB', 'hasPublicHealthInsurance', True),
    ('HINSCARE', 'hasMedicare', True),
    # Includes Medicaid, Medical Assistance, or other government plans for those w/ low income or disability
    ('HINSCAID', 'hasMedicaid', True),
    ('HINSVA', 'hasVeteransHealthInsurance', True),  # Includes all who have ever used/enrolled for VA health care

    ('HINSIHS', 'hasIndianHealthInsurance', True),  # Includes those getting insurance through Indian Health Service

    # HCOVANY includes HINSEMP, HINSPUR, HINSTRI, HINSCARE, HINSCAID, HINSVA
    # Indian Health Services insurance not included in HCOVANY (since IPUMS says IHS policies not always comprehensive)
    ('HCOVANY', 'hasHealthInsurance', True),

    # Whether respondant is currently in school
    ('SCHOOL', 'isInSchool', True),

    # Whether respondant drives alone or carpools
    ('CARPOOL', 'carpools', True),
]

In [None]:
for col, new_col, drop_col in boolean_column_conversions:
    if col in preprocessed_df.columns:
        assert(len(preprocessed_df[col].unique()) <= 3)
        preprocessed_df[new_col] = (preprocessed_df[col] - 1).astype("boolean")
        if(preprocessed_df[col].isnull().values.any()):
            print('Warning: {} has null values. This means {} will have null values. Make sure this is expected.'.format(col, new_col))
            assert(preprocessed_df[new_col].isna().values.any())
        if drop_col:
            preprocessed_df.drop([col], axis=1, inplace=True)
    elif new_col not in preprocessed_df.columns:
        print('Warning: {} not in preprocessed_df so {} not generated'.format(col, new_col))

Preprocess HISPAN column into isHispanic boolean column. Dropping HISPAN and HISPAND (detailed version of HISPAN) removes information about Hispanic respondants' specific background, so don't drop these columns if you want to preserve this information.

In [None]:
# HISPAN column is 0 if not Hispanic, nonzero if Hispanic

# 9 represents unknown value. Shouldn't be present in our data.
assert(9 not in preprocessed_df.HISPAN.values)

preprocessed_df['isHispanic'] = preprocessed_df.HISPAN.astype(bool)

# Drop these if you don't care about details. HISPAND is more detailed version of HISPAND
preprocessed_df.drop(['HISPAN'], axis=1, inplace=True)
preprocessed_df.drop(['HISPAND'], axis=1, inplace=True)

preprocessed_df.isHispanic.value_counts()

sameSexMarriage is true if respondant's spouse is of same sex as respondant, false otherwise (including if respondant has no spouse). Note that this is not a perfect analogue for sexuality as anyone who is not married will be marked as false and the data only includes binary sex, not gender.

In [None]:
preprocessed_df['sameSexMarriage'] = (preprocessed_df['SEX'] == preprocessed_df['SEX_SP'])
assert not preprocessed_df.sameSexMarriage.isnull().values.any()  # Make sure there are no null values
preprocessed_df.drop(['SEX', 'SEX_SP'], axis=1, inplace=True)
preprocessed_df.sameSexMarriage.sum()  # Shows number of people in same sex marriages

mixedRaceMarriage is true if the respondant's spouse is of a different race as spouse, false otherwise (including if respondant has no spouse).

In [None]:
preprocessed_df['mixedRaceMarriage'] = (~(preprocessed_df.RACE_SP.isnull()) & (preprocessed_df.RACE != preprocessed_df.RACE_SP))
assert not preprocessed_df.mixedRaceMarriage.isnull().values.any()  # Make sure there are no null values

# RACE and RACED dropped because race information was already encoded. Don't drop these if you want more detailed information.
preprocessed_df.drop(['RACE', 'RACED', 'RACE_SP', 'RACED_SP'], axis=1, inplace=True)

preprocessed_df.mixedRaceMarriage.sum()  # Shows number of people in same sex marriages

Converts housing type to binary column. Values of 1-2 refer to households, while values of 3-5 refer to group quarters. Values of 0 or 6 should not be present.

Note that group quarters refers to living arragements like rooming houses or military barracks, with a large number of units with individuals unrelated to the respondant.

In [None]:
assert(preprocessed_df.GQ.max() < 6 and preprocessed_df.GQ.min() > 0)

preprocessed_df['isGroupQuarters'] = (preprocessed_df['GQ'] > 2)
preprocessed_df.drop(['GQ'], axis=1, inplace=True)

Converts birthplace to binary column bornInUS. Note that this includes US outlying areas and territories: American Samoa, Guam, Puerto Rico, US Virgin Islands, and other US possessions.

In [None]:
print('Number of people born in US: {}. Number of those who were born in US outlying areas: {}'.format(
    (preprocessed_df.BPL <= 120).sum(), ((preprocessed_df.BPL >= 100) & (preprocessed_df.BPL <= 120)).sum()))

preprocessed_df['bornInUS'] = (preprocessed_df.BPL <= 120)
preprocessed_df.drop(['BPL', 'BPLD'], axis=1, inplace=True)

#### Preprocess Categorical Columns

The next cells preprocess data that is split among multiple different categorical columns.

Converts MARST (marital status) into 3 categorical columns: isMarried, wasMarried, and neverMarried.

Note that people are marked as married whether their spouse is present (MARST = 1) or absent (MARST = 2). Similarly, people are marked as wasMarried whether they are separated (MARST = 3), divorced (MARST = 4), or widowed (MARST = 5).

In [None]:
preprocessed_df['isMarried'] = (preprocessed_df['MARST'] <= 2)
preprocessed_df['wasMarried'] = ((preprocessed_df['MARST'] >= 3) & (preprocessed_df['MARST'] <= 5))
preprocessed_df['neverMarried'] = (preprocessed_df['MARST'] == 6)
preprocessed_df.drop(['MARST'], axis=1, inplace=True)

Converts SPEAKENG (which includes data on whether and how well the respondant speaks English) into 3 columns:

1. Speaks English
2. Speaks English well
3. Speaks only English

Note that how well the respondant speaks English is self reported by the respondant rather than being evaluated with an objective metric.

In [None]:
# Ensure only correct values are present in column
assert(sorted(preprocessed_df.SPEAKENG.unique().tolist()) == [0, 1, 3, 4, 5, 6])

# Create boolean columns for English speaking ability
preprocessed_df['speaksEnglish'] = (preprocessed_df.SPEAKENG > 1)
preprocessed_df['speaksOnlyEnglish'] = (preprocessed_df.SPEAKENG == 3)
# Note: if respondant only speaks English, they will be marked as speaksEnglishWell
preprocessed_df['speaksEnglishWell'] = ((preprocessed_df.SPEAKENG > 2) & (preprocessed_df.SPEAKENG < 6))

# Add NaN values for respondents who didn't answer
noResponse = (preprocessed_df.SPEAKENG == 0)
for col in ['speaksEnglish', 'speaksOnlyEnglish', 'speaksEnglishWell']:
    preprocessed_df.loc[noResponse, col] = np.nan
    # preprocessed_df.loc[col, noResponse] = np.nan

preprocessed_df.drop(['SPEAKENG'], axis=1, inplace=True)

Converts EDUCD (which has data about highest educational attainment) into columns:
1. No schooling
2. Up to grade 4
3. Up to grade 8
4. Some highschool (no diploma)
5. High school diploma
6. Some college, no degree
7. Associate's Degree
8. Bachelor's Degree
9. Bachelor's Degree plus some professional degree
10. Master's Degree
11. Doctoral Degree

Some information about these education levels:
* No schooling is not the same as N/A (which is marked with NaN in each column)
* Up to grade 4 includes those whose maximum educational attainment is preschool or kindergarten (~40k and ~37k respondants respectively). This could be worth separating into another column.
* "Professional degree" in column 9 refers to the fact that ~57k respondants are marked as having a professional degree beyond a bachelor's degree. Note that this professional degree is not a master's or PhD, which is labelled in separate columns.

In [None]:
# Missing is marked w/ 999, so make sure that it isn't present in the data by checking bounds
assert(preprocessed_df.EDUCD.min() == 1 and preprocessed_df.EDUCD.max() == 116)

# Create boolean columns for education level
preprocessed_df['noSchooling'] = (preprocessed_df.EDUCD == 2).astype("boolean")
preprocessed_df['maxGrade4'] = ((preprocessed_df.EDUCD <= 17) & (preprocessed_df.EDUCD >= 11)).astype("boolean")
preprocessed_df['maxGrade8'] = ((preprocessed_df.EDUCD <= 26) & (preprocessed_df.EDUCD >= 22)).astype("boolean")
preprocessed_df['maxSomeHS'] = ((preprocessed_df.EDUCD <= 61) & (preprocessed_df.EDUCD >= 30)).astype("boolean")
preprocessed_df['highSchoolDiploma'] = ((preprocessed_df.EDUCD == 63) | (preprocessed_df.EDUCD == 64)).astype("boolean")
preprocessed_df['someCollege'] = ((preprocessed_df.EDUCD == 65) | (preprocessed_df.EDUCD == 71)).astype("boolean")
preprocessed_df['associatesDegree'] = (preprocessed_df.EDUCD == 81).astype("boolean")
preprocessed_df['bachelorsDegree'] = (preprocessed_df.EDUCD == 101).astype("boolean")
preprocessed_df['mastersDegree'] = (preprocessed_df.EDUCD == 114).astype("boolean")
preprocessed_df['bachelorsPlusProfessionalDegree'] = (preprocessed_df.EDUCD == 115).astype("boolean")
preprocessed_df['doctoralDegree'] = (preprocessed_df.EDUCD == 116).astype("boolean")

# N/A values (which are marked with 1) will be set to NaN
nanMask = (preprocessed_df.EDUCD == 1)

# Check to make sure all respondants were covered in one of the columns (or will be set to NaN)
assert(not (~(
    preprocessed_df.noSchooling | preprocessed_df.maxGrade4 | preprocessed_df.maxGrade8 | preprocessed_df.maxSomeHS | preprocessed_df.highSchoolDiploma | 
    preprocessed_df.someCollege | preprocessed_df.associatesDegree | preprocessed_df.bachelorsDegree | preprocessed_df.mastersDegree | 
    preprocessed_df.bachelorsPlusProfessionalDegree | preprocessed_df.doctoralDegree | nanMask
    )).values.any())

# Set NaN values
for col in ['noSchooling', 'maxGrade4', 'maxGrade8', 'maxSomeHS', 'highSchoolDiploma', 'someCollege', 'associatesDegree', 'bachelorsDegree', 'mastersDegree', 'bachelorsPlusProfessionalDegree', 'doctoralDegree']:
    preprocessed_df.loc[nanMask, col] = np.nan
    assert(preprocessed_df[col].isnull().values.sum() == nanMask.values.sum())

preprocessed_df.drop(['EDUC', 'EDUCD'], axis=1, inplace=True)

In [None]:
preprocessed_df['has2ndDegree'] = (preprocessed_df.DEGFIELD2 > 0).astype("boolean")

# Check to make sure everyone with a 2nd degree has a degree
assert(not ((preprocessed_df.has2ndDegree) &
    ~(preprocessed_df.bachelorsDegree | preprocessed_df.mastersDegree | preprocessed_df.bachelorsPlusProfessionalDegree | preprocessed_df.doctoralDegree).astype("boolean")
    ).values.any()
)

# 2nd degree information (beyond whether a respondant has one) is dropped
preprocessed_df.drop(['DEGFIELD2', 'DEGFIELD2D'], axis=1, inplace=True)

The following 3 columns encode a person's employment status:
1. If they are employed
2. If they are unemployed
3. If they are not in the labor force
Note that if a person's employment status is N/A, each of these columns will be set to NaN

In [None]:
preprocessed_df['isEmployed'] = (preprocessed_df.EMPSTAT == 1).astype("boolean")
preprocessed_df['isUnemployed'] = (preprocessed_df.EMPSTAT == 2).astype("boolean")
preprocessed_df['isNotInLaborForce'] = (preprocessed_df.EMPSTAT == 3).astype("boolean")

# N/A values (which are marked with 0) will be set to NaN
nanMask = (preprocessed_df.EMPSTAT == 0)
print(f"{nanMask.values.sum()} of {len(preprocessed_df)} respondants are missing employment status")

# Set NaN values
for col in ['isEmployed', 'isUnemployed', 'isNotInLaborForce']:
    preprocessed_df.loc[nanMask, col] = np.nan
    assert(preprocessed_df[col].isnull().values.sum() == nanMask.values.sum())

assert(preprocessed_df.isEmployed.isna().equals(preprocessed_df.isUnemployed.isna()) and 
       preprocessed_df.isEmployed.isna().equals(preprocessed_df.isNotInLaborForce.isna()))

preprocessed_df.drop(['EMPSTAT', 'EMPSTATD'], axis=1, inplace=True)

The following 4 columns encode information about workers' class. Workers are initially separated into 3 categories: self-employed, working for wages, or N/A. Workers who work for salary are further subdivided into public or private sector workers, as well as a few (~6k) classed as unpaid family workers. Note that more granular information, such as the level of government (federal/state/local) for public sector workers or whether private sectors work at for-profit or non-profit organizations, is discarded.

The columns are
1. isSelfEmployed
2. getsWagesPrivateSector
3. getsWagesPublicSector
4. isUnpaidFamilyWorker

If a person responded with N/A, these columns are all marked as NaN.

In [None]:
assert(sorted(preprocessed_df.CLASSWKRD.unique().tolist()) == [0, 13, 14, 22, 23, 25, 27, 28, 29])

preprocessed_df['isSelfEmployed'] = ((preprocessed_df.CLASSWKRD == 13) | (preprocessed_df.CLASSWKRD == 14)).astype("boolean")
preprocessed_df['isPrivateSector'] = ((preprocessed_df.CLASSWKRD == 22) | (preprocessed_df.CLASSWKRD == 23)).astype("boolean")
preprocessed_df['isPublicSector'] = ((preprocessed_df.CLASSWKRD >= 25) & (preprocessed_df.CLASSWKRD <= 28)).astype("boolean")
preprocessed_df['isUnpaidFamilyWorker'] = (preprocessed_df.CLASSWKRD == 29).astype("boolean")

# N/A values (which are marked with 0) will be set to NaN
nanMask = (preprocessed_df.CLASSWKRD == 0)
print(f"{nanMask.values.sum()} of {len(preprocessed_df)} respondants have N/A for CLASSWKRD")

# Set NaN values
for col in ['isSelfEmployed', 'isPrivateSector', 'isPublicSector', 'isUnpaidFamilyWorker']:
    preprocessed_df.loc[nanMask, col] = np.nan
    assert(preprocessed_df[col].isnull().values.sum() == nanMask.values.sum())

assert(preprocessed_df.isSelfEmployed.isna().equals(preprocessed_df.isPrivateSector.isna()) and
       preprocessed_df.isSelfEmployed.isna().equals(preprocessed_df.isPublicSector.isna()) and
       preprocessed_df.isSelfEmployed.isna().equals(preprocessed_df.isUnpaidFamilyWorker.isna()))

preprocessed_df.drop(['CLASSWKR', 'CLASSWKRD'], axis=1, inplace=True)

The following 3 columns indicate whether an individual worked in the previous year, and if not, whether they worked 1-5 years ago. N/A responses have each of these columns set to NaN. The columns are:
1. employedLastYear
2. employed1to5YrsAgo
3. unemployedLast5Yrs

In [None]:
preprocessed_df['employedLastYear'] = (preprocessed_df.WORKEDYR == 3).astype("boolean")
preprocessed_df['employed1to5YrsAgo'] = (preprocessed_df.WORKEDYR == 2).astype("boolean")
preprocessed_df['unemployedLast5Yrs'] = (preprocessed_df.WORKEDYR == 1).astype("boolean")

# N/A values (which are marked with 0) will be set to NaN
nanMask = (preprocessed_df.WORKEDYR == 0)
print(f"{nanMask.values.sum()} of {len(preprocessed_df)} respondants have N/A for WORKEDYR")

# Set NaN values
for col in ['employedLastYear', 'employed1to5YrsAgo', 'unemployedLast5Yrs']:
    preprocessed_df.loc[nanMask, col] = np.nan
    assert(preprocessed_df[col].isnull().values.sum() == nanMask.values.sum())

assert(preprocessed_df.employedLastYear.isna().equals(preprocessed_df.employed1to5YrsAgo.isna()) and
       preprocessed_df.employedLastYear.isna().equals(preprocessed_df.unemployedLast5Yrs.isna()))

preprocessed_df.drop(['WORKEDYR'], axis=1, inplace=True)

The following columns indicate how an individual got to work. Note that "N/A" and "other" responses have each of these columns set to NaN. The columns are:
1. commutePrivateVehicle
2. commutePublicTransport
3. commuteBikeOrWalk
4. workFromHome
Note that if a worker uses taxis to commute to/from work (~2k respondants), they are considered to use public transport. There are ~1.8 million N/A responses and ~16k "other" responses.

In [None]:
preprocessed_df['commutePrivateVehicle'] = ((preprocessed_df.TRANWORK >= 10) & (preprocessed_df.TRANWORK <= 20)).astype("boolean")
preprocessed_df['commutePublicTransportation'] = ((preprocessed_df.TRANWORK >= 30) & (preprocessed_df.TRANWORK <= 40)).astype("boolean")
preprocessed_df['commuteBikeOrWalk'] = ((preprocessed_df.TRANWORK >= 50) & (preprocessed_df.TRANWORK <= 60)).astype("boolean")
preprocessed_df['workFromHome'] = (preprocessed_df.TRANWORK == 80).astype("boolean")

# N/A values (which are marked with 0) and other values (which are marked with 70) will be set to NaN
nanMask = (preprocessed_df.TRANWORK == 0) | (preprocessed_df.TRANWORK == 70)
print(f"{nanMask.values.sum()} of {len(preprocessed_df)} respondants have N/A or other for TRANWORK")

# Set NaN values
for col in ['commutePrivateVehicle', 'commutePublicTransportation', 'commuteBikeOrWalk', 'workFromHome']:
    preprocessed_df.loc[nanMask, col] = np.nan
    assert(preprocessed_df[col].isnull().values.sum() == nanMask.values.sum())

assert(preprocessed_df.commutePrivateVehicle.isna().equals(preprocessed_df.commutePublicTransportation.isna()) and
       preprocessed_df.commutePrivateVehicle.isna().equals(preprocessed_df.commuteBikeOrWalk.isna()) and
       preprocessed_df.commutePrivateVehicle.isna().equals(preprocessed_df.workFromHome.isna()))

preprocessed_df.drop(['TRANWORK'], axis=1, inplace=True)

The following 2 columns encode whether someone is attending public or private school. Note that each row with isInSchool=True will be marked as either in public or private. Those not in school will be marked as false, while those for whom data is unavailable will have NaN values in these 2 columns (as well as in isInSchool).

In [None]:
preprocessed_df['attendingPublicSchool'] = (preprocessed_df.SCHLTYPE == 2).astype("boolean")
preprocessed_df['attendingPrivateSchool'] = (preprocessed_df.SCHLTYPE == 3).astype("boolean")

# N/A values (which are marked with 0) will be set to NaN
nanMask = (preprocessed_df.SCHLTYPE == 0)

# Check to make sure values match up with isInSchool column
assert(nanMask.equals(preprocessed_df.isInSchool.isnull()))
assert(preprocessed_df.isInSchool.fillna(False).equals(preprocessed_df.attendingPublicSchool | preprocessed_df.attendingPrivateSchool))

# Set NaN values
for col in ['attendingPublicSchool', 'attendingPrivateSchool']:
    preprocessed_df.loc[nanMask, col] = np.nan
    assert(preprocessed_df[col].isnull().values.sum() == nanMask.values.sum())

# Check to make sure NaN values match up with isInSchool column
assert(preprocessed_df.isInSchool.isnull().equals(preprocessed_df.attendingPublicSchool.isnull()))
assert(preprocessed_df.isInSchool.isnull().equals(preprocessed_df.attendingPrivateSchool.isnull()))

preprocessed_df.drop(['SCHLTYPE'], axis=1, inplace=True)

#### Miscellaneous Preprocessing

In [None]:
# YRMARR is the year in which respondant had their most recent marriage. If respondent was never married, YRMARR is 0. This is converted to NaN.
preprocessed_df.YRMARR.replace(0, np.nan, inplace=True)

# Ensure there are no discrepencies in the data
assert(preprocessed_df.YRMARR.isna() == preprocessed_df.neverMarried).all()

In [None]:
# YRNATUR is the year in which respondant was naturalized as US citizen. If respondent was never naturalized, YRNATUR is 9999. This is converted to NaN.
preprocessed_df.YRNATUR.replace(9999, np.nan, inplace=True)

In [None]:
# Ancestry data dropped because it is not used to train the model.
preprocessed_df.drop(['ANCESTR1', 'ANCESTR2', 'ANCESTR1D', 'ANCESTR2D'], axis=1, inplace=True)

In [None]:
# Citizenship data and number of years in the US is dropped as it is unavailable for most (~2.8 million) of the respondants
preprocessed_df.drop(['CITIZEN', 'YRSUSA1'], axis=1, inplace=True)

In [None]:
# These columns include languages spoken at home. This is excluded in favor of SPEAKENG, which provides less granular information.
preprocessed_df.drop(['LANGUAGE', 'LANGUAGED'], axis=1, inplace=True)

In [None]:
# This information is already extracted from the EMPSTAT column
preprocessed_df.drop(['LABFORCE'], axis=1, inplace=True)

In [None]:
# WKSWORK1 = 0 represents N/A. This is converted to NaN.
preprocessed_df.WKSWORK1.replace(0, np.nan, inplace=True)

In [None]:
# UHRSWORK = 0 represents N/A. This is converted to NaN.
preprocessed_df.UHRSWORK.replace(0, np.nan, inplace=True)

In [None]:
# WKSWORK1 is the number of weeks worked in the last year. WKSWORK2, which is a less granular version of WKSWORK1, is dropped.
preprocessed_df.drop(['WKSWORK2'], axis=1, inplace=True)

In [None]:
# The industry in which an individual works is dropped. The respondant's occupation is more relevant.
preprocessed_df.drop(['IND'], axis=1, inplace=True)
# The 2010 version of occupation (OCC2010) is used, so OCC is dropped
preprocessed_df.drop(['OCC'], axis=1, inplace=True)

In [None]:
# Drop CBSERIAL (original census bureau serial number for household) in favor of SERIAL
preprocessed_df.drop(['CBSERIAL'], axis=1, inplace=True)

In [None]:
# Check that no values are missing (999998) or N/A (999999)
assert(preprocessed_df.INCWAGE_CPIU_2010.max() < 999998)
assert(preprocessed_df.INCWAGE_CPIU_2010.min() >= 0)

# Drop INCWAGE in favor of INCWAGE_CPIU_2010 (which is adjusted for inflation to 2010 dollars)
preprocessed_df.drop(['INCWAGE'], axis=1, inplace=True)

In [None]:
# Drop information about total income (includes income from all sources, not just salary) and family income
preprocessed_df.drop(['INCTOT', 'FTOTINC'], axis=1, inplace=True)

In [None]:
# Drop information about how far above/below poverty line income is (this would let model cheat by learning what the poverty line is)
preprocessed_df.drop(['POVERTY'], axis=1, inplace=True)

In [None]:
# Drop earnings and education scores (calculated by IPUMS - ideally our model shouldn't need these)
preprocessed_df.drop(['ERSCOR90', 'EDSCOR90'], axis=1, inplace=True)

In [None]:
# Drop information about number of riders in transportation used to get to work
preprocessed_df.drop(['RIDERS'], axis=1, inplace=True)

In [None]:
assert(preprocessed_df.TRANTIME.min() == 0)

# TRANTIME = 0 represents N/A. This is converted to NaN.
preprocessed_df.TRANTIME.replace(0, np.nan, inplace=True)

In [None]:
print(f"{len(preprocessed_df.columns)} columns in total")
preprocessed_df.columns

In [None]:
preprocessed_df.head()

In [None]:
# Get the number of columns with each type
preprocessed_df.dtypes.value_counts()

In [None]:
# Show memory usage of each column of df
preprocessed_df.memory_usage()

## Save preprocessed data

At this point, all of the simple preprocessing is complete. Preprocessing that will create a lot more categorical columns, such as preprocessing of DEGFIELD, OCC2010, and PWSTATE2 is done in a separate notebook. This ensures the file created by this notebook will not have too many columns.

In [None]:
import csv
import gzip

# preprocessed_df.reset_index(inplace=True)
preprocessed_df.to_csv(
    "init_preprocessed_data.csv.gz",
    sep='|',
    header=True,
    index=True,
    quoting=csv.QUOTE_ALL,
    compression='gzip',
    quotechar='"',
    doublequote=True,
    lineterminator='\n'
)

In [None]:
preprocessed_df.to_csv(
    "init_preprocessed_data.csv",
    header=True,
    index=True
)

In [None]:
preprocessed_df.reset_index().to_csv(
    "init_preprocessed_data_without_index.csv",
    header=True,
    index=False
)