In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# Read the data
df = pd.read_csv('init_preprocessed_data_without_index.csv')
len(df)

  exec(code_obj, self.user_global_ns, self.user_ns)


3252599

In [3]:
df.columns

Index(['SERIAL', 'PERNUM', 'HHWT', 'CLUSTER', 'STRATA', 'PERWT', 'AGE',
       'YRMARR', 'YRNATUR', 'RACNUM', 'DEGFIELD', 'DEGFIELDD', 'OCC2010',
       'WKSWORK1', 'UHRSWORK', 'INCWAGE_CPIU_2010', 'PWSTATE2', 'PWCOUNTY',
       'PWTYPE', 'TRANTIME', 'isFemale', 'isAmericanIndian', 'isAsian',
       'isBlack', 'isPacificIslander', 'isWhite', 'isOtherRace',
       'hasPrivateHealthInsurance', 'hasEmployerHealthInsurance',
       'hasPurchasedPrivHealthInsurance', 'hasMilitaryHealthInsurance',
       'hasPublicHealthInsurance', 'hasMedicare', 'hasMedicaid',
       'hasVeteransHealthInsurance', 'hasIndianHealthInsurance',
       'hasHealthInsurance', 'isInSchool', 'carpools', 'isHispanic',
       'sameSexMarriage', 'mixedRaceMarriage', 'isGroupQuarters', 'bornInUS',
       'isMarried', 'wasMarried', 'neverMarried', 'speaksEnglish',
       'speaksOnlyEnglish', 'speaksEnglishWell', 'noSchooling', 'maxGrade4',
       'maxGrade8', 'maxSomeHS', 'highSchoolDiploma', 'someCollege',
       'assoc

In [4]:
df.head()

Unnamed: 0,SERIAL,PERNUM,HHWT,CLUSTER,STRATA,PERWT,AGE,YRMARR,YRNATUR,RACNUM,...,isUnpaidFamilyWorker,employedLastYear,employed1to5YrsAgo,unemployedLast5Yrs,commutePrivateVehicle,commutePublicTransportation,commuteBikeOrWalk,workFromHome,attendingPublicSchool,attendingPrivateSchool
0,1,1,13.0,2021000000011,80001,13.0,85,1971.0,,1,...,,False,False,True,,,,,False,False
1,2,1,51.0,2021000000021,80001,51.0,67,1970.0,,1,...,,False,False,True,,,,,False,False
2,3,1,17.0,2021000000031,120001,17.0,74,1991.0,,1,...,,False,False,True,,,,,False,False
3,4,1,61.0,2021000000041,170001,61.0,16,,,1,...,,False,False,True,,,,,False,True
4,5,1,15.0,2021000000051,50001,15.0,83,2016.0,,1,...,,False,False,True,,,,,False,False


Create state fields

In [5]:
num_nostate = (df.PWSTATE2 == 0).values.sum()
original_len = len(df)
print(f"{num_nostate} ({num_nostate/original_len:.2%}) of the rows have no state")

1803778 (55.46%) of the rows have no state


In [6]:
# Remove rows with no state
df = df[df.PWSTATE2 != 0]
assert(len(df) == original_len - num_nostate)
print(f"Removed {num_nostate} rows with no state. {len(df)} rows remain.")

Removed 1803778 rows with no state. 1448821 rows remain.


In [7]:
# Drop columns that are now unnecessary

origNumCols = len(df.columns)
droppedCols = []

for col in df.columns:
    unique = df[col].unique()
    if(len(unique) == 1):
        print(f"Dropping column {col} since it has only one value: {unique[0]}")
        droppedCols.append(col)
    elif(len(unique) == 2 and df[col].isna().values.any()):
        print(f"Warning: Column {col} has two values but you may still want to drop it: {unique[0]} and {unique[1]}")

df.drop(droppedCols, axis=1, inplace=True)
assert(len(df.columns) == origNumCols - len(droppedCols))
print(f"Dropped {len(droppedCols)} columns. {len(df.columns)} columns remain.")

Dropping column isEmployed since it has only one value: True
Dropping column isUnemployed since it has only one value: False
Dropping column isNotInLaborForce since it has only one value: False
Dropping column employedLastYear since it has only one value: True
Dropping column employed1to5YrsAgo since it has only one value: False
Dropping column unemployedLast5Yrs since it has only one value: False
Dropped 6 columns. 72 columns remain.


In [8]:
with(open('ipums_fields/stateField.json')) as f:
    state_mapping = json.load(f)

In [9]:
# Make sure each degree field is in the mapping
stateKeys = sorted([int(k) for k in state_mapping.keys() if int(k) != 0])
# Note that stateKeys doesn't include foreign countries
assert(sorted([val for val in df.PWSTATE2.unique().tolist() if val <= stateKeys[-1]]) == stateKeys)

In [10]:
df["worksOutsideUS"] = (df.PWSTATE2 > stateKeys[-1])
cols_created = 1
for (key, value) in state_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue

    stateName = value.replace(' ', '_')
    df[f"worksIn_{stateName}"] = (df.PWSTATE2 == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

Created 53 columns


In [11]:
df.drop(columns=['PWSTATE2', 'PWCOUNTY', 'PWTYPE'], inplace=True)

Create degree fields

In [12]:
with(open('ipums_fields/degField.json')) as f:
    deg_mapping = json.load(f)

In [13]:
# Make sure each degree field is in the mapping
assert(sorted(df.DEGFIELD.unique().tolist()) == sorted([int(k) for k in deg_mapping.keys()]))

In [14]:
cols_created = 0
for (key, value) in deg_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue

    degName = value.replace(' ', '_').replace(',', '')
    df[f"hasDegree_{degName}"] = (df.DEGFIELD == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

Created 37 columns


In [15]:
df.drop(columns=['DEGFIELD', 'DEGFIELDD'], inplace=True)

Create occupation fields

In [16]:
with(open('ipums_fields/occupation2010.json')) as f:
    occ_mapping = json.load(f)

In [17]:
# Make sure each degree field is in the mapping
sortedOcc = sorted([int(k) for k in occ_mapping.keys()])
uniqueCodes = df.OCC2010.unique().tolist()
removedKeys = []

for jobCode in uniqueCodes:
    assert(jobCode in sortedOcc)

for jobCode in sortedOcc:
    if jobCode not in uniqueCodes:
        removedKeys.append(jobCode)
        print(f"No examples of job code {jobCode} (occupation {occ_mapping[str(jobCode)]})")
# assert(sorted(df.OCC2010.unique().tolist()) == )

No examples of job code 9920 (occupation Unemployed, with No Work Experience in the Last 5 Years or Earlier or Never Worked)


In [18]:
cols_created = 0
for (key, value) in occ_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue
    
    # Skip occupations that were removed
    if key in removedKeys:
        continue

    occName = value.replace(' ', '_').replace(',', '')
    df[f"occupation_{occName}"] = (df.OCC2010 == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

  if sys.path[0] == "":


Created 427 columns


In [19]:
print(f"Created {cols_created} columns")

Created 427 columns


In [20]:
df.drop(columns=['OCC2010'], inplace=True)

In [None]:
df = df.copy()

In [None]:
df.reset_index().to_csv('large_field_preprocessed_data.csv', index=False)