In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
# Read the data
df = pd.read_csv('init_preprocessed_data_without_index.csv')
len(df)

In [None]:
df.columns

In [None]:
df.head()

Create degree fields

In [None]:
with(open('ipums_fields/degField.json')) as f:
    deg_mapping = json.load(f)

In [None]:
# Make sure each degree field is in the mapping
assert(sorted(df.DEGFIELD.unique().tolist()) == sorted([int(k) for k in deg_mapping.keys()]))

In [None]:
cols_created = 0
for (key, value) in deg_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue

    degName = value.replace(' ', '_').replace(',', '')
    df[f"hasDegree_{degName}"] = (df.DEGFIELD == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

In [None]:
df.drop(columns=['DEGFIELD', 'DEGFIELDD'], inplace=True)

Create occupation fields

In [None]:
with(open('ipums_fields/occupation2010.json')) as f:
    occ_mapping = json.load(f)

In [None]:
# Make sure each degree field is in the mapping
assert(sorted(df.OCC2010.unique().tolist()) == sorted([int(k) for k in occ_mapping.keys()]))

In [None]:
cols_created = 0
for (key, value) in occ_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue

    occName = value.replace(' ', '_').replace(',', '')
    df[f"occupation_{occName}"] = (df.OCC2010 == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

In [None]:
df.drop(columns=['OCC2010'], inplace=True)

Create state fields

In [None]:
with(open('ipums_fields/stateField.json')) as f:
    state_mapping = json.load(f)

In [None]:
df.PWSTATE2.unique()

In [None]:
# Make sure each degree field is in the mapping
stateKeys = sorted([int(k) for k in state_mapping.keys()])
# Note that stateKeys doesn't include foreign countries
assert(sorted([val for val in df.PWSTATE2.unique().tolist() if val <= stateKeys[-1]]) == stateKeys)

In [None]:
df["worksOutsideUS"] = (df.PWSTATE2 > stateKeys[-1])
cols_created = 1
for (key, value) in state_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue

    stateName = value.replace(' ', '_')
    df[f"worksIn_{stateName}"] = (df.PWSTATE2 == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

In [None]:
df.drop(columns=['PWSTATE2', 'PWCOUNTY', 'PWTYPE'], inplace=True)

In [None]:
df.reset_index().to_csv('large_field_preprocessed_data.csv', index=False)