In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
# Read the data
df = pd.read_csv('dataset/init_preprocessed_data_without_index.csv')
len(df)

In [None]:
df.columns

In [None]:
df.head()

Create state fields

In [None]:
num_nostate = (df.PWSTATE2 == 0).values.sum()
original_len = len(df)
print(f"{num_nostate} ({num_nostate/original_len:.2%}) of the rows have no state")

In [None]:
# Remove rows with no state
df = df[df.PWSTATE2 != 0]
assert(len(df) == original_len - num_nostate)
print(f"Removed {num_nostate} rows with no state. {len(df)} rows remain.")

In [None]:
# Drop columns that are now unnecessary

origNumCols = len(df.columns)
droppedCols = []

for col in df.columns:
    unique = df[col].unique()
    if(len(unique) == 1):
        print(f"Dropping column {col} since it has only one value: {unique[0]}")
        droppedCols.append(col)
    elif(len(unique) == 2 and df[col].isna().values.any()):
        print(f"Warning: Column {col} has two values but you may still want to drop it: {unique[0]} and {unique[1]}")

df.drop(droppedCols, axis=1, inplace=True)
assert(len(df.columns) == origNumCols - len(droppedCols))
print(f"Dropped {len(droppedCols)} columns. {len(df.columns)} columns remain.")

In [None]:
with(open('ipums_fields/stateField.json')) as f:
    state_mapping = json.load(f)

In [None]:
# Make sure each degree field is in the mapping
stateKeys = sorted([int(k) for k in state_mapping.keys() if int(k) != 0])
# Note that stateKeys doesn't include foreign countries
assert(sorted([val for val in df.PWSTATE2.unique().tolist() if val <= stateKeys[-1]]) == stateKeys)

In [None]:
df["worksOutsideUS"] = (df.PWSTATE2 > stateKeys[-1])
cols_created = 1
for (key, value) in state_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue

    stateName = value.replace(' ', '_')
    df[f"worksIn_{stateName}"] = (df.PWSTATE2 == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

In [None]:
df.drop(columns=['PWSTATE2', 'PWCOUNTY', 'PWTYPE'], inplace=True)

Create degree fields

In [None]:
with(open('ipums_fields/degField.json')) as f:
    deg_mapping = json.load(f)

In [None]:
# Make sure each degree field is in the mapping
assert(sorted(df.DEGFIELD.unique().tolist()) == sorted([int(k) for k in deg_mapping.keys()]))

In [None]:
cols_created = 0
for (key, value) in deg_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue

    degName = value.replace(' ', '_').replace(',', '')
    df[f"hasDegree_{degName}"] = (df.DEGFIELD == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

In [None]:
df.drop(columns=['DEGFIELD', 'DEGFIELDD'], inplace=True)

Create occupation fields

In [None]:
with(open('ipums_fields/occupation2010.json')) as f:
    occ_mapping = json.load(f)

In [None]:
# Make sure each degree field is in the mapping
sortedOcc = sorted([int(k) for k in occ_mapping.keys()])
uniqueCodes = df.OCC2010.unique().tolist()
removedKeys = []

for jobCode in uniqueCodes:
    assert(jobCode in sortedOcc)

for jobCode in sortedOcc:
    if jobCode not in uniqueCodes:
        removedKeys.append(jobCode)
        print(f"No examples of job code {jobCode} (occupation {occ_mapping[str(jobCode)]})")
# assert(sorted(df.OCC2010.unique().tolist()) == )

In [None]:
cols_created = 0
for (key, value) in occ_mapping.items():
    # Skip N/A column (may want to fill this with NaN later)
    if value == 'N/A':
        continue
    
    # Skip occupations that were removed
    if key in removedKeys:
        continue

    occName = value.replace(' ', '_').replace(',', '')
    df[f"occupation_{occName}"] = (df.OCC2010 == int(key))
    cols_created += 1

print(f"Created {cols_created} columns")

In [None]:
print(f"Created {cols_created} columns")

In [None]:
df.drop(columns=['OCC2010'], inplace=True)

In [None]:
df = df.copy()

In [None]:
df.reset_index().to_csv('dataset/large_field_preprocessed_data.csv', index=False)