In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 50)

import os
# Use this to see how much memory the dataframes use
from sys import getsizeof

import yaml

import pseudopeople as pp
from pseudopeople.configuration import get_configuration
from vivarium.framework.randomness import RandomnessStream
from vivarium.config_tree import ConfigTree

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import datatypes
from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data

!date
!whoami
!uname -a
!pwd

Tue 11 Apr 2023 05:05:17 PM PDT
ndbs
Linux gen-slurm-sarchive-p0154 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


# Generate default sample decennial census

In [2]:
df_census_sample = pp.generate_decennial_census()
df_census_sample

Unnamed: 0,simulant_id,first_name,date_of_birth,housing_type,city,middle_initial,guardian_2,zipcode,age,year,relation_to_household_head,state,street_number,sex,guardian_1,last_name,street_name,race_ethnicity,unit_number
0,0_2,Melanie,1993-08-05 00:00:00,Standard,Anytown,L,0_-1,00000,26,2020,Reference person,US,10233,Female,0_-1,Herrod,north burgher avenue,White,
1,0_3,Jordan,1993-12-29 00:00:00,Standard,Anytown,C,0_-1,00000,26,2020,Other relative,US,10233,Female,0_-1,Herrod,north burgher avenue,White,
2,0_923,John,1942-06-29 00:00:00,Standard,Anytown,E,0_-1,00000,77,2020,Reference person,US,147-153,Male,0_-1,Davis,browning ave,Black,
3,0_2641,Sharon,1960-10-10 00:00:00,Standard,Anytown,T,0_-1,00000,59,2020,Reference person,US,107,Female,0_-1,Plummer,stallion st,White,
4,0_2801,Ronnie,1946-12-05 00:00:00,Standard,Anytown,A,0_-1,00000,73,2020,Reference person,US,214,Male,0_-1,Yoakum,s vine lane,White,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29570,0_7522,Halle,2014-04-21 00:00:00,Standard,Anytown,R,0_7521,00000,25,2040,Reference person,US,135,Female,0_7520,Carriker,cobblewood drive,White,
29571,0_14524,Keith,1967-05-11 00:00:00,Standard,Anytown,D,0_-1,00000,72,2040,Reference person,US,728,Male,0_-1,Znhtalek,w winchester st,White,
29572,0_14563,Presley,2008-04-02 00:00:00,Standard,Anytown,I,0_14561,00000,31,2040,Other nonrelative,US,728,Female,0_14560,Hill,w winchester st,White,
29573,0_18084,Carol,1971-11-09 00:00:00,Standard,Anytown,M,0_-1,00000,68,2040,Reference person,US,129,,0_-1,Wardell,custer street,White,


In [5]:
df_census_sample.dtypes

simulant_id                   category
first_name                      object
date_of_birth                   object
housing_type                  category
city                            object
middle_initial                  object
guardian_2                    category
zipcode                         object
age                             object
year                             int64
relation_to_household_head      object
state                           object
street_number                   object
sex                             object
guardian_1                    category
last_name                       object
street_name                     object
race_ethnicity                  object
unit_number                     object
dtype: object

# Edit default configuration to zero out all noise so I can get an un-noised sample dataset

In [8]:
config = get_configuration()
config

decennial_census:
    row_noise:
        omission:
            probability:
                default: 0.0145
                    source: None
                baseline: 0.0
                    source: None
    column_noise:
        first_name:
            missing_data:
                row_noise_level:
                    baseline: 0.01
                        source: None
            fake_name:
                row_noise_level:
                    baseline: 0.01
                        source: None
                token_noise_level:
                    baseline: 0.1
                        source: None
            typographic:
                row_noise_level:
                    baseline: 0.01
                        source: None
                token_noise_level:
                    baseline: 0.1
                        source: None
                include_original_token_level:
                    baseline: 0.1
                        source: None
        middle_initial:
            miss

In [18]:
config.decennial_census.column_noise.first_name

missing_data:
    row_noise_level:
        baseline: 0.01
            source: None
fake_name:
    row_noise_level:
        baseline: 0.01
            source: None
    token_noise_level:
        baseline: 0.1
            source: None
typographic:
    row_noise_level:
        baseline: 0.01
            source: None
    token_noise_level:
        baseline: 0.1
            source: None
    include_original_token_level:
        baseline: 0.1
            source: None

## Write a function to zero out all column noise in the default configuration

In [16]:
def get_zero_column_noise_config():
    config = get_configuration()
    for dataset, dataset_config in config.items():
        for column, column_config in dataset_config['column_noise'].items():
            for noise_type, noise_config in column_config.items():
                noise_config['row_noise_level'] = 0
    return config

# Actually, keys are unnecessary, so we can just use values:
def get_zero_column_noise_config():
    config = get_configuration()
    for dataset_config in config.values():
        for column_config in dataset_config['column_noise'].values():
            for noise_config in column_config.values():
                noise_config['row_noise_level'] = 0
    return config

zero_config = get_zero_column_noise_config()
zero_config

decennial_census:
    row_noise:
        omission:
            probability:
                default: 0.0145
                    source: None
                baseline: 0.0
                    source: None
    column_noise:
        first_name:
            missing_data:
                row_noise_level:
                    user: 0
                        source: None
                    baseline: 0.01
                        source: None
            fake_name:
                row_noise_level:
                    user: 0
                        source: None
                    baseline: 0.01
                        source: None
                token_noise_level:
                    baseline: 0.1
                        source: None
            typographic:
                row_noise_level:
                    user: 0
                        source: None
                    baseline: 0.01
                        source: None
                token_noise_level:
                    baseline: 0.1

In [22]:
zero_config.decennial_census.column_noise.first_name

missing_data:
    row_noise_level:
        user: 0
            source: None
        baseline: 0.01
            source: None
fake_name:
    row_noise_level:
        user: 0
            source: None
        baseline: 0.01
            source: None
    token_noise_level:
        baseline: 0.1
            source: None
typographic:
    row_noise_level:
        user: 0
            source: None
        baseline: 0.01
            source: None
    token_noise_level:
        baseline: 0.1
            source: None
    include_original_token_level:
        baseline: 0.1
            source: None

## Apparently I have to pass a `dict` to the data generating functions, not a `ConfigTree`

In [23]:
# This fails without converting zero_config to dict first
df_census_sample0 = pp.generate_decennial_census(configuration=zero_config.to_dict())
df_census_sample0

Unnamed: 0,simulant_id,first_name,date_of_birth,housing_type,city,middle_initial,guardian_2,zipcode,age,year,relation_to_household_head,state,street_number,sex,guardian_1,last_name,street_name,race_ethnicity,unit_number
0,0_2,Melanie,1993-08-05 00:00:00,Standard,Anytown,L,0_-1,00000,26,2020,Reference person,US,10233,Female,0_-1,Herrod,north burgher avenue,White,
1,0_3,Jordan,1993-12-29 00:00:00,Standard,Anytown,C,0_-1,00000,26,2020,Other relative,US,10233,Female,0_-1,Herrod,north burgher avenue,White,
2,0_923,John,1942-06-29 00:00:00,Standard,Anytown,E,0_-1,00000,77,2020,Reference person,US,147-153,Male,0_-1,Davis,browning ave,Black,
3,0_2641,Sharon,1960-10-10 00:00:00,Standard,Anytown,T,0_-1,00000,59,2020,Reference person,US,107,Female,0_-1,Plummer,stallion st,White,
4,0_2801,Ronnie,1946-12-05 00:00:00,Standard,Anytown,A,0_-1,00000,73,2020,Reference person,US,214,Male,0_-1,Yoakum,s vine lane,White,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29570,0_7522,Halle,2014-04-21 00:00:00,Standard,Anytown,R,0_7521,00000,25,2040,Reference person,US,135,Female,0_7520,Carriker,cobblewood drive,White,
29571,0_14524,Keith,1967-05-11 00:00:00,Standard,Anytown,D,0_-1,00000,72,2040,Reference person,US,728,Male,0_-1,Antalek,w winchester st,White,
29572,0_14563,Presley,2008-04-02 00:00:00,Standard,Anytown,I,0_14561,00000,31,2040,Other nonrelative,US,728,Female,0_14560,Hill,w winchester st,White,
29573,0_18084,Carol,1971-11-09 00:00:00,Standard,Anytown,M,0_-1,00000,68,2040,Reference person,US,129,Female,0_-1,Wardell,custer street,White,


# Write functions to compute the percent of noised cells in each column and the total number of rows with noise

In [31]:
def percent_different_in_columns(df1, df2):
    return 100 * (df1 != df2).sum() / len(df1)

def percent_of_rows_with_difference(df1, df2):
    return 100 * (df1 != df2).any(axis=1).sum() / len(df1)

In [30]:
# 100*(df_census_sample0 != df_census_sample).sum() / len(df_census_sample)
percent_different_in_columns(df_census_sample, df_census_sample0)

simulant_id                   0.000000
first_name                    2.431107
date_of_birth                 3.097210
housing_type                  0.000000
city                          1.602705
middle_initial                1.122570
guardian_2                    0.000000
zipcode                       2.089603
age                           2.826712
year                          0.000000
relation_to_household_head    1.839391
state                         2.011834
street_number                 2.015216
sex                           1.437025
guardian_1                    0.000000
last_name                     2.549451
street_name                   1.653423
race_ethnicity                1.920541
unit_number                   0.185968
dtype: float64

In [32]:
# (df_census_sample0 != df_census_sample).any(axis=1).sum() / len(df_census_sample)
percent_of_rows_with_difference(df_census_sample, df_census_sample0)

23.678782755705832

# Convenience function to cut down on editing after copy/paste

In [48]:
def compare_columns(df1, df2, colname, notna=False):
    if notna:
        notna = df1[colname].notna() & df2[colname].notna()
        return df1[colname].loc[notna].compare(df2[colname].loc[notna])
    else:
        return df1[colname].compare(df2[colname])

# Look at original value and noised value in each column using `pandas.Series.compare`

# First name

In [33]:
df_census_sample0.first_name.compare(df_census_sample.first_name)

Unnamed: 0,self,other
79,Brian,
94,Kaylee,
172,Caitlin,
238,Zoe,Zow
253,Janet,CHILD
...,...,...
29272,Nicholas,Nichkolas
29324,Davin,
29403,Cynthia,MOTHER
29496,Xavier,Xavifer


In [102]:
compare_columns(df_census_sample0, df_census_sample, 'first_name', notna=True)

Unnamed: 0,self,other
238,Zoe,Zow
253,Janet,CHILD
279,Samantha,W
338,Jodi,MR
383,Brianna,A
...,...,...
29160,Robin,Rpbin
29260,Katie,GENTLEMAN
29272,Nicholas,Nichkolas
29403,Cynthia,MOTHER


# Date of birth

## Looks like dates are still stored as Timestamps?

But they get changed to `str` or `pd.NA` when noise is applied.

## Note: Sometimes numeric noise is added to the `hh:mm:ss` instead of `yyyy-mm-dd`

In [34]:
df_census_sample0.date_of_birth.compare(df_census_sample.date_of_birth)

Unnamed: 0,self,other
23,1973-07-14 00:00:00,1983-07-14 01:20:00
39,1965-03-15 00:00:00,1952-03-15 00:00:00
97,2022-07-07 00:00:00,
147,1947-12-17 00:00:00,1647-12-17 00:00:00
198,1988-04-08 00:00:00,1688-04-08 00:00:00
...,...,...
29430,2039-01-06 00:00:00,
29440,2029-06-24 00:00:00,2029-06-24 00:00:00
29511,1963-11-05 00:00:00,
29532,1969-01-12 00:00:00,1969-01-12 00:00:00


In [94]:
config.decennial_census.column_noise.date_of_birth

missing_data:
    row_noise_level:
        baseline: 0.01
            source: None
numeric_miswriting:
    row_noise_level:
        baseline: 0.01
            source: None
    token_noise_level:
        baseline: 0.1
            source: None
typographic:
    row_noise_level:
        baseline: 0.01
            source: None
    token_noise_level:
        baseline: 0.1
            source: None
    include_original_token_level:
        baseline: 0.1
            source: None

In [95]:
df_census_sample0.date_of_birth.map(type).unique()

array([<class 'pandas._libs.tslibs.timestamps.Timestamp'>], dtype=object)

In [96]:
df_census_sample.date_of_birth.map(type).unique()

array([<class 'pandas._libs.tslibs.timestamps.Timestamp'>, <class 'str'>,
       <class 'pandas._libs.missing.NAType'>], dtype=object)

In [97]:
str_dob = df_census_sample.date_of_birth.map(type) == str
str_dob.sum()

616

In [98]:
df_census_sample0.date_of_birth.loc[str_dob].compare(df_census_sample.loc[str_dob].date_of_birth)

Unnamed: 0,self,other
23,1973-07-14 00:00:00,1983-07-14 01:20:00
39,1965-03-15 00:00:00,1952-03-15 00:00:00
147,1947-12-17 00:00:00,1647-12-17 00:00:00
198,1988-04-08 00:00:00,1688-04-08 00:00:00
242,1973-12-15 00:00:00,1973-12-15 00:04:00
...,...,...
29314,1981-02-20 00:00:00,1981-02-20 00:00:00
29415,2035-05-02 00:00:00,4035-05-02 00:00:00
29440,2029-06-24 00:00:00,2029-06-24 00:00:00
29532,1969-01-12 00:00:00,1969-01-12 00:00:00


### This verifies the noise always changes the type from `Timestamp` to `str` or `pd.NA`:

In [99]:
notna = df_census_sample.date_of_birth.notna()
df_census_sample0.date_of_birth.loc[notna & ~str_dob].compare(
    df_census_sample.loc[notna & ~str_dob].date_of_birth)

Unnamed: 0,self,other


# Housing type has no noise

## Also, only Standard and Carceral show up, but no other GQ

In [39]:
df_census_sample0.housing_type.compare(df_census_sample.housing_type, keep_shape=True, keep_equal=True)

Unnamed: 0,self,other
0,Standard,Standard
1,Standard,Standard
2,Standard,Standard
3,Standard,Standard
4,Standard,Standard
...,...,...
29570,Standard,Standard
29571,Standard,Standard
29572,Standard,Standard
29573,Standard,Standard


In [111]:
df_census_sample0.housing_type.unique()

['Standard', 'Carceral']
Categories (7, object): ['Standard', 'Carceral', 'Nursing home', 'Other institutional', 'College', 'Military', 'Other non-institutional']

In [112]:
config.decennial_census.column_noise.housing_type

ConfigurationKeyError: 'No value at name column_noise.housing_type.'

# City

In [35]:
df_census_sample0.city.compare(df_census_sample.city)

Unnamed: 0,self,other
20,Anytown,
32,Anytown,Anttown
130,Anytown,
152,Anytown,Antgown
160,Anytown,
...,...,...
29409,Anytown,Anytpwn
29474,Anytown,
29526,Anytown,Anyfown
29540,Anytown,


# Middle initial -- sometimes we get 2 letters instead of 1

In [40]:
df_census_sample0.middle_initial.compare(df_census_sample.middle_initial)

Unnamed: 0,self,other
51,J,
366,J,
394,K,
453,L,I
510,H,
...,...,...
28607,C,
28678,E,
28687,S,
28854,L,


In [41]:
notna = df_census_sample.middle_initial.notna()
df_census_sample0.middle_initial.loc[notna].compare(df_census_sample.middle_initial.loc[notna])

Unnamed: 0,self,other
453,L,I
828,M,H
888,C,X
1654,E,D
4293,A,W
4567,C,V
4762,E,S
4861,M,J
7261,N,G
8722,J,M


# Zipcode

In [42]:
notna = df_census_sample.zipcode.notna()
df_census_sample0.zipcode.loc[notna].compare(df_census_sample.zipcode.loc[notna])

Unnamed: 0,self,other
8,00000,00100
21,00000,00001
29,00000,20000
102,00000,20000
384,00000,00700
...,...,...
29324,00000,00001
29447,00000,00007
29469,00000,02000
29527,00000,00100


# Age

In [43]:
notna = df_census_sample.age.notna()
df_census_sample0.age.loc[notna].compare(df_census_sample.age.loc[notna])

Unnamed: 0,self,other
112,6,5
224,61,61
230,67,87
351,25,24
354,0,1
...,...,...
29128,33,32
29269,17,17
29444,41,42
29466,51,51


# Relation to household head

In [49]:
compare_columns(df_census_sample0, df_census_sample, 'relation_to_household_head', notna=False)

Unnamed: 0,self,other
26,Other nonrelative,Opp-sex spouse
64,Reference person,Same-sex partner
233,Reference person,Stepchild
289,Reference person,Same-sex spouse
297,Biological child,
...,...,...
29331,Biological child,
29374,Parent,Foster child
29382,Reference person,
29396,Reference person,


In [50]:
compare_columns(df_census_sample0, df_census_sample, 'relation_to_household_head', notna=True)

Unnamed: 0,self,other
26,Other nonrelative,Opp-sex spouse
64,Reference person,Same-sex partner
233,Reference person,Stepchild
289,Reference person,Same-sex spouse
523,Parent,Parent-in-law
...,...,...
29105,Reference person,Parent
29142,Reference person,Sibling
29200,Roommate,Sibling
29231,Reference person,Parent


# State

In [51]:
df_census_sample0.columns

Index(['simulant_id', 'first_name', 'date_of_birth', 'housing_type', 'city',
       'middle_initial', 'guardian_2', 'zipcode', 'age', 'year',
       'relation_to_household_head', 'state', 'street_number', 'sex',
       'guardian_1', 'last_name', 'street_name', 'race_ethnicity',
       'unit_number'],
      dtype='object')

In [54]:
compare_columns(df_census_sample0, df_census_sample, 'state', notna=True)

Unnamed: 0,self,other
126,US,TN
280,US,MS
419,US,VA
525,US,NM
787,US,IL
...,...,...
28990,US,NV
29150,US,OR
29286,US,SD
29447,US,AL


# Street number

## Looks like numeric miswriting adds blank spaces to the end of numbers

In [55]:
compare_columns(df_census_sample0, df_census_sample, 'street_number', notna=True)

Unnamed: 0,self,other
64,123,125
249,6040,6002
334,281,281
357,4048,4048
414,1103,1103
...,...,...
29252,4222,4222
29304,4372,4772
29325,1005,1005
29375,2612,2617


In [56]:
compare_columns(df_census_sample0, df_census_sample, 'street_number', notna=True).loc[334]

self          281
other    281     
Name: 334, dtype: object

In [57]:
compare_columns(df_census_sample0, df_census_sample, 'street_number', notna=True).loc[334, 'self']

'281'

In [58]:
compare_columns(df_census_sample0, df_census_sample, 'street_number', notna=True).loc[334, 'other']

'281     '

# Sex

In [100]:
compare_columns(df_census_sample0, df_census_sample, 'sex', notna=True)

Unnamed: 0,self,other
203,Male,Female
491,Male,Female
1380,Male,Female
1522,Male,Female
1691,Male,Female
...,...,...
28480,Male,Female
28895,Male,Female
29009,Male,Female
29069,Female,Male


# Last name

In [101]:
compare_columns(df_census_sample0, df_census_sample, 'last_name', notna=True)

Unnamed: 0,self,other
74,Bedolla,H
78,Nolf,LADY OF HOUSE
119,Gardner,T
134,Logan,A
198,Lebron,Lefron
...,...,...
29349,Davis,DAUGHTER
29352,Boone,T
29390,Hayes,UNKNOWN
29512,Padilla,BOY


# Street name

In [105]:
compare_columns(df_census_sample0, df_census_sample, 'street_name', notna=True)

Unnamed: 0,self,other
7,westminster dr,weqtminstwf dr
136,fairlawn st,fakrlawn st
485,elena st,elena at
645,daly ave,dxly avs
658,meg brauer way,jeg brauer way
...,...,...
28857,royal crst drive,royal crst dguve
28899,grissom pkwy,grissom okwy
29037,wst avenue,wst avrgue
29108,terrell hl dr,terrekl hl dr


# Race/Ethnicity

In [108]:
compare_columns(df_census_sample0, df_census_sample, 'race_ethnicity', notna=True)

Unnamed: 0,self,other
157,Latino,Asian
312,Latino,NHOPI
353,Latino,NHOPI
413,Black,Multiracial or Other
492,White,AIAN
...,...,...
29073,White,AIAN
29398,Black,Asian
29439,Multiracial or Other,Asian
29474,White,Asian


In [109]:
compare_columns(df_census_sample0, df_census_sample, 'unit_number', notna=True)

Unnamed: 0,self,other
1647,floor number 6 apt number 5,floor number 2 apt number 5
2037,no 50,no 70
3104,unit # 258,ybit # 255
3105,unit # 258,unit # 258
7314,apt 1l,apt 1l
8176,unit 418,unit 419
9418,apt 19,apt 19
9573,no 421r,no 427r
10737,unit # 17 e floor 1,unit # 17 e floor 1
11068,unit 163,unit 163


In [110]:
compare_columns(df_census_sample0, df_census_sample, 'unit_number', notna=True).loc[9418]

self                          apt 19
other    apt 19                     
Name: 9418, dtype: object

# Now let's load a larger dataset -- let's try parquet first

```
/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04/final_results/parquet/
```

In [3]:
project_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
model_dir = (
    f'{project_dir}/results'
    '/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04'
)
parquet_dir = f'{model_dir}/final_results/parquet'
hdf_dir = f'{model_dir}/final_results/hdf'
rhode_island_par_dir = f'{parquet_dir}/states/rhode_island'
usa_par_dir = f'{parquet_dir}/usa'

!ls -halt $hdf_dir

total 96K
drwxrwsr-x  3 rmudambi IHME-Simulationscience  512 Apr  9 18:49 states
drwxrwsr-x  5 rmudambi IHME-Simulationscience 1.5K Apr  9 18:49 .
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:35 usa
drwxrwsr-x  4 rmudambi IHME-Simulationscience 2.0K Apr  9 15:21 ..
drwxrwsr-x  2 rmudambi IHME-Simulationscience 335K Apr  9 11:55 logs


In [4]:
!ls -halt $model_dir/final_results

total 24K
drwxrwsr-x 5 rmudambi IHME-Simulationscience 1.5K Apr  9 18:49 hdf
drwxrwsr-x 5 rmudambi IHME-Simulationscience 1.5K Apr  9 18:48 parquet
drwxrwsr-x 4 rmudambi IHME-Simulationscience 2.0K Apr  9 15:21 .
lrwxrwxrwx 1 rmudambi IHME-Simulationscience   33 Apr  9 12:57 best -> final_results/2023_04_09_11_07_45
lrwxrwxrwx 1 rmudambi IHME-Simulationscience   33 Apr  9 12:57 latest -> final_results/2023_04_09_11_07_45
drwxrwsr-x 6 rmudambi IHME-Simulationscience 5.0K Apr  9 11:04 ..


In [59]:
seed = 9847
ext = '.parquet'
ri_w2_dir = f'{rhode_island_dir}/tax_w2_observer'
ri_census_dir = f'{rhode_island_dir}/decennial_census_observer'
ri_acs_dir = f'{rhode_island_dir}/household_survey_observer_acs'

ri_w2_path = f'{ri_w2_dir}/tax_w2_observer_{seed}{ext}'
ri_census_path = f'{ri_census_dir}/decennial_census_observer_{seed}{ext}'
ri_acs_path = f'{ri_acs_dir}/household_survey_observer_acs_{seed}{ext}'

usa_w2_dir = f'{usa_dir}/tax_w2_observer'
usa_census_dir = f'{usa_dir}/decennial_census_observer'
usa_acs_dir = f'{usa_dir}/household_survey_observer_acs'

usa_w2_path = f'{usa_w2_dir}/tax_w2_observer_{seed}{ext}'
usa_census_path = f'{usa_census_dir}/decennial_census_observer_{seed}{ext}'
usa_acs_path = f'{usa_acs_dir}/household_survey_observer_acs_{seed}{ext}'

# Load USA ACS data, first un-noised, then noised

In [64]:
%%time
df_usa_acs = pd.read_parquet(usa_acs_path)
df_usa_acs

CPU times: user 126 ms, sys: 26.9 ms, total: 153 ms
Wall time: 151 ms


Unnamed: 0,state,city,simulant_id,household_id,date_of_birth,sex,street_name,first_name,guardian_1,street_number,unit_number,housing_type,last_name,guardian_2,zipcode,survey_date,age,middle_initial
0,OH,kimball,9847_6213,9847_2465,1939-07-04,Male,pipestone cir,Billy,9847_-1,16130,,Standard,Allyn,9847_-1,43140,2019-01-29,79,O
1,OH,kimball,9847_6214,9847_2465,1964-09-22,Female,pipestone cir,Mary,9847_-1,16130,,Standard,Allyn,9847_-1,43140,2019-01-29,54,T
2,FL,okahumpka,9847_25392,9847_10131,1974-12-15,Male,cr 65,Rodney,9847_-1,23,,Standard,Kuang,9847_-1,33071,2019-01-29,44,B
3,FL,okahumpka,9847_25393,9847_10131,1977-06-06,Female,cr 65,Amber,9847_-1,23,,Standard,Kuang,9847_-1,33071,2019-01-29,41,E
4,FL,okahumpka,9847_25394,9847_10131,2009-11-14,Male,cr 65,Michael,9847_25392,23,,Standard,Kuang,9847_25393,33071,2019-01-29,9,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60381,AZ,gilbert,9847_1372132,9847_229216,2039-10-29,Male,nw 4th ct,Paul,9847_825709,643,,Standard,Dorrough,9847_-1,85736,2041-05-21,1,N
60382,NY,new york,9847_1379697,9847_861241,2001-04-01,Male,table mesa dr,Matthew,9847_-1,2067,,Standard,Pait,9847_-1,10065,2041-05-21,40,J
60383,NY,new york,9847_1379698,9847_861241,1999-04-11,Female,table mesa dr,Makenzie,9847_-1,2067,,Standard,Pait,9847_-1,10065,2041-05-21,42,E
60384,NY,new york,9847_1379699,9847_861241,2035-04-30,Female,table mesa dr,Sophia,9847_1379697,2067,,Standard,Pait,9847_1379698,10065,2041-05-21,6,E


In [65]:
df_usa_acs.dtypes

state                   category
city                    category
simulant_id             category
household_id            category
date_of_birth     datetime64[ns]
sex                     category
street_name             category
first_name              category
guardian_1              category
street_number           category
unit_number             category
housing_type            category
last_name               category
guardian_2              category
zipcode                 category
survey_date       datetime64[ns]
age                        int64
middle_initial          category
dtype: object

In [66]:
%%time
df_usa_acs_noisy = pp.generate_american_communities_survey(usa_acs_path)
df_usa_acs_noisy

CPU times: user 1.59 s, sys: 78.5 ms, total: 1.67 s
Wall time: 1.67 s


Unnamed: 0,state,city,simulant_id,household_id,date_of_birth,sex,street_name,first_name,guardian_1,street_number,unit_number,housing_type,last_name,guardian_2,zipcode,survey_date,age,middle_initial
0,OH,kimball,9847_6213,9847_2465,1939-07-04 00:00:00,Male,pipestone cir,Billy,9847_-1,,,Standard,Allyn,9847_-1,43140,2019-01-29,79,O
1,OH,kimball,9847_6214,9847_2465,1964-09-22 00:00:00,Female,pipestone cir,Mary,9847_-1,16130,,Standard,Allyn,9847_-1,43140,2019-01-29,54,T
2,FL,okahumpka,9847_25392,9847_10131,1974-12-15 00:00:00,Male,cr 65,Rodney,9847_-1,23,,Standard,Kuang,9847_-1,33071,2019-01-29,44,B
3,FL,okahumpka,9847_25393,9847_10131,1977-06-06 00:00:00,Female,cr 65,Amber,9847_-1,23,,Standard,Kuang,9847_-1,33071,2019-01-29,41,E
4,FL,okahumpka,9847_25394,9847_10131,2009-11-14 00:00:00,Male,cr 65,Michael,9847_25392,23,,Standard,Kuang,9847_25393,33071,2019-01-29,9,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60381,AZ,gilbert,9847_1372132,9847_229216,2039-10-29 00:00:00,Male,nw 4th ct,Paul,9847_825709,643,,Standard,Dorrough,9847_-1,85736,2041-05-21,1,N
60382,NY,new york,9847_1379697,9847_861241,2001-04-01 00:00:00,Male,table mesa dr,Matthew,9847_-1,2067,,Standard,Pait,9847_-1,10065,2041-05-21,40,J
60383,NY,new york,9847_1379698,9847_861241,1999-04-11 00:00:00,Female,table mesa dr,Makenzie,9847_-1,2067,,Standard,Pait,9847_-1,10065,2041-05-21,42,E
60384,NY,new york,9847_1379699,9847_861241,2035-04-30 00:00:00,Female,table mesa dr,Sophia,9847_1379697,2067,,Standard,Pait,9847_1379698,10065,2041-05-21,6,E


In [68]:
df_usa_acs_noisy.dtypes

state                     object
city                      object
simulant_id             category
household_id            category
date_of_birth             object
sex                       object
street_name               object
first_name                object
guardian_1              category
street_number             object
unit_number               object
housing_type            category
last_name                 object
guardian_2              category
zipcode                   object
survey_date       datetime64[ns]
age                       object
middle_initial            object
dtype: object

# Can't directly compare noised and un-noised versions because of incompatible datatypes

In [85]:
percent_different_in_columns(df_usa_acs, df_usa_acs_noisy)

AttributeError: 'bool' object has no attribute 'ndim'

# Try converting both to all categorical so I can compare the values

In [69]:
%time df_usa_acs_noisy_cat = df_usa_acs_noisy.astype('category')
df_usa_acs_noisy_cat.dtypes

CPU times: user 512 ms, sys: 0 ns, total: 512 ms
Wall time: 510 ms


state             category
city              category
simulant_id       category
household_id      category
date_of_birth     category
sex               category
street_name       category
first_name        category
guardian_1        category
street_number     category
unit_number       category
housing_type      category
last_name         category
guardian_2        category
zipcode           category
survey_date       category
age               category
middle_initial    category
dtype: object

In [70]:
%time df_usa_acs_cat = df_usa_acs.astype('category')
df_usa_acs_cat.dtypes

CPU times: user 29.4 ms, sys: 0 ns, total: 29.4 ms
Wall time: 26.4 ms


state             category
city              category
simulant_id       category
household_id      category
date_of_birth     category
sex               category
street_name       category
first_name        category
guardian_1        category
street_number     category
unit_number       category
housing_type      category
last_name         category
guardian_2        category
zipcode           category
survey_date       category
age               category
middle_initial    category
dtype: object

# Still doesn't work because categories don't match

In [71]:
percent_different_in_columns(df_usa_acs_cat, df_usa_acs_noisy_cat)

TypeError: Categoricals can only be compared if 'categories' are the same.

# Write a function to conform two dataframes to categorical with matching categories

## Woo hoo! Now I can actually compare them

In [74]:
def to_matching_categorical(df1, df2):
    assert df1.columns.equals(df2.columns)
    df1_cat = df1.astype('category')
    df2_cat = df2.astype('category')
    for col in df1.columns:
        categories = df1_cat[col].cat.categories.union(df2_cat[col].cat.categories)
        df1_cat[col] = df1_cat[col].cat.set_categories(categories)
        df2_cat[col] = df2_cat[col].cat.set_categories(categories)
    return df1_cat, df2_cat

%time df_usa_acs_cat, df_usa_acs_noisy_cat = to_matching_categorical(df_usa_acs, df_usa_acs_noisy)
percent_different_in_columns(df_usa_acs_cat, df_usa_acs_noisy_cat)

CPU times: user 1.09 s, sys: 39.9 ms, total: 1.13 s
Wall time: 1.13 s


state             1.930911
city              1.702381
simulant_id       0.000000
household_id      0.000000
date_of_birth     2.934455
sex               1.468884
street_name       1.730534
first_name        2.455867
guardian_1        0.000000
street_number     2.330010
unit_number       0.105985
housing_type      0.000000
last_name         2.397907
guardian_2        0.000000
zipcode           2.078296
survey_date       0.000000
age               2.999040
middle_initial    1.157553
dtype: float64

In [75]:
sizemb(df_usa_acs_cat)

23.480935

In [76]:
sizemb(df_usa_acs_noisy_cat)

23.480935

In [77]:
sizemb(df_usa_acs_noisy)

58.486313

# Hmm, it looks like converting to categorical converts `pd.NA` to `np.nan`

That's convenient.

In [78]:
df_usa_acs_cat.street_number.map(type).unique()

array([<class 'str'>], dtype=object)

In [79]:
df_usa_acs_noisy_cat.street_number.map(type).unique()

array([nan, <class 'str'>], dtype=object)

In [80]:
df_usa_acs_noisy_cat.street_number.loc[df_usa_acs_noisy_cat.street_number.isna()]

0        NaN
32       NaN
54       NaN
173      NaN
206      NaN
        ... 
59863    NaN
59919    NaN
59934    NaN
60073    NaN
60239    NaN
Name: street_number, Length: 608, dtype: category
Categories (8363, object): ['', '# 19415', '# 4902', '*', ..., 'w3751', 'w4098', 'w5965', 'w9560']

In [82]:
df_usa_acs_noisy.street_number.map(type).unique()

array([<class 'pandas._libs.missing.NAType'>, <class 'str'>], dtype=object)

In [81]:
df_usa_acs_noisy.street_number.loc[df_usa_acs_noisy.street_number.isna()]

0        <NA>
32       <NA>
54       <NA>
173      <NA>
206      <NA>
         ... 
59863    <NA>
59919    <NA>
59934    <NA>
60073    <NA>
60239    <NA>
Name: street_number, Length: 608, dtype: object

In [83]:
df_usa_acs_noisy_cat.street_number.loc[0]

nan

In [84]:
type(df_usa_acs_noisy_cat.street_number.loc[0])

float

# Check whether USA ACS data contains the race/ethnicity column

No.

In [87]:
df_usa_acs.filter(like='race')

0
1
2
3
4
...
60381
60382
60383
60384
60385


In [92]:
df_usa_acs_noisy.filter(like='race')

0
1
2
3
4
...
60381
60382
60383
60384
60385


# Generate sample ACS data to verify that they contain race/ethnicity

In [88]:
df_acs_sample = pp.generate_american_communities_survey()
df_acs_sample

Unnamed: 0,simulant_id,first_name,date_of_birth,housing_type,city,middle_initial,guardian_2,household_id,zipcode,survey_date,age,state,street_number,sex,guardian_1,last_name,street_name,race_ethnicity,unit_number
0,0_10874,Debbie,1955-02-23 00:00:00,Standard,Anytown,P,0_-1,0_4412,00000,2019-01-29,63,US,,Female,0_-1,Brown,circulo coronado,White,
1,0_10875,William,1952-05-22 00:00:00,Standard,Anytown,G,0_-1,0_4412,00000,2019-01-29,66,US,4807,Male,0_-1,Brown,circulo coronado,White,
2,0_10876,Ian,1988-02-16 00:00:00,Standard,Anytown,K,0_-1,0_4412,00000,2019-01-29,30,US,4807,Male,0_-1,Brown,circulo coronado,Multiracial or Other,
3,0_17777,Carol,1936-05-17 00:00:00,Standard,Anytown,M,0_-1,0_7162,00000,2019-02-26,82,US,10751,Female,0_-1,Kupa,knox st,White,
4,0_16445,Charles,1962-04-24 00:00:00,Standard,Anytown,K,0_-1,0_6638,00000,2019-05-21,56,US,6292,Male,0_-1,Beckham,mission blvd,White,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,0_23034,Alaina,2028-02-08 00:00:00,Standard,Anytown,S,0_-1,0_7824,00000,2041-01-29,12,US,,Female,0_5313,Campos,big pines ln,Latino,
298,0_9702,Dawn,1962-12-18 00:00:00,Standard,Anytown,P,0_-1,0_15927,00000,2041-03-26,78,US,1031,Female,0_-1,Kehren,gifford dr,White,
299,0_16972,Addison,2015-09-05 00:00:00,Standard,Anytown,K,0_-1,0_11692,00000,2041-05-21,25,US,2413,Female,0_16971,Huke,prk hill rd,White,
300,0_19414,William,1969-05-12 00:00:00,Standard,Anytown,G,0_-1,0_11692,00000,2041-05-21,71,US,2413,Male,0_-1,Martinez-Garcia,prk hill rd,Latino,


In [89]:
df_acs_sample.filter(like='race')

Unnamed: 0,race_ethnicity
0,White
1,White
2,Multiracial or Other
3,White
4,White
...,...
297,Latino
298,White
299,White
300,Latino


In [90]:
df_cps_sample = pp.generate_current_population_survey()
df_cps_sample

Unnamed: 0,simulant_id,first_name,date_of_birth,housing_type,city,middle_initial,guardian_2,household_id,zipcode,survey_date,age,state,street_number,sex,guardian_1,last_name,street_name,race_ethnicity,unit_number
0,0_2695,Benjamin,1984-02-24 00:00:00,Standard,Anytown,A,0_-1,0_1091,00000,2019-01-29,34,US,1827,Male,0_-1,Sullivan,w gentile st,Multiracial or Other,
1,0_16908,Amanda,1978-05-29 00:00:00,Standard,Anytown,S,0_-1,0_6810,00000,2022-08-09,44,US,2150,Female,0_-1,Pollock,emerald str,White,
2,0_16909,Isabella,2002-07-31 00:00:00,Standard,Anytown,R,0_-1,0_6810,00000,2022-08-09,19,US,2150,Female,0_16908,Pollock,emerald str,White,
3,0_16910,Zahara,2006-10-03 00:00:00,Standard,Anytown,E,0_-1,0_6810,00000,2022-08-09,15,US,2150,Female,0_16908,,emerald str,White,
4,0_2695,Benjamin,1984-02-24 00:00:00,Standard,Anytown,A,0_-1,0_1091,00000,2027-08-03,43,US,1827,Male,0_-1,Sullivan,w gentile st,Multiracial or Other,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1762,0_10747,India,2006-09-08 00:00:00,Standard,Anytown,V,0_10746,0_9054,00000,2041-02-26,34,US,927,Female,0_10745,Kvasager,munroe falls kent rd,White,
1763,0_26719,Paisleigh,2038-12-26 00:00:00,Standard,Anytown,K,0_-1,0_9054,00000,2041-02-26,2,US,927,Female,0_10747,Kvasager,munroe falls kent rd,White,
1764,0_18582,Christopher,1980-01-23 00:00:00,Standard,Anytown,R,0_-1,0_7470,00000,2041-04-23,61,US,621,Male,0_-1,Speakes,jellison st,White,
1765,0_10837,Kathy,1949-11-26 00:00:00,Standard,Anyyown,R,0_-1,0_4398,00000,2041-05-21,91,US,3652,Female,0_-1,Pilot,sw 105th ave,White,


In [91]:
df_cps_sample.filter(like='race')

Unnamed: 0,race_ethnicity
0,Multiracial or Other
1,White
2,White
3,White
4,Multiracial or Other
...,...
1762,White
1763,White
1764,White
1765,White
