In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 50)

import os
# Use this to see how much memory the dataframes use
from sys import getsizeof

import yaml

import pseudopeople as pp
from pseudopeople.utilities import get_configuration
from vivarium.framework.randomness import RandomnessStream
from vivarium.config_tree import ConfigTree

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import datatypes
from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data

!date
!whoami
!uname -a
!pwd

Mon 10 Apr 2023 03:18:35 PM PDT
ndbs
Linux gen-slurm-sarchive-p0154 5.4.0-135-generic #152-Ubuntu SMP Wed Nov 23 20:19:22 UTC 2022 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


# Find data -- let's try parquet first

```
/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04/final_results/parquet/
```

In [11]:
project_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
model_dir = (
    f'{project_dir}/results'
    '/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04'
)
rhode_island_dir = f'{model_dir}/final_results/parquet/states/rhode_island'
usa_dir = f'{model_dir}/final_results/parquet/usa'

!ls -halt $usa_dir

total 284K
drwxrwsr-x  5 rmudambi IHME-Simulationscience 1.5K Apr  9 18:48 ..
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:41 .
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:21 tax_dependents_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:20 tax_1040_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:15 tax_w2_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 social_security_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 wic_observer
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 household_survey_observer_cps
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 household_survey_observer_acs
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:09 decennial_census_observer


In [12]:
!ls -halt $rhode_island_dir/tax_w2_observer

total 16G
drwxrwsr-x 9 rmudambi IHME-Simulationscience 3.5K Apr 10 11:10 ..
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:10 tax_w2_observer_9971.parquet
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr  9 21:10 .
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:10 tax_w2_observer_9911.parquet
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:09 tax_w2_observer_9901.parquet
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:09 tax_w2_observer_99.parquet
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:09 tax_w2_observer_9888.parquet
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:08 tax_w2_observer_9872.parquet
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:08 tax_w2_observer_9871.parquet
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:08 tax_w2_observer_9859.parquet
-rw-r--r-- 1 rmudambi IHME-Simulationscience  49M Apr  9 21:07 tax_w2_observer_9847.parquet
-rw-r--r-- 1 rmudambi

In [13]:
!ls -halt $usa_dir/tax_w2_observer

total 315G
drwxrwsr-x 10 rmudambi IHME-Simulationscience 4.0K Apr  9 15:41 ..
-rw-r--r--  1 rmudambi IHME-Simulationscience 967M Apr  9 12:47 tax_w2_observer_2689.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 966M Apr  9 12:33 tax_w2_observer_6545.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 967M Apr  9 12:15 tax_w2_observer_9888.parquet
drwxrwsr-x  2 rmudambi IHME-Simulationscience 167K Apr  9 12:15 .
-rw-r--r--  1 rmudambi IHME-Simulationscience 966M Apr  9 12:14 tax_w2_observer_9840.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 967M Apr  9 12:14 tax_w2_observer_9901.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 965M Apr  9 12:14 tax_w2_observer_9911.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 966M Apr  9 12:14 tax_w2_observer_9871.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 966M Apr  9 12:14 tax_w2_observer_9723.parquet
-rw-r--r--  1 rmudambi IHME-Simulationscience 966M Apr  9 12:14 tax_w2_observer_9971.parquet
-rw-r--

In [32]:
seed = 5670
ext = '.parquet'
ri_w2_dir = f'{rhode_island_dir}/tax_w2_observer'
ri_census_dir = f'{rhode_island_dir}/decennial_census_observer'
ri_acs_dir = f'{rhode_island_dir}/household_survey_observer_acs'

ri_w2_path = f'{ri_w2_dir}/tax_w2_observer_{seed}{ext}'
ri_census_path = f'{ri_census_dir}/decennial_census_observer_{seed}{ext}'
ri_acs_path = f'{ri_acs_dir}/household_survey_observer_acs_{seed}{ext}'

usa_w2_dir = f'{usa_dir}/tax_w2_observer'
usa_census_dir = f'{usa_dir}/decennial_census_observer'
usa_acs_dir = f'{usa_dir}/household_survey_observer_acs'

usa_w2_path = f'{usa_w2_dir}/tax_w2_observer_{seed}{ext}'
usa_census_path = f'{usa_census_dir}/decennial_census_observer_{seed}{ext}'
usa_acs_path = f'{usa_acs_dir}/household_survey_observer_acs_{seed}{ext}'

# Try loading noised W2 data for RI from parquet

In [18]:
ri_w2_path

'/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04/final_results/parquet/states/rhode_island/tax_w2_observer/tax_w2_observer_5670.parquet'

In [20]:
%%time
df_ri_w2_noisy = pp.generate_taxes_w2_and_1099(ri_w2_path)
df_ri_w2_noisy

CPU times: user 17.5 s, sys: 4.81 s, total: 22.3 s
Wall time: 22.3 s


Unnamed: 0,mailing_address_street_name,simulant_id,mailing_address_unit_number,mailing_address_zipcode,tax_year,mailing_address_street_number,date_of_birth,employer_street_name,ssn,employer_id,...,employer_street_number,income,employer_zipcode,first_name,age,employer_state,employer_unit_number,mailing_address_city,mailing_address_state,middle_initial
645,chell road,5670_668,,02881,2019,9100,1974-09-23 00:00:00,e 116 st,730-78-3410,39721,...,26875,16928,95835,Jose,45,CA,,warren,RI,M
884,delbert avenue,5670_964,,02915,2019,5525,1964-09-17 00:00:00,westdale rd nw,889-15-3299,153658,...,607,24677,70180,Tony,55,LA,,providence,RI,P
997,redstone ct,5670_1082,,02864,2019,1004,1960-02-03 00:00:00,randall st,530-84-2449,340968,...,1531,82625,08753,Leona,59,NJ,,cranston,RI,M
1222,n hills blvd,5670_1309,,02909,2019,445,1964-02-26 00:00:00,trinity hls lane,072-23-2212,735259,...,3689,9926,85713,April,55,AZ,,providence,AR,W
1223,n hills blvd,5670_1309,,02909,2019,445,1964-02-26 00:00:00,sparta oaks dr,072-23-2212,1338401,...,5835,6134,81321,April,55,CO,,providence,RI,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22383502,s mobile way,5670_1386266,,02886,2040,151,1986-07-10 00:00:00,oaklawn,428-45-0772,1188968,...,,16882,17701,Robert,54,PA,,west warwick,RI,P
22384285,28th avenue,5670_1390271,,02893,2040,10700,1978-05-13 00:00:00,e 42nd st,187-35-0120,1307191,...,4655,3064,84120,Johanna,62,UT,,south kingstown,RI,S
22384596,state route 11,5670_1391736,,02895,2040,9444,1979-06-19 00:00:00,sw 215th ter,842-31-6424,1703745,...,,474,14437,Rebecca,61,NY,,providence,RI,B
22384731,grace dr nw,5670_1391962,,02908,2040,8430,2005-04-10 00:00:00,s frazier st,211-04-1068,1420370,...,25515,6261,12523,Sophia,35,NY,,westerly,RI,L


In [21]:
df_ri_w2_noisy.dtypes

mailing_address_street_name        object
simulant_id                      category
mailing_address_unit_number        object
mailing_address_zipcode            object
tax_year                            int64
mailing_address_street_number      object
date_of_birth                      object
employer_street_name               object
ssn                                object
employer_id                        object
last_name                          object
employer_city                      object
mailing_address_po_box             object
employer_name                      object
tax_form                           object
employer_street_number             object
income                             object
employer_zipcode                   object
first_name                         object
age                                object
employer_state                     object
employer_unit_number               object
mailing_address_city               object
mailing_address_state             

In [24]:
sizemb(df_ri_w2_noisy)

204.441694

# Load unnoised W2 data for RI

In [22]:
%%time
df_ri_w2 = pd.read_parquet(ri_w2_path)
df_ri_w2

CPU times: user 4.69 s, sys: 1.26 s, total: 5.94 s
Wall time: 5.96 s


Unnamed: 0,mailing_address_street_name,simulant_id,mailing_address_unit_number,mailing_address_zipcode,tax_year,mailing_address_street_number,date_of_birth,employer_street_name,ssn,employer_id,...,employer_street_number,income,employer_zipcode,first_name,age,employer_state,employer_unit_number,mailing_address_city,mailing_address_state,middle_initial
645,chell road,5670_668,,02881,2019,9100,1974-09-23,e 116 st,730-78-3410,39721,...,26875,16928,95835,Jose,45,CA,,warren,RI,M
884,delbert avenue,5670_964,,02915,2019,5525,1964-09-17,westdale rd nw,889-15-3299,153658,...,607,24677,70080,Tony,55,LA,,providence,RI,P
997,redstone ct,5670_1082,,02864,2019,1004,1960-02-03,randall st,530-84-2449,340968,...,1531,82625,08753,Leona,59,NJ,,cranston,RI,M
1222,n hills blvd,5670_1309,,02909,2019,445,1964-02-26,trinity hls lane,072-23-2212,735259,...,3689,9926,85713,April,55,AZ,,providence,RI,W
1223,n hills blvd,5670_1309,,02909,2019,445,1964-02-26,sparta oaks dr,072-23-2212,1338401,...,5835,6134,81321,April,55,CO,,providence,RI,W
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22383502,s mobile way,5670_1386266,,02886,2040,151,1986-07-10,oaklawn,428-45-0772,1188968,...,24,16882,17701,Robert,54,PA,,west warwick,RI,P
22384285,28th avenue,5670_1390271,,02893,2040,10700,1978-05-13,e 42nd st,187-35-0120,1307191,...,4655,3064,84120,Johanna,62,UT,,south kingstown,RI,S
22384596,state route 11,5670_1391736,,02895,2040,9444,1979-06-19,sw 215th ter,842-31-6424,1703745,...,,474,14437,Rebecca,61,NY,,providence,RI,B
22384731,grace dr nw,5670_1391962,,02908,2040,8430,2005-04-10,s frazier st,211-04-1068,1420370,...,2555,6261,12523,Sophia,35,NY,,westerly,RI,L


In [23]:
df_ri_w2.dtypes

mailing_address_street_name            category
simulant_id                            category
mailing_address_unit_number            category
mailing_address_zipcode                category
tax_year                                  int64
mailing_address_street_number          category
date_of_birth                    datetime64[ns]
employer_street_name                   category
ssn                                    category
employer_id                               int64
last_name                              category
employer_city                          category
mailing_address_po_box                    int64
employer_name                          category
tax_form                               category
employer_street_number                 category
income                                    int64
employer_zipcode                       category
first_name                             category
age                                       int64
employer_state                         c

In [25]:
sizemb(df_ri_w2)

579.203098

In [26]:
sizemb(df_ri_w2.ssn)

127.17426

In [27]:
sizemb(df_ri_w2_noisy.ssn)

6.957732

In [28]:
sizemb(df_ri_w2_noisy.simulant_id)

114.383364

In [30]:
sizemb(df_ri_w2_noisy.employer_id)

4.966405

In [31]:
sizemb(df_ri_w2_noisy.employer_name)

7.880263

# Load unnoised W2 data for USA

In [33]:
usa_w2_path

'/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/v2.0_three_censuses/united_states_of_america/2023_04_08_14_07_04/final_results/parquet/usa/tax_w2_observer/tax_w2_observer_5670.parquet'

In [34]:
%%time
df_usa_w2 = pd.read_parquet(usa_w2_path)
df_usa_w2

CPU times: user 10.9 s, sys: 5.28 s, total: 16.2 s
Wall time: 18.2 s


Unnamed: 0,mailing_address_street_name,simulant_id,mailing_address_unit_number,mailing_address_zipcode,tax_year,mailing_address_street_number,date_of_birth,employer_street_name,ssn,employer_id,...,employer_street_number,income,employer_zipcode,first_name,age,employer_state,employer_unit_number,mailing_address_city,mailing_address_state,middle_initial
0,northview dr ne,5670_0,,17013,2019,235,1953-01-19,edgecliff ct,863-79-9332,923417,...,,28043,93101,Phyllis,66,CA,,harrisburg,PA,D
1,northview dr ne,5670_0,,17013,2019,235,1953-01-19,n 52nd st,863-79-9332,1258181,...,309,12141,37013,Phyllis,66,TN,,harrisburg,PA,D
2,tivoli ln,5670_2,,91601,2019,118,1979-01-30,skyview ter,801-14-2038,1631549,...,1960,13517,85308,Rebecca,40,AZ,unit 170,visalia,CA,C
3,tivoli ln,5670_3,,91601,2019,118,1979-04-10,ince dr,801-14-2038,1358821,...,e,13952,85006,Richard,40,AZ,,visalia,CA,D
4,oakview ln,5670_4,,30005,2019,,2001-01-21,stoney crk cir,509-76-7953,1,...,,222,45459,Owen,18,OH,,johns creek,GA,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22384735,old nat'l highway,5670_1391966,,27603,2040,8011,2005-05-18,west lexington stre,466-41-2227,27816,...,1507,2195,37909,Robert,35,TN,,mooresville,NC,K
22384736,n 4th st,5670_1391970,,33401,2040,49,1999-07-03,s oriole av,476-71-8945,1350023,...,1116,12961,89506,Meaghan,41,NV,,lehigh acres,FL,C
22384737,watervliet avnu,5670_1391971,,60612,2040,105,2007-09-23,n browns station dr,229-73-1457,372782,...,115,15346,33308,Anthony,33,FL,,lawrenceville,IL,J
22384738,watervliet avnu,5670_1391972,,60612,2040,105,2011-11-05,southeast 22nd street,075-82-5964,432807,...,3388,1101,21755,Elijah,29,MD,,lawrenceville,IL,E


In [35]:
df_usa_w2.dtypes

mailing_address_street_name            category
simulant_id                            category
mailing_address_unit_number            category
mailing_address_zipcode                category
tax_year                                  int64
mailing_address_street_number          category
date_of_birth                    datetime64[ns]
employer_street_name                   category
ssn                                    category
employer_id                               int64
last_name                              category
employer_city                          category
mailing_address_po_box                    int64
employer_name                          category
tax_form                               category
employer_street_number                 category
income                                    int64
employer_zipcode                       category
first_name                             category
age                                       int64
employer_state                         c

In [36]:
sizemb(df_usa_w2) # 2.8 GB for the unnoised USA W2 data

2810.76879

In [37]:
2.8*334

935.1999999999999

In [38]:
type(df_usa_w2)

pandas.core.frame.DataFrame

In [45]:
sizemb(df_usa_w2.first_name)

45.809479

In [61]:
%time sizemb(df_usa_w2.simulant_id)

CPU times: user 171 ms, sys: 7.7 ms, total: 179 ms
Wall time: 178 ms


201.041272

In [62]:
%time sizemb(df_usa_w2.simulant_id.astype(str))

CPU times: user 6.13 s, sys: 2.64 s, total: 8.77 s
Wall time: 8.76 s


1521.30808

# Generate noised W2 data for USA

In [39]:
%%time
df_usa_w2_noisy = pp.generate_taxes_w2_and_1099(usa_w2_path)
df_usa_w2_noisy

CPU times: user 9min 43s, sys: 1min 58s, total: 11min 41s
Wall time: 11min 32s


Unnamed: 0,mailing_address_street_name,simulant_id,mailing_address_unit_number,mailing_address_zipcode,tax_year,mailing_address_street_number,date_of_birth,employer_street_name,ssn,employer_id,...,employer_street_number,income,employer_zipcode,first_name,age,employer_state,employer_unit_number,mailing_address_city,mailing_address_state,middle_initial
0,northview dr ne,5670_0,,,2019,235,1953-01-19 00:00:00,edgecliff ct,863-79-9332,923417,...,,28043,93101,Phyllis,66,CA,,harrisburg,PA,D
1,northview dr ne,5670_0,,17013,2019,235,1953-01-19 00:00:00,n 52nd st,863-79-9332,1258181,...,309,12141,37013,Phyllis,66,TN,,harrisburg,PA,D
2,tivoli ln,5670_2,,91601,2019,118,1979-01-30 00:00:00,skyview ter,801-14-2038,1631549,...,1960,13517,85308,Rebecca,40,AZ,unit 170,visalia,CA,C
3,tivoli ln,5670_3,,91601,2019,118,1979-04-10 00:00:00,ince dr,801-14-2038,1358821,...,e,13952,85006,Richard,40,AZ,,visalia,CA,D
4,oakview ln,5670_4,,30005,2019,,2001-01-21 00:00:00,stoney crk cir,509-76-7953,1,...,,222,45459,Owen,18,OH,,johns creek,,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22384735,old nat'l highway,5670_1391966,,27603,2040,8011,2005-05-18 00:00:00,,466-41-2227,27816,...,1507,2195,37909,Robert,35,TN,,mooresville,NC,K
22384736,n 4th st,5670_1391970,,33401,2040,49,1999-07-03 00:00:00,s oriole av,476-71-8945,1350023,...,1116,12961,89506,Meaghan,41,NV,,lehigh acres,FL,C
22384737,watervliet avnu,5670_1391971,,60612,2040,105,2007-09-23 00:00:00,n browns station dr,229-73-1457,372782,...,115,15346,33308,Anthony,33,FL,,lawrenceville,IL,J
22384738,watervliet avnu,5670_1391972,,60612,2040,105,2011-11-05 00:00:00,southeast 22nd street,075-82-5964,432807,...,3388,1101,21755,Elijah,29,MD,,lawrenceville,IL,E


In [40]:
df_usa_w2_noisy.dtypes

mailing_address_street_name        object
simulant_id                      category
mailing_address_unit_number        object
mailing_address_zipcode            object
tax_year                            int64
mailing_address_street_number      object
date_of_birth                      object
employer_street_name               object
ssn                                object
employer_id                        object
last_name                          object
employer_city                      object
mailing_address_po_box             object
employer_name                      object
tax_form                           object
employer_street_number             object
income                             object
employer_zipcode                   object
first_name                         object
age                                object
employer_state                     object
employer_unit_number               object
mailing_address_city               object
mailing_address_state             

In [42]:
%%time
sizemb(df_usa_w2_noisy) # 31.7 GB for the noised USA W2 data

CPU times: user 52.1 s, sys: 3.25 s, total: 55.4 s
Wall time: 55.2 s


31691.982976

In [43]:
%time sizemb(df_usa_w2_noisy.simulant_id)

CPU times: user 163 ms, sys: 19.6 ms, total: 182 ms
Wall time: 181 ms


201.041272

In [44]:
%time sizemb(df_usa_w2_noisy.employer_id)

CPU times: user 3.24 s, sys: 159 ms, total: 3.4 s
Wall time: 3.37 s


818.948468

In [46]:
%time sizemb(df_usa_w2_noisy.first_name)

CPU times: user 3.25 s, sys: 70.8 ms, total: 3.32 s
Wall time: 3.3 s


1404.080518

# Check configuration

In [49]:
config = get_configuration()
config.taxes_w2_and_1099

omission:
    base: 0.0
        source: initial data
duplication:
    base: 0.05
        source: initial data
age:
    missing_data:
        row_noise_level:
            base: 0.01
                source: initial data
    typographic:
        row_noise_level:
            base: 0.01
                source: initial data
        token_noise_level:
            base: 0.1
                source: initial data
        include_original_token_level:
            base: 0.1
                source: initial data
    age_miswriting:
        row_noise_level:
            base: 0.01
                source: initial data
        token_noise_level:
            base: 0.1
                source: initial data
        possible_perturbations:
            1:
                base: 0.5
                    source: initial data
            -1:
                base: 0.5
                    source: initial data
date_of_birth:
    missing_data:
        row_noise_level:
            base: 0.01
                source: init

In [53]:
[
    k for k, v  in config.taxes_w2_and_1099.items()
    if k not in ['omission', 'duplication']
    and 'typographic' in v
]

['age',
 'date_of_birth',
 'employer_city',
 'employer_id',
 'employer_name',
 'employer_street_name',
 'employer_street_number',
 'employer_unit_number',
 'employer_zipcode',
 'first_name',
 'income',
 'last_name',
 'mailing_address_city',
 'mailing_address_street_name',
 'mailing_address_street_number',
 'mailing_address_unit_number',
 'mailing_address_po_box',
 'mailing_address_zipcode',
 'middle_initial',
 'ssn']

# Try generating USA W2 data with all noise except typographic

In [54]:
config.taxes_w2_and_1099.income

missing_data:
    row_noise_level:
        base: 0.01
            source: initial data
typographic:
    row_noise_level:
        base: 0.01
            source: initial data
    token_noise_level:
        base: 0.1
            source: initial data
    include_original_token_level:
        base: 0.1
            source: initial data
numeric_miswriting:
    row_noise_level:
        base: 0.01
            source: initial data
    token_noise_level:
        base: 0.1
            source: initial data

In [58]:
w2_config = config.taxes_w2_and_1099.to_dict()
cols_with_typos = [
    k for k, col_config  in w2_config.items()
    if k not in ['omission', 'duplication']
    and 'typographic' in col_config
]
print(cols_with_typos)
for col in cols_with_typos:
    w2_config[col]['typographic']['row_noise_level'] = 0
w2_config

['age', 'date_of_birth', 'employer_city', 'employer_id', 'employer_name', 'employer_street_name', 'employer_street_number', 'employer_unit_number', 'employer_zipcode', 'first_name', 'income', 'last_name', 'mailing_address_city', 'mailing_address_street_name', 'mailing_address_street_number', 'mailing_address_unit_number', 'mailing_address_po_box', 'mailing_address_zipcode', 'middle_initial', 'ssn']


{'omission': 0.0,
 'duplication': 0.05,
 'age': {'missing_data': {'row_noise_level': 0.01},
  'typographic': {'row_noise_level': 0,
   'token_noise_level': 0.1,
   'include_original_token_level': 0.1},
  'age_miswriting': {'row_noise_level': 0.01,
   'token_noise_level': 0.1,
   'possible_perturbations': {1: 0.5, -1: 0.5}}},
 'date_of_birth': {'missing_data': {'row_noise_level': 0.01},
  'typographic': {'row_noise_level': 0,
   'token_noise_level': 0.1,
   'include_original_token_level': 0.1},
  'numeric_miswriting': {'row_noise_level': 0.01, 'token_noise_level': 0.1}},
 'employer_city': {'missing_data': {'row_noise_level': 0.01},
  'typographic': {'row_noise_level': 0,
   'token_noise_level': 0.1,
   'include_original_token_level': 0.1}},
 'employer_id': {'missing_data': {'row_noise_level': 0.01},
  'typographic': {'row_noise_level': 0,
   'token_noise_level': 0.1,
   'include_original_token_level': 0.1},
  'numeric_miswriting': {'row_noise_level': 0.01, 'token_noise_level': 0.1}},
 '

In [60]:
%%time
df_usa_w2_noisy_no_typos = pp.generate_taxes_w2_and_1099(
    usa_w2_path,
    configuration={'taxes_w2_and_1099': w2_config}
)
df_usa_w2_noisy_no_typos

CPU times: user 5min 27s, sys: 1min 50s, total: 7min 18s
Wall time: 7min 17s


Unnamed: 0,mailing_address_street_name,simulant_id,mailing_address_unit_number,mailing_address_zipcode,tax_year,mailing_address_street_number,date_of_birth,employer_street_name,ssn,employer_id,...,employer_street_number,income,employer_zipcode,first_name,age,employer_state,employer_unit_number,mailing_address_city,mailing_address_state,middle_initial
0,northview dr ne,5670_0,,,2019,235,1953-01-19 00:00:00,edgecliff ct,863-79-9332,923417,...,,28043,93101,Phyllis,66,CA,,harrisburg,PA,D
1,northview dr ne,5670_0,,17013,2019,235,1953-01-19 00:00:00,n 52nd st,863-79-9332,1258181,...,309,12141,37013,Phyllis,66,TN,,harrisburg,PA,D
2,tivoli ln,5670_2,,91601,2019,118,1979-01-30 00:00:00,skyview ter,801-14-2038,1631549,...,1960,13517,85308,Rebecca,40,AZ,unit 170,visalia,CA,C
3,tivoli ln,5670_3,,91601,2019,118,1979-04-10 00:00:00,ince dr,801-14-2038,1358821,...,e,13952,85006,Richard,40,AZ,,visalia,CA,D
4,oakview ln,5670_4,,30005,2019,,2001-01-21 00:00:00,stoney crk cir,509-76-7953,1,...,,222,45459,Owen,18,OH,,johns creek,,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22384735,old nat'l highway,5670_1391966,,27603,2040,8011,2005-05-18 00:00:00,,466-41-2227,27816,...,1507,2195,37909,Robert,35,TN,,mooresville,NC,K
22384736,n 4th st,5670_1391970,,33401,2040,49,1999-07-03 00:00:00,s oriole av,476-71-8945,1350023,...,1116,12961,89506,Meaghan,41,NV,,lehigh acres,FL,C
22384737,watervliet avnu,5670_1391971,,60612,2040,105,2007-09-23 00:00:00,n browns station dr,229-73-1457,372782,...,115,15346,33308,Anthony,33,FL,,lawrenceville,IL,J
22384738,watervliet avnu,5670_1391972,,60612,2040,105,2011-11-05 00:00:00,southeast 22nd street,075-82-5964,432807,...,3388,1101,21755,Elijah,29,MD,,lawrenceville,IL,E


# Look for missing values in original USA W2 data

In [63]:
%%time
df_usa_w2.isna().sum()

CPU times: user 987 ms, sys: 336 ms, total: 1.32 s
Wall time: 1.32 s


mailing_address_street_name      0
simulant_id                      0
mailing_address_unit_number      0
mailing_address_zipcode          0
tax_year                         0
mailing_address_street_number    0
date_of_birth                    0
employer_street_name             0
ssn                              0
employer_id                      0
last_name                        0
employer_city                    0
mailing_address_po_box           0
employer_name                    0
tax_form                         0
employer_street_number           0
income                           0
employer_zipcode                 0
first_name                       0
age                              0
employer_state                   0
employer_unit_number             0
mailing_address_city             0
mailing_address_state            0
middle_initial                   0
dtype: int64

In [64]:
%%time
(df_usa_w2 == '').sum()

CPU times: user 1.22 s, sys: 127 ms, total: 1.35 s
Wall time: 1.34 s


mailing_address_street_name        715542
simulant_id                             0
mailing_address_unit_number      20930578
mailing_address_zipcode                 0
tax_year                                0
mailing_address_street_number     1913238
date_of_birth                           0
employer_street_name                    0
ssn                                     0
employer_id                             0
last_name                               0
employer_city                           0
mailing_address_po_box                  0
employer_name                           0
tax_form                                0
employer_street_number            1357412
income                                  0
employer_zipcode                        0
first_name                              0
age                                     0
employer_state                          0
employer_unit_number             21157859
mailing_address_city                    0
mailing_address_state             

In [65]:
seed

5670

In [68]:
%%time
(df_usa_w2 != df_usa_w2_noisy).sum()

AttributeError: 'bool' object has no attribute 'ndim'

In [71]:
%%time
df_usa_w2.employer_id == df_usa_w2_noisy.employer_id

CPU times: user 10.4 ms, sys: 8.57 ms, total: 18.9 ms
Wall time: 18.2 ms


0           False
1           False
2           False
3           False
4           False
            ...  
22384735    False
22384736    False
22384737    False
22384738    False
22384739    False
Name: employer_id, Length: 22384740, dtype: bool

In [72]:
pd.concat([df_usa_w2.employer_id, df_usa_w2_noisy.employer_id], axis=1)

Unnamed: 0,employer_id,employer_id.1
0,923417,923417
1,1258181,1258181
2,1631549,1631549
3,1358821,1358821
4,1,1
...,...,...
22384735,27816,27816
22384736,1350023,1350023
22384737,372782,372782
22384738,432807,432807


In [69]:
df_usa_w2.dtypes

mailing_address_street_name            category
simulant_id                            category
mailing_address_unit_number            category
mailing_address_zipcode                category
tax_year                                  int64
mailing_address_street_number          category
date_of_birth                    datetime64[ns]
employer_street_name                   category
ssn                                    category
employer_id                               int64
last_name                              category
employer_city                          category
mailing_address_po_box                    int64
employer_name                          category
tax_form                               category
employer_street_number                 category
income                                    int64
employer_zipcode                       category
first_name                             category
age                                       int64
employer_state                         c

In [70]:
df_usa_w2_noisy.dtypes

mailing_address_street_name        object
simulant_id                      category
mailing_address_unit_number        object
mailing_address_zipcode            object
tax_year                            int64
mailing_address_street_number      object
date_of_birth                      object
employer_street_name               object
ssn                                object
employer_id                        object
last_name                          object
employer_city                      object
mailing_address_po_box             object
employer_name                      object
tax_form                           object
employer_street_number             object
income                             object
employer_zipcode                   object
first_name                         object
age                                object
employer_state                     object
employer_unit_number               object
mailing_address_city               object
mailing_address_state             

In [73]:
df_usa_w2.dtypes.loc[df_usa_w2.dtypes != 'category']

tax_year                           int64
date_of_birth             datetime64[ns]
employer_id                        int64
mailing_address_po_box             int64
income                             int64
age                                int64
dtype: object

In [75]:
%%time
%time df_usa_w2_noisy_cat = df_usa_w2_noisy.copy()
%time datatypes.convert_dtypes(df_usa_w2_noisy_cat)
df_usa_w2_noisy_cat.dtypes

CPU times: user 27.1 s, sys: 14.8 s, total: 41.9 s
Wall time: 41.8 s


TypeError: float() argument must be a string or a number, not 'NAType'

CPU times: user 3min 49s, sys: 1min 4s, total: 4min 54s
Wall time: 4min 53s


mailing_address_street_name      category
simulant_id                        object
mailing_address_unit_number      category
mailing_address_zipcode          category
tax_year                         category
mailing_address_street_number    category
date_of_birth                    category
employer_street_name             category
ssn                                object
employer_id                        object
last_name                        category
employer_city                    category
mailing_address_po_box           category
employer_name                    category
tax_form                         category
employer_street_number           category
income                             object
employer_zipcode                 category
first_name                       category
age                                object
employer_state                   category
employer_unit_number             category
mailing_address_city             category
mailing_address_state            c

In [77]:
df_usa_w2_noisy.loc[df_usa_w2_noisy.employer_name.isna(), 'employer_name']

119         <NA>
159         <NA>
284         <NA>
322         <NA>
323         <NA>
            ... 
22383756    <NA>
22383849    <NA>
22383946    <NA>
22384041    <NA>
22384282    <NA>
Name: employer_name, Length: 222917, dtype: object

In [78]:
df_usa_w2_noisy.employer_name.astype(str)

0                                   Jam CLT Cell
1                          East End Times Square
2                               Dewdney Car care
3                   Hickory Station 55 South Bay
4                                       Military
                            ...                 
22384735                 Fair City Vision Center
22384736                          Golden's Paint
22384737                  Tunxis Hill Counseling
22384738                             Robert E DC
22384739    Alan T And Care Clinic Cancer Center
Name: employer_name, Length: 22384740, dtype: object

In [81]:
[(k, v.iloc[0]) for k,v in df_usa_w2.items()]

[('mailing_address_street_name', 'northview dr ne'),
 ('simulant_id', '5670_0'),
 ('mailing_address_unit_number', ''),
 ('mailing_address_zipcode', '17013'),
 ('tax_year', 2019),
 ('mailing_address_street_number', '235'),
 ('date_of_birth', Timestamp('1953-01-19 00:00:00')),
 ('employer_street_name', 'edgecliff ct'),
 ('ssn', '863-79-9332'),
 ('employer_id', 923417),
 ('last_name', 'Jones'),
 ('employer_city', 'berkeley'),
 ('mailing_address_po_box', 0),
 ('employer_name', 'Jam CLT Cell'),
 ('tax_form', 'W2'),
 ('employer_street_number', ''),
 ('income', 28043),
 ('employer_zipcode', '93101'),
 ('first_name', 'Phyllis'),
 ('age', 66),
 ('employer_state', 'CA'),
 ('employer_unit_number', ''),
 ('mailing_address_city', 'harrisburg'),
 ('mailing_address_state', 'PA'),
 ('middle_initial', 'D')]

In [82]:
%%time
def convert_col_to_str_if_cat(colname, df):
    dtype = df.dtypes[colname]
    return df[colname].astype(str) if dtype == 'category' else df[colname]

%time new_cols = [convert_col_to_str_if_cat(colname, df_usa_w2) for colname in df_usa_w2]
%time df_usa_w2_str = pd.concat(new_cols, axis=1)
df_usa_w2_str.dtypes

CPU times: user 1min 21s, sys: 1min 12s, total: 2min 34s
Wall time: 2min 33s
CPU times: user 21.8 s, sys: 9.55 s, total: 31.4 s
Wall time: 31.3 s
CPU times: user 1min 43s, sys: 1min 22s, total: 3min 5s
Wall time: 3min 5s


mailing_address_street_name              object
simulant_id                              object
mailing_address_unit_number              object
mailing_address_zipcode                  object
tax_year                                  int64
mailing_address_street_number            object
date_of_birth                    datetime64[ns]
employer_street_name                     object
ssn                                      object
employer_id                               int64
last_name                                object
employer_city                            object
mailing_address_po_box                    int64
employer_name                            object
tax_form                                 object
employer_street_number                   object
income                                    int64
employer_zipcode                         object
first_name                               object
age                                       int64
employer_state                          

In [83]:
%%time
(df_usa_w2_str != df_usa_w2_noisy).sum()

CPU times: user 1min 6s, sys: 12.1 s, total: 1min 18s
Wall time: 1min 18s


mailing_address_street_name        358541
simulant_id                             0
mailing_address_unit_number         35982
mailing_address_zipcode            469564
tax_year                                0
mailing_address_street_number      471128
date_of_birth                      665653
employer_street_name               369739
ssn                                494609
employer_id                      22384740
last_name                          553057
employer_city                      352521
mailing_address_po_box           22384740
employer_name                      408523
tax_form                           335557
employer_street_number             482809
income                           22384740
employer_zipcode                   469398
first_name                         547088
age                              22384740
employer_state                     442794
employer_unit_number                30357
mailing_address_city               351495
mailing_address_state             

In [86]:
df_usa_w2_noisy.income.astype(float) == df_usa_w2_str.income

TypeError: float() argument must be a string or a number, not 'NAType'

In [87]:
%%time
df_usa_w2_noisy.income.loc[df_usa_w2_noisy.income.notna()].str.isdigit().sum()

CPU times: user 20.5 s, sys: 917 ms, total: 21.4 s
Wall time: 21.4 s


219063

In [88]:
%%time
df_usa_w2_noisy.income.loc[df_usa_w2_noisy.income.notna()].str.isdigit().all()

CPU times: user 19.9 s, sys: 728 ms, total: 20.7 s
Wall time: 20.7 s


False

In [89]:
%%time
df_usa_w2_str2 = df_usa_w2.astype(str)
df_usa_w2_str2.dtypes

CPU times: user 2min 7s, sys: 1min 16s, total: 3min 23s
Wall time: 3min 23s


mailing_address_street_name      object
simulant_id                      object
mailing_address_unit_number      object
mailing_address_zipcode          object
tax_year                         object
mailing_address_street_number    object
date_of_birth                    object
employer_street_name             object
ssn                              object
employer_id                      object
last_name                        object
employer_city                    object
mailing_address_po_box           object
employer_name                    object
tax_form                         object
employer_street_number           object
income                           object
employer_zipcode                 object
first_name                       object
age                              object
employer_state                   object
employer_unit_number             object
mailing_address_city             object
mailing_address_state            object
middle_initial                   object


In [91]:
df_usa_w2_noisy.columns.to_list()

['mailing_address_street_name',
 'simulant_id',
 'mailing_address_unit_number',
 'mailing_address_zipcode',
 'tax_year',
 'mailing_address_street_number',
 'date_of_birth',
 'employer_street_name',
 'ssn',
 'employer_id',
 'last_name',
 'employer_city',
 'mailing_address_po_box',
 'employer_name',
 'tax_form',
 'employer_street_number',
 'income',
 'employer_zipcode',
 'first_name',
 'age',
 'employer_state',
 'employer_unit_number',
 'mailing_address_city',
 'mailing_address_state',
 'middle_initial']

In [92]:
%%time
new_cols = [df_usa_w2_noisy[colname] for colname in df_usa_w2_noisy]
for i, col in enumerate(new_cols):
    if col.name in ['simulant_id', 'tax_year']:
        new_cols[i] = col.astype(str)
df_usa_w2_noisy_str = pd.concat(new_cols, axis=1)
df_usa_w2_noisy_str.dtypes

CPU times: user 19.1 s, sys: 9.94 s, total: 29.1 s
Wall time: 29 s


mailing_address_street_name      object
simulant_id                      object
mailing_address_unit_number      object
mailing_address_zipcode          object
tax_year                         object
mailing_address_street_number    object
date_of_birth                    object
employer_street_name             object
ssn                              object
employer_id                      object
last_name                        object
employer_city                    object
mailing_address_po_box           object
employer_name                    object
tax_form                         object
employer_street_number           object
income                           object
employer_zipcode                 object
first_name                       object
age                              object
employer_state                   object
employer_unit_number             object
mailing_address_city             object
mailing_address_state            object
middle_initial                   object


In [93]:
def percent_different_in_columns(df1, df2):
    return 100 * (df1 != df2).sum() / len(df1)

def percent_of_rows_with_difference(df1, df2):
    return 100 * (df1 != df2).any(axis=1).sum() / len(df1)

In [101]:
def compare_columns(df1, df2, colname, notna=False):
    if notna:
        notna = df1[colname].notna() & df2[colname].notna()
        return df1[colname].loc[notna].compare(df2[colname].loc[notna])
    else:
        return df1[colname].compare(df2[colname])

In [95]:
%%time
percent_different_in_columns(df_usa_w2_noisy_str, df_usa_w2_str2)

CPU times: user 46.6 s, sys: 9.64 s, total: 56.2 s
Wall time: 56.1 s


mailing_address_street_name        1.601721
simulant_id                        0.000000
mailing_address_unit_number        0.160743
mailing_address_zipcode            2.097697
tax_year                           0.000000
mailing_address_street_number      2.104684
date_of_birth                    100.000000
employer_street_name               1.651746
ssn                                2.209581
employer_id                       99.294430
last_name                          2.470688
employer_city                      1.574827
mailing_address_po_box            99.117023
employer_name                      1.825007
tax_form                           1.499044
employer_street_number             2.156867
income                            99.408856
employer_zipcode                   2.096955
first_name                         2.444022
age                               99.208702
employer_state                     1.978107
employer_unit_number               0.135615
mailing_address_city            

# For some reason, dates are formatted differently in the noised and unnoised versions

In [96]:
df_usa_w2_str2.date_of_birth.compare(df_usa_w2_noisy_str.date_of_birth)

Unnamed: 0,self,other
0,1953-01-19,1953-01-19 00:00:00
1,1953-01-19,1953-01-19 00:00:00
2,1979-01-30,1979-01-30 00:00:00
3,1979-04-10,1979-04-10 00:00:00
4,2001-01-21,2001-01-21 00:00:00
...,...,...
22384735,2005-05-18,2005-05-18 00:00:00
22384736,1999-07-03,1999-07-03 00:00:00
22384737,2007-09-23,2007-09-23 00:00:00
22384738,2011-11-05,2011-11-05 00:00:00


In [97]:
df_usa_w2_noisy.date_of_birth

0           1953-01-19 00:00:00
1           1953-01-19 00:00:00
2           1979-01-30 00:00:00
3           1979-04-10 00:00:00
4           2001-01-21 00:00:00
                   ...         
22384735    2005-05-18 00:00:00
22384736    1999-07-03 00:00:00
22384737    2007-09-23 00:00:00
22384738    2011-11-05 00:00:00
22384739    1963-12-20 00:00:00
Name: date_of_birth, Length: 22384740, dtype: object

In [99]:
df_usa_w2_noisy.date_of_birth[0]

Timestamp('1953-01-19 00:00:00')

In [98]:
df_usa_w2.date_of_birth

0          1953-01-19
1          1953-01-19
2          1979-01-30
3          1979-04-10
4          2001-01-21
              ...    
22384735   2005-05-18
22384736   1999-07-03
22384737   2007-09-23
22384738   2011-11-05
22384739   1963-12-20
Name: date_of_birth, Length: 22384740, dtype: datetime64[ns]

In [100]:
df_usa_w2.date_of_birth[0]

Timestamp('1953-01-19 00:00:00')

In [102]:
compare_columns(df_usa_w2_str2, df_usa_w2_noisy_str, 'income', notna=False)

Unnamed: 0,self,other
0,28043,28043
1,12141,12141
2,13517,13517
3,13952,13952
4,222,222
...,...,...
22384735,2195,2195
22384736,12961,12961
22384737,15346,15346
22384738,1101,1101


In [103]:
compare_columns(df_usa_w2_str2, df_usa_w2_noisy_str, 'income', notna=False).loc[0]

self     28043
other    28043
Name: 0, dtype: object

In [104]:
df_usa_w2_str2.loc[0, 'income']

'28043'

In [105]:
df_usa_w2_noisy_str.loc[0, 'income']

28043

In [106]:
same = df_usa_w2_str2.income == df_usa_w2_noisy_str.income
df_usa_w2_str2.loc[same, 'income']

364         428731
387          30712
406          20692
466         133355
575          49253
             ...  
22383594     16463
22384130      5383
22384188     11129
22384588      3551
22384706     19027
Name: income, Length: 132326, dtype: object

In [107]:
df_usa_w2_str2.loc[364, 'income']

'428731'

In [108]:
df_usa_w2_noisy_str.loc[364, 'income']

'428731'