In [3]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 50)

import os
# Use this to see how much memory the dataframes use
from sys import getsizeof

import yaml
import re

import pseudopeople as psp
from pseudopeople.configuration import get_configuration
from vivarium.framework.randomness import RandomnessStream
from vivarium.config_tree import ConfigTree

from vivarium_research_prl.utils import sizemb, MappingViaAttributes
from vivarium_research_prl import datatypes, alpha, data_loading
from vivarium_research_prl.noise import corruption, fake_names, noisify
from vivarium_research_prl.find_kids import datasets, noisify_data


!date
!whoami
!uname -a
!pwd

Tue 02 May 2023 06:05:50 PM PDT
ndbs
Linux int-slurm-sarchive-p0002 5.4.0-144-generic #161-Ubuntu SMP Fri Feb 3 14:49:04 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/pseudopeople_testing


In [4]:
%load_ext autoreload
%autoreload 2

# Data!

U.S.
```
/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/v2.1_updated_emigration/united_states_of_america/2023_04_19_16_30_28/final_results/2023_04_20_11_23_26/pseudopeople_input_data_usa_1.0.0
```

Rhode Island
```
/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/v2.1_updated_emigration/united_states_of_america/2023_04_19_16_30_28/final_results/2023_04_20_11_23_26/states/pseudopeople_input_data_ri_1.0.0
```

In [5]:
project_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
model_dir = (
    f'{project_dir}/results'
    '/v2.1_updated_emigration/united_states_of_america/2023_04_19_16_30_28'
)
run_dir = f'{model_dir}/final_results/2023_04_20_11_23_26'
rhode_island_dir = f'{run_dir}/states/pseudopeople_input_data_ri_1.0.0'
usa_dir = f'{run_dir}/pseudopeople_input_data_usa_1.0.0'

!ls -halt $rhode_island_dir

total 260K
drwxrwsr-x 3 rmudambi IHME-Simulationscience 1.0K Apr 21 16:01 ..
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 21 14:55 current_population_survey
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 21 14:54 taxes_dependents
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 21 14:53 women_infants_and_children
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 21 14:53 american_community_survey
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 21 14:53 decennial_census
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 21 14:52 taxes_1040
drwxrwsr-x 2 rmudambi IHME-Simulationscience 167K Apr 21 14:50 taxes_w2_and_1099
drwxrwsr-x 9 rmudambi IHME-Simulationscience 3.5K Apr 20 16:02 .


# Test configuration interface

In [13]:
config = psp.get_config()
config.keys()

dict_keys(['decennial_census', 'american_community_survey', 'current_population_survey', 'women_infants_and_children', 'social_security', 'taxes_w2_and_1099'])

In [8]:
config['decennial_census']

{'row_noise': {'omit_row': {'row_probability': 0.0145}},
 'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.01},
   'use_fake_name': {'cell_probability': 0.01},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'middle_initial': {'leave_blank': {'cell_probability': 0.01},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'last_name': {'leave_blank': {'cell_probability': 0.01},
   'use_fake_name': {'cell_probability': 0.01},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'age': {'leave_blank': {'cell_probability': 0.01},
   'misreport_age': {'cell_probability': 0.01,
    'possible_age_differences': {-2: 0.1, -1: 0.4, 1: 0.4, 2: 0.1}},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'date_of_birth': {'leave_blank': {'cell_probability': 0.01},
   'write_wrong_digits': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0.01, 'toke

In [9]:
census_config = psp.get_config('decennial_census')
census_config

{'row_noise': {'omit_row': {'row_probability': 0.0145}},
 'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.01},
   'use_fake_name': {'cell_probability': 0.01},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'middle_initial': {'leave_blank': {'cell_probability': 0.01},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'last_name': {'leave_blank': {'cell_probability': 0.01},
   'use_fake_name': {'cell_probability': 0.01},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'age': {'leave_blank': {'cell_probability': 0.01},
   'misreport_age': {'cell_probability': 0.01,
    'possible_age_differences': {-2: 0.1, -1: 0.4, 1: 0.4, 2: 0.1}},
   'make_typos': {'cell_probability': 0.01, 'token_probability': 0.1}},
  'date_of_birth': {'leave_blank': {'cell_probability': 0.01},
   'write_wrong_digits': {'cell_probability': 0.01, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0.01, 'toke

# Write a function to get a config with zero noise

In [10]:
def get_zero_noise_config(row_or_col='both'):
    if row_or_col not in ['row', 'column', 'both']:
        raise ValueError("row_or_col must be 'row', 'column', or 'both'")
    config = psp.get_config()
    for dataset_config in config.values():
        if row_or_col in ['row', 'both']:
            for row_noise_config in dataset_config['row_noise'].values():
                row_noise_config['row_probability'] = 0
        if row_or_col in ['column', 'both']:
            for column_config in dataset_config['column_noise'].values():
                for noise_config in column_config.values():
                    if 'row_probability' in noise_config:
                        noise_config['row_probability'] = 0
                    elif 'cell_probability' in noise_config:
                        noise_config['cell_probability'] = 0
    return config

zero_config = get_zero_noise_config()
zero_config['decennial_census']

{'row_noise': {'omit_row': {'row_probability': 0}},
 'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0},
   'use_fake_name': {'cell_probability': 0},
   'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
  'middle_initial': {'leave_blank': {'cell_probability': 0},
   'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
  'last_name': {'leave_blank': {'cell_probability': 0},
   'use_fake_name': {'cell_probability': 0},
   'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
  'age': {'leave_blank': {'cell_probability': 0},
   'misreport_age': {'cell_probability': 0,
    'possible_age_differences': {-2: 0.1, -1: 0.4, 1: 0.4, 2: 0.1}},
   'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
  'date_of_birth': {'leave_blank': {'cell_probability': 0},
   'write_wrong_digits': {'cell_probability': 0, 'token_probability': 0.1},
   'make_typos': {'cell_probability': 0, 'token_probability': 0.1}},
  'street_number': {'lea

In [12]:
# Abie's solution
def recursive_zero(d):
    if not isinstance(d, dict):
        return

    for k in d.keys():
        if 'probability' in str(k):
            d[k] = 0.0
        else:
            recursive_zero(d[k])
            
zero_config2 = psp.get_config()
recursive_zero(zero_config2)
zero_config2['decennial_census']

{'row_noise': {'omit_row': {'row_probability': 0.0}},
 'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.0},
   'use_fake_name': {'cell_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'middle_initial': {'leave_blank': {'cell_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'last_name': {'leave_blank': {'cell_probability': 0.0},
   'use_fake_name': {'cell_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'age': {'leave_blank': {'cell_probability': 0.0},
   'misreport_age': {'cell_probability': 0.0,
    'possible_age_differences': {-2: 0.1, -1: 0.4, 1: 0.4, 2: 0.1}},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'date_of_birth': {'leave_blank': {'cell_probability': 0.0},
   'write_wrong_digits': {'cell_probability': 0.0, 'token_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0

# Generate WIC data for Rhode Island

In [18]:
%%time
wic = psp.generate_women_infants_and_children(rhode_island_dir)
wic

Noising data:   0%|                                                          | 0/334 [00:00<?, ?it/s]
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   0%|▏                                                 | 1/334 [00:00<02:17,  2.42it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   1%|▎                                                 | 2/334 [00:00<02:09,  2.56it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   1%|▍                                                 | 3/334 [00:01<02:05,  2.64it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   1%|▌                                                 | 4/334 [00:01<02:03,  2.67it/s][A
Applying noise:   0%|                                     

Noising data:  12%|█████▋                                           | 39/334 [00:13<01:45,  2.80it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  12%|█████▊                                           | 40/334 [00:14<01:44,  2.82it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  12%|██████                                           | 41/334 [00:14<01:43,  2.83it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  13%|██████▏                                          | 42/334 [00:14<01:43,  2.83it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  13%|██████▎                                          | 43/334 [00:15<01:43,  2.82it/s][A
Applying noise:   0%|                                  

Noising data:  23%|███████████▍                                     | 78/334 [00:27<01:32,  2.76it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  24%|███████████▌                                     | 79/334 [00:28<01:31,  2.80it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  24%|███████████▋                                     | 80/334 [00:28<01:29,  2.83it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  24%|███████████▉                                     | 81/334 [00:28<01:29,  2.82it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  25%|████████████                                     | 82/334 [00:29<01:29,  2.82it/s][A
Applying noise:   0%|                                  

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 43.83type/s][A
Noising data:  35%|████████████████▊                               | 117/334 [00:42<01:27,  2.48it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  35%|████████████████▉                               | 118/334 [00:42<01:25,  2.53it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  36%|█████████████████                               | 119/334 [00:42<01:23,  2.58it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  36%|█████████████████▏                              | 120/334 [00:43<01:20,  2.65it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  36%|█████████████████▍                  

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  47%|██████████████████████▍                         | 156/334 [00:56<01:05,  2.73it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  47%|██████████████████████▌                         | 157/334 [00:56<01:04,  2.74it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  47%|██████████████████████▋                         | 158/334 [00:57<01:03,  2.76it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  48%|██████████████████████▊                         | 159/334 [00:57<01:03,  2.75it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  48%|██████████████████████▉             

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  58%|████████████████████████████                    | 195/334 [01:10<00:50,  2.74it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  59%|████████████████████████████▏                   | 196/334 [01:10<00:50,  2.72it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  59%|████████████████████████████▎                   | 197/334 [01:11<00:50,  2.70it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  59%|████████████████████████████▍                   | 198/334 [01:11<00:50,  2.70it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  60%|████████████████████████████▌       

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  70%|█████████████████████████████████▋              | 234/334 [01:24<00:36,  2.73it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  70%|█████████████████████████████████▊              | 235/334 [01:25<00:36,  2.70it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  71%|█████████████████████████████████▉              | 236/334 [01:25<00:36,  2.70it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  71%|██████████████████████████████████              | 237/334 [01:26<00:35,  2.72it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  71%|██████████████████████████████████▏ 

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  82%|███████████████████████████████████████▏        | 273/334 [01:39<00:22,  2.69it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  82%|███████████████████████████████████████▍        | 274/334 [01:39<00:22,  2.71it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  82%|███████████████████████████████████████▌        | 275/334 [01:40<00:21,  2.70it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  83%|███████████████████████████████████████▋        | 276/334 [01:40<00:21,  2.66it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  83%|████████████████████████████████████

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  93%|████████████████████████████████████████████▊   | 312/334 [01:53<00:08,  2.68it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  94%|████████████████████████████████████████████▉   | 313/334 [01:54<00:07,  2.70it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  94%|█████████████████████████████████████████████▏  | 314/334 [01:54<00:07,  2.70it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  94%|█████████████████████████████████████████████▎  | 315/334 [01:54<00:07,  2.71it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  95%|████████████████████████████████████

CPU times: user 1min 54s, sys: 6.3 s, total: 2min
Wall time: 2min 2s




Unnamed: 0,household_id,simulant_id,first_name,middle_initial,last_name,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,sex,race_ethnicity
0,2599_16688,2599_41986,Sarah,C,Mccarty,06/02/1983,2498,e interstate 30,,cranston,RI,02840,Female,White
1,2599_17590,2599_44131,Nicole,A,Lewis,05/22/1991,,9th avnu,,west warwick,RI,02888,Female,White
2,2599_68676,2599_171233,Lindsay,J,Milligan,07/27/1988,2121,winter rdg ln,apt 101,west warwick,RI,02861,Female,White
3,2599_75319,2599_187606,Jennifer,C,Granados Rodriguez,12/21/1984,9824,holly ave,,cranston,RI,02860,Female,Latino
4,2599_75596,2599_188275,Kerry,J,Noel,07/10/1996,2124,justbrand lane,,newport,RI,02920,Female,Black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14425,7745_214563,7745_1010123,Liam,J,Newman,08/08/2019,46,haverford dr,,south kingstown,RI,02813,Male,White
14426,7745_31844,7745_1012353,Isaac,I,Dellinger,09/21/2019,e1853,starview ln,,e greenwich,RI,02831,Male,White
14427,7745_228895,7745_1014120,Elizabeth,N,Coker,10/24/2019,807,road 56,,east providence,RI,02864,Female,Asian
14428,7745_61219,7745_1016457,Isabel,I,Perez-Flores,12/30/2019,17419,park dr s,,middletown,RI,02915,Female,Latino


In [25]:
sizemb(wic)

9.811167

# Generate WIC data with no noise

## And try passing a sub-configuration to see if it works

### Nope, doesn't work. I think we should fix this.

In [15]:
wic0_config = psp.get_config('women_infants_and_children')
recursive_zero(wic0_config)
wic0_config

{'row_noise': {'omit_row': {'row_probability': 0.0}},
 'column_noise': {'first_name': {'leave_blank': {'cell_probability': 0.0},
   'use_fake_name': {'cell_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'middle_initial': {'leave_blank': {'cell_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'last_name': {'leave_blank': {'cell_probability': 0.0},
   'use_fake_name': {'cell_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'date_of_birth': {'leave_blank': {'cell_probability': 0.0},
   'write_wrong_digits': {'cell_probability': 0.0, 'token_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'street_number': {'leave_blank': {'cell_probability': 0.0},
   'write_wrong_digits': {'cell_probability': 0.0, 'token_probability': 0.0},
   'make_typos': {'cell_probability': 0.0, 'token_probability': 0.0}},
  'street_name': {'le

In [20]:
%%time
wic0 = psp.generate_women_infants_and_children(rhode_island_dir, config=wic0_config)
wic0

ConfigurationError: Invalid dataset 'row_noise' provided. Valid datasets are ['decennial_census', 'american_community_survey', 'current_population_survey', 'women_infants_and_children', 'social_security', 'taxes_w2_and_1099'].

In [21]:
%%time
wic0 = psp.generate_women_infants_and_children(rhode_island_dir, config=zero_config)
wic0

Noising data:   0%|                                                          | 0/334 [00:00<?, ?it/s]
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   0%|▏                                                 | 1/334 [00:00<01:57,  2.83it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   1%|▎                                                 | 2/334 [00:00<01:53,  2.93it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   1%|▍                                                 | 3/334 [00:01<01:50,  3.00it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:   1%|▌                                                 | 4/334 [00:01<01:50,  2.98it/s][A
Applying noise:   0%|                                     

Noising data:  12%|█████▋                                           | 39/334 [00:12<01:38,  3.01it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  12%|█████▊                                           | 40/334 [00:13<01:38,  2.97it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  12%|██████                                           | 41/334 [00:13<01:38,  2.98it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  13%|██████▏                                          | 42/334 [00:14<01:37,  3.00it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  13%|██████▎                                          | 43/334 [00:14<01:37,  3.00it/s][A
Applying noise:   0%|                                  

Noising data:  23%|███████████▍                                     | 78/334 [00:26<01:26,  2.97it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  24%|███████████▌                                     | 79/334 [00:26<01:25,  2.99it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  24%|███████████▋                                     | 80/334 [00:26<01:24,  2.99it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  24%|███████████▉                                     | 81/334 [00:27<01:25,  2.96it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  25%|████████████                                     | 82/334 [00:27<01:24,  2.97it/s][A
Applying noise:   0%|                                  

Noising data:  35%|████████████████▊                               | 117/334 [00:39<01:12,  2.99it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  35%|████████████████▉                               | 118/334 [00:39<01:12,  2.97it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  36%|█████████████████                               | 119/334 [00:39<01:12,  2.98it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  36%|█████████████████▏                              | 120/334 [00:40<01:12,  2.95it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  36%|█████████████████▍                              | 121/334 [00:40<01:11,  2.99it/s][A
Applying noise:   0%|                                  

Noising data:  47%|██████████████████████▍                         | 156/334 [00:52<01:00,  2.96it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  47%|██████████████████████▌                         | 157/334 [00:52<00:59,  2.97it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  47%|██████████████████████▋                         | 158/334 [00:52<00:59,  2.98it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  48%|██████████████████████▊                         | 159/334 [00:53<00:58,  2.99it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  48%|██████████████████████▉                         | 160/334 [00:53<00:58,  2.96it/s][A
Applying noise:   0%|                                  

Noising data:  58%|████████████████████████████                    | 195/334 [01:05<00:46,  2.99it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  59%|████████████████████████████▏                   | 196/334 [01:05<00:46,  2.98it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  59%|████████████████████████████▎                   | 197/334 [01:06<00:46,  2.97it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  59%|████████████████████████████▍                   | 198/334 [01:06<00:46,  2.95it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  60%|████████████████████████████▌                   | 199/334 [01:06<00:45,  2.96it/s][A
Applying noise:   0%|                                  

Noising data:  70%|█████████████████████████████████▋              | 234/334 [01:18<00:33,  2.98it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  70%|█████████████████████████████████▊              | 235/334 [01:18<00:33,  2.97it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  71%|█████████████████████████████████▉              | 236/334 [01:19<00:33,  2.95it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  71%|██████████████████████████████████              | 237/334 [01:19<00:32,  2.95it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  71%|██████████████████████████████████▏             | 238/334 [01:19<00:32,  2.96it/s][A
Applying noise:   0%|                                  

Noising data:  82%|███████████████████████████████████████▏        | 273/334 [01:31<00:20,  2.95it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  82%|███████████████████████████████████████▍        | 274/334 [01:32<00:20,  2.96it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  82%|███████████████████████████████████████▌        | 275/334 [01:32<00:19,  2.95it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  83%|███████████████████████████████████████▋        | 276/334 [01:32<00:19,  2.96it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  83%|███████████████████████████████████████▊        | 277/334 [01:33<00:19,  2.96it/s][A
Applying noise:   0%|                                  

Noising data:  93%|████████████████████████████████████████████▊   | 312/334 [01:45<00:07,  2.91it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  94%|████████████████████████████████████████████▉   | 313/334 [01:45<00:07,  2.94it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  94%|█████████████████████████████████████████████▏  | 314/334 [01:45<00:06,  2.94it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  94%|█████████████████████████████████████████████▎  | 315/334 [01:46<00:06,  2.92it/s][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Noising data:  95%|█████████████████████████████████████████████▍  | 316/334 [01:46<00:06,  2.95it/s][A
Applying noise:   0%|                                  

CPU times: user 1min 47s, sys: 5.32 s, total: 1min 52s
Wall time: 1min 52s




Unnamed: 0,household_id,simulant_id,first_name,middle_initial,last_name,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,sex,race_ethnicity
0,2599_16688,2599_41986,Sarah,C,Mccarty,06/02/1983,2498,e interstate 30,,cranston,RI,02840,Female,White
1,2599_17590,2599_44131,Nicole,A,Lewis,05/22/1991,,9th avnu,,west warwick,RI,02888,Female,White
2,2599_68676,2599_171233,Lindsay,J,Milligan,07/27/1988,2121,winter rdg ln,apt 101,west warwick,RI,02861,Female,White
3,2599_75319,2599_187606,Jennifer,C,Granados Rodriguez,12/21/1984,9824,holly ave,,cranston,RI,02860,Female,Latino
4,2599_75596,2599_188275,Kerry,J,Noel,07/10/1996,2124,justbrand lane,,newport,RI,02920,Female,Black
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14425,7745_214563,7745_1010123,Liam,J,Newman,08/08/2019,46,haverford dr,,south kingstown,RI,02813,Male,White
14426,7745_31844,7745_1012353,Isaac,I,Dellinger,09/21/2019,e1853,starview ln,,e greenwich,RI,02831,Male,White
14427,7745_228895,7745_1014120,Elizabeth,N,Coker,10/24/2019,807,road 56,,east providence,RI,02864,Female,Asian
14428,7745_61219,7745_1016457,Isabel,I,Perez-Flores,12/30/2019,17419,park dr s,,middletown,RI,02915,Female,Latino


# Check datatypes

In [22]:
wic.dtypes

household_id        object
simulant_id         object
first_name          object
middle_initial      object
last_name           object
date_of_birth       object
street_number       object
street_name         object
unit_number         object
city                object
state             category
zipcode             object
sex               category
race_ethnicity    category
dtype: object

In [24]:
# Good, looks like everything is either string or NaN
wic.apply(lambda col: col.map(type).unique())

household_id                       [<class 'str'>]
simulant_id                        [<class 'str'>]
first_name        [<class 'str'>, <class 'float'>]
middle_initial    [<class 'str'>, <class 'float'>]
last_name         [<class 'str'>, <class 'float'>]
date_of_birth                      [<class 'str'>]
street_number     [<class 'str'>, <class 'float'>]
street_name       [<class 'str'>, <class 'float'>]
unit_number       [<class 'float'>, <class 'str'>]
city                               [<class 'str'>]
state                         [<class 'str'>, nan]
zipcode                            [<class 'str'>]
sex                           [<class 'str'>, nan]
race_ethnicity                     [<class 'str'>]
dtype: object

In [26]:
# Good, looks like there are no more empty strings for missing data
(wic == '').sum()

household_id      0
simulant_id       0
first_name        0
middle_initial    0
last_name         0
date_of_birth     0
street_number     0
street_name       0
unit_number       0
city              0
state             0
zipcode           0
sex               0
race_ethnicity    0
dtype: int64

# Generate census data

In [27]:
%%time
census = psp.generate_decennial_census(rhode_island_dir)
census

Noising data:   0%|                                                          | 0/334 [00:00<?, ?it/s]
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.22type/s][A
Noising data:   0%|▏                                                 | 1/334 [00:02<11:09,  2.01s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.58type/s][A
Noising data:   1%|▎                                                 | 2/334 [00:03<11:02,  2.00s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 29.88type/s][A
Noising data:   1%|▍                                      

Noising data:   8%|███▊                                             | 26/334 [00:51<10:07,  1.97s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 31.32type/s][A
Noising data:   8%|███▉                                             | 27/334 [00:53<10:04,  1.97s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 28.55type/s][A
Noising data:   8%|████                                             | 28/334 [00:54<10:04,  1.98s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 31.10type/s][A
Noising data:   9%|████▎                               

Noising data:  16%|███████▋                                         | 52/334 [01:42<09:19,  1.98s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.03type/s][A
Noising data:  16%|███████▊                                         | 53/334 [01:44<09:14,  1.97s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 29.90type/s][A
Noising data:  16%|███████▉                                         | 54/334 [01:46<09:12,  1.97s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.90type/s][A
Noising data:  16%|████████                            

Noising data:  23%|███████████▍                                     | 78/334 [02:33<08:38,  2.02s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 34.30type/s][A
Noising data:  24%|███████████▌                                     | 79/334 [02:35<08:30,  2.00s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.69type/s][A
Noising data:  24%|███████████▋                                     | 80/334 [02:37<08:25,  1.99s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 33.49type/s][A
Noising data:  24%|███████████▉                        

Noising data:  31%|██████████████▉                                 | 104/334 [03:25<07:46,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.58type/s][A
Noising data:  31%|███████████████                                 | 105/334 [03:27<07:41,  2.01s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 31.18type/s][A
Noising data:  32%|███████████████▏                                | 106/334 [03:29<07:36,  2.00s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.43type/s][A
Noising data:  32%|███████████████▍                    

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.42type/s][A
Noising data:  39%|██████████████████▋                             | 130/334 [04:17<06:58,  2.05s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.96type/s][A
Noising data:  39%|██████████████████▊                             | 131/334 [04:19<06:53,  2.04s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 31.21type/s][A
Noising data:  40%|██████████████████▉                             | 132/334 [04:21<06:49,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 16.15type/s][A
Noising data:  47%|██████████████████████▍                         | 156/334 [05:09<06:09,  2.07s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.98type/s][A
Noising data:  47%|██████████████████████▌                         | 157/334 [05:11<06:04,  2.06s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 31.64type/s][A
Noising data:  47%|██████████████████████▋                         | 158/334 [05:13<06:00,  2.05s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.05type/s][A
Noising data:  54%|██████████████████████████▏                     | 182/334 [06:02<05:05,  2.01s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 17.34type/s][A
Noising data:  55%|██████████████████████████▎                     | 183/334 [06:04<05:13,  2.08s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.69type/s][A
Noising data:  55%|██████████████████████████▍                     | 184/334 [06:06<05:08,  2.06s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.27type/s][A
Noising data:  62%|█████████████████████████████▉                  | 208/334 [06:54<04:15,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.30type/s][A
Noising data:  63%|██████████████████████████████                  | 209/334 [06:56<04:12,  2.02s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 29.67type/s][A
Noising data:  63%|██████████████████████████████▏                 | 210/334 [06:58<04:11,  2.02s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 29.73type/s][A
Noising data:  70%|█████████████████████████████████▋              | 234/334 [07:47<03:22,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.72type/s][A
Noising data:  70%|█████████████████████████████████▊              | 235/334 [07:49<03:20,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 31.56type/s][A
Noising data:  71%|█████████████████████████████████▉              | 236/334 [07:51<03:19,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.83type/s][A
Noising data:  78%|█████████████████████████████████████▎          | 260/334 [08:40<02:30,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.35type/s][A
Noising data:  78%|█████████████████████████████████████▌          | 261/334 [08:42<02:28,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.92type/s][A
Noising data:  78%|█████████████████████████████████████▋          | 262/334 [08:44<02:25,  2.02s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.36type/s][A
Noising data:  86%|█████████████████████████████████████████       | 286/334 [09:33<01:38,  2.04s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 33.19type/s][A
Noising data:  86%|█████████████████████████████████████████▏      | 287/334 [09:35<01:35,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 31.87type/s][A
Noising data:  86%|█████████████████████████████████████████▍      | 288/334 [09:37<01:33,  2.03s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.67type/s][A
Noising data:  93%|████████████████████████████████████████████▊   | 312/334 [10:26<00:44,  2.04s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 32.27type/s][A
Noising data:  94%|████████████████████████████████████████████▉   | 313/334 [10:29<00:42,  2.04s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|████████████████████████████████████████████████| 8/8 [00:00<00:00, 30.71type/s][A
Noising data:  94%|█████████████████████████████████████████████▏  | 314/334 [10:31<00:40,  2.04s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise: 100%|██████████████████████████████████

CPU times: user 9min 45s, sys: 1min 36s, total: 11min 22s
Wall time: 11min 13s


Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity
0,2477_591,John,T,Everetts,82,03/22/1938,6292,e secretariat dr,,providence,RI,02886,Reference person,Male,White
1,2477_2188,Susan,F,Dorn,63,01/08/1957,41468,e chenango pl,,jamestown,RI,02886,Reference person,Female,White
2,2477_2189,John,P,Dorn,75,02/07/1945,41468,e chenango pl,,jamestown,RI,02886,Opp-sex spouse,Male,White
3,2477_2465,Kassandra,R,,26,06/79/1993,138,elastic plant rd,,cranston,RI,02814,Reference person,Female,Latino
4,2477_2466,Sophia,E,Medrano Garcia,7,05/07/2012,130,elastic plant rd,,cranston,RI,02814,Biological child,Female,Latino
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072363,8133_1021295,Arielle,T,Parrella,17,03/26/2003,106,parkview ave,,pawtucket,RI,02888,Biological child,Female,Latino
1072364,8133_1021308,Devin,B,Scoggins,9,11/18/2010,5758,c.r. 2461,,warwick,RI,02891,Biological child,Male,White
1072365,8133_1021349,Braden,A,Corbett,22,05/10/1997,68,harbor woods cir,,lincoln,RI,02903,Roommate,Male,Black
1072366,8133_1021967,Cameron,A,Horan,0,04/09/2020,7120,w franklin ave,,johnston,RI,02909,Other nonrelative,Male,White


In [28]:
sizemb(census)

695.035011

In [29]:
census.dtypes

simulant_id                       object
first_name                        object
middle_initial                    object
last_name                         object
age                               object
date_of_birth                     object
street_number                     object
street_name                       object
unit_number                       object
city                              object
state                           category
zipcode                           object
relation_to_reference_person    category
sex                             category
race_ethnicity                  category
dtype: object

In [30]:
%%time
census_c = census.copy()
datatypes.convert_to_int_and_categorical(census_c)
census_c.dtypes

CPU times: user 6.98 s, sys: 400 ms, total: 7.38 s
Wall time: 7.38 s


simulant_id                        int64
first_name                      category
middle_initial                  category
last_name                       category
age                             category
date_of_birth                   category
street_number                   category
street_name                     category
unit_number                     category
city                            category
state                           category
zipcode                         category
relation_to_reference_person    category
sex                             category
race_ethnicity                  category
dtype: object

In [31]:
sizemb(census_c)

90.590814

In [32]:
census_c

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,street_number,street_name,unit_number,city,state,zipcode,relation_to_reference_person,sex,race_ethnicity
0,2477000000591,John,T,Everetts,82,03/22/1938,6292,e secretariat dr,,providence,RI,02886,Reference person,Male,White
1,2477000002188,Susan,F,Dorn,63,01/08/1957,41468,e chenango pl,,jamestown,RI,02886,Reference person,Female,White
2,2477000002189,John,P,Dorn,75,02/07/1945,41468,e chenango pl,,jamestown,RI,02886,Opp-sex spouse,Male,White
3,2477000002465,Kassandra,R,,26,06/79/1993,138,elastic plant rd,,cranston,RI,02814,Reference person,Female,Latino
4,2477000002466,Sophia,E,Medrano Garcia,7,05/07/2012,130,elastic plant rd,,cranston,RI,02814,Biological child,Female,Latino
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1072363,8133001021295,Arielle,T,Parrella,17,03/26/2003,106,parkview ave,,pawtucket,RI,02888,Biological child,Female,Latino
1072364,8133001021308,Devin,B,Scoggins,9,11/18/2010,5758,c.r. 2461,,warwick,RI,02891,Biological child,Male,White
1072365,8133001021349,Braden,A,Corbett,22,05/10/1997,68,harbor woods cir,,lincoln,RI,02903,Roommate,Male,Black
1072366,8133001021967,Cameron,A,Horan,0,04/09/2020,7120,w franklin ave,,johnston,RI,02909,Other nonrelative,Male,White


# Generate W2/1099 data

In [33]:
%%time
w2 = psp.generate_taxes_w2_and_1099(rhode_island_dir)
w2

Noising data:   0%|                                                          | 0/334 [00:00<?, ?it/s]
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.26type/s][A
Noising data:   0%|▏                                                 | 1/334 [00:05<30:15,  5.45s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.05type/s][A
Noising data:   1%|▎                                                 | 2/334 [00:10<28:49,  5.21s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.57type/s][A
Noising data:   1%|▍                                      

Noising data:   8%|███▊                                             | 26/334 [02:10<25:32,  4.98s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 49.26type/s][A
Noising data:   8%|███▉                                             | 27/334 [02:15<25:24,  4.97s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.94type/s][A
Noising data:   8%|████                                             | 28/334 [02:20<25:19,  4.96s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.50type/s][A
Noising data:   9%|████▎                               

Noising data:  16%|███████▋                                         | 52/334 [04:21<23:54,  5.09s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 49.00type/s][A
Noising data:  16%|███████▊                                         | 53/334 [04:26<23:43,  5.07s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.33type/s][A
Noising data:  16%|███████▉                                         | 54/334 [04:31<23:43,  5.08s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 48.44type/s][A
Noising data:  16%|████████                            

Noising data:  23%|███████████▍                                     | 78/334 [06:32<21:29,  5.04s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.14type/s][A
Noising data:  24%|███████████▌                                     | 79/334 [06:37<21:21,  5.02s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  62%|██████████████████████████████                  | 5/8 [00:00<00:00, 48.63type/s][A
Noising data:  24%|███████████▋                                     | 80/334 [06:42<21:36,  5.10s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 46.80type/s][A
Noising data:  24%|███████████▉                        

Noising data:  31%|██████████████▉                                 | 104/334 [08:44<19:29,  5.08s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 44.06type/s][A
Noising data:  31%|███████████████                                 | 105/334 [08:49<19:23,  5.08s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.19type/s][A
Noising data:  32%|███████████████▏                                | 106/334 [08:54<19:11,  5.05s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  50%|████████████████████████                        | 4/8 [00:00<00:00, 37.45type/s][A
Applying noise: 100%|██████████████████████████████████

Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.30type/s][A
Noising data:  39%|██████████████████▋                             | 130/334 [10:56<17:28,  5.14s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 44.45type/s][A
Noising data:  39%|██████████████████▊                             | 131/334 [11:01<17:24,  5.15s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.24type/s][A
Noising data:  40%|██████████████████▉                             | 132/334 [11:07<17:13,  5.12s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|██████████████████████████████████

Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 49.54type/s][A
Noising data:  47%|██████████████████████▍                         | 156/334 [13:10<15:12,  5.13s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.14type/s][A
Noising data:  47%|██████████████████████▌                         | 157/334 [13:15<15:07,  5.13s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 54.08type/s][A
Noising data:  47%|██████████████████████▋                         | 158/334 [13:20<14:57,  5.10s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|██████████████████████████████████

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 51.07type/s][A
Noising data:  54%|██████████████████████████▏                     | 182/334 [15:24<13:12,  5.21s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.09type/s][A
Noising data:  55%|██████████████████████████▎                     | 183/334 [15:30<13:04,  5.19s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 46.35type/s][A
Noising data:  55%|██████████████████████████▍                     | 184/334 [15:35<13:01,  5.21s/it][A
Applying noise:   0%|                                  

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.84type/s][A
Noising data:  62%|█████████████████████████████▉                  | 208/334 [17:39<10:50,  5.16s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.18type/s][A
Noising data:  63%|██████████████████████████████                  | 209/334 [17:44<10:43,  5.15s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 48.89type/s][A
Noising data:  63%|██████████████████████████████▏                 | 210/334 [17:49<10:36,  5.13s/it][A
Applying noise:   0%|                                  

Noising data:  70%|█████████████████████████████████▍              | 233/334 [19:50<08:46,  5.21s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.31type/s][A
Noising data:  70%|█████████████████████████████████▋              | 234/334 [19:55<08:40,  5.20s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.79type/s][A
Noising data:  70%|█████████████████████████████████▊              | 235/334 [20:00<08:35,  5.20s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 47.90type/s][A
Noising data:  71%|█████████████████████████████████▉  

Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 49.77type/s][A
Noising data:  78%|█████████████████████████████████████▏          | 259/334 [22:06<06:39,  5.32s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 48.67type/s][A
Noising data:  78%|█████████████████████████████████████▎          | 260/334 [22:12<06:32,  5.31s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 45.38type/s][A
Noising data:  78%|█████████████████████████████████████▌          | 261/334 [22:17<06:27,  5.31s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|██████████████████████████████████

Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 46.29type/s][A
Noising data:  85%|████████████████████████████████████████▉       | 285/334 [24:24<04:17,  5.26s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 48.67type/s][A
Noising data:  86%|█████████████████████████████████████████       | 286/334 [24:29<04:11,  5.25s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 46.64type/s][A
Noising data:  86%|█████████████████████████████████████████▏      | 287/334 [24:34<04:07,  5.26s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|██████████████████████████████████

Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 52.77type/s][A
Noising data:  93%|████████████████████████████████████████████▋   | 311/334 [26:41<02:02,  5.33s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 44.07type/s][A
Noising data:  93%|████████████████████████████████████████████▊   | 312/334 [26:47<01:56,  5.32s/it][A
Applying noise:   0%|                                                        | 0/8 [00:00<?, ?type/s][A
Applying noise:  75%|████████████████████████████████████            | 6/8 [00:00<00:00, 53.74type/s][A
Noising data:  94%|████████████████████████████████████████████▉   | 313/334 [26:52<01:51,  5.29s/it][A
Applying noise:   0%|                                  

CPU times: user 25min 20s, sys: 3min 54s, total: 29min 15s
Wall time: 28min 44s


Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,...,income,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form
0,1490_588,Adriana,M,Baumert,41,02/25/1979,1029,westgate way,,,...,55820,325622,Westminster Arts,,e federal st,,birmingham,AL,35648,W2
1,1490_589,David,L,Baumert,39,03/10/1981,1029,westgate way,,,...,77760,1598657,Empanadas Golden Boys Painting,6628,hawks haven rd,,statesville,NC,28168,W2
2,1490_590,Abigael,M,Baumert,22,05/07/1998,1029,westgate way,,,...,9451,849149,Alamo Community Church of Education,9712,3rd street,,five forks,SC,29926,W2
3,1490_1161,Ronald,D,Wright,38,02/23/1982,2115,nort elk lane,,,...,12349,99503,Northwest Pizza & Boutique,3223,wellesley avenue,,winter springs,FL,32571,W2
4,1490_1161,Ronald,,Wright,38,02/23/1982,2115,nort elk lane,,,...,7196,217787,Saint Helens Nail Spa,155,7th avenue,,new york,NY,11217,W2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043909,5072_1025338,Joshua,,Beil,30,07/20/1990,1425,old kings hwy s,,,...,16370,413828,Henry E DDS,3,nantucket dr,,beckley,WV,25701,W2
1043910,5072_1029528,Marcus,D,Salu Jacquot,26,11/21/1994,422,katherine claire ln,,,...,8159,568939,GCS University Settlement Early Childhood Educ...,833,w pike st,,flint,MI,48122,W2
1043911,5072_1030814,Amelia,T,Alcamo,65,07/08/1955,408,lake ridge drive nrth,,,...,4964,771438,The Church of Touch Miami,722,everbreeze dr,,fort myers,FL,32826,W2
1043912,5072_1030814,Amelia,T,Alcamo,65,07/08/1955,408,lake ridge drive nrth,,,...,10930,1203472,Green Gables Estate Bellevue Dental,10005,n crescent dr,,paradise,TX,75020,W2


In [34]:
sizemb(w2)

1243.378012

In [35]:
w2.dtypes

simulant_id                        object
first_name                         object
middle_initial                     object
last_name                          object
age                                object
date_of_birth                      object
mailing_address_street_number      object
mailing_address_street_name        object
mailing_address_unit_number        object
mailing_address_po_box             object
mailing_address_city               object
mailing_address_state            category
mailing_address_zipcode            object
ssn                                object
income                             object
employer_id                        object
employer_name                      object
employer_street_number             object
employer_street_name               object
employer_unit_number               object
employer_city                      object
employer_state                   category
employer_zipcode                   object
tax_form                         c

In [37]:
%%time
w2_c = w2.copy()
datatypes.convert_to_int_and_categorical(w2_c)
w2_c.dtypes

CPU times: user 18.1 s, sys: 1.13 s, total: 19.3 s
Wall time: 19.3 s


simulant_id                         int64
first_name                       category
middle_initial                   category
last_name                        category
age                              category
date_of_birth                    category
mailing_address_street_number    category
mailing_address_street_name      category
mailing_address_unit_number      category
mailing_address_po_box           category
mailing_address_city             category
mailing_address_state            category
mailing_address_zipcode          category
ssn                                 int32
income                           category
employer_id                      category
employer_name                    category
employer_street_number           category
employer_street_name             category
employer_unit_number             category
employer_city                    category
employer_state                   category
employer_zipcode                 category
tax_form                         c

In [38]:
sizemb(w2_c)

277.230335

In [58]:
%%time
100 * w2_c.apply(sizemb) / w2.apply(sizemb)

CPU times: user 2.89 s, sys: 0 ns, total: 2.89 s
Wall time: 2.88 s


simulant_id                       11.800439
first_name                         4.749150
middle_initial                     1.736546
last_name                         19.216926
age                                5.569373
date_of_birth                     10.903457
mailing_address_street_number     12.695500
mailing_address_street_name       41.349076
mailing_address_unit_number        8.836174
mailing_address_po_box             8.175968
mailing_address_city               3.576431
mailing_address_state            100.000000
mailing_address_zipcode            3.404489
ssn                                5.919064
income                            40.392641
employer_id                      116.813893
employer_name                     82.212841
employer_street_number            13.024058
employer_street_name              47.003967
employer_unit_number              10.472774
employer_city                      5.988089
employer_state                   100.000000
employer_zipcode                

In [41]:
w2_c.drop(columns=['simulant_id', 'ssn']).apply(lambda col: 100*len(col.cat.categories)/len(col))

first_name                        1.140994
middle_initial                    0.002491
last_name                         9.406522
age                               0.018392
date_of_birth                     3.342038
mailing_address_street_number     4.019776
mailing_address_street_name      23.161582
mailing_address_unit_number       1.207571
mailing_address_po_box            1.075089
mailing_address_city              0.365739
mailing_address_state             0.004885
mailing_address_zipcode           0.112940
income                           17.229868
employer_id                      60.810756
employer_name                    56.693655
employer_street_number            4.328709
employer_street_name             28.889066
employer_unit_number              1.533555
employer_city                     2.122014
employer_state                    0.004885
employer_zipcode                  3.159264
tax_form                          0.000192
dtype: float64

In [42]:
census_c.drop(columns=['simulant_id']).apply(lambda col: 100*len(col.cat.categories)/len(col))

first_name                       1.453139
middle_initial                   0.002425
last_name                        9.667390
age                              0.021168
date_of_birth                    4.167692
street_number                    3.889243
street_name                     24.382861
unit_number                      1.293866
city                             0.275092
state                            0.004756
zipcode                          0.126822
relation_to_reference_person     0.001772
sex                              0.000187
race_ethnicity                   0.000653
dtype: float64

In [43]:
w2.employer_id

0           325622
1          1598657
2           849149
3            99503
4           217787
            ...   
1043909     413828
1043910     568939
1043911     771438
1043912    1203472
1043913    1124899
Name: employer_id, Length: 1043914, dtype: object

In [44]:
w2_c.employer_id

0           325622
1          1598657
2           849149
3            99503
4           217787
            ...   
1043909     413828
1043910     568939
1043911     771438
1043912    1203472
1043913    1124899
Name: employer_id, Length: 1043914, dtype: category
Categories (634812, object): [1, 2, 3, 6, ..., '999615 ', '999833', '999922', '999964 ']

In [45]:
w2.apply(lambda col: col.map(type).unique())

simulant_id                                                      [<class 'str'>]
first_name                                      [<class 'str'>, <class 'float'>]
middle_initial                                  [<class 'str'>, <class 'float'>]
last_name                                       [<class 'str'>, <class 'float'>]
age                              [<class 'int'>, <class 'str'>, <class 'float'>]
date_of_birth                                   [<class 'str'>, <class 'float'>]
mailing_address_street_number                   [<class 'str'>, <class 'float'>]
mailing_address_street_name                     [<class 'str'>, <class 'float'>]
mailing_address_unit_number                     [<class 'float'>, <class 'str'>]
mailing_address_po_box                          [<class 'float'>, <class 'str'>]
mailing_address_city                            [<class 'str'>, <class 'float'>]
mailing_address_state                                       [<class 'str'>, nan]
mailing_address_zipcode     

In [47]:
%time w2.income.map(type).value_counts()

CPU times: user 100 ms, sys: 3.41 ms, total: 104 ms
Wall time: 101 ms


<class 'int'>      1012641
<class 'str'>        20549
<class 'float'>      10724
Name: income, dtype: int64

In [49]:
%time w2_c.income.map(type).value_counts()

CPU times: user 109 ms, sys: 3.54 ms, total: 113 ms
Wall time: 110 ms


<class 'int'>    1012641
<class 'str'>      20549
Name: income, dtype: int64

In [50]:
w2.income.loc[w2.income.map(type) == str]

13           10393
33            5823
63           64044
75           33047
220          85849
            ...   
1043641     19775 
1043654      62139
1043789    1511776
1043811      22784
1043819     306332
Name: income, Length: 20549, dtype: object

In [54]:
w2_c.income.loc[w2.income.notna() & ~w2_c.income.astype(str).str.isdigit()]

374         1654 
536         8117 
1665        4316 
1736        5520 
2037        4637 
            ...  
1043192    1597  
1043430    21251 
1043471    35086 
1043629    1523  
1043641    19775 
Name: income, Length: 8099, dtype: category
Categories (179865, object): [8, 14, 23, 24, ..., '9995  ', '9998', '9998  ', '99989']

In [55]:
w2_c.loc[374, 'income']

'1654 '

In [56]:
w2_c.date_of_birth

0          02/25/1979
1          03/10/1981
2          05/07/1998
3          02/23/1982
4          02/23/1982
              ...    
1043909    07/20/1990
1043910    11/21/1994
1043911    07/08/1955
1043912    07/08/1955
1043913    03/19/1964
Name: date_of_birth, Length: 1043914, dtype: category
Categories (34888, object): ['00/01/1931', '00/01/1968', '00/01/1990', '00/02/1950', ..., '98/04/1939', '98/09/1936', '99/04/1937', '99/13/1939']

In [59]:
'abc'.split('_')

['abc']

In [61]:
x, *y = 'abc'.split('_')

In [62]:
x

'abc'

In [63]:
y

[]

In [65]:
int('123')

123

In [66]:
int('123 ')

123

# Check employer addresses across shards

In [67]:
w2

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,...,income,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form
0,1490_588,Adriana,M,Baumert,41,02/25/1979,1029,westgate way,,,...,55820,325622,Westminster Arts,,e federal st,,birmingham,AL,35648,W2
1,1490_589,David,L,Baumert,39,03/10/1981,1029,westgate way,,,...,77760,1598657,Empanadas Golden Boys Painting,6628,hawks haven rd,,statesville,NC,28168,W2
2,1490_590,Abigael,M,Baumert,22,05/07/1998,1029,westgate way,,,...,9451,849149,Alamo Community Church of Education,9712,3rd street,,five forks,SC,29926,W2
3,1490_1161,Ronald,D,Wright,38,02/23/1982,2115,nort elk lane,,,...,12349,99503,Northwest Pizza & Boutique,3223,wellesley avenue,,winter springs,FL,32571,W2
4,1490_1161,Ronald,,Wright,38,02/23/1982,2115,nort elk lane,,,...,7196,217787,Saint Helens Nail Spa,155,7th avenue,,new york,NY,11217,W2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043909,5072_1025338,Joshua,,Beil,30,07/20/1990,1425,old kings hwy s,,,...,16370,413828,Henry E DDS,3,nantucket dr,,beckley,WV,25701,W2
1043910,5072_1029528,Marcus,D,Salu Jacquot,26,11/21/1994,422,katherine claire ln,,,...,8159,568939,GCS University Settlement Early Childhood Educ...,833,w pike st,,flint,MI,48122,W2
1043911,5072_1030814,Amelia,T,Alcamo,65,07/08/1955,408,lake ridge drive nrth,,,...,4964,771438,The Church of Touch Miami,722,everbreeze dr,,fort myers,FL,32826,W2
1043912,5072_1030814,Amelia,T,Alcamo,65,07/08/1955,408,lake ridge drive nrth,,,...,10930,1203472,Green Gables Estate Bellevue Dental,10005,n crescent dr,,paradise,TX,75020,W2


In [68]:
w2_c

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,...,income,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form
0,1490000000588,Adriana,M,Baumert,41,02/25/1979,1029,westgate way,,,...,55820,325622,Westminster Arts,,e federal st,,birmingham,AL,35648,W2
1,1490000000589,David,L,Baumert,39,03/10/1981,1029,westgate way,,,...,77760,1598657,Empanadas Golden Boys Painting,6628,hawks haven rd,,statesville,NC,28168,W2
2,1490000000590,Abigael,M,Baumert,22,05/07/1998,1029,westgate way,,,...,9451,849149,Alamo Community Church of Education,9712,3rd street,,five forks,SC,29926,W2
3,1490000001161,Ronald,D,Wright,38,02/23/1982,2115,nort elk lane,,,...,12349,99503,Northwest Pizza & Boutique,3223,wellesley avenue,,winter springs,FL,32571,W2
4,1490000001161,Ronald,,Wright,38,02/23/1982,2115,nort elk lane,,,...,7196,217787,Saint Helens Nail Spa,155,7th avenue,,new york,NY,11217,W2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1043909,5072001025338,Joshua,,Beil,30,07/20/1990,1425,old kings hwy s,,,...,16370,413828,Henry E DDS,3,nantucket dr,,beckley,WV,25701,W2
1043910,5072001029528,Marcus,D,Salu Jacquot,26,11/21/1994,422,katherine claire ln,,,...,8159,568939,GCS University Settlement Early Childhood Educ...,833,w pike st,,flint,MI,48122,W2
1043911,5072001030814,Amelia,T,Alcamo,65,07/08/1955,408,lake ridge drive nrth,,,...,4964,771438,The Church of Touch Miami,722,everbreeze dr,,fort myers,FL,32826,W2
1043912,5072001030814,Amelia,T,Alcamo,65,07/08/1955,408,lake ridge drive nrth,,,...,10930,1203472,Green Gables Estate Bellevue Dental,10005,n crescent dr,,paradise,TX,75020,W2


In [69]:
w2_c.simulant_id.value_counts()

2721000912572    7
5046000328771    7
5114000736833    7
3304000818598    6
2965000931899    6
                ..
7137000830882    1
7137000831420    1
7137000831562    1
7137000833626    1
5072001032414    1
Name: simulant_id, Length: 757448, dtype: int64

In [70]:
w2_c.employer_id.value_counts()

1          4102
80980        43
1389791      43
1            41
596498       39
           ... 
729141        1
729139        1
729138        1
729135        1
999964        1
Name: employer_id, Length: 634812, dtype: int64

In [74]:
w2_c.employer_id.value_counts().index

CategoricalIndex([1, 80980, 1389791, '1      ', 596498, 970539, 891635, '1',
                  904393, 404051,
                  ...
                  729153, 729150, 729148, 729145, 729142, 729141, 729139,
                  729138, 729135, '999964 '],
                 categories=[1, 2, 3, 6, ..., '999615 ', '999833', '999922', '999964 '], ordered=False, dtype='category', length=634812)

In [76]:
w2.query("employer_id == 80980")

Unnamed: 0,simulant_id,first_name,middle_initial,last_name,age,date_of_birth,mailing_address_street_number,mailing_address_street_name,mailing_address_unit_number,mailing_address_po_box,...,income,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode,tax_form
2857,1490_972262,Christian,T,Strong,23.0,04/20/1997,3503,springside way,,,...,28003.0,80980,Clinch Vly Sch,575.0,selborne la,,marque,TX,77039.0,W2
7424,5046_526026,Bettye,S,Masopust,80.0,04/27/1940,7440,e 163rd street,,,...,3950.0,80980,Clinch Vly Sch,32.0,n brown ave,,la porte,TX,77039.0,W2
19988,5004_863151,Rebecca,S,Hoagland,25.0,05/04/1995,2456,w garfield st,,,...,13421.0,80980,Clinch Vly Sch,318.0,n sr 235,,oakland,TX,77093.0,W2
26561,1832_252597,Jose,A,White,39.0,06/28/1981,1461,kimbleton ave,,,...,18208.0,80980,Clinch Vly Sch,12240.0,nw harris st,,fort worth,TX,77026.0,W2
40417,6793_879099,Gino,M,Hull,24.0,10/14/1996,n53w 37214,orangeview ave,,,...,15294.0,80980,Clinch Vly Sch,802.0,klamath falls dr,,conroe,TX,77032.0,W2
54741,4528_877154,Joseph,A,Miller,33.0,01/28/1987,3311,covington ln,,,...,64045.0,80980,Clinch Vly Sch,219.0,bagby court,,hillcrest,TX,77093.0,W2
77667,5619_844785,David,D,Wine,69.0,06/20/1951,5130,s parker rd,,,...,54625.0,80980,Clinch Vly Sch,711.0,orchlee st,,oletha,TX,77039.0,W2
146800,4943_396797,Josephine,K,Houska,19.0,10/08/2001,1710,flying a trl,,,...,1825.0,80980,Clinch Vly Sch,5053.0,hickory marsh ln,,oak ridge,TX,77060.0,W2
164062,1777_629973,Elaine,L,Ashley,65.0,09/13/1955,3130,railroad st,,,...,9373.0,80980,Clinch Vly Sch,7143.0,canterbury lane,,brownwood,TX,77060.0,W2
172911,1648_703406,Jesse,R,Whitacre,46.0,12/29/1974,e9424,bayview ave,,,...,55612.0,80980,Clinch Vly Sch,614.0,skylark drive,,houston,TX,77039.0,W2


In [79]:
w2_c.loc[w2_c.employer_id==80980].filter(regex="simulant|employer").sort_values('simulant_id')

Unnamed: 0,simulant_id,employer_id,employer_name,employer_street_number,employer_street_name,employer_unit_number,employer_city,employer_state,employer_zipcode
584631,40000911527,80980,Clinch Vly Sch,661.0,cochise rd,,houston,TX,77060.0
603205,99000157237,80980,Clinch Vly Sch,386.0,harmon avenue,,the colony,TX,77060.0
991912,315000093525,80980,Clinch Vly Sch,8480.0,birch ln,,van,TX,77039.0
663983,446000999098,80980,Clinch Vly Sch,1078.0,n sunapee loop,,throckmorton,TX,77039.0
932433,1452000530381,80980,Clinch Vly Sch,7951.0,nw 302nd st,aprt 4,richardson,TX,77093.0
2857,1490000972262,80980,Clinch Vly Sch,575.0,selborne la,,marque,TX,77039.0
172911,1648000703406,80980,Clinch Vly Sch,614.0,skylark drive,,houston,TX,77039.0
164062,1777000629973,80980,Clinch Vly Sch,7143.0,canterbury lane,,brownwood,TX,77060.0
26561,1832000252597,80980,Clinch Vly Sch,12240.0,nw harris st,,fort worth,TX,77026.0
994557,2284000004186,80980,Clinch Vly Sch,41.0,duff rd,,georgetown,TX,77026.0


In [81]:
w2_c.simulant_id // 10**datatypes.ID_PAD_WIDTH

0          1490
1          1490
2          1490
3          1490
4          1490
           ... 
1043909    5072
1043910    5072
1043911    5072
1043912    5072
1043913    5072
Name: simulant_id, Length: 1043914, dtype: int64

In [85]:
%%time
w2_c.employer_id.astype(str).str.strip().str.isdigit().all()

CPU times: user 1.16 s, sys: 3.6 ms, total: 1.17 s
Wall time: 1.16 s


False

In [86]:
%%time
eid = w2_c.employer_id.astype(str).str.strip()
eid.loc[~eid.str.isdigit()]

CPU times: user 1.04 s, sys: 19.7 ms, total: 1.06 s
Wall time: 1.06 s


58         nan
154        nan
193        nan
194        nan
202        nan
          ... 
1043333    nan
1043413    nan
1043421    nan
1043579    nan
1043903    nan
Name: employer_id, Length: 12289, dtype: object

In [87]:
%%time
w2_c.employer_id.loc[w2_c.employer_id.notna()].astype(str).str.strip().str.isdigit().all()

CPU times: user 734 ms, sys: 44 ms, total: 778 ms
Wall time: 776 ms


True