In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from vivarium_research_prl.noise import corruption, fake_names

!date
!whoami
!uname -a
!pwd

Fri 02 Dec 2022 03:25:47 PM PST
ndbs
Linux int-slurm-sarchive-p0012 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/noise


In [2]:
%load_ext autoreload
%autoreload 2

# Descriptions from Abie's original notebooks:

## Reproduce data corruption like GeCO

Informed by reading https://dmm.anu.edu.au/geco/flex-data-gen-manual.pdf but not looking at the sourcecode, since it might be in conflict with the license we end up using for this sim.

## NORC report on PVS includes lists of fake names

It is a pdf, though, so I'm going to copy them into this notebook and make something machine readable out of them.

**Question:** Can we find a link to the NORC report Abie used???


# Check module imports

In [3]:
dir(corruption)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'df_k',
 'df_ocr',
 'df_phonetic',
 'df_qwerty',
 'di',
 'dj',
 'i',
 'j',
 'k',
 'keyboard_corrupt',
 'nbr_val',
 'nbrs',
 'np',
 'ocr_corrupt',
 'ocr_error_dict',
 'pd',
 'phonetic_corrupt',
 'phonetic_error_dict',
 'qwerty_error_dict',
 'val']

In [4]:
dir(fake_names)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 '_fake_first_name_string',
 '_fake_last_name_string',
 'fake_first_names',
 'fake_last_names']

# Test OCR noise

In [5]:
corruption.df_ocr

Unnamed: 0,ocr_true,ocr_err
0,5,S
1,5,s
2,2,Z
3,2,z
4,1,|
...,...,...
44,l<,k
45,1<,k
46,m,rn
47,l,|


In [6]:
# how many characters in the true string?
corruption.df_ocr.ocr_true.map(len).value_counts()

1    34
2    13
3     2
Name: ocr_true, dtype: int64

In [7]:
corruption.ocr_error_dict

{'0': ['o', 'O'],
 '1': ['|'],
 '12': ['R'],
 '13': ['B'],
 '17': ['n'],
 '1<': ['k'],
 '1>': ['b'],
 '2': ['Z', 'z'],
 '5': ['S', 's'],
 '6': ['G'],
 'A': ['4'],
 'B': ['8'],
 'D': ['O'],
 'E': ['F'],
 'F': ['P'],
 'I-I': ['H'],
 'IJ': ['U'],
 'LI': ['U'],
 'Q': ['O'],
 'U': ['V'],
 'Y': ['V'],
 'cl': ['d'],
 'g': ['9', 'q'],
 'h': ['b'],
 'i': ["'l", ':'],
 'iii': ['m'],
 'j': ['i'],
 'k': ['lc'],
 'l': ['J', '1', 'I', '|'],
 'l<': ['k'],
 'l>': ['b'],
 'lJ': ['U'],
 'lo': ['b'],
 'm': ['n', 'rn'],
 'q': ['9', '4'],
 'ri': ['n'],
 'u': ['v'],
 'w': ['vv'],
 'y': ['v']}

In [8]:
corruption.ocr_corrupt("the quick brown fox jumps over the lazy dog", corrupted_pr=.5)

'tbe 9u:ck brovvn fox iumps over tbe Jazy do9'

# Test phonetic corruption

In [9]:
corruption.df_phonetic

Unnamed: 0,where,orig,new,pre,post,pattern,start
0,ALL,h,@,,,,
1,END,e,@,,,,
2,ALL,t,d,,,,
3,ALL,d,t,,,,
4,ALL,c,k,,,,
...,...,...,...,...,...,...,...
351,ALL,zza,sa,,,,
352,MIDDLE,z,s,n;-1;t,,y;slavo,
353,MIDDLE,ks,x,,,,
354,MIDDLE,cks,x,y;-1;a;i;u;e;o,,,


In [10]:
corruption.df_phonetic.orig.map(len).value_counts()

2    144
3     82
4     71
1     33
5     17
6      8
7      1
Name: orig, dtype: int64

In [11]:
corruption.phonetic_error_dict

{'aa': ['ar'],
 'acce': ['akse'],
 'acch': ['aksh'],
 'acci': ['aksi'],
 'ach': ['k'],
 'achb': ['akb'],
 'achf': ['akf'],
 'achh': ['akh'],
 'achl': ['akl'],
 'achm': ['akm'],
 'achn': ['akn'],
 'achr': ['akr'],
 'achv': ['akv'],
 'achw': ['akw'],
 'aggi': ['aji', 'aki'],
 'ah': ['h'],
 'aiss': ['ai'],
 'aisz': ['ai'],
 'alle': ['ale'],
 'archit': ['arkit'],
 'au': ['o'],
 'augh': ['arf'],
 'aux': ['auks'],
 'aw': ['a'],
 'b': ['p'],
 'bacher': ['baker'],
 'bb': ['p'],
 'btl': ['tl'],
 'c': ['k'],
 'ca': ['ka'],
 'caesar': ['sesar'],
 'cc': ['k', 'k'],
 'cce': ['xi'],
 'cch': ['xh'],
 'cci': ['xi'],
 'ce': ['se'],
 'cg': ['k', 'k'],
 'ch': ['x', 'x'],
 'chae': ['kae'],
 'charac': ['karak'],
 'charis': ['karis'],
 'chb': ['kb'],
 'chem': ['kem'],
 'chf': ['kf'],
 'chh': ['kh'],
 'chia': ['kia', 'kia'],
 'chl': ['kl'],
 'chm': ['km'],
 'chn': ['kn'],
 'chor': ['kor'],
 'chr': ['kr'],
 'chs': ['ks'],
 'cht': ['kt'],
 'chv': ['kv'],
 'chw': ['kw'],
 'chym': ['kym'],
 'ci': ['si'],
 'cia':

In [12]:
corruption.phonetic_corrupt("the quick brown fox jumps over the lazy dog", corrupted_pr=.5)

'te kwuik bahon voecs jums ofr t lasy tok'

# Keyboard corruption

In [13]:
corruption.df_qwerty

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,q,w,e,r,t,y,u,i,o,p
1,a,s,d,f,g,h,j,k,l,
2,z,x,c,v,b,n,m,,,
3,#,,,,,,,,,
4,7,8,9,,,,,,,
5,4,5,6,,,,,,,
6,1,2,3,,,,,,,


In [14]:
str(corruption.df_qwerty.loc[3,3])

'nan'

In [15]:
# The original version included NaN's as neighbors, but I removed those
corruption.qwerty_error_dict

{'q': ['w', 'a', 's'],
 'w': ['q', 'e', 'a', 's', 'd'],
 'e': ['w', 'r', 's', 'd', 'f'],
 'r': ['e', 't', 'd', 'f', 'g'],
 't': ['r', 'y', 'f', 'g', 'h'],
 'y': ['t', 'u', 'g', 'h', 'j'],
 'u': ['y', 'i', 'h', 'j', 'k'],
 'i': ['u', 'o', 'j', 'k', 'l'],
 'o': ['i', 'p', 'k', 'l'],
 'p': ['o', 'l'],
 'a': ['q', 'w', 's', 'z', 'x'],
 's': ['q', 'w', 'e', 'a', 'd', 'z', 'x', 'c'],
 'd': ['w', 'e', 'r', 's', 'f', 'x', 'c', 'v'],
 'f': ['e', 'r', 't', 'd', 'g', 'c', 'v', 'b'],
 'g': ['r', 't', 'y', 'f', 'h', 'v', 'b', 'n'],
 'h': ['t', 'y', 'u', 'g', 'j', 'b', 'n', 'm'],
 'j': ['y', 'u', 'i', 'h', 'k', 'n', 'm'],
 'k': ['u', 'i', 'o', 'j', 'l', 'm'],
 'l': ['i', 'o', 'p', 'k'],
 'z': ['a', 's', 'x'],
 'x': ['a', 's', 'd', 'z', 'c'],
 'c': ['s', 'd', 'f', 'x', 'v'],
 'v': ['d', 'f', 'g', 'c', 'b'],
 'b': ['f', 'g', 'h', 'v', 'n'],
 'n': ['g', 'h', 'j', 'b', 'm'],
 'm': ['h', 'j', 'k', 'n'],
 '7': ['8', '4', '5'],
 '8': ['7', '9', '4', '5', '6'],
 '9': ['8', '5', '6'],
 '4': ['7', '8', '5', '

In [16]:
# Originally, nan's were included in the dictionary as floats,
# but now there should be only strings and no NaN's
list(map(type, corruption.qwerty_error_dict['x']))

[str, str, str, str, str]

In [17]:
corruption.keyboard_corrupt(
    "the quick brown fox jumps over the lazy dog", corrupted_pr=.1, addl_pr=.9)

'rthe quick brown fox jumps over the plazy dog'

In [18]:
corruption.keyboard_corrupt(
    "the quick brown fox jumps over the lazy dog", corrupted_pr=.2, addl_pr=.1)

'tts wuisk frown fox jumps over the lazy dog'

In [19]:
# Originally, letters sometimes got replaced with the string'nan' because of
# the NaNs in the dict, but that should not happen anymore
corruption.keyboard_corrupt(
    "the quick brown fox jumps over the lazy dog", corrupted_pr=.9, addl_pr=.9)

'tmwe aquuscok nbroewn bfoc uumpxs obvreer ythwe plaxzty fdofg'

# Get some simulated data to test with

In [20]:
project_output_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = 'results/special_last_names/florida/2022_10_14_10_49_32/population_table/'
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -l $output_dir

total 32224
-rw-rw-r-- 1 albrja   IHME-Simulationscience 12622072 Oct 20 23:08 decennial_census.hdf
-rwxrwxrwx 1 beatrixh IHME-Simulationscience 20364830 Nov 14 16:42 state_table.hdf


In [21]:
decennial_census_path = f'{output_dir}/decennial_census.hdf'
with pd.HDFStore(decennial_census_path, 'r') as census_hdf:
    print(census_hdf.info())
    census_keys = census_hdf.keys()

<class 'pandas.io.pytables.HDFStore'>
File path: /mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/special_last_names/florida/2022_10_14_10_49_32/population_table//decennial_census.hdf
/year_2020            frame        (shape->[47444,10])
/year_2030            frame        (shape->[46440,10])
/year_2040            frame        (shape->[44626,10])


In [22]:
# Keys exist for years 2020, 2030, 2040
years = [2020]
census = {year: pd.read_hdf(decennial_census_path, f'year_{year}') for year in years}

for year in years:
    print(year, census[year].shape)

2020 (47444, 10)


In [23]:
census[2020]

Unnamed: 0,first_name,last_name,age,date_of_birth,address,zipcode,relation_to_household_head,sex,race_ethnicity,middle_initial
0,Margaret,Clark,68.681336,1951-07-27,"1344 winoka rd brooksville, fl",34601,Reference person,Female,Black,J
1,Jeffrey,Littlejohn,52.913882,1967-05-03,"927 23rd st clearwater, fl",34698,Reference person,Male,Black,V
2,Briana,Jackson,13.566889,2006-09-07,"927 23rd st clearwater, fl",34698,Biological child,Female,Black,A
3,Benjamin,Cox,21.444732,1998-10-21,"927 23rd st clearwater, fl",34698,Stepchild,Male,Black,D
4,Willie,Tucker,72.478355,1947-10-09,"8904 167th place fleming island, fl",32003,Reference person,Male,White,J
...,...,...,...,...,...,...,...,...,...,...
49993,Thomas,Gutierrez,29.374925,1990-11-16,"2210 henn hyde rd ne hollywood, fl",33021,Institutionalized GQ pop,Male,Latino,B
49994,Marcus,Roman,31.731657,1988-07-08,"2210 henn hyde rd ne hollywood, fl",33021,Institutionalized GQ pop,Male,Multiracial or Other,S
49997,Christian,Rosales,36.294295,1983-12-16,"701 haber rd vero beach, fl",32968,Institutionalized GQ pop,Male,Latino,C
49998,Phillip,Morton,34.806130,1985-06-11,"114 s frnt st fort myers, fl",33919,Institutionalized GQ pop,Male,White,J


# Add keyboard noise to addresses and see how long it takes on 50,000 records

Try it two different ways:

1. With `Series.map`, using a `lambda` to get a partially applied function
2. Using `np.vectorize`

Both took a little more than 2 seconds. It looks like `np.vectorize` is slightly faster, but it results in a Numpy array rather than a pandas `Series`.

In [24]:
%%time
census[2020].address.map(lambda s: corruption.keyboard_corrupt(s, .01, .01))

CPU times: user 2.12 s, sys: 45.1 ms, total: 2.16 s
Wall time: 2.14 s


0             1344 winoka rd  brooksville, fl
1                 927 23rd st  clearwater, fl
2                 927 23rd st  clearwater, fl
3                 927 23rd st  clearwater, fl
4        8904 167th place  fleming island, fl
                         ...                 
49993     2210 henn hyde rd ne  hollywood, fl
49994     2210 henn hyde rd ne  hollywood, fl
49997            701 haber rd  vero beach, fl
49998           114 s frnt st  fort myers, fl
49999            701 haber rd  vero beach, fl
Name: address, Length: 47444, dtype: object

In [25]:
%%time
np.vectorize(corruption.keyboard_corrupt)(census[2020].address, .01, .01)

CPU times: user 2.15 s, sys: 16.5 ms, total: 2.16 s
Wall time: 2.14 s


array(['1344 winoka rd  brooksville, fl', '927 23rd st  clearwater, fl',
       '927 23rd st  clearwater, fl', ..., '701 haber rd  vero beach, fl',
       '114 s frnt st  fort myers, fl', '701 haber rd  vero beach, fl'],
      dtype='<U69')

# Do a more precise speed test of `Series.map` vs. `np.vectorize`

They're almost the same -- `np.vectorize` seems to give a slight speedup of about 2%. Let's plan to stick with `Series.map` for now.

In [26]:
%timeit census[2020].address.map(lambda s: corruption.keyboard_corrupt(s, .01, .01))

2.14 s ± 6.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [27]:
%timeit np.vectorize(corruption.keyboard_corrupt)(census[2020].address, .01, .01)

2.14 s ± 13.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
2.14/2.1

1.019047619047619

In [29]:
2.12/2.08

1.0192307692307692

# See how long it takes to add phonetic noise to 50,000 addresses

Phonetic is the most complicated type of noise, and addresses are the longest field in this dataset, so this should take the most amount of time. Looks like it takes about twice as long as keyboard noise.

In [30]:
%%time
census[2020].address.map(lambda s: corruption.phonetic_corrupt(s, .01))

CPU times: user 4.26 s, sys: 56.4 ms, total: 4.31 s
Wall time: 4.27 s


0             1344 winoka rd  brooksville, fl
1                 927 23rd st  clearwater, fl
2                 927 23rd st  clearwater, fl
3                 927 23rd st  clearwater, fl
4        8904 167th place  fleming island, fl
                         ...                 
49993     2210 henn hyde rd ne  holywood, fle
49994      2210 henn hyde rd ne  holywood, fl
49997            701 haber rd  vero beach, fl
49998           114 s frnt st  fort myers, fl
49999             701 haber rd  vero beac, fl
Name: address, Length: 47444, dtype: object

In [31]:
%timeit census[2020].address.map(lambda s: corruption.phonetic_corrupt(s, .01))

4.28 s ± 11.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
%%time
census[2020].last_name.map(lambda s: corruption.phonetic_corrupt(s, .01))

CPU times: user 1.39 s, sys: 4.14 ms, total: 1.39 s
Wall time: 1.37 s


0             Clark
1        Littlejohn
2           Jackson
3               Cox
4            Tucker
            ...    
49993     Gutierrez
49994         Roman
49997       Rosales
49998        Morton
49999         Tally
Name: last_name, Length: 47444, dtype: object

# Look at mean lengths of last names and addresses

Since the corrupters act independently on individual characters or sub-strings with a fixed probability, if we wanted X% of records to be corrupted, we could divide the probability passed to the corruptor by the mean string length to get approximately the right proportion of records affected. However, it would be more efficient to first select X% of the records to corrupt and apply the noise to only those records with whatever, separate, token-level proabability we think is appropriate.

In [33]:
census[2020].last_name.map(len).mean()

6.801766292892673

In [34]:
census[2020].address.map(len).mean()

33.42391029424163

In [35]:
census[2020].last_name

0             Clark
1        Littlejohn
2           Jackson
3               Cox
4            Tucker
            ...    
49993     Gutierrez
49994         Roman
49997       Rosales
49998        Morton
49999        Talley
Name: last_name, Length: 47444, dtype: object

# Test OCR noise on 50,000 records

Looks like this is slightly faster than keyboard noise, by about 7.5%.

In [36]:
%%time
census[2020].address.map(lambda s: corruption.ocr_corrupt(s, 0.01))

CPU times: user 1.99 s, sys: 16 ms, total: 2.01 s
Wall time: 1.99 s


0             1344 winoka rd  brooksville, fl
1                 927 23rd st  clearwater, fl
2                 927 23rd st  clearwater, fl
3                 927 23rd st  clearwater, fl
4        8904 167th place  fleming island, fl
                         ...                 
49993     2210 henn hyde rd ne  hollywood, fl
49994     22|0 henn hyde rd ne  hollywood, fl
49997            701 haber rd  vero beach, fl
49998           114 s frnt st  fort myers, fl
49999            701 haber rd  vero beach, fl
Name: address, Length: 47444, dtype: object

In [37]:
%timeit census[2020].address.map(lambda s: corruption.ocr_corrupt(s, 0.01))

1.98 s ± 16.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
1-1.98/2.14

0.07476635514018692

# Test fake names module

In [39]:
fake_names.fake_first_names

['GIRL',
 'MOM',
 'A',
 'GOH',
 'MOTHER',
 'ADULT',
 'GRANDCHILD',
 'MR',
 'ADULT MALE',
 'GRANDDAUGHTER',
 'MRS',
 'B',
 'GRANDSON',
 'MS',
 'BABY',
 'H',
 'N',
 'BOY',
 'HIJA',
 'NEPHEW',
 'BROTHER',
 'HIJO',
 'NINO',
 'C',
 'HOUSE',
 'O',
 'CHILD',
 'HUSBAND',
 'OLDEST',
 'CHILD F',
 'INMATE',
 'ONE',
 'COH',
 'J',
 'P',
 'D',
 'K',
 'PERSON',
 'DAD',
 'KID',
 'R',
 'DAU',
 'L',
 'RESIDENT',
 'DAUGHTER',
 'LADY',
 'RESPONDENT',
 'DAUGHTER OF',
 'LADY IN THE',
 'S',
 'DOH',
 'LADY OF',
 'SENOR',
 'E',
 'LADY OF HOUSE',
 'SENORA',
 'F',
 'LADY OF THE',
 'SISTER',
 'FATHER',
 'LOH',
 'SOH',
 'FEMALE',
 'M',
 'SON',
 'FEMALE CHILD',
 'MALE',
 'SON OF',
 'FRIEND',
 'MALE CHILD',
 'T',
 'G',
 'MAN',
 'V',
 'GENT',
 'MAN IN THE',
 'W',
 'GENTELMAN',
 'MAN OF',
 'WIFE',
 'GENTLE',
 'MAN OF THE',
 'WOMAN',
 'GENTLEMAN',
 'MINOR',
 'YOUNGEST',
 'GENTLEMAN OF',
 'MISS',
 'GENTLEMEN',
 'MOH']

# See what the fake names look like when converted to title case

Looks like every word gets capitalized, including prepositions/articles like "Of The" or "De La".

In [40]:
list(map(lambda s: s.title(), fake_names.fake_first_names))

['Girl',
 'Mom',
 'A',
 'Goh',
 'Mother',
 'Adult',
 'Grandchild',
 'Mr',
 'Adult Male',
 'Granddaughter',
 'Mrs',
 'B',
 'Grandson',
 'Ms',
 'Baby',
 'H',
 'N',
 'Boy',
 'Hija',
 'Nephew',
 'Brother',
 'Hijo',
 'Nino',
 'C',
 'House',
 'O',
 'Child',
 'Husband',
 'Oldest',
 'Child F',
 'Inmate',
 'One',
 'Coh',
 'J',
 'P',
 'D',
 'K',
 'Person',
 'Dad',
 'Kid',
 'R',
 'Dau',
 'L',
 'Resident',
 'Daughter',
 'Lady',
 'Respondent',
 'Daughter Of',
 'Lady In The',
 'S',
 'Doh',
 'Lady Of',
 'Senor',
 'E',
 'Lady Of House',
 'Senora',
 'F',
 'Lady Of The',
 'Sister',
 'Father',
 'Loh',
 'Soh',
 'Female',
 'M',
 'Son',
 'Female Child',
 'Male',
 'Son Of',
 'Friend',
 'Male Child',
 'T',
 'G',
 'Man',
 'V',
 'Gent',
 'Man In The',
 'W',
 'Gentelman',
 'Man Of',
 'Wife',
 'Gentle',
 'Man Of The',
 'Woman',
 'Gentleman',
 'Minor',
 'Youngest',
 'Gentleman Of',
 'Miss',
 'Gentlemen',
 'Moh']

In [41]:
list(map(lambda s: s.title(), fake_names.fake_last_names))

['Hh',
 'Of The House',
 'A',
 'Hhm',
 'One',
 'Adult',
 'Home',
 'Owner',
 'Anon',
 'House',
 'P',
 'Anonymous',
 'Household',
 'Parent',
 'Apellido',
 'Householder',
 'Person',
 'B',
 'Husband',
 'R',
 'Boy',
 'J',
 'Ref',
 'C',
 'K',
 'Refuse',
 'Casa',
 'L',
 'Resident',
 'Child',
 'Lady',
 'Resp',
 'Coh',
 'Lady Of House',
 'Respondant',
 'D',
 'Lady Of The House',
 'Respondent',
 'Daughter',
 'Last Name',
 'S',
 'De Casa',
 'Loh',
 'Soh',
 'De La Casa',
 'M',
 'Son',
 'Declined',
 'Male',
 'T',
 'Doe',
 'Man',
 'The House',
 'Doh',
 'Man Of The House',
 'Three',
 'Dont Know',
 'Moh',
 'Two',
 'E',
 'N',
 'Unk',
 'F',
 'Na',
 'Unknown',
 'Female',
 'No',
 'W',
 'Four',
 'No Last Name',
 'Wife',
 'Friend',
 'No Name',
 'X',
 'G',
 'None',
 'Xxx',
 'Girl',
 'O',
 'Y',
 'Goh',
 'Occupant',
 'Younger',
 'H',
 'Of House',
 'H Age',
 'Of The Home']

In [42]:
help('abd'.title)

Help on built-in function title:

title() method of builtins.str instance
    Return a version of the string where each word is titlecased.
    
    More specifically, words start with uppercased characters and all remaining
    cased characters have lower case.

