In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from vivarium_research_prl.noise import corruption

!date
!whoami
!uname -a
!pwd

Thu 01 Dec 2022 12:13:52 AM PST
ndbs
Linux int-slurm-sarchive-p0012 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/noise


In [2]:
%load_ext autoreload
%autoreload 2

# Get some data

In [3]:
project_output_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = 'results/special_last_names/florida/2022_10_14_10_49_32/population_table/'
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -l $output_dir

total 32224
-rw-rw-r-- 1 albrja   IHME-Simulationscience 12622072 Oct 20 23:08 decennial_census.hdf
-rwxrwxrwx 1 beatrixh IHME-Simulationscience 20364830 Nov 14 16:42 state_table.hdf


In [4]:
decennial_census_path = f'{output_dir}/decennial_census.hdf'
with pd.HDFStore(decennial_census_path, 'r') as census_hdf:
    print(census_hdf.info())
    census_keys = census_hdf.keys()

<class 'pandas.io.pytables.HDFStore'>
File path: /mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/special_last_names/florida/2022_10_14_10_49_32/population_table//decennial_census.hdf
/year_2020            frame        (shape->[47444,10])
/year_2030            frame        (shape->[46440,10])
/year_2040            frame        (shape->[44626,10])


In [5]:
# Keys exist for years 2020, 2030, 2040
years = [2020]
census = {year: pd.read_hdf(decennial_census_path, f'year_{year}') for year in years}

for year in years:
    print(year, census[year].shape)

2020 (47444, 10)


# Check module imports

In [6]:
dir(corruption)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'df_k',
 'df_ocr',
 'df_phonetic',
 'k',
 'np',
 'ocr_corrupt',
 'ocr_error_dict',
 'pd',
 'phonetic_corrupt',
 'phonetic_error_dict']

# Test OCR noise

In [7]:
corruption.df_ocr

Unnamed: 0,ocr_true,ocr_err
0,5,S
1,5,s
2,2,Z
3,2,z
4,1,|
...,...,...
44,l<,k
45,1<,k
46,m,rn
47,l,|


In [8]:
# how many characters in the true string?
corruption.df_ocr.ocr_true.map(len).value_counts()

1    34
2    13
3     2
Name: ocr_true, dtype: int64

In [9]:
corruption.ocr_error_dict

{'0': ['o', 'O'],
 '1': ['|'],
 '12': ['R'],
 '13': ['B'],
 '17': ['n'],
 '1<': ['k'],
 '1>': ['b'],
 '2': ['Z', 'z'],
 '5': ['S', 's'],
 '6': ['G'],
 'A': ['4'],
 'B': ['8'],
 'D': ['O'],
 'E': ['F'],
 'F': ['P'],
 'I-I': ['H'],
 'IJ': ['U'],
 'LI': ['U'],
 'Q': ['O'],
 'U': ['V'],
 'Y': ['V'],
 'cl': ['d'],
 'g': ['9', 'q'],
 'h': ['b'],
 'i': ["'l", ':'],
 'iii': ['m'],
 'j': ['i'],
 'k': ['lc'],
 'l': ['J', '1', 'I', '|'],
 'l<': ['k'],
 'l>': ['b'],
 'lJ': ['U'],
 'lo': ['b'],
 'm': ['n', 'rn'],
 'q': ['9', '4'],
 'ri': ['n'],
 'u': ['v'],
 'w': ['vv'],
 'y': ['v']}

In [10]:
corruption.ocr_corrupt("the quick brown fox jumps over the lazy dog", corrupted_pr=.5)

'the 4viclc brown fox ivmps over the 1azv do9'

# Test phonetic corruption

In [11]:
corruption.df_phonetic

Unnamed: 0,where,orig,new,pre,post,pattern,start
0,ALL,h,@,,,,
1,END,e,@,,,,
2,ALL,t,d,,,,
3,ALL,d,t,,,,
4,ALL,c,k,,,,
...,...,...,...,...,...,...,...
351,ALL,zza,sa,,,,
352,MIDDLE,z,s,n;-1;t,,y;slavo,
353,MIDDLE,ks,x,,,,
354,MIDDLE,cks,x,y;-1;a;i;u;e;o,,,


In [12]:
corruption.df_phonetic.orig.map(len).value_counts()

2    144
3     82
4     71
1     33
5     17
6      8
7      1
Name: orig, dtype: int64

In [13]:
corruption.phonetic_error_dict

{'aa': ['ar'],
 'acce': ['akse'],
 'acch': ['aksh'],
 'acci': ['aksi'],
 'ach': ['k'],
 'achb': ['akb'],
 'achf': ['akf'],
 'achh': ['akh'],
 'achl': ['akl'],
 'achm': ['akm'],
 'achn': ['akn'],
 'achr': ['akr'],
 'achv': ['akv'],
 'achw': ['akw'],
 'aggi': ['aji', 'aki'],
 'ah': ['h'],
 'aiss': ['ai'],
 'aisz': ['ai'],
 'alle': ['ale'],
 'archit': ['arkit'],
 'au': ['o'],
 'augh': ['arf'],
 'aux': ['auks'],
 'aw': ['a'],
 'b': ['p'],
 'bacher': ['baker'],
 'bb': ['p'],
 'btl': ['tl'],
 'c': ['k'],
 'ca': ['ka'],
 'caesar': ['sesar'],
 'cc': ['k', 'k'],
 'cce': ['xi'],
 'cch': ['xh'],
 'cci': ['xi'],
 'ce': ['se'],
 'cg': ['k', 'k'],
 'ch': ['x', 'x'],
 'chae': ['kae'],
 'charac': ['karak'],
 'charis': ['karis'],
 'chb': ['kb'],
 'chem': ['kem'],
 'chf': ['kf'],
 'chh': ['kh'],
 'chia': ['kia', 'kia'],
 'chl': ['kl'],
 'chm': ['km'],
 'chn': ['kn'],
 'chor': ['kor'],
 'chr': ['kr'],
 'chs': ['ks'],
 'cht': ['kt'],
 'chv': ['kv'],
 'chw': ['kw'],
 'chym': ['kym'],
 'ci': ['si'],
 'cia':

In [14]:
corruption.phonetic_corrupt("the quick brown fox jumps over the lazy dog", corrupted_pr=.5)

'te kuycck bron vox jums ofah te lazy dok'

# Keyboard corruption

In [16]:
corruption.df_qwerty

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,q,w,e,r,t,y,u,i,o,p
1,a,s,d,f,g,h,j,k,l,
2,z,x,c,v,b,n,m,,,
3,#,,,,,,,,,
4,7,8,9,,,,,,,
5,4,5,6,,,,,,,
6,1,2,3,,,,,,,


In [28]:
str(corruption.df_qwerty.loc[3,3])

'nan'

In [17]:
corruption.qwerty_error_dict

{'q': ['w', 'a', 's'],
 'w': ['q', 'e', 'a', 's', 'd'],
 'e': ['w', 'r', 's', 'd', 'f'],
 'r': ['e', 't', 'd', 'f', 'g'],
 't': ['r', 'y', 'f', 'g', 'h'],
 'y': ['t', 'u', 'g', 'h', 'j'],
 'u': ['y', 'i', 'h', 'j', 'k'],
 'i': ['u', 'o', 'j', 'k', 'l'],
 'o': ['i', 'p', 'k', 'l', nan],
 'p': ['o', 'l', nan],
 'a': ['q', 'w', 's', 'z', 'x'],
 's': ['q', 'w', 'e', 'a', 'd', 'z', 'x', 'c'],
 'd': ['w', 'e', 'r', 's', 'f', 'x', 'c', 'v'],
 'f': ['e', 'r', 't', 'd', 'g', 'c', 'v', 'b'],
 'g': ['r', 't', 'y', 'f', 'h', 'v', 'b', 'n'],
 'h': ['t', 'y', 'u', 'g', 'j', 'b', 'n', 'm'],
 'j': ['y', 'u', 'i', 'h', 'k', 'n', 'm', nan],
 'k': ['u', 'i', 'o', 'j', 'l', 'm', nan, nan],
 'l': ['i', 'o', 'p', 'k', nan, nan, nan, nan],
 'z': ['a', 's', 'x', nan],
 'x': ['a', 's', 'd', 'z', 'c', nan, nan],
 'c': ['s', 'd', 'f', 'x', 'v', nan, nan, nan],
 'v': ['d', 'f', 'g', 'c', 'b', nan, nan, nan],
 'b': ['f', 'g', 'h', 'v', 'n', nan, nan, nan],
 'n': ['g', 'h', 'j', 'b', 'm', nan, nan, nan],
 'm': ['h'

In [34]:
# nan's are stored as floats in the dictionary
list(map(type, corruption.qwerty_error_dict['x']))

[str, str, str, str, str, float, float]

In [30]:
# Note that sometimes letters get replaced with 'nan' because of the NaNs in the dict
corruption.keyboard_corrupt(
    "the quick brown fox jumps over the lazy dog", corrupted_pr=.1, addl_pr=.9)

'the quicjk brown fox jumps ovwr thwe nanlazgy sog'

In [32]:
corruption.keyboard_corrupt(
    "the quick brown fox jumps over the lazy dog", corrupted_pr=.2, addl_pr=.1)

'the quicm brown fox jumpc over the lazy dog'

In [35]:
# Note that sometimes letters get replaced with 'nan' because of the NaNs in the dict
corruption.keyboard_corrupt(
    "the quick brown fox jumps over the lazy dog", corrupted_pr=.9, addl_pr=.9)

'rtjhwe aqukifck nanbfroawn vfozx mjumpas ogvdedr gtthse iazu wdoyg'