In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 8)
!date

%load_ext autoreload
%autoreload 2

Tue Apr 12 19:49:44 PDT 2022


# Reproduce data corruption like GeCO

Informed by reading https://dmm.anu.edu.au/geco/flex-data-gen-manual.pdf but not looking at the sourcecode, since it might be in conflict with the license we end up using for this sim.


In [2]:
%cd ~/projects/2022/geco-data-generator-corruptor/lookup-files/
# !ls -halt

/ihme/homes/abie/projects/2022/geco-data-generator-corruptor/lookup-files


In [3]:
df_ocr = pd.read_csv('ocr-variations-upper-lower.csv', skiprows=[0,1], header=None, names=['ocr_true', 'ocr_err'])
df_ocr.ocr_true.map(len).value_counts() # how many characters in the true string?

1    34
2    13
3     2
Name: ocr_true, dtype: int64

# Algorithm sketch

For each token decide if it is OCRed correctly, and if it is not, decide how it goes wrong.

Since there are tokens of length 1, 2, and 3, how to handle?  I guess I can start with threes, then twos, then ones, for each location in a string.

In [7]:
ocr_error_dict = {}
for k, df_k in df_ocr.groupby('ocr_true'):
    ocr_error_dict[k] = list(df_k.ocr_err)

In [62]:
def ocr_corrupt(truth, corrupted_pr):
    err = ''
    i = 0
    while i < len(truth):
        error_introduced = False
        for token_length in [3,2,1]:
            token = truth[i:(i+token_length)]
            if token in ocr_error_dict and not error_introduced:
                if np.random.uniform() < corrupted_pr:
                    err += np.random.choice(ocr_error_dict[token])
                    i += token_length
                    error_introduced = True
        if not error_introduced:
            err += truth[i:(i+1)]
            i += 1
    return err

ocr_corrupt("the quick brown fox jumps over the lazy dog", corrupted_pr=.5)

"tbe 4u'lclc brovvn fox jurnps over the |azy doq"

# Hardest one: phonetic corruption

This includes an undocumented microlanguage, with commands like `n;-1;t` to mean no using this rule if the character before it is a t.

In [64]:
df_phonetic = pd.read_csv('phonetic-variations.csv', skiprows=[0,1], header=None,
                          names=['where', 'orig', 'new', 'pre', 'post', 'pattern', 'start'])
df_phonetic

Unnamed: 0,where,orig,new,pre,post,pattern,start
0,ALL,h,@,,,,
1,END,e,@,,,,
2,ALL,t,d,,,,
3,ALL,d,t,,,,
...,...,...,...,...,...,...,...
352,MIDDLE,z,s,n;-1;t,,y;slavo,
353,MIDDLE,ks,x,,,,
354,MIDDLE,cks,x,y;-1;a;i;u;e;o,,,
355,END,l,le,y;-1;ai,,,


In [66]:
df_phonetic.orig.map(len).value_counts()

2    144
3     82
4     71
1     33
5     17
6      8
7      1
Name: orig, dtype: int64

In [65]:
phonetic_error_dict = {}
for k, df_k in df_phonetic.groupby('orig'):
    phonetic_error_dict[k] = list(df_k.new.str.replace('@', ''))

In [77]:
def phonetic_corrupt(truth, corrupted_pr):
    err = ''
    i = 0
    while i < len(truth):
        error_introduced = False
        for token_length in [7,6,5,4,3,2,1]:
            token = truth[i:(i+token_length)]
            if token in phonetic_error_dict and not error_introduced:
                if np.random.uniform() < corrupted_pr:
                    err += np.random.choice(phonetic_error_dict[token]) # TODO: only consider possibilities allowed by where, pre, post, pattern, and start values
                    i += token_length
                    error_introduced = True
        if not error_introduced:
            err += truth[i:(i+1)]
            i += 1
    return err

phonetic_corrupt("the quick brown fox jumps over the lazy dog", corrupted_pr=.5)

'dhe kwuyk bahon fox jumps ovah dh lazi dok'

In [82]:
df_qwerty = pd.read_csv('qwerty-keyboard.csv', skiprows=[0,1], header=None)
df_qwerty

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,q,w,e,r,t,y,u,i,o,p
1,a,s,d,f,g,h,j,k,l,
2,z,x,c,v,b,n,m,,,
3,#,,,,,,,,,
4,7,8,9,,,,,,,
5,4,5,6,,,,,,,
6,1,2,3,,,,,,,


In [90]:
qwerty_error_dict = {}
for i in df_qwerty.index:
    for j in df_qwerty.columns:
        val = df_qwerty.loc[i,j]
        if str(val) != 'nan' and val != '#':
            nbrs = []
            for di in [-1,0,1]:
                for dj in [-1,0,1]:
                    if di != 0 or dj != 0: # only actual nbrs, not val itself
                        if i+di in df_qwerty.index and j+dj in df_qwerty.columns:
                            nbr_val = df_qwerty.loc[i+di, j+dj]
                            if nbr_val != '#':
                                nbrs.append(nbr_val)
            qwerty_error_dict[val] = nbrs

In [92]:
def keyboard_corrupt(truth, corrupted_pr, addl_pr):
    err = ''
    i = 0
    while i < len(truth):
        error_introduced = False
        for token_length in [1]:
            token = truth[i:(i+token_length)]
            if token in phonetic_error_dict and not error_introduced:
                if np.random.uniform() < corrupted_pr:
                    err += np.random.choice(qwerty_error_dict[token])
                    if np.random.uniform() < addl_pr:
                        err += token
                    i += token_length
                    error_introduced = True
        if not error_introduced:
            err += truth[i:(i+1)]
            i += 1
    return err

keyboard_corrupt("the quick brown fox jumps over the lazy dog", corrupted_pr=.1, addl_pr=.9)

'the quick brown fox jumps ovedr tuhe lazy dog'