<a href="https://colab.research.google.com/github/iued-uni-heidelberg/corpustools/blob/main/S105ocrCorrectionV06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# correction of ocr-generated text
- using correction dictionaries
- using rules

### Notes on the development
- Algorithm:
1. We extract rewrite rules from the annotations
2. If a word is not found in our corpus it can be an error:

>>
- We Apply the longest-first strategy for rewriting
- We Apply all the rules, converting the word into several candidates, where possible
- We check which candidates exist in Wikipedia dictionary

3. We print all candidates for annotation in a spreadsheet
4. Record which rules are most productive ...


Further investigation:
- introduce Levenshtein distance?
- introduce language model to check which rewriting operation to apply?


## Service functions

In [1]:
import re, os, sys


In [47]:
# critical path function...
# function for Armenian tokenization
def tokenizeTextHY(SFIn):
    LLParagraphs = []
    with open(SFIn, 'r') as FIn:
        countpara = 0
        for SLine in FIn:
            countpara += 1
            if countpara % 100000 == 0: print(countpara)
            SLine = SLine.strip()
            if SLine == '': continue
            LLine = re.split('([ ,\.\:։;\'\"\(\)\-\–\!\?\{\}\t\«\»]+)', SLine)
            # if LLine == '': continue
            if LLine: LLParagraphs.append(LLine)
    return LLParagraphs


# This section applies corrections to an Armenian texts and records which words have been corrected

In [96]:
def llPara2dict(LLParagraphs):
    DFreq = {}
    p=0
    for LPara in LLParagraphs:
        p+=1
        if p%200000 == 0:
            print(p)
        # if LPara == [] or LPara == ['']: continue
        if LPara == []: continue
        # FOut.write(str(LPara) + '\n')
        i = 0 # counting words
        for el in LPara:
            i+=1 # index of next word
            try:
                DFreq[el] += 1
            except:
                DFreq[el] = 1
    return DFreq


In [98]:
def printFrqDictinary(DWiki, FOut):
    for key, val in sorted(DWiki.items(), key=lambda item: item[1], reverse=True):
        FOut.write(f'{key}\t{val}\n')

## Downloading Armenian corpus, correcting lines, tokenizing

In [None]:
# core starts here - critical

!wget https://heibox.uni-heidelberg.de/f/c977e87cf2b244e6801b/?dl=1
!mv index.html?dl=1 KorpusARM.tgz


In [None]:
!tar xvzf KorpusARM.tgz
!mkdir KorpusARM1
!mkdir KorpusARM1/stage01
# concatenating files
!cat korpusARM/hyFiktion/* >KorpusARM1/stage01/hyFiktion.txt
!cat korpusARM/hyNatur/* >KorpusARM1/stage01/hyNatur.txt
!cat korpusARM/hyRecht/* >KorpusARM1/stage01/hyRecht.txt
!mkdir KorpusARM1/stage02

# function for Armenian line breaks:

def correctLineBreaksHY(FName, FNameOut):
    FIn = open(FName, 'r')
    FOut = open(FNameOut, 'w')
    countHyphens = 0
    for SLine in FIn:
        SLine = SLine.strip()
        if SLine == '':
            FOut.write('\n\n')
            continue
        if SLine[-1] == '-':
            SLine2write = SLine[:-1]
            FOut.write(SLine2write)
            countHyphens +=1
            continue
        FOut.write(SLine + ' ')
    FOut.flush()
    print(str(countHyphens) + ' hyphens corrected')
    return

correctLineBreaksHY('KorpusARM1/stage01/hyFiktion.txt', 'KorpusARM1/stage02/hyFiktion.txt')
correctLineBreaksHY('KorpusARM1/stage01/hyNatur.txt', 'KorpusARM1/stage02/hyNatur.txt')
correctLineBreaksHY('KorpusARM1/stage01/hyRecht.txt', 'KorpusARM1/stage02/hyRecht.txt')


In [5]:

!wc KorpusARM1/stage02/hyFiktion.txt
!wc KorpusARM1/stage02/hyNatur.txt
!wc KorpusARM1/stage02/hyRecht.txt

   6208   92131 1081755 KorpusARM1/stage02/hyFiktion.txt
  3642  67142 870081 KorpusARM1/stage02/hyNatur.txt
   8940   86621 1288655 KorpusARM1/stage02/hyRecht.txt


### now we tokenize the Armenian corpora

In [48]:
try:
    del LLParaHyF
except:
    sys.stderr.write(f'LLParaHyF doens\'t need to be deleted, not yet created...\n')

try:
    del LLParaHyN
except:
    sys.stderr.write(f'LLParaHyN doens\'t need to be deleted, not yet created...\n')

try:
    del LLParaHyR
except:
    sys.stderr.write(f'LLParaHyR doens\'t need to be deleted, not yet created...\n')

LLParaHyF doens't need to be deleted, not yet created...
LLParaHyN doens't need to be deleted, not yet created...
LLParaHyR doens't need to be deleted, not yet created...


In [49]:
LLParaHyF = tokenizeTextHY('/content/KorpusARM1/stage02/hyFiktion.txt')
LLParaHyN = tokenizeTextHY('/content/KorpusARM1/stage02/hyNatur.txt')
LLParaHyR = tokenizeTextHY('/content/KorpusARM1/stage02/hyRecht.txt')

In [50]:
# we check how our specialized corpora was tokenized...
print(LLParaHyF[1])
print(len(LLParaHyF[1]))
print(len(LLParaHyF))

print(LLParaHyN[1])
print(len(LLParaHyN[1]))
print(len(LLParaHyN))

print(LLParaHyR[1])
print(len(LLParaHyR[1]))
print(len(LLParaHyR))

['', '-- ', 'Բայց', ' ', 'դու', ' ', 'ինճ', ' ', 'անմիջապես', ' ', 'ամբողջովին', ' ', 'չպիտի', ' ', 'կուլ', ' ', 'տաս', ',--- ', 'ասաց', ' ', 'նա', ' ', 'մեղմորեն', ':', '']
25
3104
['Կան', ' ', 'մահվան', ' ', 'քարոզիչներ', ', ', 'ն', ' ', 'երկիրը', ' ', 'լիքն', ' ', 'է', ' ', 'նրանցով', ', ', 'ում', ' ', 'ոլետք', ' ', 'է', ' ', 'կյանքից', ' ', 'հեռացում', ' ', 'քարոզվի', ':', '']
29
1821
['ԳԼՈՒԽ', ' ', '1', ' ', 'ՀԻՄՆԱԿԱՆ', ' ', 'ԴՐՈՒՅԹՆԵՐ']
7
4471


In [97]:
DHyF = llPara2dict(LLParaHyF)
print(len(DHyF))

DHyN = llPara2dict(LLParaHyN)
print(len(DHyN))

DHyR = llPara2dict(LLParaHyR)
print(len(DHyR))

20748
21272
10849


## Wikipedia


In [None]:
# downloading wikipedia
### downloading Armenian Wikipedia
!wget https://heibox.uni-heidelberg.de/f/d1f866a61bd545318213/?dl=1
!mv index.html?dl=1 hywiki-20221101-pages-articles.txt.gz
!gunzip hywiki-20221101-pages-articles.txt.gz
# the length of wikipedia



In [None]:
!wc hywiki-20221101-pages-articles.txt

In [16]:
try:
    del LLParaWiki
except:
    sys.stderr.write(f'LLParaWiki doens\'t need to be deleted, not yet created...\n')

LLParaWiki doens't need to be deleted, not yet created...


In [None]:
LLParaWiki = tokenizeTextHY('/content/hywiki-20221101-pages-articles.txt')

In [18]:
print(LLParaWiki[1])

['Հայաստան', ' , ', 'պաշտոնական', ' ', 'անվանումը՝', ' ', 'Հայաստանի', ' ', 'Հանրապետություն', ', ', 'պետություն', ' ', 'Առաջավոր', ' ', 'Ասիայում՝', ' ', 'Հայկական', ' ', 'լեռնաշխարհի', ' ', 'հյուսիսարևելյան', ' ', 'մասում', '։ ', 'Քաղաքական', ' ', 'և', ' ', 'մշակութային', ' ', 'իմաստով', ', ', 'սակայն', ', ', 'գտնվում', ' ', 'է', ' ', 'հարավարևելյան', ' ', 'Եվրոպայի', ' ', 'Կովկասյան', ' ', 'տարածաշրջանում', '։ ', 'Հյուսիսում', ' ', 'սահմանակցում', ' ', 'է', ' ', 'Վրաստանին', ', ', 'արևելքում՝', ' ', 'Ադրբեջանին', ', ', 'հարավում՝', ' ', 'Իրանին', ', ', 'իսկ', ' ', 'արևմուտքում՝', ' ', 'Թուրքիային', '։ ', 'Հարավարևելյան', ' ', 'կողմում', ' ', 'Բերձորի', ' ', 'միջանցքով', ' ', 'կապվում', ' ', 'է', ' ', 'Արցախի', ' ', 'Հանրապետությանը', ', ', 'իսկ', ' ', 'հարավ', '-', 'արևմուտքում', ' ', 'Ադրբեջանի', ' ', 'էքսկլավ', ' ', 'Նախիջևանի', ' ', 'Ինքնավար', ' ', 'Հանրապետությունն', ' ', 'է', '։ ', 'Այժմյան', ' ', 'ՀՀ', '-', 'ն', ' ', 'զբաղեցնում', ' ', 'է', ' ', 'պատմական', ' ', 'Հայաստանի', 

In [19]:
len(LLParaWiki[1])

137

In [20]:
len(LLParaWiki)

2153019

In [22]:
DWiki = llPara2dict(LLParaWiki)
print(len(DWiki))

200000
400000
600000
800000
1000000
1200000
1400000
1600000
1800000
2000000
2071984


In [23]:
len(DWiki)

2071984

In [24]:
FOut = open('hywiki-frqDict.txt', 'w')
for key, val in sorted(DWiki.items(), key=lambda item: item[1], reverse=True):
    FOut.write(f'{key}\t{val}\n')

In [25]:
!wc hywiki-frqDict.txt

 2071974  4211029 42482907 hywiki-frqDict.txt


In [None]:
!head --lines=40 hywiki-frqDict.txt

## Discover candidate rewriting rules systematically

In [53]:
def getPrefInfSuf(wrd1, wrd2):
    try:
        cpref = os.path.commonprefix([wrd1, wrd2])
        drw1 = wrd1[::-1]
        drw2 = wrd2[::-1]
        ffusc = os.path.commonprefix([drw1, drw2])
        csuff = ffusc[::-1]
    except:
        sys.stderr.write('error finding pref- and suffix')
        cpref = None
        csuff = None

    try:
        wrd1minpref = wrd1.removeprefix(cpref)
        wrd2minpref = wrd2.removeprefix(cpref)
        wrd1centre = wrd1minpref.removesuffix(csuff)
        wrd2centre = wrd2minpref.removesuffix(csuff)
    except:
        sys.stderr.write('error finding centre 1 and 2')
        wrd1centre = None
        wrd2centre = None

    return cpref, wrd1centre, wrd2centre, csuff


P12, I1, I2, S12 = getPrefInfSuf('[перепливи]', '[перелови]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[розгубився]', '[розгубивсь]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[вловив]', '[зловив]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[переходити]', '[перешкодити]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[переходити]', '[перешкоджати]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[ходити]', '[перешкоджати]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)




P12, I1, I2, S12 = getPrefInfSuf('[մերճակա]', '[մերձակա]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[առջն]', '[առջև]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[թեթնություն]', '[թեթևություն]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[Եթենա]', '[եթե նա]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[ննա]', '[նա]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)

P12, I1, I2, S12 = getPrefInfSuf('[ճեռքերն]', '[ձեռքից]')
print(P12, ' | ', I1, ' > ', I2, ' | ', S12)




[пере  |  пли  >  ло  |  ви]
[розгубивс  |  я  >  ь  |  ]
[  |  в  >  з  |  ловив]
[пере  |  х  >  шк  |  одити]
[пере  |  ходи  >  шкоджа  |  ти]
[  |  ходи  >  перешкоджа  |  ти]
[մեր  |  ճ  >  ձ  |  ակա]
[առջ  |  ն  >  և  |  ]
[թեթ  |  ն  >  և  |  ություն]
[  |  Եթե  >  եթե   |  նա]
[ն  |    >  ա]  |  նա]
[  |  ճեռքերն  >  ձեռքից  |  ]


In [58]:
def createSufList(a):
    ''' odyty] --> odyty], odyty, odyt, ody, od, o, "" '''
    LSuf = []
    for j in range(1,len(a)+1):
        prefix = a[:j]
        LSuf.append(prefix)
        # print(j, len(prefix), LSuf)
    LSuf.insert(0, '')
    return LSuf

createSufList('odyty]')

['', 'o', 'od', 'ody', 'odyt', 'odyty', 'odyty]']

In [59]:
createSufList('ություն]')

['', 'ո', 'ու', 'ութ', 'ությ', 'ությո', 'ությու', 'ություն', 'ություն]']

In [60]:
def createPrefList(s):
    LPref = [s[-i:] for i in range(1, len(s) + 1)]
    LPref.insert(0, '')
    # print(LPref)
    return LPref

createPrefList('[pere')

['', 'e', 're', 'ere', 'pere', '[pere']

In [61]:
createPrefList('[թեթ')

['', 'թ', 'եթ', 'թեթ', '[թեթ']

### we create the list of potential rewrite rules


In [85]:
def twoWords2listOfRules(W1, W2):
    DTRules = {}
    DTRulesLen = {}
    LTRules = []
    P12, I1, I2, S12 = getPrefInfSuf(W1, W2)
    LPref12 = createPrefList(P12)
    LSuf12 = createSufList(S12)

    for pref in LPref12:
        for suf in LSuf12:
            SlhsRule = pref + I1 + suf
            SrhsRule = pref + I2 + suf
            TRules = (SlhsRule, SrhsRule)
            LTRules.append(TRules)
            try:
                DTRules[TRules] += 1
            except:
                DTRules[TRules] = 1

            try:
                DTRulesLen[TRules] = len(SlhsRule)
            except:
                continue
    return LTRules, DTRules, DTRulesLen, P12, I1, I2, S12

def printFrqTDict(DTFrq, FOut = None):
    for key, val in sorted(DTFrq.items(), key=lambda x:x[1], reverse=True):
        LHS, RHS = key
        if FOut:
            FOut.write(f'{LHS}, {RHS}, {str(val)}\n')
        else:
            print(LHS, RHS, str(val))
    if FOut:
        FOut.flush()

def printTList(LTuples, FOut = None):
    for el in LTuples:
        LHS, RHS = el
        if FOut:
            FOut.write(f'{LHS}, {RHS}\n')
        else:
            print(LHS, RHS)
    if FOut:
        FOut.flush()



In [86]:
LTRules, DTRules, DTRulesLen, P12, I1, I2, S12 = twoWords2listOfRules('[перепливи]', '[перелови]')
# FOut = open('printTRules0-test-uk.txt', 'w')
# printFrqTDict(DTRules, FOut)
FOut0 = open('printTRulesLen-test-uk.txt', 'w')
FOut0.write(f'{P12}|{I1}>{I2}|{S12}\n')
printFrqTDict(DTRulesLen, FOut0)
FOut1 = open('printTRules-test-uk.txt', 'w')
printTList(LTRules, FOut1)

In [83]:
LTRules, DTRules, DTRulesLen, P12, I1, I2, S12 = twoWords2listOfRules('[թեթնություն]', '[թեթևություն]')
FOut0 = open('printTRulesLen-test-hy.txt', 'w')
FOut0.write(f'{P12}|{I1}>{I2}|{S12}\n')
printFrqTDict(DTRulesLen, FOut0)



In [None]:
# ... todo: check if we need this function (possibly -- substring)
def findLongestMatch(SInput, DTRulesLength):
    SOutput = None
    for key, val in sorted(DTRulesLength.items(), key=lambda x:x[1], reverse=True):
        LHS, RHS = key
        if LHS in SInput:
            SOutput = SInput.replace(LHS, RHS, 1)
            break
    return SOutput

SOutput1 = findLongestMatch('[перепливи]', DTRulesLen)
if SOutput1: print(SOutput1)

SOutput1 = findLongestMatch('[перешкоджав]', DTRulesLen)
if SOutput1: print(SOutput1)

SOutput1 = findLongestMatch('[проходжав]', DTRulesLen)
if SOutput1: print(SOutput1)


[перелови]


Algorithm:
- Further we create a common dictionary for all the rewrite strings, we record their frequencies, then we resort them by their length...

## File with corrections

In [None]:
# Dowloading the file with corrections
# !wget https://heibox.uni-heidelberg.de/f/14706c04a4024b2f937d/?dl=1
# without ճեր
!wget https://heibox.uni-heidelberg.de/f/4a24540473564788853d/?dl=1

!mv index.html?dl=1 Pilot-Corrections-all.tsv


In [63]:
!wc Pilot-Corrections-all.tsv

  324  2854 26730 Pilot-Corrections-all.tsv


In [75]:
# critical path function
def readCorrectionsFrq(colNumberOri, colNumberCorrect, colNumberFrq, SFIn, SFOut = None):
    LTWrongCorrect = []
    '''
    if type(LTWrongCorrect) == list:
        pass
    '''

    DWrongCorrect = {}
    FOut = open(SFOut, 'w')
    with open(SFIn, 'r') as FIn:
        count = 0
        for SLine in FIn:
            count += 1
            if count == 1: continue
            SLine = SLine.rstrip('\n')
            LLine = SLine.split('\t')
            SWrong = LLine[colNumberOri]
            SCorrect = LLine[colNumberCorrect]
            SFrq = LLine[colNumberFrq]
            if SWrong != '' and SCorrect != '' and SWrong != SCorrect:
                # TWrongCorrect = (f'[{SWrong}]', f'[{SCorrect}]', f'{SFrq}')
                TWrongCorrect = (f'[{SWrong}]', f'[{SCorrect}]', int(SFrq))
                LTWrongCorrect.append(TWrongCorrect)
                if SWrong in DWrongCorrect.keys():
                    SCorrect1 = DWrongCorrect[SWrong]
                    if SCorrect1 != SCorrect:
                        print(SWrong + '\t' + SCorrect1 + '\t' + SCorrect)
                DWrongCorrect[SWrong] = SCorrect
    if SFOut:
        for SWrong, SCorrect, SFrq in LTWrongCorrect:
            FOut.write(f'{SWrong}\t{SCorrect}\t{SFrq}\n')
        FOut.flush()
    print(len(DWrongCorrect))

    return LTWrongCorrect, DWrongCorrect


In [76]:
# reading corrections for word forms, with the purpose of generalizing them
# goal: to display candidates for correction -- based on existing corrections (?)
LTWrongCorrectWordF, DWrongCorrectWordF = readCorrectionsFrq(1, 4, 9, '/content/Pilot-Corrections-all.tsv', SFOut = 'Pilot-Corrections-all-WordForm.tsv')
# LTWrongCorrectLemmaF, DWrongCorrectLemmaF = readCorrectionsFrq(3, 6, 9, '/content/Pilot-Corrections-all.tsv', SFOut = 'Pilot-Corrections-all-Lemma.tsv')
print(LTWrongCorrectWordF)
# print(LTWrongCorrectLemmaF)
# ինձ|ինչ
# առջև|առջևից
# ինչ|ինձ
# ինչ|ես
# գիտենալ|իմանալ


ինճ	ինձ	ինչ
առջնից	առջև	առջևից
ինճ	ինչ	ինձ
171
[('[մերճակա]', '[մերձակա]', 4), ('[առջն]', '[առջև]', 4), ('[թեթնություն]', '[թեթևություն]', 4), ('[Եթենա]', '[եթե նա]', 4), ('[ննա]', '[նա]', 4), ('[ճեռքերն]', '[ձեռքից]', 4), ('[ճայն]', '[ձայն]', 4), ('[ճեռքով]', '[ձեռքով]', 4), ('[այլնս]', '[այլևս]', 4), ('[ետնի]', '[ետևի]', 4), ('[կեղնները]', '[կեղևները]', 4), ('[ճկան]', '[ձկան]', 4), ('[ննա]', '[նա]', 4), ('[բանաձնի]', '[բանաձևի]', 4), ('[առնտրական]', '[առևտրական]', 4), ('[արնելյան]', '[արևելյան]', 4), ('[կուղնորվի]', '[կուղևորվի]', 4), ('[նս]', '[ևս]', 4), ('[հետնեց]', '[հետևել]', 4), ('[նուխիսկ]', '[նույնիսկ]', 4), ('[երնույթ]', '[երևույթ]', 4), ('[քրտնքով]', '[քրտինքով]', 4), ('[արվարճանում]', '[արվարձանում]', 4), ('[անճամբ]', '[անձամբ]', 4), ('[ճայնը]', '[ձայնը]', 4), ('[ճգվում]', '[ձգվում]', 4), ('[ճիու]', '[ձիու]', 4), ('[դարճնում]', '[դարձնում]', 4), ('[ուղնորվում]', '[ուղևորվում ]', 4), ('[իջնանատան]', '[իջևանատան]', 4), ('[ճգտում]', '[ձգտում]', 4), ('[դրսնորում]', '[դրսևորում]

In [77]:
print(len(DWrongCorrectWordF))
# print(len(DWrongCorrectLemmaF))


171


In [None]:
for key, value in sorted(DWrongCorrectWordF.items()):
    print(f'{key}\t{value}')

In [None]:
for SWrong, SCorrect, Frq in sorted(LTWrongCorrectWordF, key=lambda x:x[2], reverse=True):
    print(SWrong, SCorrect, Frq)



In [87]:
print(len(LTWrongCorrectWordF))

192


## Extracting a common dictionary of rewrite rules
- from file with corrections

In [94]:
def extractRulesFromLTCorrections(LTWrongCorrect, FOut = None, FOutLen = None):
    # the common dictionary of rewrite ruels;
    '''
      structure:
        DRewriteRules[ (LHS, RHS) ] = Frq_CountSame
      dictionary for full information
        DRewriteRulesInfo []

    '''
    DRewriteRules = {} # production dictionary
    DRewriteRulesLen = {} # sorted by the length

    # DRewriteRulesInfo = {}  # full information associated with dictionary entries (in case we need them)

    for SWrong, SCorrect, Frq in sorted(LTWrongCorrectWordF, key=lambda x:x[2], reverse=True):
        LTRules, DTRules, DTRulesLen, P12, I1, I2, S12 = twoWords2listOfRules(SWrong, SCorrect)
        for el in LTRules:
            try:
                LHS, RHS = el
                if LHS == RHS or LHS == '' or RHS == '': continue

                # dictionary sorted by the length
                DRewriteRulesLen[el] = len(LHS)

                try:
                    DRewriteRules[el] += 1
                except:
                    DRewriteRules[el] = 1
            except:
                sys.stderr.write(f'{el} - cannot be recognised!\n')

    if FOut:
        printFrqTDict(DRewriteRules, FOut)

    if FOutLen:
        printFrqTDict(DRewriteRulesLen, FOutLen)

    return DRewriteRules, DRewriteRulesLen




In [95]:
FOutDRewriteRules = open('printTRulesAllFrq-hy.txt', 'w')
FOutDRewriteRulesLen = open('printTRulesAllLen-hy.txt', 'w')

DRewriteRules, DRewriteRulesLen = extractRulesFromLTCorrections(LTWrongCorrectWordF, FOut = FOutDRewriteRules, FOutLen = FOutDRewriteRulesLen)

print(len(DRewriteRules))
print(len(DRewriteRulesLen))


2725
2725


## Comparing dictionaries
- collecting a frq list of potential OCR errors: words in corpus which are not in Wikipedia

In [105]:
def dictDifference(D1, D2, FOut1 = None, FOut2 = None, FOut12 = None):
    DinD1 = {}
    DinD2 = {}
    DCommon = {}


    for key, frqD1 in D1.items():
        if key in D2.keys():
            frqD2 = D2[key]
            TFrqs = (frqD1, frqD2)
            DCommon[key] = TFrqs
        else:
            DinD1[key] = frqD1

    for key, frqD2 in D2.items():
        if key in D1.keys():
            frqD1 = D1[key]
            TFrqs = (frqD1, frqD2)
            DCommon[key] = TFrqs
        else:
            DinD2[key] = frqD2

    if FOut1:
        printFrqDictinary(DinD1, FOut1)
    if FOut2:
        printFrqDictinary(DinD2, FOut2)
    if FOut12:
        printFrqDictinary(DCommon, FOut12)

    return DinD1, DinD2, DCommon



In [109]:
print(len(DHyF))
print(len(DHyN))
print(len(DHyR))
print(len(DWiki))

20748
21272
10849
2071984


In [110]:
FD1WF = open('dictFD1.txt', 'w')
FD2WF = open('dictFD2.txt', 'w')
FDCommonWF = open('dictFDCommon.txt', 'w')
DinD1WF, DinD2WF, DCommonWF = dictDifference(DWiki, DHyF, FOut1 = FD1WF, FOut2 = FD2WF, FOut12 = FDCommonWF)
print(len(DinD1WF))
print(len(DinD2WF))
print(len(DCommonWF))


2055434
4198
16550


In [111]:
FD1WN = open('dictND1.txt', 'w')
FD2WN = open('dictND2.txt', 'w')
FDCommonWN = open('dictNDCommon.txt', 'w')
DinD1WN, DinD2WN, DCommonWN = dictDifference(DWiki, DHyN, FOut1 = FD1WN, FOut2 = FD2WN, FOut12 = FDCommonWN)
print(len(DinD1WN))
print(len(DinD2WN))
print(len(DCommonWN))


2059280
8568
12704


In [112]:
FD1WR = open('dictRD1.txt', 'w')
FD2WR = open('dictRD2.txt', 'w')
FDCommonWR = open('dictRDCommon.txt', 'w')
DinD1WR, DinD2WR, DCommonWR = dictDifference(DWiki, DHyR, FOut1 = FD1WR, FOut2 = FD2WR, FOut12 = FDCommonWR)
print(len(DinD1WR))
print(len(DinD2WR))
print(len(DCommonWR))

2062989
1854
8995


## Attempting to correct non-found items
Algorithm:
- for each non-found item:
  - go over each rule
  - find if it has a substring among LHS rules (one or more)
  - apply that, check if the correction exists in wikipedia

- save all corrections, from longest to shortest



In [113]:
print(len(DRewriteRulesLen))

2725


In [126]:
def applyRule2String(TRule, SInput):
    '''
    applies one rule (tuple: left-right hand side) to a string,
      returns all possible candidates which result from rewriting (later to be checked)
    '''
    LOutput = []

    try:
        LHS, RHS = TRule
    except:
        sys.stderr.write(f'{TRule} RuleNotRecognized\n')
        return None
    try:
        EscLHS = re.escape(LHS)
        # print(EscLHS)
        RLHS = re.compile(EscLHS)
    except:
        sys.stderr.write(f'{TRule} Compilation failed\n')
        return None


    if LHS in SInput:
        SNew = re.sub(RLHS, RHS, SInput)
        LOutput.append(SNew)
        SNew1 = re.sub(RLHS, RHS, SInput, count=1)
        LOutput.append(SNew1)
        SNew2 = re.sub(RLHS, RHS, SInput, count=2)
        LOutput.append(SNew2)

        EOutputUniq = set(LOutput)
        LOutputUniq = list(EOutputUniq)
    else:
        LOutputUniq = []

    return LOutputUniq

LOutputUniq = applyRule2String(('пли', 'ло'), '[перепливи]')
print(LOutputUniq)

LOutputUniq = applyRule2String(('[առանճ', '[առանձ'), '[առանճնահատուկ]')
print(LOutputUniq)





['[перелови]']
['[առանձնահատուկ]']


In [125]:
def applyAllRules2Word(SInput, DRewriteRulesLen, DWiki):
    '''
    apply all existing rules to a given word,
      check them in wikipedia
      save uniq rewritings
    '''
    LCandidates = []
    SInputBR = f'[{SInput}]' # adding brackets to match brackets in rules
    for TRule, Len in sorted(DRewriteRulesLen.items(), key=lambda item: item[1], reverse=True ): # for each rule, sorted by length
        LOutputUniq = applyRule2String(TRule, SInputBR)
        for SOutputWordBR in LOutputUniq:
            SOutputWord = SOutputWordBR.strip('[]')
            if SOutputWord in DWiki.keys():
                TOutput = (SOutputWord,TRule,Len)
                LCandidates.append(TOutput)

    return LCandidates

print(len(DWiki))
print(len(DRewriteRulesLen))

LCandidates =  applyAllRules2Word('առանճնահատուկ', DRewriteRulesLen, DWiki)
print(LCandidates)

2071984
2725
[('առանձնահատուկ', ('[առանճ', '[առանձ'), 6), ('առանձնահատուկ', ('առանճ', 'առանձ'), 5), ('առանձնահատուկ', ('անճնա', 'անձնա'), 5), ('առանձնահատուկ', ('ռանճ', 'ռանձ'), 4), ('առանձնահատուկ', ('նճնա', 'նձնա'), 4), ('առանձնահատուկ', ('անճն', 'անձն'), 4), ('առանձնահատուկ', ('անճ', 'անձ'), 3), ('առանձնահատուկ', ('ճնա', 'ձնա'), 3), ('առանձնահատուկ', ('նճն', 'նձն'), 3), ('առանձնահատուկ', ('նճ', 'նձ'), 2), ('առանձնահատուկ', ('ճն', 'ձն'), 2), ('առանձնահատուկ', ('ճ', 'ձ'), 1)]


In [128]:
def applyAllrules3ListUniq(LCandidates):
    LUniqCandidates = []
    DUniqCandidates = {}

    for TCandRuleFreq in LCandidates:
        try:
            SCand, TLHSRHS, IFrq = TCandRuleFreq
            TOut = TLHSRHS, IFrq
            # LHS, RHS = TLHSRHS
        except:
            sys.stderr.write(f'{TCandRuleFreq} - tuple not recognized\n')
            TOut = None

        try:
            TValues = DUniqCandidates[SCand]


        except:
            TValues = []

        TValues.append(TOut)
        DUniqCandidates[SCand] = TValues

    for SCand, TVal in sorted(DUniqCandidates.items(), key=lambda item: item[1][1], reverse=True ):
        LUniqCandidates.append((SCand, TVal))



    return(LUniqCandidates)

LUniqCandidates = applyAllrules3ListUniq(LCandidates)
print(LUniqCandidates)
for el in LUniqCandidates:
    print(el)


[('առանձնահատուկ', [(('[առանճ', '[առանձ'), 6), (('առանճ', 'առանձ'), 5), (('անճնա', 'անձնա'), 5), (('ռանճ', 'ռանձ'), 4), (('նճնա', 'նձնա'), 4), (('անճն', 'անձն'), 4), (('անճ', 'անձ'), 3), (('ճնա', 'ձնա'), 3), (('նճն', 'նձն'), 3), (('նճ', 'նձ'), 2), (('ճն', 'ձն'), 2), (('ճ', 'ձ'), 1)])]
('առանձնահատուկ', [(('[առանճ', '[առանձ'), 6), (('առանճ', 'առանձ'), 5), (('անճնա', 'անձնա'), 5), (('ռանճ', 'ռանձ'), 4), (('նճնա', 'նձնա'), 4), (('անճն', 'անձն'), 4), (('անճ', 'անձ'), 3), (('ճնա', 'ձնա'), 3), (('նճն', 'նձն'), 3), (('նճ', 'նձ'), 2), (('ճն', 'ձն'), 2), (('ճ', 'ձ'), 1)])


In [None]:
# apply corrections to all items in the list not in wikipedia


def applyCorrectionRules(DinD2, DRewriteRulesLen, DWiki):

    for key, val in DinD2.items(): # for each word that is not found in Wikipedia
        LPossibleCandidates = []
        SWord = f'[{key}]' # adding being/end symbols
        for TRule, Len in sorted(DRewriteRulesLen.items(), key=lambda item: item[1], reverse=True ):
            LOutputUniq = applyRule2String(TRule, SWord)
            for STestWord in LOutputUniq:
                STestWord0 = STestWord.strip('][')
                if STestWord0 in DWiki.keys():
                    LPossibleCandidates.append(STestWord0)





    return