In [1]:
import os
import cv2
import pytesseract
from pytesseract import Output
import json
from fuzzywuzzy import fuzz 
import numpy as np
from sklearn.linear_model import LinearRegression

import asyncio
import requests
from concurrent.futures import ThreadPoolExecutor
import json
import pandas as pd
import os
import sys

### Getting args from command line

In [2]:
# This is used when running as a script in a parallel cluster
def GetArgs():
    print('Number of arguments:', len(sys.argv), 'arguments.')
    print('Argument List:', str(sys.argv))
    os.chdir(sys.argv[1])
    dirpath = os.getcwd()
    print("current directory is : " + dirpath)
    start=int(sys.argv[2])
    end=int(sys.argv[3])
    return(start,end)

### Open JSON datafile

In [3]:
def AbrirDatosMesa(fname):
    with open(fname) as json_file:  
        data = json.load(json_file)
    return data 

### Creates the adequate filename for mesa json file

In [4]:
def GetDatosMesa(mesa,path="./data/"):
    data_name="mesa_"+'{0:06d}'.format(mesa)+'.json'
    data=AbrirDatosMesa(path+data_name)
    return data

### Use Google's Tesseract OCR engine to extract readable text

In [5]:
def ExtractData(img):
    d = pytesseract.image_to_data(img, output_type=Output.DICT, config="-c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
    return d

### Extract data from JSON file 

In [6]:
def ExtractDataActa(acta,data):
    b=data['TE'][acta-1]['VOTOSTE'+str(acta-1)]
    masked_words=[] #nombres de partidos en la primer columna
    wlenbycode=[] #largo del nombre de estos partidos (en acta)
    extra_masks=[] #partidos en la segunda columna del acta
    maxA=24-1
    for i,val in enumerate(b):
        if i>maxA:
            extra_masks+=val['D'].split()
        else:
            wlenbycode.append(len(val['D']))
            val['D']=''.join([i for i in val['D'] if i.isalpha()])
            masked_words.append(val['D'])
    return masked_words,wlenbycode,extra_masks

### Fuzzy (non strict) matching

In [7]:
def FuzzyMatch(word,matching_words):
    for mw in matching_words:                     
        if fuzz.ratio(word, mw) > 95:
            return True,mw
    return False,-1

### Mask information in acta to leak the least information

In [8]:
def MaskWords(img,d,masked_words,wlenbycode,extra_masks):
    emasks=['TOTAL','VOTOS','PAPELETAS','RECIBIDAS','SIGLAS','VALIDOS','NULOS','BLANCO','VALIDAMENTE','EMITIDOS','OBSERVACIONES','CANTIDAD','AGREGAN']
    emasks+=extra_masks
    party_codes={}
    p1d=(450, 100)
    p2d=(1000, 100)
    
    bar=(750,0)
    barWH=(200,130)
    color1=(0, 255, 0)
    color2=(255,0,0)
    color3=(0,0,0)
    cv2.putText(img, "A", (p1d[0], p1d[1]),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 3)
    cv2.putText(img, "{} rows".format(len(masked_words)), (10, p1d[1]),cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 0, 0), 3)
    #cv2.putText(img, "B", (p2d[0], p2d[1]),cv2.FONT_HERSHEY_SIMPLEX, 4, (255, 0, 0), 3)
    cv2.rectangle(img, (bar[0],bar[1]), (bar[0] + barWH[0], bar[1] + barWH[1]), color3, -1)
    maxX=p1d[0]

    n_boxes = len(d['level'])
    for i in range(n_boxes):
            maskw=0
            if len(d['text'][i])>1:# and d['left'][i]<maxX:
                d['text'][i]=d['text'][i].replace('0','O')
                d['text'][i]=''.join([i for i in d['text'][i] if i.isalpha()])
                d['text'][i]=d['text'][i].upper()
                #if text is directly found in the list
                if d['text'][i] in masked_words:
                    maskw=1
                    color=color1
                    code=masked_words.index(d['text'][i])
                    if code in party_codes:
                            if(d['left'][i]<party_codes[code][0]):
                                party_codes[code]=(d['left'][i], d['top'][i], d['width'][i], d['height'][i])
                    else:
                        party_codes[code]=(d['left'][i], d['top'][i], d['width'][i], d['height'][i])    
                else: #if not do fuzzy matching
                    match=FuzzyMatch(d['text'][i],masked_words)
                    if match[0]:
                        maskw=2
                        color=color2
                        mw=match[1]
                        code=masked_words.index(mw)
                        if code in party_codes:
                            if(d['left'][i]<party_codes[code][0]):
                                party_codes[code]=(d['left'][i], d['top'][i], d['width'][i], d['height'][i])
                        else:
                            party_codes[code]=(d['left'][i], d['top'][i], d['width'][i], d['height'][i])
                    match=FuzzyMatch(d['text'][i],emasks)
                    if match[0]:
                        mw=match[1]
                        maskw=3
                        color=color3

            if maskw:                
                (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
                if maskw==2 or maskw==3:
                    w=len(mw)*14
                cv2.rectangle(img, (x, y), (x + w, y + h), color, -1)

    add_extra=False    
    for code in range(len(masked_words)):
        if code not in party_codes:
            add_extra=True
        #else:
        #    print(all_codes.index(code),"CODE ",code,"VALUE",party_codes[code])
    if add_extra and len(party_codes)>1:
        
        xc=[]
        c=[]
        yc=[]
        for k,val in party_codes.items():
            c.append(k)
            yc.append(val[1])
            xc.append(val[0])
        c=np.array(c).reshape(-1, 1)
        yc=np.array(yc).reshape(-1, 1)
        xc=np.array(xc).reshape(-1, 1)
        regYC = LinearRegression().fit(c,yc)
        regXC = LinearRegression().fit(c,xc)
        for x in range(len(masked_words)):           
            #if code not in party_codes:
                xa=np.array(x).reshape(-1, 1)
                yc=regYC.predict(xa)
                yc=int(np.asscalar(yc))
                xc=regXC.predict(xa)
                xc=int(np.asscalar(xc))
                
                covl=wlenbycode[x]*14
                
                cv2.rectangle(img, (0, yc), (xc+covl,yc+18), color1, -1)
          
                
                cv2.putText(img, 'A'+str(x+1), (xc, yc+20),cv2.FONT_HERSHEY_SIMPLEX, 1.2, (255, 0, 0), 3)
            #else:
            #    (x, y, w, h) = party_codes[code]
            #    xc=party_codes[code][0]
            #    yc=party_codes[code][1]
            #    cv2.putText(img2, "A"+str(x), (xc, yc+20),cv2.FONT_HERSHEY_SIMPLEX, 1.5, (255, 0, 0), 3)
    return img

### Helper function to draw boxes in a tesseract OCR dictionary

In [9]:
def DrawBoxes(img,d):
    img2=img.copy()
    n_boxes = len(d['level'])
    for i in range(n_boxes):
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            cv2.rectangle(img2, (x, y), (x + w, y + h), (0, 255, 0), 2)
    return img2

### Get interest area based on heuritstic anchor words

In [10]:
def GetBoundingBox(d,acta,masked_words):
    first_w='PAPELETAS'
    last_w='VALIDAMENTE'
    last_pad=(0,1200)
    first_pad=(0,0)
    if acta == 1:
        p1d=(0, 600)
    else:
        p1d=(0, 320)    
    p2d=(940, 2100)
    #p2d=(1500, 2100)
    expw=p2d[0]-p1d[0]
    exph=p2d[1]-p1d[1]
    por=0.9
    porEr=0.7
    
    
    problem=False
    
    idx_f=GetIndex(d,first_w,True)
    idx_l=GetIndex(d,last_w,False)
    
    if idx_f==-1 and idx_l==-1:
        for i in range(len(d['text'])):
            if FuzzyMatch(d['text'][i],masked_words)[0]:
                break
        else:
            problem=True 
    if idx_f!=-1:
        #p1= (d['left'][idx_f]+first_pad[0], d['top'][idx_f]+first_pad[1])
        p1= (0, d['top'][idx_f]+first_pad[1])
    else:    
        p1= p1d#default value
        
    if idx_l!=-1:
        p2=(d['left'][idx_l]+d['width'][idx_l]+last_pad[0], d['top'][idx_l]+d['height'][idx_l]+last_pad[1])
    else:
        p2=p2d #default value
    
    if p2[0]-p1[0]<expw*por or p2[1]-p1[1]<exph*por:
        print("EXPANDING AREA")
        #problem=True
        p2=p2d
        p1= p1d
        if p2[0]-p1[0]<expw*porEr or p2[1]-p1[1]<exph*porEr:
            print("AREA TOO SMALL")
            problem=True
    
    return (p1,p2,problem)

### Helper function to find the index of a word in a list

In [11]:
def GetIndex(d,word,first=True):
    if word in d['text']:
        if first:
            return d['text'].index(word)
        else:
            return len(d['text']) - 1 - d['text'][::-1].index(word)
    else:
        return -1

### Processs every Acta for each Mesa

In [14]:
def ProcessFile(mesa):
    print("Mesa",mesa)
    path="data/"
    out_path='./data_mturk_pres/'
    mfname="mesa_"+'{0:06d}'.format(mesa)+'.json'
    
    exists = os.path.isfile(path+mfname)
    if not exists:
        print("Datos ",mfname," NO EXISTE")
        return (True,-1)
    
    
    mdata=GetDatosMesa(mesa)
    for acta in range(1,6):
        fname='{0:06d}'.format(mesa*10+acta)+'.jpg'
        fname2='problem_{0:06d}'.format(mesa*10+acta)+'.jpg'

        p1=(380,720)
        p2=(600,1950)

        exists = os.path.isfile(path+fname)
        if not exists:
            print("ACTA ",fname," NO EXISTE")
            continue
        
        masked_words,wlenbycode,extra_masks=ExtractDataActa(acta,mdata)

        image = cv2.imread(path+fname) 

        #Extract data for bounding box
        d=ExtractData(image)

        (p1,p2,problem)=GetBoundingBox(d,acta,masked_words)


        if problem:
            #Check if it needs to be rotated
            # get image height, width
            (h, w) = image.shape[:2]
            # calculate the center of the image
            center = (w / 2, h / 2)

            angle180 = 180

            scale = 1.0

            # 180 degrees
            M = cv2.getRotationMatrix2D(center, angle180, scale)
            rotated180 = cv2.warpAffine(image.copy(), M, (w, h))

            d=ExtractData(rotated180)        
            (p1n,p2n,problem)=GetBoundingBox(d,acta,masked_words)

            if problem:
                cv2.imwrite("./problems/"+fname2,image)
            else:
                p1=p1n
                p2=p2n
                print("ROTATION PROBLEM ",fname2)
                image=rotated180
                cv2.imwrite("./problems/FIXED_"+fname2,image)

        #image2=DrawBoxes(image,d)

        crop_img = image[p1[1]:p2[1], p1[0]:p2[0]]

        #Extract data for masking
        d=ExtractData(crop_img)
        
        #print(extra_masks)
        crop_img=MaskWords(crop_img,d,masked_words,wlenbycode,extra_masks)

        cv2.imwrite(out_path+fname,crop_img)
        #cv2.imwrite(out_path+fname2,image2)
    return

### Task scheduler

In [None]:
async def get_data_asynchronous(path="data/",startFromZero=True):
    
    mesas=[]
    if startFromZero:
        mesas=range(start,end+1)
    else:
        df=pd.read_csv('data/results.csv')
        for index, row in df.iterrows():
            if row['datos']==False or row['actas']<5:
                mesas.append(int(row['mesa']))
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        loop = asyncio.get_event_loop()
        tasks = [
                loop.run_in_executor(
                    executor,
                    ProcessFile,
                    mesa
                )
            for mesa in mesas
        ]
        res=await asyncio.gather(*tasks)

### Run

In [None]:
start,end=GetArgs()
loop = asyncio.get_event_loop()
future = asyncio.ensure_future(get_data_asynchronous())
loop.run_until_complete(future)