In [18]:
import pandas as pd
from os import getcwd
import numpy as np

In [4]:
# Folder params
WORK_DIRECTORY = getcwd()
ARRAYS_PATH = WORK_DIRECTORY + '\\areas_result\\'
TMP_PATH = WORK_DIRECTORY + '\\temps\\'
LOCAL_PATH = WORK_DIRECTORY + '\\local_result\\'

In [56]:
def center_of(bounding_box):
    xs = np.array([bounding_box[0],bounding_box[2],bounding_box[4],bounding_box[6]])
    ys = np.array([bounding_box[1],bounding_box[3],bounding_box[5],bounding_box[7]])
    return int(xs.sum() / xs.size) , int(ys.sum() / ys.size)


def dni_rect_from_ce(x, y, cv2_v, img, areas_dict, lines_dict):
    w = img.shape[1]
    h = img.shape[0]

    color = (0, 0, 255)
    thicknes = 2
    fields = []
    res_dict = {}

    for key in areas_dict.keys():
        tras_vector = np.array([areas_dict[key]['tras_x'], areas_dict[key]['tras_y']])
        point = np.array([x, y]) + tras_vector
        point_2 = point + np.array([areas_dict[key]['width'], areas_dict[key]['height']])

        point = np.where(point<=0,0,point)
        point_2[0] = w if point_2[0]>= w else point_2[0]
        point_2[1] = h if point_2[1]>= h else point_2[1]

        cv2_v.rectangle(img,(point[0],point[1]),(point_2[0],point_2[1]),color,thicknes)
        res_dict[key] = (point, point_2)
        # print('{} {}'.format(point,point_2))

    for key in lines_dict.keys():
        tras_vector = np.array([lines_dict[key]['tras_x'], lines_dict[key]['tras_y']])
        point = np.array([x, y]) + tras_vector
        point_2 = point + np.array([lines_dict[key]['width'], lines_dict[key]['height']])

        point = np.where(point<=0,0,point)
        point_2[0] = w if point_2[0]>= w else point_2[0]
        point_2[1] = h if point_2[1]>= h else point_2[1]

        # cv2_v.rectangle(img,(point[0],point[1]),(point_2[0],point_2[1]),color,thicknes)
        res_dict[key] = (point, point_2)

    # return res_dict['dni'][0], res_dict['dni'][1], res_dict['tel'][0], res_dict['tel'][1]
    return res_dict


def is_in_box(center, top_left_point, bottom_right_point):
    x = center[0] > top_left_point[0] and center[0] < bottom_right_point[0]
    y = center[1] > top_left_point[1] and center[1] < bottom_right_point[1]
    return (x and y)


def get_fields_from_json(azure_json, areas_dict):
    fields = {}
    scores = {}
    j=0

    # Setting empty arrays in result
    for key in areas_dict.keys():
        fields[key] = []
        scores[key] = []

    for line in azure_json['recognitionResult']['lines']:
        j += 1
        # print('Line {}: {},{}'.format(j,line['text'],line['boundingBox']))

        # Drawing areas
        cv2.rectangle(image,(line['boundingBox'][0],line['boundingBox'][1]),(line['boundingBox'][2],line['boundingBox'][5]),(0,0,0),1)
        cv2.circle(image,center_of(line['boundingBox']), 4, (0,255,255), -1)

        # Searching in areas_dict
        c = center_of(line['boundingBox'])
        t = line['text']
        ws = []
        scos = []
        for w in line['words']:
            ws.append(w['text'])
            try:
                scos.append(w['confidence'])
            except:
                scos.append('Normal')

        for key in areas_dict.keys():
            # print('C {} está en {}?'.format(c, areas_dict[key]))
            if is_in_box(c, areas_dict[key][0], areas_dict[key][1]):
                # print('String: {} --> {}'.format(t,key))
                fields[key].append(ws)
                scores[key].append(scos)

    return fields, scores


def basic_digit_clean(string, num_eqs):
    for k in num_eqs.keys():
        string = string.replace(k, num_eqs[k])
    return string


def basic_field_clean(string):
    a = string.find(':')
    if a == len(string)-1:
        return string[:-1]
    else:
        if a >= 0:
            return string[a + 1:]
        return string


def post_num_field(l_field, l_score, banned, num_eqs={}, separator=""):
    value = ""
    score = 0.0
    ct = 0

    for i in range(len(l_field)):
        for j in range(len(l_field[i])):
            # print(j)
            if l_field[i][j] not in banned:
                ct += 1
                value += separator + l_field[i][j]
                # print('{}'.format(l_score[i][j]))
                score += eqs[l_score[i][j]]
    value = basic_digit_clean(value, num_eqs)
    return (value.strip(), score / ct) if ct > 0 else (value, 0.0)


def get_dni_from_line(string, num_eqs):
    a = string.find('DNI:')
    b = string.find('Telefono:')
    if a >= 0 and b >= 0:
        return basic_digit_clean(string[a+len('DNI:'):b],num_eqs)
    a = string.find('DNI')
    b = string.find('Telefono')
    if a >= 0 and b >= 0:
        return basic_digit_clean(string[a+len('DNI'):b],num_eqs)
    else:
        return ""


def get_tel_from_line(string, num_eqs):
    b = string.find('Telefono:')
    if b >= 0:
        return basic_digit_clean(string[b+len('Telefono:'):],num_eqs)
    b = string.find('Telefono')
    if b >= 0:
        return basic_digit_clean(string[b+len('Telefono'):],num_eqs)
    else:
        return ""


def clean_letters(word):
    ret = ""
    for c in word:
        if '0' <= c and c <= '9':
            ret += str(c)
    return ret


def has_letters(word):
    ret = True
    for c in word:
        if not '0' <= c and c <= '9':
            return not False
    return not True

def digits(word):
    for c in word:
        if not c.isdigit():
            return False
    return True

In [57]:
digits(':25595015')

False

In [50]:
has_letters(':25595015')

False

In [65]:
clean_letters(':25595015')

'25595015'

In [55]:
':'.isdigit()

False

In [7]:
# Getting the local result
local_result = pd.read_csv(LOCAL_PATH+'result.csv', dtype={'DNI':str, 'Telefono':str})

In [10]:
local_result.head(2)

Unnamed: 0,idCupon,NombreArchivo,DNI,AcertividadDNI,Telefono,AcertividadTelefono,idCampania,idUsuario,idEstado,idBatch,LocalJsonOCR,Ruta,FechaHora,Azure
0,133690,MarzoCompras_AV_B3_20190708103533_00199,41417807,56.03,414400970,63.22,4,1,1,20191030134239,"{""dni_area"": [93, 433, 126, 201], ""telefono_ar...",/cupones/marzo_compras_2018/04/B3/MarzoCompras...,2019-10-30 14:39:58.470973,1
1,133691,MarzoCompras_AV_B3_20190708103533_00200,46682740,62.45,943524335,73.95,4,1,1,20191030134239,"{""dni_area"": [87, 427, 131, 206], ""telefono_ar...",/cupones/marzo_compras_2018/04/B3/MarzoCompras...,2019-10-30 14:39:58.470973,1


In [32]:
# Getting the local result
ruta='C:\\git\\cuponesWong\\CuponesWong\\notebooks_flow\\azure_result\\'
bd_azure = pd.read_csv(ruta+'azure_result.csv', dtype={'DNI':str, 'Telefono':str})
bd_azure = bd_azure.fillna('')

In [33]:
bd_azure.columns

Index(['NombreArchivo', 'DNI', 'AcertividadDNI', 'Telefono',
       'AcertividadTelefono', 'NombreCompleto', 'AcertividadNombreCompleto',
       'Direccion', 'AcertividadDireccion', 'Distrito', 'AcertividadDistrito',
       'Correo', 'AcertividadCorreo', 'AzureJsonOCR', 'idCampania',
       'idUsuario', 'idEstado'],
      dtype='object')

In [60]:
bd_azure['dni_letters'] = bd_azure['DNI'].apply(has_letters)
bd_azure['dni_digs'] = bd_azure['DNI'].apply(digits)

In [61]:
bd_azure[bd_azure['NombreArchivo']=='MarzoCompras_AV_B3_20190708103537_00380']

Unnamed: 0,NombreArchivo,DNI,AcertividadDNI,Telefono,AcertividadTelefono,NombreCompleto,AcertividadNombreCompleto,Direccion,AcertividadDireccion,Distrito,AcertividadDistrito,Correo,AcertividadCorreo,AzureJsonOCR,idCampania,idUsuario,idEstado,dni_letters,dni_letters2,dni_digs
132,MarzoCompras_AV_B3_20190708103537_00380,:25595015,60.0,43252068,0.0,Nombres y Apellidos. JOLLA GLOUANA ClesiAS MAR...,66.0,AS. QUArdiA ChALACA 1868 urb. ANGAMOS,70.0,SELLAUis TA - CALLAO,75.0,,0.0,"{""status"": ""Succeeded"", ""recognitionResult"": {...",4,1,2,False,False,False


In [64]:
bd_azure[np.logical_and(np.logical_not(bd_azure['dni_digs']),bd_azure['AcertividadDNI']>0.0)][['DNI','AcertividadDNI']]

Unnamed: 0,DNI,AcertividadDNI
6,4073s020,60.000000
55,42166x23,80.000000
125,701bass1,60.000000
132,:25595015,60.000000
174,:25715821,70.909091
187,:7434721J,65.000000
192,:10199T0N,66.000000
214,0B093401,70.000000
215,0B093401,60.000000
220,080934e1,70.000000


In [26]:
# Merge for updating in database
data_merge = pd.merge(bd_azure, local_result[['idCupon','NombreArchivo','DNI','AcertividadDNI','Telefono','AcertividadTelefono']], how='left', on='NombreArchivo',)
data_merge['DNI_def'] = np.where(data_merge['AcertividadDNI_y']>=89.00,
                                 data_merge['DNI_y'],
                                 np.where(np.logical_and(data_merge['AcertividadDNI_x'] == 0.0, data_merge['AcertividadDNI_y'] != 0.0),
                                          data_merge['DNI_y'],
                                          data_merge['DNI_x']
                                         )
                                )
data_merge['AcertDNI_def'] = np.where(data_merge['AcertividadDNI_y']>=89.00,
                                 data_merge['AcertividadDNI_y'],
                                 np.where(np.logical_and(data_merge['AcertividadDNI_x'] == 0.0, data_merge['AcertividadDNI_y'] != 0.0),
                                          data_merge['AcertividadDNI_y'],
                                          data_merge['AcertividadDNI_x']
                                         )
                                )
data_merge['Telefono_def'] = np.where(data_merge['AcertividadTelefono_y']>=90.00,
                                 data_merge['Telefono_y'],
                                 np.where(np.logical_and(data_merge['AcertividadTelefono_x'] == 0.0, data_merge['AcertividadTelefono_y'] != 0.0),
                                          data_merge['Telefono_y'],
                                          data_merge['Telefono_x']
                                         )
                                )
data_merge['AcertTelefono_def'] = np.where(data_merge['AcertividadTelefono_y']>=90.00,
                                 data_merge['AcertividadTelefono_y'],
                                 np.where(np.logical_and(data_merge['AcertividadTelefono_x'] == 0.0, data_merge['AcertividadTelefono_y'] != 0.0),
                                          data_merge['AcertividadTelefono_y'],
                                          data_merge['AcertividadTelefono_x']
                                         )
                                )

In [25]:
data_merge[['DNI_y','DNI_x','AcertividadDNI_y','AcertividadDNI_x','DNI_def','AcertDNI_def']][data_merge['AcertividadDNI_x']==0]

Unnamed: 0,DNI_y,DNI_x,AcertividadDNI_y,AcertividadDNI_x,DNI_def,AcertDNI_def
3,7175548,7755981,0.00,0.0,7755981,0.00
8,745379266,4537997690,0.00,0.0,4537997690,0.00
10,72262144,936813,54.61,0.0,72262144,54.61
24,727,7288791,0.00,0.0,7288791,0.00
30,2644406,,0.00,0.0,,0.00
37,32935,IN0,0.00,0.0,IN0,0.00
42,09922622,0993262,58.71,0.0,09922622,58.71
53,10398,,0.00,0.0,,0.00
63,7755503,,0.00,0.0,,0.00
73,1001022,1004109,0.00,0.0,1004109,0.00


In [21]:
data_merge[['Telefono_y','Telefono_x','AcertividadTelefono_y','AcertividadTelefono_x','Telefono_def','AcertTelefono_def']]

Unnamed: 0,Telefono_y,Telefono_x,AcertividadTelefono_y,AcertividadTelefono_x,Telefono_def,AcertTelefono_def
0,414400970,914486978,63.22,90.0,914486978,90.00
1,943524335,943524335,73.95,81.0,943524335,81.00
2,74803744,948333444,75.69,70.0,948333444,70.00
3,9452570,945257940,85.09,60.0,945257940,60.00
4,977120001,997128001,70.65,60.0,997128001,60.00
5,953723816,953723816,74.41,60.0,953723816,60.00
6,74109847,9401918497,50.31,0.0,9401918497,0.00
7,943524335,943524335,58.05,75.0,943524335,75.00
8,22533937,922583937,66.89,67.5,922583937,67.50
9,75200507,93XX0x(2,87.44,0.0,93XX0x(2,0.00
