In [60]:
import numpy as np
import pandas as pd
import cv2
import pytesseract
from glob import glob
import spacy
import re
import string
import warnings
warnings.filterwarnings('ignore')

### Cleaning text

In [9]:
def cleanText(txt):
    whiteSpace = string.whitespace
    specCharacters = '!#$%&\'()*+:;<=>?[\\]^`{|}~'
    tableWhiteSpace = str.maketrans('','', whiteSpace)
    tableSpecCharacters = str.maketrans('','',specCharacters)
    
    text = str(txt)
    text = text.lower()
    removeWhiteSpace = text.translate(tableWhiteSpace)
    removespecCharacters = removeWhiteSpace.translate(tableSpecCharacters)
    
    return str(removespecCharacters)

In [10]:
###load model
model_ner = spacy.load('./output/model-best/')

In [197]:
## 1.Load image
image = cv2.imread('../test-img/linguaGrupa.JPG')
# cv2.imshow('Company', image)
# cv2.waitKey(0)
# cv2.destroyAllWindows()


## 2.Extract data using Pytesseract
tessData = pytesseract.image_to_data(image, lang='hrv')
#tessData
tessList = list(map(lambda x:x.split('\t'), tessData.split('\n')))
df = pd.DataFrame(tessList[1:], columns=tessList[0])
df.dropna(inplace=True) #drop missing values
df['text'] = df['text'].apply(cleanText) #text cleaning


## 3.Convert data into content
df_clean = df.query('text != "" ')  #selecting all the text with value
content = " ".join([w for w in df_clean['text']])  #joining all the words with space
print(content)


## 4.Get prediction from model
doc = model_ner(content)



q naša adresa ulica eugena kumičića 10, 10 000 zagreb kontaktirajte nas 01/ 6576 666 pošaljite upit s infoglinguagrupa.hr £ 9 linl pratite nas pravni podaci lingua grupa d.o.o. sjedište eugena kumičića 10, 10 000 zagreb, republika hrvatska mb 01298402 oib 04550695038 pdv broj hro4550695038 mbs 080156450 kod trgovačkog suda u zagreb direktorica ivana lieli iban hr5523400091102709207 kod privredna banka zagreb d.d. swift pbzghr2x


## Prediction display

In [12]:
from spacy import displacy

In [198]:
displacy.render(doc, style='ent')

### Tagging each word

In [199]:
docjson = doc.to_json()
docjson.keys()

dict_keys(['text', 'ents', 'tokens'])

In [200]:
doc_text = docjson['text']
doc_text

'q naša adresa ulica eugena kumičića 10, 10 000 zagreb kontaktirajte nas 01/ 6576 666 pošaljite upit s infoglinguagrupa.hr £ 9 linl pratite nas pravni podaci lingua grupa d.o.o. sjedište eugena kumičića 10, 10 000 zagreb, republika hrvatska mb 01298402 oib 04550695038 pdv broj hro4550695038 mbs 080156450 kod trgovačkog suda u zagreb direktorica ivana lieli iban hr5523400091102709207 kod privredna banka zagreb d.d. swift pbzghr2x'

In [201]:
dataframe_tokens = pd.DataFrame(docjson['tokens'])
dataframe_tokens['token'] = dataframe_tokens[['start', 'end']].apply(
    lambda x:doc_text[x[0]:x[1]], axis = 1) #joining text to start and end - dataframe
dataframe_tokens.head(10)

Unnamed: 0,id,start,end,token
0,0,0,1,q
1,1,2,6,naša
2,2,7,13,adresa
3,3,14,19,ulica
4,4,20,26,eugena
5,5,27,35,kumičića
6,6,36,38,10
7,7,38,39,","
8,8,40,42,10
9,9,43,46,000


In [None]:
### Merging tables

In [202]:
right_table = pd.DataFrame(docjson['ents'])[['start', 'label']]
dataframe_tokens = pd.merge(dataframe_tokens, right_table, how='left', on='start')

In [214]:
dataframe_tokens.head(50)

Unnamed: 0,id,start,end,token,label
0,0,0,1,q,O
1,1,2,6,naša,O
2,2,7,13,adresa,O
3,3,14,19,ulica,B-STREET
4,4,20,26,eugena,I-STREET
5,5,27,35,kumičića,I-STREET
6,6,36,38,10,I-STREET
7,7,38,39,",",O
8,8,40,42,10,B-POST_NUMBER
9,9,43,46,000,I-POST_NUMBER


In [None]:
### Filling NaN val with 'O'

In [213]:
dataframe_tokens.fillna('O', inplace=True)
dataframe_tokens.head(50)

Unnamed: 0,id,start,end,token,label
0,0,0,1,q,O
1,1,2,6,naša,O
2,2,7,13,adresa,O
3,3,14,19,ulica,B-STREET
4,4,20,26,eugena,I-STREET
5,5,27,35,kumičića,I-STREET
6,6,36,38,10,I-STREET
7,7,38,39,",",O
8,8,40,42,10,B-POST_NUMBER
9,9,43,46,000,I-POST_NUMBER


In [205]:
df_clean.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
4,5,1,1,1,1,1,88,44,28,38,65.234695,q
5,5,1,1,1,1,2,168,41,55,16,96.709244,naša
6,5,1,1,1,1,3,230,41,76,16,96.231819,adresa
8,5,1,1,1,2,1,168,68,50,17,96.725449,ulica
9,5,1,1,1,2,2,227,69,81,23,96.252228,eugena
10,5,1,1,1,2,3,317,68,93,17,96.059944,kumičića
11,5,1,1,1,2,4,418,69,29,19,95.628365,10
12,5,1,1,1,2,5,455,69,24,16,95.628365,10
13,5,1,1,1,2,6,487,69,47,16,93.609779,000
14,5,1,1,1,2,7,541,69,77,23,96.422165,zagreb


In [206]:
df_clean['end'] = df_clean['text'].apply(lambda x: len(x) + 1).cumsum() - 1
df_clean['start'] = df_clean[['text', 'end']].apply(lambda x: x[1] - len(x[0]), axis=1)
df_clean.head(8)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start
4,5,1,1,1,1,1,88,44,28,38,65.234695,q,1,0
5,5,1,1,1,1,2,168,41,55,16,96.709244,naša,6,2
6,5,1,1,1,1,3,230,41,76,16,96.231819,adresa,13,7
8,5,1,1,1,2,1,168,68,50,17,96.725449,ulica,19,14
9,5,1,1,1,2,2,227,69,81,23,96.252228,eugena,26,20
10,5,1,1,1,2,3,317,68,93,17,96.059944,kumičića,35,27
11,5,1,1,1,2,4,418,69,29,19,95.628365,10,39,36
12,5,1,1,1,2,5,455,69,24,16,95.628365,10,42,40


In [None]:
### inner join 'df_clean' and 'dataframe_tokens'

In [207]:
dataframe_info = pd.merge(df_clean, dataframe_tokens[['start', 'token', 'label']], how='inner', on='start')
dataframe_info.head(10)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
0,5,1,1,1,1,1,88,44,28,38,65.234695,q,1,0,q,O
1,5,1,1,1,1,2,168,41,55,16,96.709244,naša,6,2,naša,O
2,5,1,1,1,1,3,230,41,76,16,96.231819,adresa,13,7,adresa,O
3,5,1,1,1,2,1,168,68,50,17,96.725449,ulica,19,14,ulica,B-STREET
4,5,1,1,1,2,2,227,69,81,23,96.252228,eugena,26,20,eugena,I-STREET
5,5,1,1,1,2,3,317,68,93,17,96.059944,kumičića,35,27,kumičića,I-STREET
6,5,1,1,1,2,4,418,69,29,19,95.628365,10,39,36,10,I-STREET
7,5,1,1,1,2,5,455,69,24,16,95.628365,10,42,40,10,B-POST_NUMBER
8,5,1,1,1,2,6,487,69,47,16,93.609779,000,46,43,000,I-POST_NUMBER
9,5,1,1,1,2,7,541,69,77,23,96.422165,zagreb,53,47,zagreb,B-CITY


## Bounding box

In [208]:
box_df = dataframe_info.query("label != 'O' ")
img = image.copy()
for x, y, w, h, label in box_df[['left', 'top', 'width', 'height', 'label']].values:
    x = int(x)
    y = int(y)
    w = int(w)
    h = int(h)
    
    cv2.rectangle(img, (x,y), (x+w, y+h), (0,255,0), 2)
    cv2.putText(img, str(label),(x,y), cv2.FONT_HERSHEY_PLAIN, 1, (255,0,255),2 )

cv2.imshow('Prediction', img)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
### Grouping tags

In [209]:
box_df['label'] = box_df['label'].apply(lambda x: x[2:]) #cutting out first two letters from tags
box_df.head(12)


Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label
3,5,1,1,1,2,1,168,68,50,17,96.725449,ulica,19,14,ulica,STREET
4,5,1,1,1,2,2,227,69,81,23,96.252228,eugena,26,20,eugena,STREET
5,5,1,1,1,2,3,317,68,93,17,96.059944,kumičića,35,27,kumičića,STREET
6,5,1,1,1,2,4,418,69,29,19,95.628365,10,39,36,10,STREET
7,5,1,1,1,2,5,455,69,24,16,95.628365,10,42,40,10,POST_NUMBER
8,5,1,1,1,2,6,487,69,47,16,93.609779,000,46,43,000,POST_NUMBER
9,5,1,1,1,2,7,541,69,77,23,96.422165,zagreb,53,47,zagreb,CITY
12,5,1,2,1,2,1,168,181,40,19,90.302757,01/,75,72,01/,PHONE
13,5,1,2,1,2,2,217,182,62,16,93.356613,6576,80,76,6576,PHONE
14,5,1,2,1,2,3,288,182,44,16,95.348938,666,84,81,666,CELL-PHONE


In [None]:
###group the labels

In [210]:
class groupgenerator():
    def __init__(self):
        self.id= 0
        self.text=''
    def getgroup(self, text):
        if self.text == text:
            return self.id
        else:
            self.id += 1
            self.text = text
            return self.id

group_gen = groupgenerator()

In [211]:
box_df['group'] = box_df['label'].apply(group_gen.getgroup)

In [212]:
box_df.head(25)

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,end,start,token,label,group
3,5,1,1,1,2,1,168,68,50,17,96.725449,ulica,19,14,ulica,STREET,1
4,5,1,1,1,2,2,227,69,81,23,96.252228,eugena,26,20,eugena,STREET,1
5,5,1,1,1,2,3,317,68,93,17,96.059944,kumičića,35,27,kumičića,STREET,1
6,5,1,1,1,2,4,418,69,29,19,95.628365,10,39,36,10,STREET,1
7,5,1,1,1,2,5,455,69,24,16,95.628365,10,42,40,10,POST_NUMBER,2
8,5,1,1,1,2,6,487,69,47,16,93.609779,000,46,43,000,POST_NUMBER,2
9,5,1,1,1,2,7,541,69,77,23,96.422165,zagreb,53,47,zagreb,CITY,3
12,5,1,2,1,2,1,168,181,40,19,90.302757,01/,75,72,01/,PHONE,4
13,5,1,2,1,2,2,217,182,62,16,93.356613,6576,80,76,6576,PHONE,4
14,5,1,2,1,2,3,288,182,44,16,95.348938,666,84,81,666,CELL-PHONE,5


In [97]:
### Right and bottom of bounding box

In [100]:
box_df[['left','top','width','height']] = box_df[['left','top','width','height']].astype(int)
box_df['right'] = box_df['left'] + box_df['width']
box_df['bottom'] = box_df['top'] + box_df['height']

In [103]:
### tagging: groupby group

In [120]:
col_group = ['left','top','right','bottom', 'label', 'token', 'group']
group_tag_img = box_df[col_group].groupby(by='group')

In [172]:
img_tagging = group_tag_img.agg({
    'left':min,
    'right': max,
    'top': min,
    'bottom': max,
    'label': np.unique,
    'token': lambda x: " ".join(x)
})


In [173]:
img_tagging['label'] = img_tagging['label'].astype(str) #convert object into string
img_tagging['label'] = img_tagging['label'].apply(lambda x: x[2:]) #cut first two char ['COMPANY']
img_tagging['label'] = img_tagging['label'].apply(lambda x: x[:len(x)-2])#cut last two char['COMPANY']

img_tagging

Unnamed: 0_level_0,left,right,top,bottom,label,token
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,19,255,18,45,COMPANY,kramel d.o.o
2,72,219,90,107,OIB,10000305435
3,18,149,126,144,STREET,kladare 19c
4,17,84,157,174,POST_NUMBER,33405
5,93,175,156,174,CITY,kladare
6,18,113,186,204,COUNTRY,hrvatska
7,48,236,225,242,CELL_PHONE,385 97 6616845
8,48,348,261,285,EMAIL,kramelservicesegmail.com
9,78,117,298,316,FAX,08h
10,80,201,336,353,MBS,070153287


In [175]:
img_bb=image.copy()

for l,r,t,b,label,token in img_tagging.values:
    cv2.rectangle(img_bb, (l,t), (r,b), (0,255,0), 2)
    cv2.putText(img_bb, label, (l,t), cv2.FONT_HERSHEY_PLAIN, 1.2, (255,0,255),2)
    
cv2.imshow('Company', img_bb)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [None]:
### Additional cleaning for phone, website and email

In [180]:
def parser(text, label):
    if label in ('PHONE','CELL_PHONE','FAX'):
        text = text.lower()
        text = re.sub(r'\D', '', text)
        
    elif label == 'EMAIL':
        text = text.lower()
        special_char = '@_.\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(special_char), '', text)
        
    elif label == 'WEBSITE':
        text = text.lower()
        special_char = ':/.%#\-'
        text = re.sub(r'[^A-Za-z0-9{} ]'.format(special_char), '', text)
        
    elif label in ('COMPANY', 'STREET', 'COUNTRY'):
        text = text.lower()
        text = text.title()
    
    return text

In [186]:
# Entities

In [194]:
info_array = dataframe_info[['token','label']].values
entities = dict(COMPANY=[], STREET=[], CITY=[], COUNTRY=[], 
                PHONE=[], CELL_PHONE=[], FAX=[],POST_NUMBER=[], 
                EMAIL=[], WEBSITE=[],MBS=[], IBAN=[], OIB=[])
previous = 'O'

for token, label in info_array:
    bio_tag = label[:1]
    label_tag = label[2:]
    
    # 1.parse the token
    text = parser(token, label_tag)
    
    if bio_tag in ('B', 'I'):
        if previous != label_tag:
            entities[label_tag].append(text)
        else:
            if bio_tag == 'B':
                entities[label_tag].append(text)
            else:
                if label_tag in ('COMPANY', 'STREET', 'CITY', 'COUNTRY'): # join with space
                    entities[label_tag][-1] = entities[label_tag][-1] + " " + text
                else:
                    entities[label_tag][-1] = entities[label_tag][-1] + text
    previous = label_tag

In [196]:
entities

{'COMPANY': ['Kramel D.O.O'],
 'STREET': ['Kladare 19C'],
 'CITY': ['kladare'],
 'COUNTRY': ['Hrvatska'],
 'PHONE': [],
 'CELL_PHONE': ['385976616845'],
 'FAX': ['08'],
 'POST_NUMBER': ['33405'],
 'EMAIL': ['kramelservicesegmail.com'],
 'WEBSITE': [],
 'MBS': ['070153287'],
 'IBAN': ['hr1724840081135193856'],
 'OIB': ['10000305435']}