In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

In [2]:
import flair
flair.device

device(type='cpu')

In [3]:
### flair dependancies

from flair.models import SequenceTagger
from flair.data import Sentence

In [4]:
### hack to load pdf2txt
import sys
sys.path.insert(0,'Pieter/eml-to-txt/pdf2text/')

### pdf2text dependancies
### xmltodict

from src.pdf.converter import PDFConverter
from src.pdf.writer import PDFWriter
from src.pdf.util import FrontendIndexMapping


def process_pdf(pdf_file):
    # Copy pdf-file to temp
    PDFWriter.copy_pdf_to_tmp(pdf_file)

    # Convert temporary pdf file
    pdf_as_json, word_boundaries = PDFConverter.convert_pdf_to_dict()

    # Convert json to text
    frontend_index_mapping = FrontendIndexMapping(
        pdf_as_json
    )

    return frontend_index_mapping.text()


In [5]:
import os
import numpy as np

In [6]:
folder = 'data/Mail_Exports'

In [7]:
file = np.random.choice(os.listdir(folder))
file

'Orderbevestiging 23191868 - R2657068.pdf'

In [8]:
pdf_txt = process_pdf(os.path.join(folder,file))

In [9]:
import spacy

In [10]:
nlp = spacy.blank("nl")

In [11]:
### get text and char offset from spacy tokens
tokens = [{'text':str(token),'offset':token.idx} for token in nlp(pdf_txt)]
tokens[:10]

[{'text': '\n', 'offset': 0},
 {'text': 'ORIGINEEL', 'offset': 1},
 {'text': '\n ', 'offset': 10},
 {'text': 'Opdrachtbevestiging', 'offset': 12},
 {'text': '23191868', 'offset': 32},
 {'text': '\n', 'offset': 40},
 {'text': 'Besteldatum', 'offset': 41},
 {'text': ':', 'offset': 52},
 {'text': '\t', 'offset': 53},
 {'text': '03.08.2020', 'offset': 54}]

In [12]:
### remove whitespace
tokens = [token for token in tokens if len(token['text'].strip())>0]
tokens[:10]

[{'text': 'ORIGINEEL', 'offset': 1},
 {'text': 'Opdrachtbevestiging', 'offset': 12},
 {'text': '23191868', 'offset': 32},
 {'text': 'Besteldatum', 'offset': 41},
 {'text': ':', 'offset': 52},
 {'text': '03.08.2020', 'offset': 54},
 {'text': 'Pittway', 'offset': 65},
 {'text': 'BVBA', 'offset': 73},
 {'text': 'Uw', 'offset': 78},
 {'text': 'Referentie', 'offset': 81}]

In [13]:
stride = 10
context_lenght = 40

In [14]:
%%time
sentences = []
offset_mappings = []
for x in range(0,len(tokens),stride):
    print('*'*100)
    text = ' '.join([token['text'] for token in tokens[x:x+context_lenght]])
    print(text)
    sentences.append(Sentence(text))
    
    offset_mapping = []
    offset_this = 0
    for token in tokens[x:x+context_lenght]:
        offset_mapping.append((offset_this,token['offset']))
        offset_this += len(token['text'])+1
    offset_mappings.append(offset_mapping)
    print(len(text))
    print(offset_mapping)

****************************************************************************************************
ORIGINEEL Opdrachtbevestiging 23191868 Besteldatum : 03.08.2020 Pittway BVBA Uw Referentie : R2657068 Hermes Plaza Hermeslaan 1H Uw besteldatum : 03.08.2020 1831 Diegem België Opdrachtgever : 1539304 Verzendadres : 1670631 Van Marcke N.V. Van Marcke N.V. LAR BlokZ 5 A LAR
272
[(0, 1), (10, 12), (30, 32), (39, 41), (51, 52), (53, 54), (64, 65), (72, 73), (77, 78), (80, 81), (91, 91), (93, 93), (102, 102), (109, 109), (115, 115), (126, 126), (129, 129), (132, 132), (144, 143), (146, 145), (157, 156), (162, 161), (169, 168), (176, 175), (190, 188), (192, 190), (200, 198), (213, 210), (215, 212), (223, 220), (227, 224), (234, 231), (239, 236), (243, 240), (250, 247), (255, 252), (259, 256), (265, 262), (267, 264), (269, 266)]
****************************************************************************************************
: R2657068 Hermes Plaza Hermeslaan 1H Uw besteldatum : 03.08.2020 

In [15]:
%%time

# load the model you trained
model = SequenceTagger.load('flair/models/first_test/best-model.pt')

# predict tags and print
model.predict(sentences,all_tag_prob=True)



2020-08-14 16:49:04,119 loading file flair/models/first_test/best-model.pt
CPU times: user 14.6 s, sys: 782 ms, total: 15.4 s
Wall time: 4.22 s


In [16]:
def find_offset(offset_mapping,x):
    for i,om in enumerate(offset_mapping):
        if om[0]>x:
            break
    om = offset_mapping[i-1]
    return om[1] - om[0] + x

In [17]:
from collections import defaultdict
labels = defaultdict(list)

for sentence,offset_mapping in zip(sentences,offset_mappings):
    print('*'*100)
    print(sentence)
    
    tags = sentence.to_dict(tag_type='ner')
    
    for entitie in tags['entities']:
        print(entitie)
    
        start = find_offset(offset_mapping,entitie['start_pos'])
        end = find_offset(offset_mapping,entitie['end_pos'])
        label = entitie['labels'][0].value
        score = entitie['labels'][0].score
        
        labels[(label,start,end)].append(score) 

****************************************************************************************************
Sentence: "ORIGINEEL Opdrachtbevestiging 23191868 Besteldatum : 03.08.2020 Pittway BVBA Uw Referentie : R2657068 Hermes Plaza Hermeslaan 1H Uw besteldatum : 03.08.2020 1831 Diegem België Opdrachtgever : 1539304 Verzendadres : 1670631 Van Marcke N.V. Van Marcke N.V. LAR BlokZ 5 A LAR"   [− Tokens: 40  − Token-Labels: "ORIGINEEL Opdrachtbevestiging 23191868 Besteldatum : 03.08.2020 Pittway BVBA Uw Referentie : R2657068 <REFERENTIE> Hermes Plaza Hermeslaan 1H Uw besteldatum : 03.08.2020 1831 Diegem België Opdrachtgever : 1539304 Verzendadres : 1670631 Van Marcke N.V. Van Marcke N.V. LAR BlokZ 5 A LAR"]
{'text': 'R2657068', 'start_pos': 93, 'end_pos': 101, 'labels': [REFERENTIE (1.0)]}
****************************************************************************************************
Sentence: ": R2657068 Hermes Plaza Hermeslaan 1H Uw besteldatum : 03.08.2020 1831 Diegem België Opdrachtgev

In [18]:
labels

defaultdict(list,
            {('REFERENTIE', 93, 101): [0.9999934434890747, 0.9999409914016724],
             ('LEVERDATUM', 740, 750): [0.9986662864685059,
              0.9987465143203735,
              0.9993253946304321,
              0.9985598921775818],
             ('REFERENTIE', 846, 854): [0.9994608759880066,
              0.9999886751174927,
              0.99998939037323,
              0.9999936819076538],
             ('REFERENTIE', 2034, 2042): [0.9999904632568359,
              0.9999938011169434,
              0.9999972581863403,
              0.999992847442627],
             ('LEVERDATUM', 2086, 2096): [0.7886056900024414,
              0.9164460301399231,
              0.9050818085670471],
             ('LEVERDATUM', 1995, 2005): [0.7457555532455444]})

In [19]:
labels = {key:sum(value)/len(value) for key,value in labels.items()}

In [20]:
labels

{('REFERENTIE', 93, 101): 0.9999672174453735,
 ('LEVERDATUM', 740, 750): 0.9988245218992233,
 ('REFERENTIE', 846, 854): 0.9998581558465958,
 ('REFERENTIE', 2034, 2042): 0.9999935925006866,
 ('LEVERDATUM', 2086, 2096): 0.8700445095698038,
 ('LEVERDATUM', 1995, 2005): 0.7457555532455444}

In [21]:
labels

{('REFERENTIE', 93, 101): 0.9999672174453735,
 ('LEVERDATUM', 740, 750): 0.9988245218992233,
 ('REFERENTIE', 846, 854): 0.9998581558465958,
 ('REFERENTIE', 2034, 2042): 0.9999935925006866,
 ('LEVERDATUM', 2086, 2096): 0.8700445095698038,
 ('LEVERDATUM', 1995, 2005): 0.7457555532455444}

In [22]:
# Return list of named entities (dict):
#         # "label"
#         # "start"
#         # "end"
#         # "score"
#         # "context

In [23]:
results = []
for key,value in labels.items():
    
    new_result = {
        'label' : key[0],
        'start' : key[1],
        'end' : key[2],
        'score' : value,
        'context' : pdf_txt[key[1]:key[2]],
    }
    
    
    new_results = []
    new_always_better = True
    for result in results:
        this_range = set(range(result['start'],result['end']))
        new_range = set(range(new_result['start'],new_result['end']))
        if this_range.intersection(new_range):
            if new_result['score'] > result['score']:
                continue
            else:
                new_always_better = False
        new_results.append(result)
        
    if new_always_better:
        new_results.append(new_result)
    results = new_results


In [24]:
file,results

('Orderbevestiging 23191868 - R2657068.pdf',
 [{'label': 'REFERENTIE',
   'start': 93,
   'end': 101,
   'score': 0.9999672174453735,
   'context': 'R2657068'},
  {'label': 'LEVERDATUM',
   'start': 740,
   'end': 750,
   'score': 0.9988245218992233,
   'context': '06.08.2020'},
  {'label': 'REFERENTIE',
   'start': 846,
   'end': 854,
   'score': 0.9998581558465958,
   'context': 'R2657068'},
  {'label': 'REFERENTIE',
   'start': 2034,
   'end': 2042,
   'score': 0.9999935925006866,
   'context': 'R2657068'},
  {'label': 'LEVERDATUM',
   'start': 2086,
   'end': 2096,
   'score': 0.8700445095698038,
   'context': '03.08.2020'},
  {'label': 'LEVERDATUM',
   'start': 1995,
   'end': 2005,
   'score': 0.7457555532455444,
   'context': '03.08.2020'}])

In [25]:
# 'Opdrachtbevestiging_0045478233_20200804.PDF'

# [{'label': 'REFERENTIE',
#   'start': 222,
#   'end': 231,
#   'score': 0.9989293217658997,
#   'context': '500-15221'},
#  {'label': 'LEVERDATUM',
#   'start': 651,
#   'end': 661,
#   'score': 0.9854911118745804,
#   'context': '06.08.2020'}]

In [26]:
# ('Conf. de commande 1574183346.pdf',
#  [{'label': 'REFERENTIE',
#    'start': 180,
#    'end': 188,
#    'score': 0.9999979337056478,
#    'context': 'R2654812'},
#   {'label': 'LEVERDATUM',
#    'start': 273,
#    'end': 283,
#    'score': 0.9919492304325104,
#    'context': '07.08.2020'},
#   {'label': 'LEVERDATUM',
#    'start': 2513,
#    'end': 2523,
#    'score': 0.9869064688682556,
#    'context': '03.08.2020'}])

In [27]:
# ('jobrpt_YARCCLIENTDD_SMEJ2_28422.pdf',
#  [{'label': 'REFERENTIE',
#    'start': 201,
#    'end': 209,
#    'score': 0.9999919831752777,
#    'context': 'R2657052'},
#   {'label': 'LEVERDATUM',
#    'start': 763,
#    'end': 773,
#    'score': 0.9988783895969391,
#    'context': '10/09/2020'},
#   {'label': 'LEVERDATUM',
#    'start': 861,
#    'end': 871,
#    'score': 0.9997783601284027,
#    'context': '10/09/2020'}])

In [None]:
# ('Orderbevestiging 23191868 - R2657068.pdf',
#  [{'label': 'REFERENTIE',
#    'start': 93,
#    'end': 101,
#    'score': 0.9999672174453735,
#    'context': 'R2657068'},
#   {'label': 'LEVERDATUM',
#    'start': 740,
#    'end': 750,
#    'score': 0.9988245218992233,
#    'context': '06.08.2020'},
#   {'label': 'REFERENTIE',
#    'start': 846,
#    'end': 854,
#    'score': 0.9998581558465958,
#    'context': 'R2657068'},
#   {'label': 'REFERENTIE',
#    'start': 2034,
#    'end': 2042,
#    'score': 0.9999935925006866,
#    'context': 'R2657068'},
#   {'label': 'LEVERDATUM',
#    'start': 2086,
#    'end': 2096,
#    'score': 0.8700445095698038,
#    'context': '03.08.2020'},
#   {'label': 'LEVERDATUM',
#    'start': 1995,
#    'end': 2005,
#    'score': 0.7457555532455444,
#    'context': '03.08.2020'}])