In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]=""

In [2]:
import flair
flair.device

device(type='cpu')

In [3]:
### flair dependancies

from flair.models import SequenceTagger
from flair.data import Sentence

In [5]:
### hack to load pdf2txt
import sys
sys.path.insert(0,'Pieter/eml-to-txt/pdf2text/')

### pdf2text dependancies
### xmltodict

from src.pdf.converter import PDFConverter
from src.pdf.writer import PDFWriter
from src.pdf.util import FrontendIndexMapping


def process_pdf(pdf_file):
    # Copy pdf-file to temp
    PDFWriter.copy_pdf_to_tmp(pdf_file)

    # Convert temporary pdf file
    pdf_as_json, word_boundaries = PDFConverter.convert_pdf_to_dict()

    # Convert json to text
    frontend_index_mapping = FrontendIndexMapping(
        pdf_as_json
    )

    return frontend_index_mapping.text()


In [221]:
import os
import numpy as np

In [222]:
folder = 'data/Mail_Exports'

In [223]:
file = np.random.choice(os.listdir(folder))
file

'Opdrachtbevestiging_0045478233_20200804.PDF'

In [224]:
pdf_txt = process_pdf(os.path.join(folder,file))

In [225]:
import spacy

In [226]:
nlp = spacy.blank("nl")

In [227]:
### get text and char offset from spacy tokens
tokens = [{'text':str(token),'offset':token.idx} for token in nlp(pdf_txt)]
tokens[:10]

[{'text': '\n', 'offset': 0},
 {'text': 'TTI', 'offset': 1},
 {'text': 'Central', 'offset': 5},
 {'text': 'Europe', 'offset': 13},
 {'text': '\n', 'offset': 19},
 {'text': 'Opdrachtbevestiging', 'offset': 20},
 {'text': '\t', 'offset': 39},
 {'text': 'Pagina', 'offset': 40},
 {'text': ' ', 'offset': 47},
 {'text': '1', 'offset': 48}]

In [228]:
### remove whitespace
tokens = [token for token in tokens if len(token['text'].strip())>0]
tokens[:10]

[{'text': 'TTI', 'offset': 1},
 {'text': 'Central', 'offset': 5},
 {'text': 'Europe', 'offset': 13},
 {'text': 'Opdrachtbevestiging', 'offset': 20},
 {'text': 'Pagina', 'offset': 40},
 {'text': '1', 'offset': 48},
 {'text': '/1', 'offset': 50},
 {'text': 'Opdrachtbevestigingsnr', 'offset': 53},
 {'text': '.', 'offset': 75},
 {'text': '0045478233', 'offset': 77}]

In [229]:
stride = 10
context_lenght = 40

In [230]:
%%time
sentences = []
offset_mappings = []
for x in range(0,len(tokens),stride):
    print('*'*100)
    text = ' '.join([token['text'] for token in tokens[x:x+context_lenght]])
    print(text)
    sentences.append(Sentence(text))
    
    offset_mapping = []
    offset_this = 0
    for token in tokens[x:x+context_lenght]:
        offset_mapping.append((offset_this,token['offset']))
        offset_this += len(token['text'])+1
    offset_mappings.append(offset_mapping)
    print(len(text))
    print(offset_mapping)

****************************************************************************************************
TTI Central Europe Opdrachtbevestiging Pagina 1 /1 Opdrachtbevestigingsnr . 0045478233 Besteldatum 04.08.2020 Klantnr . 0400033121 Techtronic Industries Central Europe GmbH , Postfach 100833 , DE-40708 Hilden Inkoopordernr . 500-15221 Inkooporder datum 04.08.2020 Contactpersoon TTI Adnane Boujnane Factuuradres Leveringsadres VAN MARCKE
337
[(0, 1), (4, 5), (12, 13), (19, 20), (39, 40), (46, 48), (48, 50), (51, 53), (74, 75), (76, 77), (87, 88), (99, 100), (110, 111), (118, 118), (120, 120), (131, 131), (142, 142), (153, 153), (161, 161), (168, 168), (173, 172), (175, 174), (184, 183), (191, 189), (193, 191), (202, 200), (209, 207), (223, 220), (225, 222), (235, 232), (247, 244), (253, 250), (264, 261), (279, 276), (283, 280), (290, 287), (299, 296), (312, 309), (327, 324), (331, 328)]
****************************************************************************************************
Be

In [231]:
%%time

# load the model you trained
model = SequenceTagger.load('flair/models/first_test/best-model.pt')

# predict tags and print
model.predict(sentences,all_tag_prob=True)



2020-08-14 16:00:07,426 loading file flair/models/first_test/best-model.pt
CPU times: user 9.42 s, sys: 612 ms, total: 10 s
Wall time: 3.45 s


In [232]:
def find_offset(offset_mapping,x):
    for i,om in enumerate(offset_mapping):
        if om[0]>x:
            break
    om = offset_mapping[i-1]
    return om[1] - om[0] + x

In [233]:
from collections import defaultdict
labels = defaultdict(list)

for sentence,offset_mapping in zip(sentences,offset_mappings):
    print('*'*100)
    print(sentence)
    
    tags = sentence.to_dict(tag_type='ner')
    
    for entitie in tags['entities']:
        print(entitie)
    
        start = find_offset(offset_mapping,entitie['start_pos'])
        end = find_offset(offset_mapping,entitie['end_pos'])
        label = entitie['labels'][0].value
        score = entitie['labels'][0].score
        
        labels[(label,start,end)].append(score) 

****************************************************************************************************
Sentence: "TTI Central Europe Opdrachtbevestiging Pagina 1 /1 Opdrachtbevestigingsnr . 0045478233 Besteldatum 04.08.2020 Klantnr . 0400033121 Techtronic Industries Central Europe GmbH , Postfach 100833 , DE-40708 Hilden Inkoopordernr . 500-15221 Inkooporder datum 04.08.2020 Contactpersoon TTI Adnane Boujnane Factuuradres Leveringsadres VAN MARCKE"   [− Tokens: 40  − Token-Labels: "TTI Central Europe Opdrachtbevestiging Pagina 1 /1 Opdrachtbevestigingsnr . 0045478233 Besteldatum 04.08.2020 Klantnr . 0400033121 Techtronic Industries Central Europe GmbH , Postfach 100833 , DE-40708 Hilden Inkoopordernr . 500-15221 <REFERENTIE> Inkooporder datum 04.08.2020 Contactpersoon TTI Adnane Boujnane Factuuradres Leveringsadres VAN MARCKE"]
{'text': '500-15221', 'start_pos': 225, 'end_pos': 234, 'labels': [REFERENTIE (0.9997)]}
*************************************************************************

In [234]:
labels

defaultdict(list,
            {('REFERENTIE', 222, 231): [0.9997028708457947,
              0.9986907839775085,
              0.9983943104743958],
             ('LEVERDATUM', 651, 661): [0.9842538833618164,
              0.9931427240371704,
              0.9974974989891052,
              0.9670703411102295]})

In [236]:
labels = {key:sum(value)/len(value) for key,value in labels.items()}

In [237]:
labels

{('REFERENTIE', 222, 231): 0.9989293217658997,
 ('LEVERDATUM', 651, 661): 0.9854911118745804}

In [None]:
# Return list of named entities (dict):
#         # "label"
#         # "start"
#         # "end"
#         # "score"
#         # "context

In [242]:
results = []
for key,value in labels.items():
    print(label)
    
    results.append({
        'label':key[0],
        'start':key[1],
        'end':key[2],
        'score':value,
        'context':pdf_txt[key[1]:key[2]],
    })

(('LEVERDATUM', 651, 661), 0.9854911118745804)
(('LEVERDATUM', 651, 661), 0.9854911118745804)


In [243]:
results

[{'label': 'REFERENTIE',
  'start': 222,
  'end': 231,
  'score': 0.9989293217658997,
  'context': '500-15221'},
 {'label': 'LEVERDATUM',
  'start': 651,
  'end': 661,
  'score': 0.9854911118745804,
  'context': '06.08.2020'}]

In [None]:
'Opdrachtbevestiging_0045478233_20200804.PDF'


In [None]:
# [{'label': 'REFERENTIE',
#   'start': 222,
#   'end': 231,
#   'score': 0.9989293217658997,
#   'context': '500-15221'},
#  {'label': 'LEVERDATUM',
#   'start': 651,
#   'end': 661,
#   'score': 0.9854911118745804,
#   'context': '06.08.2020'}]