In [20]:
import spacy
from spacy.tokens import DocBin
from spacy import displacy
from spacy.util import filter_spans
import pickle
import fitz
import pathlib as pl
import random
import re
from rich import print as prt

In [2]:
# for i in utils.v2Tov3Converter(train_data):
#     for j in i.ents:
#         prt(j.text, " ", j.label_)
    

In [3]:
train_data = pickle.load(open('train_data/train_data.pkl', 'rb'))

In [10]:
class utils:
    # convert tuples to list and return converted data
    def prepareData(train_data):
        data = []
        for text, annot in train_data:
            ent = []
            for strt, end, lbl in annot['entities']:
                ent.append([strt, end, lbl])
            annot['entities'] = ent
            data.append([text, annot])
        return data

    # visualise training data using displacy
    def renderData(data, start=0, end=1, serve=False):
        nlp = spacy.blank('en')
        data0 = data[start:end]
        for text, annotations in data0:
            doc = nlp.make_doc(text)
            ents = []
            for start, end, label in annotations['entities']:
                span = doc.char_span(start, end, label=label)
                if(span!=None):
                    ents.append(span)
            doc.ents = ents
        if serve:
            displacy.serve(doc, style='ent')
        else:
            displacy.render(doc, style='ent')

    def remove_whitespace_entities(doc):
        doc.ents = [e for e in doc.ents if not e.text.isspace()]
        return doc

    # convert training dataset from v2 to v3 using docbin
    # note use filter_span to to get rid of the span errors
    def v2Tov3Converter(data, filename="train"):
        nlp = spacy.blank("en")
        # the DocBin will store the example documents
        db = DocBin()
        for text, annotations in data:
            doc = nlp.make_doc(text)
            ents = []
            for start, end, label in annotations['entities']:
                # span = doc.char_span(start, end, label=label, alignment_mode='contract')
                span = doc.char_span(start, end, label=label)
                if span == None:
                    continue
                if span.text.isspace()==True:
                    continue
                proceed = True
                span_text = span.text
                if span_text[0]==' ' or span_text[len(span_text)-1]==' ':
                    continue
                for char in span.text:
                    if char.isalnum():
                        continue
                    else:
                        proceed = False

                if proceed:
                    ents.append(span)

            ents = filter_spans(ents)
            # prt(ents)
            doc.ents = ents
            doc = utils.remove_whitespace_entities(doc)
            db.add(doc)
        filename=filename+".spacy"
        db.to_disk(filename)
        return list(db.get_docs(nlp.vocab))
    
    def trim_entity_spans(data: list) -> list:
        # Removes leading and trailing white spaces from entity spans.

        # Args:
        # data (list): The data to be cleaned in spaCy JSON format.

        # Returns:
        # list: The cleaned data.
        invalid_span_tokens = re.compile(r'\s')
        cleaned_data = []
        for text, annotations in data:
            entities = annotations['entities']
            valid_entities = []
            for start, end, label in entities:
                valid_start = start
                valid_end = end
                # if there's preceding spaces, move the start position to nearest character
                while valid_start < len(text) and invalid_span_tokens.match(
                        text[valid_start]):
                    valid_start += 1
                while valid_end > 1 and invalid_span_tokens.match(
                        text[valid_end - 1]):
                    valid_end -= 1

                valid_entities.append([valid_start, valid_end, label])
            cleaned_data.append([text, {'entities': valid_entities}])
        return cleaned_data

In [11]:
docs = utils.v2Tov3Converter(train_data)

In [26]:
# initialize config.cfg file
!spacy init config --lang en --pipeline ner config.cfg --force

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
# train the model using config.cfg file. and save the model in trained_model folder.
!python -m spacy train config.cfg --output ./trained_model/ --paths.train ./train.spacy --paths.dev ./train.spacy

## loading and tesing trained model

In [43]:
class tools:

    def loadModel(model_dir = './trained_model/model-last'):
        return spacy.load(model_dir)

    def loadPdfs(dir='./data for testing'):
        path = pl.Path(dir)
        return list(path.glob("*.pdf"))

    def extractTextFromPdf(path, pdf_number):
        pdf = fitz.open(path[pdf_number])
        text = ''
        for page in pdf:
            text += str(page.getText())
        text = " ".join(text.split('\n'))
        return text
    
    def render(text):
        displacy.render(nlp(text), style='ent')
    
    

In [45]:
nlp = tools.loadModel()
pdfs = tools.loadPdfs()
text = tools.extractTextFromPdf(pdfs, 1)
tools.render(text)