In [1]:
import spacy
from spacy.tokens import Doc, Span
from spacy import displacy
from spacy.training import Example
from spacy.scorer import Scorer
import re
import os
from spacy.language import Language
from prettytable import PrettyTable
from IPython.display import clear_output
from collections import Counter

In [2]:
EXPRESIONES_REGULARES = {
    "EMAIL": "((?<=^)|(?<=\\s))([\\w-]+(\\.[\\w-]+)*@([\\w-]+\\.)+\\w+)(?=$|[\\s\\.])",
    "DNI": "((?<=^)|(?<=\\s))(([A-Za-z][- ]?((\\d\\.?\\d{3}\\.?\\d{3}[ -]?[A-Za-z])|(\\d{2}\\.?\\d{3}\\.?\\d{3})))|(\\d{2}\\.?\\d{3}\\.?\\d{3}[\\. -]?[A-Za-z])|([XxYyZz][- ]?\\d{8}[- ]?[A-Za-z]))(?=$|[\\s\\.])",
    "CUENTA_BANCARIA": "ES[\\s.-]?\\d{2}[\\s.-]?\\d{4}[\\s.-]?\\d{4}[\\s.-]?\\d{4}[\\s.-]?\\d{4}[\\s.-]?\\d{4}",
    "MATRICULA": "\\b(?=\\w)\\d{4}\\s?-?\\s?[b-df-hj-ñp-tv-zB-DF-HJ-ÑP-TV-Z]{3,4}\\b(?<=\\w)",
    "CSV": "\\w{5}-\\w{5}-\\w{5}",
    "URL": "https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)",
    "REF_CATASTRAL": "\\b(?=\\w)(\\d{7}\\s?\\w{2}\\s?\\d{4}\\s?\\w\\s?\\d{4}\\s?\\w{2}|\\d{5}\\s?\\w\\s?\\d{12}\\s?\\w{2})\\b(?<=\\w)",
    "TELEFONO": "((?<=^)|(?<=\\s))[6789]((((\\d{2}(?P<sep>[-. ]?)\\d{2})|(\\d{1}(?P<sep2>[-. ]?)\\d{3}))((?P=sep)|(?P=sep2))?\\d{2}((?P=sep)|(?P=sep2))?\\d{2}))(?=$|[\\s\\.])",
}

In [3]:
@Language.component("regex")
def regexComponent(doc):
        
    text = doc.text
    regexes = EXPRESIONES_REGULARES
    
    chars_to_tokens = {}
    
    for token in doc:
        for i in range(token.idx, token.idx + len(token.text)):
            chars_to_tokens[i] = token.i
    for label, regex in regexes.items():
        for match in re.finditer(re.compile(regex), text):
            start, end = match.span()
            span = doc.char_span(start, end, label=label)
            if span is not None:
                if span not in doc.ents:
                    try:
                        doc.ents += (span,)
                    except ValueError as e:
                        print('Error ({}): {} {}\n{}'.format(str(1), span.label_, span, e))
            else:
                start_token = chars_to_tokens.get(start)
                end_token = chars_to_tokens.get(end)

                if start_token is not None and end_token is not None:
                    span = Span(doc, start_token, end_token + 1, label=label)
                    try:
                        doc.ents += (span,)
                    except ValueError as e:
                        pass
                else:
                    pass
    return doc

In [4]:
def getListaDocsAnotados():
    path_doc_anotados = "annotated/"
    
    lista_anotados = os.listdir(path_doc_anotados)
    lista_anotados = [x for x in lista_anotados if re.match(r"^doc_*", x)]
    print("Docs anotados totales: ", len(lista_anotados) )
    
    path_test = "test/"
    lista_test = os.listdir(path_test)
    lista_test = [x for x in lista_test if re.match(r"^doc_*", x)]
    
    lista_documentos_anotados = [x.replace(".spacy", ".tsv") for x in lista_test if x.replace(".spacy", ".tsv") in lista_anotados]
    print("Docs Anotados TEST: ", len(lista_documentos_anotados) )
    return lista_documentos_anotados

In [5]:
def cargarDocAnotado(ficheroAnotado):
        
    palabras, entidades = zip(* [linea.rstrip().split("\t") for linea in open(ficheroAnotado, "r", encoding = "utf-8") if linea != "\n"] )
    
    palabras, entidades = list(palabras), list(entidades)
    
    doc = Doc(nlp.vocab, words = palabras, ents = entidades)
            
    return doc

def cargarTextoDocumento(fichero):
    with open(fichero, 'r', encoding='utf-8') as file:
        text = ' '.join([x.rstrip() for x in file])
    return text        

In [6]:
nlp = spacy.load("./MODELOS/modelE/")

print("Pipeline: ", nlp.pipe_names)

Pipeline:  ['tok2vec', 'ner']


In [7]:
COLORS = {"NOMBRE": "#E6B0AA", 
          "APELLIDO": "#73C1A2",
          "DNI": "#AF601A",
          "DIRECCION":"#D2B4DE ", 
          "CIUDAD":"#A9CCE3", 
          "PROVINCIA":"#AED6F1",
          "CP": "#F5B7B1", 
          "TELEFONO": "#A3E4D7" , 
          "REF_CATASTRAL":"#F9E79F", 
          "SEGURIDAD_SOCIAL":"#FAD7A0", 
          "CUENTA_BANCARIA": "#5DADE2", 
          "EMAIL": "#EDBB99", 
          "MATRICULA": "#5D6D7E",
          "CSV": "#DAF7A6",
          "URL": "#FF5733",
          }
ENTITIES = list( COLORS.keys() )
options = {"ents": ENTITIES, "colors": COLORS}

### Métricas sin RegEX

In [8]:
lista_documentos_anotados = getListaDocsAnotados()
lista_documentos_anotados[:3]

Docs anotados totales:  309
Docs Anotados TEST:  62


['doc_38.txt.tsv', 'doc_273.txt.tsv', 'doc_191.txt.tsv']

In [9]:
ejemplos = []

for idx, documento in enumerate(lista_documentos_anotados):
    print(f"Procesando doc {idx + 1} de {len(lista_documentos_anotados)}")
    clear_output(True)
    anotado = cargarDocAnotado(f"annotated/{documento}")
    prediccion = nlp( anotado.text )
    ejemplos.append( Example(prediccion, anotado) )

Procesando doc 62 de 62


In [10]:
scorer = Scorer(nlp)
resultados = scorer.score(ejemplos)

In [11]:
metricas_globales_tabla = PrettyTable(["PRECISIÓN", "RECALL", "F1"])

metricas_globales_tabla.add_row( [ round(resultados["ents_p"], 3), round(resultados["ents_r"], 3), round(resultados["ents_f"], 3)] ) 

print("SIN REGEX")
print(metricas_globales_tabla)

SIN REGEX
+-----------+--------+-------+
| PRECISIÓN | RECALL |   F1  |
+-----------+--------+-------+
|   0.953   |  0.93  | 0.942 |
+-----------+--------+-------+


In [12]:
metricas_entidad_tabla = PrettyTable(["ENTIDAD", "PRECISIÓN", "RECALL", "F1"])

for entidad, metricas in resultados["ents_per_type"].items():
    metricas_entidad_tabla.add_row( [entidad, round(metricas["p"], 3), round(metricas["r"], 3), round(metricas["f"], 3)] )
    
print(metricas_entidad_tabla)

+-----------------+-----------+--------+-------+
|     ENTIDAD     | PRECISIÓN | RECALL |   F1  |
+-----------------+-----------+--------+-------+
|      NOMBRE     |   0.953   | 0.961  | 0.957 |
|     APELLIDO    |   0.962   | 0.968  | 0.965 |
|       CSV       |   0.968   | 0.968  | 0.968 |
|      CIUDAD     |   0.995   | 0.987  | 0.991 |
|       URL       |    1.0    | 0.966  | 0.983 |
|    DIRECCION    |   0.887   | 0.588  | 0.707 |
| CUENTA_BANCARIA |   0.056   | 0.067  | 0.061 |
|       DNI       |   0.871   |  0.9   | 0.885 |
|        CP       |   0.929   | 0.788  | 0.852 |
|     TELEFONO    |    1.0    |  1.0   |  1.0  |
|    PROVINCIA    |    1.0    |  1.0   |  1.0  |
|      EMAIL      |    0.0    |  0.0   |  0.0  |
|  REF_CATASTRAL  |    0.0    |  0.0   |  0.0  |
|    MATRICULA    |    0.0    |  0.0   |  0.0  |
+-----------------+-----------+--------+-------+


### Métricas con RegEX

In [13]:
nlp = spacy.load("./MODELOS/modelE/")
nlp.add_pipe("regex", before = 'ner')

print("Pipeline: ", nlp.pipe_names)

Pipeline:  ['tok2vec', 'regex', 'ner']


In [14]:
lista_documentos_anotados = getListaDocsAnotados()
lista_documentos_anotados[:3]

Docs anotados totales:  309
Docs Anotados TEST:  62


['doc_38.txt.tsv', 'doc_273.txt.tsv', 'doc_191.txt.tsv']

In [15]:
ejemplos = []

for idx, documento in enumerate(lista_documentos_anotados):
    print(f"Procesando doc {idx + 1} de {len(lista_documentos_anotados)}")
    clear_output(True)
    anotado = cargarDocAnotado(f"annotated/{documento}")
    prediccion = nlp( anotado.text )
    ejemplos.append( Example(prediccion, anotado) )

Procesando doc 62 de 62


In [16]:
scorer = Scorer(nlp)
resultados = scorer.score(ejemplos)

In [17]:
metricas_globales_tabla = PrettyTable(["PRECISIÓN", "RECALL", "F1"])

metricas_globales_tabla.add_row( [ round(resultados["ents_p"], 3), round(resultados["ents_r"], 3), round(resultados["ents_f"], 3)] ) 

print("CON REGEX")
print(metricas_globales_tabla)

CON REGEX
+-----------+--------+-------+
| PRECISIÓN | RECALL |   F1  |
+-----------+--------+-------+
|   0.936   | 0.949  | 0.945 |
+-----------+--------+-------+


In [18]:
metricas_entidad_tabla = PrettyTable(["ENTIDAD", "PRECISIÓN", "RECALL", "F1"])

for entidad, metricas in resultados["ents_per_type"].items():
    metricas_entidad_tabla.add_row( [entidad, round(metricas["p"], 3), round(metricas["r"], 3), round(metricas["f"], 3)] )
    
print(metricas_entidad_tabla)

+-----------------+-----------+--------+-------+
|     ENTIDAD     | PRECISIÓN | RECALL |   F1  |
+-----------------+-----------+--------+-------+
|      NOMBRE     |   0.953   | 0.961  | 0.957 |
|     APELLIDO    |   0.962   | 0.968  | 0.965 |
|       CSV       |   0.968   | 0.968  | 0.968 |
|      CIUDAD     |   0.995   | 0.987  | 0.991 |
|     TELEFONO    |    1.0    |  1.0   |  1.0  |
|       URL       |    1.0    | 0.966  | 0.983 |
|    DIRECCION    |   0.887   | 0.588  | 0.707 |
|    MATRICULA    |    1.0    |  1.0   |  1.0  |
| CUENTA_BANCARIA |    1.0    |  0.8   | 0.889 |
|       DNI       |   0.915   | 0.973  | 0.942 |
|        CP       |   0,929   | 0.788  | 0.852 |
|    PROVINCIA    |    1.0    |  1.0   |  1.0  |
|      EMAIL      |    1.0    |  1.0   |  1.0  |
|  REF_CATASTRAL  |    1.0    | 0.812  | 0.891 |
+-----------------+-----------+--------+-------+
