In [1]:
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
import os
from pathlib import Path
import re
import pickle
import random
import numpy as np
import eli5

NUM_JANELA=4

In [5]:
def getTiposEntidade():
    return ['Problema','Teste','Tratamento','Anatomia']
    
def replaceWhiteSpaces(str):
    return re.sub('\s{2,}',' ',str)

def save_obj(name, obj):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name):
    print('Load obj em: ', 'obj/' + name + '.pkl')
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


In [23]:
def read_clusters(cluster_file):
    word2cluster = {}
    try:
        with open(cluster_file, encoding='utf-8') as i:
            for num, line in enumerate(i):
                if line:
                    word, cluster = line.strip().split('\t')
                    word2cluster[word] = cluster
    except:
        print(line)
        print(num)
        raise
    return word2cluster

def word2features(sent, i):
    word = sent[i][0]
    postag = tipoPostaggerTokens(word, dicPostagger)
    cluster = word2cluster[word.lower()] if word.lower() in word2cluster else "0"
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[:3]': word[:3],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'word.cluster': cluster
    }
    temFeatureOrdemPalavra = False
    temFeatureOrdemPalavraFinal = False
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:word.cluster': cluster
        })
    else:
        features['BOS'] = True
        temFeatureOrdemPalavra = True
    
    if i > 1:
        word1 = sent[i-2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isupper()': word1.isupper(),
            '-2:postag': postag1,
            '-2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Segunda_palavra'] = True
            temFeatureOrdemPalavra = True

    if i > 2:
        word1 = sent[i-3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-3:word.lower()': word1.lower(),
            '-3:word.istitle()': word1.istitle(),
            '-3:word.isupper()': word1.isupper(),
            '-3:postag': postag1,
            '-3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Terceira_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i > 3:
        word1 = sent[i-4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-4:word.lower()': word1.lower(),
            '-4:word.istitle()': word1.istitle(),
            '-4:word.isupper()': word1.isupper(),
            '-4:postag': postag1,
            '-4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Quarta_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:word.cluster': cluster
        })
    else:
        features['EOS'] = True
        temFeatureOrdemPalavraFinal = True
    if i < len(sent)-2:
        word1 = sent[i+2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
            '+2:postag': postag1,
            '+2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Ultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    if i < len(sent)-3:
        word1 = sent[i+3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+3:word.lower()': word1.lower(),
            '+3:word.istitle()': word1.istitle(),
            '+3:word.isupper()': word1.isupper(),
            '+3:postag': postag1,
            '+3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Penultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    
    if i < len(sent)-4:
        word1 = sent[i+4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+4:word.lower()': word1.lower(),
            '+4:word.istitle()': word1.istitle(),
            '+4:word.isupper()': word1.isupper(),
            '+4:postag': postag1,
            '+4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Antepenultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
       

def sent2labels(sent):
    try:
        return [label for token, label in sent]
    except:
        print(sent)
        raise
        

def sent2tokens(sent):
    return [token for token, postag, label in sent]

#word2cluster = read_clusters(r"clusters/cluster-50.tsv")
word2cluster = read_clusters(r"clusters/cluster-300.tsv")


In [24]:
dicPostagger = load_obj('../spanclassification/obj/dic_postagger')
def tipoPostaggerTokens(token, dicPostagger):
    postagger = 'N' # na duvida é N
    if token.lower() in dicPostagger.keys():
        postagger = dicPostagger.get(token.lower())
    return postagger
tipoPostaggerTokens('coração', dicPostagger)

Load obj em:  obj/../spanclassification/obj/dic_postagger.pkl


'N'

In [439]:
dic_sentencesTrain = load_obj('../spanclassification/obj/dic_sentencesTrain')
dic_sentencesDev = load_obj('../spanclassification/obj/dic_sentencesDev')
dic_sentencesTest = load_obj('../spanclassification/obj/dic_sentencesTestNested')
dic_sentencesTest[0]

Load obj em:  obj/../spanclassification/obj/dic_sentencesTrain.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesDev.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesTestNested.pkl


[[['Lucas', 0, 43],
  [',', 1, 48],
  ['74', 2, 50],
  ['anos', 3, 53],
  ['.', 4, 57]],
 []]

In [239]:
len(dic_sentencesTest)

506

In [277]:

def gravarArquivosBinarios_OLD(dic_sentences, tipo):
    # gerar arquivo treinamento
    f_entidade = open(r'crf/nested_'+tipo+'.conll', 'w', encoding='utf-8')

    num_entidade_total=0
    num_entidade=0

    # TODO - refazer.. qdo vem entidade isolada, nao está gravando...
    print('\nGravando arquivo de {} '.format(tipo))
    numKeysDuplicadosTeste=[]
    numKeys2DuplicadosTeste=[]
    j=-1
    for i in range(len(dic_sentences)):
        tokens = dic_sentences[i][0]
        ents = dic_sentences[i][1]
        indiceEnts=[]
        duplicaFrase=False
        entidadesUsadas=[]
        for token in tokens:
            #print('token:', token)
            indiceToken = token[1]
            temEntidade=False
            tag='O'
            for ent in ents:
                #print(ent)
                if indiceToken in ent[1]: #and ent[2]==entidade:
                    if not temEntidade:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                        entidadesUsadas.append(ent[1])
                    else:
                        #print('aaaaaaaaa:', ents)
                        duplicaFrase = True
                        break
                        
                    #break
            #if tag != entidade:
            #    tag='O'
            tokenGravar = token[0].replace(' ','')
            tokenGravar = tokenGravar.strip()
            f_entidade.write(tokenGravar+' '+tag+'\n')
            num_entidade_total=num_entidade_total+1
        f_entidade.write('\n')
        j=j+1
        
        if duplicaFrase:
            if tipo=='test':
                numKeysDuplicadosTeste.append(i)
                numKeys2DuplicadosTeste.append(j)
            for token in tokens:
                #print('token:', token)
                indiceToken = token[1]
                tag='O'
                for ent in ents:
                    #print(ent)
                    if indiceToken in ent[1] and indiceToken not in entidadesUsadas:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                tokenGravar = token[0].replace(' ','')
                tokenGravar = tokenGravar.strip()
                f_entidade.write(tokenGravar+' '+tag+'\n')
                num_entidade_total=num_entidade_total+1
            f_entidade.write('\n')
            j=j+1
        #if i>15:
        #    break
    f_entidade.close()

    print('num_entidade:', num_entidade)
    print('num_entidade_total:', num_entidade_total)
    return numKeysDuplicadosTeste, numKeys2DuplicadosTeste

#numKeysDuplicadosTeste, numKeys2DuplicadosTeste = gravarArquivosBinarios(dic_sentencesTest, 'test')
#_, _ = gravarArquivosBinarios(dic_sentencesTrain, 'train')
#_, _ = gravarArquivosBinarios(dic_sentencesDev, 'dev')


Gravando arquivo de test 
num_entidade: 2391
num_entidade_total: 6663

Gravando arquivo de train 
num_entidade: 6406
num_entidade_total: 16828

Gravando arquivo de dev 
num_entidade: 1555
num_entidade_total: 4777


In [440]:
# para teste. nao preciso replicar
# senao ficad dificil depois juntar tudo
def gravarArquivosBinarios(dic_sentences, tipo):
    # gerar arquivo treinamento
    f_entidade = open(r'crf/nested_'+tipo+'.conll', 'w', encoding='utf-8')

    num_entidade_total=0
    num_entidade=0

    # TODO - refazer.. qdo vem entidade isolada, nao está gravando...
    print('\nGravando arquivo de {} '.format(tipo))
    numKeysDuplicadosTeste=[]
    numKeys2DuplicadosTeste=[]
    j=-1
    for i in range(len(dic_sentences)):
        tokens = dic_sentences[i][0]
        ents = dic_sentences[i][1]
        indiceEnts=[]
        duplicaFrase=False
        entidadesUsadas=[]
        for token in tokens:
            #print('token:', token)
            indiceToken = token[1]
            temEntidade=False
            tag='O'
            for ent in ents:
                #print(ent)
                if indiceToken in ent[1]: #and ent[2]==entidade:
                    if not temEntidade:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                        entidadesUsadas.append(ent[1])
                    else:
                        #print('aaaaaaaaa:', ents)
                        duplicaFrase = True
                        break
                        
                    #break
            #if tag != entidade:
            #    tag='O'
            tokenGravar = token[0].replace(' ','')
            tokenGravar = tokenGravar.strip()
            f_entidade.write(tokenGravar+' '+tag+'\n')
            num_entidade_total=num_entidade_total+1
        f_entidade.write('\n')
        j=j+1
        
        if tipo!='test':
            if duplicaFrase:
                if tipo=='test':
                    numKeysDuplicadosTeste.append(i)
                    numKeys2DuplicadosTeste.append(j)
                for token in tokens:
                    #print('token:', token)
                    indiceToken = token[1]
                    tag='O'
                    for ent in ents:
                        #print(ent)
                        if indiceToken in ent[1] and indiceToken not in entidadesUsadas:
                            tag = ent[2]
                            num_entidade=num_entidade+1
                            temEntidade = True
                    tokenGravar = token[0].replace(' ','')
                    tokenGravar = tokenGravar.strip()
                    f_entidade.write(tokenGravar+' '+tag+'\n')
                    num_entidade_total=num_entidade_total+1
                f_entidade.write('\n')
                j=j+1
        #if i>15:
        #    break
    f_entidade.close()

    print('num_entidade:', num_entidade)
    print('num_entidade_total:', num_entidade_total)
    return numKeysDuplicadosTeste, numKeys2DuplicadosTeste

numKeysDuplicadosTeste, numKeys2DuplicadosTeste = gravarArquivosBinarios(dic_sentencesTest, 'test')
_, _ = gravarArquivosBinarios(dic_sentencesTrain, 'train')
_, _ = gravarArquivosBinarios(dic_sentencesDev, 'dev')


Gravando arquivo de test 
num_entidade: 1647
num_entidade_total: 5463

Gravando arquivo de train 
num_entidade: 6406
num_entidade_total: 16828

Gravando arquivo de dev 
num_entidade: 1555
num_entidade_total: 4777


In [441]:
print(len(numKeysDuplicadosTeste))
numKeysDuplicadosTeste[:10]

0


[]

In [442]:
print(len(numKeys2DuplicadosTeste))
numKeys2DuplicadosTeste[:10]

0


[]

In [445]:
dic_sentencesTest[13]

[[['Ecocardiograma', 0, 934],
  ['-', 1, 949],
  ['ventrículo', 2, 951],
  ['esquerdo', 3, 962],
  ['com', 4, 971],
  ['hipertrofia', 5, 975],
  ['concentrica', 6, 987],
  ['de', 7, 999],
  ['grau', 8, 1002],
  ['discreto', 9, 1007],
  ['e', 10, 1016],
  ['função', 11, 1018],
  ['sistólica', 12, 1025],
  ['preservada', 13, 1035],
  ['.', 14, 1045]],
 [['Ecocardiograma', [0], 'Teste'],
  ['ventrículo esquerdo com hipertrofia concentrica de grau discreto',
   [2, 3, 4, 5, 6, 7, 8, 9],
   'Problema'],
  ['ventrículo esquerdo', [2, 3], 'Anatomia']]]

In [245]:
#pathTrain=r'../spanclassification/preProcessamento/data-ner-binario/nested_train.conll'
#pathDev=r'../spanclassification/preProcessamento/data-ner-binario/nested_dev.conll'
#pathTest=r'../spanclassification/preProcessamento/data-ner-binario/nested_test.conll'

tipos = getTiposEntidade()
#tipos=['Anatomia']
X_train = []
X_dev = []
X_test = []
y_train = []
y_dev = []
y_test = []

pathTrain=r'crf\nested_train.conll'
pathDev=r'crf\nested_dev.conll'
pathTest=r'crf\nested_test.conll'

with open(pathTest, encoding='utf-8') as f:
  testdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathDev, encoding='utf-8') as f:
  devdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathTrain, encoding='utf-8') as f:
  traindata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]

X_train = [sent2features(s) for s in traindata]
y_train = [sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]
#devdata[:2]
traindata[:2]

[[('Dispneia', 'Problema'),
  ('importante', 'Problema'),
  ('aos', 'Problema'),
  ('esforços', 'Problema'),
  ('+', 'O'),
  ('dor', 'Problema'),
  ('tipo', 'Problema'),
  ('peso', 'Problema'),
  ('no', 'Problema'),
  ('peito', 'Problema'),
  ('no', 'Problema'),
  ('esforço', 'Problema'),
  ('.', 'O')],
 [('Dispneia', 'Problema'),
  ('importante', 'Problema'),
  ('aos', 'Problema'),
  ('esforços', 'Problema'),
  ('+', 'O'),
  ('dor', 'Problema'),
  ('tipo', 'Problema'),
  ('peso', 'Problema'),
  ('no', 'Problema'),
  ('peito', 'Anatomia'),
  ('no', 'Problema'),
  ('esforço', 'Problema'),
  ('.', 'O')]]

In [250]:
testdata[13]

[('Ecocardiograma', 'Teste'),
 ('-', 'O'),
 ('ventrículo', 'Problema'),
 ('esquerdo', 'Problema'),
 ('com', 'Problema'),
 ('hipertrofia', 'Problema'),
 ('concentrica', 'Problema'),
 ('de', 'Problema'),
 ('grau', 'Problema'),
 ('discreto', 'Problema'),
 ('e', 'O'),
 ('função', 'O'),
 ('sistólica', 'O'),
 ('preservada', 'O'),
 ('.', 'O')]

In [251]:
testdata[14]

[('Ecocardiograma', 'Teste'),
 ('-', 'O'),
 ('ventrículo', 'Anatomia'),
 ('esquerdo', 'Anatomia'),
 ('com', 'Problema'),
 ('hipertrofia', 'Problema'),
 ('concentrica', 'Problema'),
 ('de', 'Problema'),
 ('grau', 'Problema'),
 ('discreto', 'Problema'),
 ('e', 'O'),
 ('função', 'O'),
 ('sistólica', 'O'),
 ('preservada', 'O'),
 ('.', 'O')]

In [252]:
testdata[15]

[('aumento', 'Problema'),
 ('moderado', 'Problema'),
 ('de', 'Problema'),
 ('átrio', 'Problema'),
 ('esquerdo', 'Problema'),
 ('.', 'O')]

In [262]:
testdata[19]

[('A', 'O'), (':', 'O'), ('FA', 'Problema'), ('.', 'O')]

In [240]:
len(testdata)

589

In [74]:
X_test[0]

[{'bias': 1.0,
  'word.lower()': 'lucas',
  'word[-3:]': 'cas',
  'word[:3]': 'Luc',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'N',
  'word.cluster': '0',
  'BOS': True,
  '+1:word.lower()': ',',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'PU',
  '+1:word.cluster': '22',
  '+2:word.lower()': '74',
  '+2:word.istitle()': False,
  '+2:word.isupper()': False,
  '+2:postag': 'NUM',
  '+2:word.cluster': '299',
  '+3:word.lower()': 'anos',
  '+3:word.istitle()': False,
  '+3:word.isupper()': False,
  '+3:postag': 'N',
  '+3:word.cluster': '134',
  '+4:word.lower()': '.',
  '+4:word.istitle()': False,
  '+4:word.isupper()': False,
  '+4:postag': 'PU',
  '+4:word.cluster': '153'},
 {'bias': 1.0,
  'word.lower()': ',',
  'word[-3:]': ',',
  'word[:3]': ',',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'PU',
  'word.cluster': '22',
  '-1:word.lower()': 'lucas',
  '-1:

In [75]:
y_test[0:2]

[['O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'Problema',
  'O',
  'O',
  'O',
  'Tratamento',
  'Tratamento',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [76]:
print(len(X_train))
print(len(y_train))
print(len(X_train[0]))

1541
1541
13


In [77]:
print(len(traindata))
print(len(X_train))
print(len(y_train))

1541
1541
1541


## Janela de 4 vizinhos antes e depois

Precisa reforçar os outros.. se mandar O, vai achar q é tudo O.. gera um arquivo só, só duplica a frase qdo tem nested..

In [78]:
import time

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False)

start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
#crf.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

print('CRF model was trained!')

loading training data to CRFsuite: 100%|██████████| 1541/1541 [00:00<00:00, 2485.48it/s]





loading dev data to CRFsuite: 100%|██████████| 466/466 [00:00<00:00, 2251.18it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 33449
Seconds required: 0.197

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.06  loss=18005.74 active=33001 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.694 0.191  feature_norm=1.00
Iter 2   time=0.04  loss=16567.56 active=31276 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.694 0.191  feature_norm=1.07
Iter 3   time=0.03  loss=15273.65 active=30914 precision=0.205  recall=0.260  F1=0.221  Acc(item/seq)=0.602 0.152  feature_norm=2.04
Iter 4   time=0.03  loss=13563.52 active=32697 precision=0.201  recall=0.230  F1=0.213  Acc(item/seq)=0.663 0.191  feature_norm=1.94
Iter 5   time=0.03  loss=12821.43 active=32842 pr

Iter 61  time=0.04  loss=1408.29  active=15760 precision=0.815  recall=0.731  F1=0.759  Acc(item/seq)=0.870 0.567  feature_norm=52.36
Iter 62  time=0.03  loss=1407.23  active=15673 precision=0.821  recall=0.731  F1=0.763  Acc(item/seq)=0.873 0.573  feature_norm=52.39
Iter 63  time=0.04  loss=1406.16  active=15602 precision=0.814  recall=0.732  F1=0.760  Acc(item/seq)=0.870 0.567  feature_norm=52.43
Iter 64  time=0.04  loss=1405.22  active=15544 precision=0.821  recall=0.731  F1=0.762  Acc(item/seq)=0.873 0.571  feature_norm=52.46
Iter 65  time=0.04  loss=1404.27  active=15520 precision=0.815  recall=0.731  F1=0.760  Acc(item/seq)=0.870 0.564  feature_norm=52.50
Iter 66  time=0.04  loss=1403.38  active=15468 precision=0.818  recall=0.730  F1=0.761  Acc(item/seq)=0.871 0.569  feature_norm=52.53
Iter 67  time=0.03  loss=1402.46  active=15416 precision=0.826  recall=0.735  F1=0.767  Acc(item/seq)=0.873 0.567  feature_norm=52.57
Iter 68  time=0.03  loss=1401.65  active=15348 precision=0.828

In [79]:
import eli5

eli5.show_weights(crf, top=30)

From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,1.809,-1.481,-1.178,-1.155,0.169
O,-2.241,2.079,-1.182,-0.544,0.028
Problema,-0.419,-1.425,2.487,-1.546,-2.078
Teste,-0.568,0.24,-1.563,2.496,0.0
Tratamento,-0.404,-1.177,-3.009,-2.261,3.577

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+3.806,word.cluster:126,,,
+2.425,word.cluster:286,,,
+2.173,word.cluster:295,,,
+2.148,word.lower():ao,,,
+2.104,word[:3]:car,,,
+2.056,word.lower():cpp,,,
+1.981,word.lower():ae,,,
+1.981,word.cluster:170,,,
+1.980,word.lower():mmii,,,
+1.918,word.cluster:199,,,

Weight?,Feature
+3.806,word.cluster:126
+2.425,word.cluster:286
+2.173,word.cluster:295
+2.148,word.lower():ao
+2.104,word[:3]:car
+2.056,word.lower():cpp
+1.981,word.lower():ae
+1.981,word.cluster:170
+1.980,word.lower():mmii
+1.918,word.cluster:199

Weight?,Feature
+3.166,postag:PU
+2.759,word.cluster:93
+2.296,word.cluster:292
+2.246,word.cluster:23
+2.233,word[:3]:POS
+2.203,word.cluster:153
+2.103,word.cluster:69
+2.073,word.cluster:25
+2.039,postag:KC
+2.038,word.cluster:105

Weight?,Feature
+4.549,word.cluster:117
+3.151,word.lower():comorbidades
+2.848,word.cluster:118
+2.718,word.cluster:122
+2.326,-1:word.lower():sem
+2.317,word.cluster:255
+2.260,word.lower():ss
+2.183,word[:3]:DIS
+2.159,word[:3]:HIP
+2.040,word.cluster:200

Weight?,Feature
+6.067,word.cluster:284
+3.205,word.cluster:17
+2.877,word.cluster:260
+2.504,word.cluster:259
+2.387,word.lower():ecg
+2.267,word.cluster:289
+2.230,word.cluster:108
+2.195,word.cluster:193
+2.181,word[-3:]:ECG
+2.181,word[:3]:ECG

Weight?,Feature
+4.745,word.cluster:27
+3.262,word.cluster:211
+3.249,word.cluster:1
+2.421,word.cluster:273
+2.198,word.cluster:16
+2.096,+1:word.cluster:273
+1.937,word.lower():angioplastia
+1.877,word.lower():cx
+1.853,word[:3]:tra
+1.852,word.cluster:72


In [80]:
testdata[0]

[('Lucas', 'O'), (',', 'O'), ('74', 'O'), ('anos', 'O'), ('.', 'O')]

In [81]:
y_pred = crf.predict_marginals(X_test)
y_pred[:5]

[[{'Problema': 0.992177204220076,
   'O': 0.006201498917039438,
   'Anatomia': 0.0006639669346637978,
   'Teste': 0.00018642006944193484,
   'Tratamento': 0.000770909858778943},
  {'Problema': 0.005327598196854566,
   'O': 0.9946424779484917,
   'Anatomia': 1.7162040847125747e-05,
   'Teste': 4.415461413532976e-06,
   'Tratamento': 8.34635239313311e-06},
  {'Problema': 0.9243929585964284,
   'O': 0.06739710475763984,
   'Anatomia': 0.00034346724606564347,
   'Teste': 0.0006144088087414378,
   'Tratamento': 0.007252060591124852},
  {'Problema': 0.004553117716456589,
   'O': 0.9953848344421016,
   'Anatomia': 1.7008304938785828e-05,
   'Teste': 4.798204145419547e-06,
   'Tratamento': 4.024133235773012e-05},
  {'Problema': 0.014218262719198372,
   'O': 0.9856292423154552,
   'Anatomia': 5.134589099585765e-05,
   'Teste': 9.723048382459091e-05,
   'Tratamento': 3.918590526018474e-06},
  {'Problema': 0.9990616342973843,
   'O': 0.0003600983540736084,
   'Anatomia': 5.4253882977234525e-05,
 

In [446]:
pathTest=r'crf\nested_test.conll'

with open(pathTest, encoding='utf-8') as f:
  testdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]


X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]

y_pred = best_crf.predict_marginals(X_test)
y_pred[0]

[{'Problema': 0.0011584839962665168,
  'O': 0.9980480577339003,
  'Anatomia': 5.752104115307515e-05,
  'Teste': 0.00045762274215182906,
  'Tratamento': 0.00027831448652810897},
 {'Problema': 2.2338037049176784e-08,
  'O': 0.9999953321153664,
  'Anatomia': 2.3347718479893095e-08,
  'Teste': 2.2853150313672976e-06,
  'Tratamento': 2.3368838465482383e-06},
 {'Problema': 4.500118907650077e-06,
  'O': 0.999937567236606,
  'Anatomia': 1.0056091261506069e-07,
  'Teste': 8.184822083834582e-07,
  'Tratamento': 5.701360136545488e-05},
 {'Problema': 0.00018924865811912125,
  'O': 0.9997216080237941,
  'Anatomia': 5.7251179606799095e-06,
  'Teste': 2.062068141631455e-05,
  'Tratamento': 6.279751871016857e-05},
 {'Problema': 2.962716091909806e-08,
  'O': 0.9999982971380729,
  'Anatomia': 4.516306665675139e-07,
  'Teste': 1.0907518627124168e-06,
  'Tratamento': 1.3085223708718555e-07}]

In [447]:
y_pred[0][3]

{'Problema': 0.00018924865811912125,
 'O': 0.9997216080237941,
 'Anatomia': 5.7251179606799095e-06,
 'Teste': 2.062068141631455e-05,
 'Tratamento': 6.279751871016857e-05}

In [451]:

probabilities3 = []
for num, y in enumerate(y_pred): # para cada frase
    probabilities2 = []
    # para cada palavra da frase
    #print('y:', y)
    for palavra in y:
        sequence_prob = []
        #print('palavra:', palavra)
        v = palavra.values()
        #print('v:', v)
        for valor in v:
            #if valor >=0.25:
            if valor >=0.35:
                sequence_prob.append('ENT')
            else:
                sequence_prob.append('O')

        #print('sequence_prob:', sequence_prob)
        entidades = []
        for i, valor in enumerate(sequence_prob):
            if i==0 and valor=='ENT': # Problema
                entidades.append('Problema')
            if i==1 and valor=='ENT': # O
                entidades.append('O')
            if i==2 and valor=='ENT': # Anatomia
                entidades.append('Anatomia')
            if i==3 and valor=='ENT': # Teste
                entidades.append('Teste')
            if i==4 and valor=='ENT': # Tratamento
                entidades.append('Tratamento')
        probabilities2.append(entidades)
    probabilities3.append(probabilities2)
    #if num>5:
    #    break

              
            
probabilities3[1]

[['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['Problema'],
 ['O'],
 ['O'],
 ['O'],
 ['Tratamento'],
 ['Tratamento'],
 ['O'],
 ['O'],
 ['O'],
 ['O'],
 ['O']]

In [452]:
i=0
for value, pred in zip(dic_sentencesTest.values(), probabilities3):
    print('---------------')
    print(value[1])
    #print(pred)
    indices_pred = [['', [j], p] for j, p in enumerate(pred) if p!=['O']]
    print(indices_pred)
    #for p in pred
    i=i+1
    if i>10:
        break

---------------
[]
[]
---------------
[['FA', [8], 'Problema'], ['marevan 5mg', [12, 13], 'Tratamento']]
[['', [8], ['Problema']], ['', [12], ['Tratamento']], ['', [13], ['Tratamento']]]
---------------
[['Comorbidades', [0], 'Problema'], ['DM', [2], 'Problema'], ['metformina 850mg', [9, 10], 'Tratamento'], ['acarbose', [16], 'Tratamento'], ['glicazida 60mg', [22, 23], 'Tratamento'], ['insulina', [29], 'Tratamento']]
[['', [0], ['Problema']], ['', [2], ['Problema']], ['', [9], ['Tratamento']], ['', [10], ['Tratamento']], ['', [16], ['Tratamento']], ['', [17], ['Tratamento']], ['', [22], ['Tratamento']], ['', [23], ['Tratamento']], ['', [29], ['Tratamento']]]
---------------
[['HAS', [0], 'Problema'], ['losartana 50mg', [7, 8], 'Tratamento'], ['digoxina', [12], 'Tratamento'], ['carvedilol 25', [20, 21], 'Tratamento'], ['HCTZ', [26], 'Tratamento']]
[['', [0], ['Problema']], ['', [7], ['Tratamento']], ['', [8], ['Tratamento']], ['', [12], ['Tratamento']], ['', [20], ['Tratamento']], ['', 

## BEST CRF

In [124]:
# cluster 50

import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.6min finished


              precision    recall  f1-score   support

    Problema      0.774     0.821     0.797      1120
       Teste      0.904     0.801     0.849       366
  Tratamento      0.884     0.884     0.884       484
    Anatomia      0.696     0.447     0.544       262

   micro avg      0.812     0.787     0.800      2232
   macro avg      0.815     0.738     0.769      2232
weighted avg      0.810     0.787     0.795      2232

best params: {'c1': 0.07142426429277902, 'c2': 0.014106307808927127}
best CV score: 0.7425037694235993
model size: 0.65M


From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,1.963,-0.661,-1.665,-1.125,-0.623
O,-3.483,2.087,-2.602,-1.284,-1.816
Problema,-0.659,-1.057,2.455,-1.433,-3.77
Teste,-1.927,-0.201,-3.003,1.988,-4.229
Tratamento,-0.381,-0.632,-3.671,-2.355,3.468

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+4.735,word.cluster:126,,,
+3.446,word.lower():ao,,,
+3.356,word.lower():cpp,,,
+3.006,word.cluster:286,,,
+2.970,word.lower():acv,,,
+2.683,word[:3]:car,,,
+2.672,word.cluster:209,,,
+2.519,word[:3]:CAR,,,
+2.437,-3:word.lower():vd,,,
+2.436,word.lower():vao,,,

Weight?,Feature
+4.735,word.cluster:126
+3.446,word.lower():ao
+3.356,word.lower():cpp
+3.006,word.cluster:286
+2.970,word.lower():acv
+2.683,word[:3]:car
+2.672,word.cluster:209
+2.519,word[:3]:CAR
+2.437,-3:word.lower():vd
+2.436,word.lower():vao

Weight?,Feature
+4.491,postag:PU
+4.307,word.cluster:93
+4.087,word.cluster:15
+3.796,word.cluster:292
+3.671,postag:KC
+3.615,word.lower():ou
+3.564,word.cluster:69
+3.456,word[:3]:POS
+3.281,+2:word.cluster:187
+3.270,-1:word.lower():ra

Weight?,Feature
+6.296,word.cluster:117
+4.836,word.lower():comorbidades
+4.045,word.lower():chagas
+3.957,word.cluster:118
+3.851,-1:word.lower():fao
+3.725,-1:word.lower():quaisquer
+3.701,word.cluster:122
+3.603,word.lower():afebril
+3.570,word[:3]:Hip
+3.498,word[:3]:DIS

Weight?,Feature
+7.406,word.cluster:284
+4.902,word.cluster:17
+4.601,word.cluster:260
+4.507,word.cluster:259
+4.007,word.lower():ecg
+3.724,word.cluster:279
+3.718,word.cluster:108
+3.553,word.lower():avaliação
+3.443,+2:word.lower():84
+3.294,word.lower():exame

Weight?,Feature
+6.541,word.cluster:27
+5.872,word.cluster:1
+4.480,word.cluster:211
+4.067,word.cluster:132
+3.671,word.lower():cx
+3.655,word.cluster:5
+3.381,word[:3]:SEL
+3.298,word[:3]:tra
+3.158,+3:word.lower():safena
+3.111,word.cluster:72


In [453]:
y_pred = best_crf.predict_marginals(X_test)

print(len(y_pred))
print(len(X_test))


506
506


In [454]:
len(dic_sentencesTest)

506

In [455]:
y_pred[13]

[{'Problema': 0.00893034522878443,
  'O': 0.008342843296807304,
  'Anatomia': 0.002548609174850487,
  'Teste': 0.9792286295669081,
  'Tratamento': 0.0009495727326497741},
 {'Problema': 0.0015734799574070755,
  'O': 0.9953234704927209,
  'Anatomia': 0.00014682172760204878,
  'Teste': 0.0028405609755131267,
  'Tratamento': 0.0001156668467574828},
 {'Problema': 0.24188924900620395,
  'O': 0.031371467701772925,
  'Anatomia': 0.10255857360196831,
  'Teste': 0.3374216724288666,
  'Tratamento': 0.2867590372611886},
 {'Problema': 0.296502288229737,
  'O': 0.02230230838268296,
  'Anatomia': 0.053849658434839485,
  'Teste': 0.33252971645596985,
  'Tratamento': 0.2948160284967711},
 {'Problema': 0.2372203560225328,
  'O': 0.7614143788836624,
  'Anatomia': 0.0009076336686349938,
  'Teste': 0.0002387240816547958,
  'Tratamento': 0.00021890734351531852},
 {'Problema': 0.9961593649268679,
  'O': 0.0026840616853349764,
  'Anatomia': 0.0010233123406347229,
  'Teste': 0.00013182289250464202,
  'Tratamen

In [556]:
dic={}
num=-1
num2=-1
for frase in y_pred: # para cada frase
    num=num+1
    num2=num2+1
    #print(frase)
    # para cada token
    tokens=dic_sentencesTest[num2][0]
    entidades=[] # entidades do token
    for numT, token in enumerate(frase):
        #print('token>', token['Problema'])
        for k, tag in token.items():
            #print('tag:', tag)
            if tag>0.3:
            #if tag>0.25:
                if k!='O':
                    entidades.append(['', numT, k])
   
    dic[num2] = [tokens, entidades]
    #if num>20:
    #    break
    
dic[13]

[[['Ecocardiograma', 0, 934],
  ['-', 1, 949],
  ['ventrículo', 2, 951],
  ['esquerdo', 3, 962],
  ['com', 4, 971],
  ['hipertrofia', 5, 975],
  ['concentrica', 6, 987],
  ['de', 7, 999],
  ['grau', 8, 1002],
  ['discreto', 9, 1007],
  ['e', 10, 1016],
  ['função', 11, 1018],
  ['sistólica', 12, 1025],
  ['preservada', 13, 1035],
  ['.', 14, 1045]],
 [['', 0, 'Teste'],
  ['', 2, 'Teste'],
  ['', 3, 'Teste'],
  ['', 5, 'Problema'],
  ['', 6, 'Problema'],
  ['', 7, 'Problema'],
  ['', 8, 'Problema'],
  ['', 9, 'Problema']]]

In [557]:
dic[14]

[[['aumento', 0, 1047],
  ['moderado', 1, 1055],
  ['de', 2, 1064],
  ['átrio', 3, 1067],
  ['esquerdo', 4, 1073],
  ['.', 5, 1081]],
 [['', 0, 'Problema'],
  ['', 1, 'Problema'],
  ['', 2, 'Problema'],
  ['', 3, 'Problema'],
  ['', 3, 'Anatomia'],
  ['', 4, 'Problema'],
  ['', 4, 'Anatomia']]]

In [558]:
dic[14]

[[['aumento', 0, 1047],
  ['moderado', 1, 1055],
  ['de', 2, 1064],
  ['átrio', 3, 1067],
  ['esquerdo', 4, 1073],
  ['.', 5, 1081]],
 [['', 0, 'Problema'],
  ['', 1, 'Problema'],
  ['', 2, 'Problema'],
  ['', 3, 'Problema'],
  ['', 3, 'Anatomia'],
  ['', 4, 'Problema'],
  ['', 4, 'Anatomia']]]

In [559]:
dic_sentencesTest[14]

[[['aumento', 0, 1047],
  ['moderado', 1, 1055],
  ['de', 2, 1064],
  ['átrio', 3, 1067],
  ['esquerdo', 4, 1073],
  ['.', 5, 1081]],
 [['aumento moderado de átrio esquerdo', [0, 1, 2, 3, 4], 'Problema'],
  ['átrio esquerdo', [3, 4], 'Anatomia']]]

In [560]:
def getDicSentences2(dicGabaritoNested): 
    dicGabaritoNested2={}
    for key, values in dicGabaritoNested.items():
        tokens = values[0]
        #print('tokens:', tokens)
        entidades=values[1]
        entidadesForm=[]
        entidadeString=''
        entidadeIndices=[]
        entidadeTag=''    
        ultimoIndice=-10
        ultimoTipo=''
        print('key:', key)
        entidades_ordenadas = sorted(entidades, key=lambda x: x[2])
        for ent in entidades_ordenadas:
                    #print('ent:', ent)
                    try:
                        #print('tipoEnt:', tipoEnt)
                        if ent[1] == ultimoIndice+1 and ultimoTipo == ent[2]: # sao seguidas
                            entidadeString = entidadeString+' '+ent[0]
                            entidadeIndices.append(ent[1])
                        else: # mudou tipo
                            if ultimoIndice>0: # se for primeira vez, nao grava
                                entidadesForm.append([entidadeString, entidadeIndices, entidadeTag])
                            entidadeString = ent[0]
                            entidadeIndices = [ent[1]]
                            entidadeTag = ent[2]
                        ultimoIndice = ent[1]
                        ultimoTipo =ent[2]
                    except:
                        print('ent:', ent)
                        print('tokens:', tokens)
                        print('key:', key)
                        raise

        if len(entidadeIndices)>0:
            entidadesForm.append([entidadeString, entidadeIndices, entidadeTag])
        if key in dicGabaritoNested2:
            dicGabaritoNested2[key][1].append(entidadesForm)
        else:
            dicGabaritoNested2[key]=[tokens, entidadesForm]
        entidadesForm=[]
        #if key>14:
        #    break
    
    return dicGabaritoNested2


In [561]:
lista = [['', 0, 'Problema'], ['', 1, 'Problema'], ['', 3, 'Anatomia']]
lista_ordenada = sorted(lista, key=lambda x: x[2])
print(lista_ordenada)

[['', 3, 'Anatomia'], ['', 0, 'Problema'], ['', 1, 'Problema']]


In [562]:
dicPred2 = getDicSentences2(dic)
dicPred2[1]

key: 0
key: 1
key: 2
key: 3
key: 4
key: 5
key: 6
key: 7
key: 8
key: 9
key: 10
key: 11
key: 12
key: 13
key: 14
key: 15
key: 16
key: 17
key: 18
key: 19
key: 20
key: 21
key: 22
key: 23
key: 24
key: 25
key: 26
key: 27
key: 28
key: 29
key: 30
key: 31
key: 32
key: 33
key: 34
key: 35
key: 36
key: 37
key: 38
key: 39
key: 40
key: 41
key: 42
key: 43
key: 44
key: 45
key: 46
key: 47
key: 48
key: 49
key: 50
key: 51
key: 52
key: 53
key: 54
key: 55
key: 56
key: 57
key: 58
key: 59
key: 60
key: 61
key: 62
key: 63
key: 64
key: 65
key: 66
key: 67
key: 68
key: 69
key: 70
key: 71
key: 72
key: 73
key: 74
key: 75
key: 76
key: 77
key: 78
key: 79
key: 80
key: 81
key: 82
key: 83
key: 84
key: 85
key: 86
key: 87
key: 88
key: 89
key: 90
key: 91
key: 92
key: 93
key: 94
key: 95
key: 96
key: 97
key: 98
key: 99
key: 100
key: 101
key: 102
key: 103
key: 104
key: 105
key: 106
key: 107
key: 108
key: 109
key: 110
key: 111
key: 112
key: 113
key: 114
key: 115
key: 116
key: 117
key: 118
key: 119
key: 120
key: 121
key: 122
key

[[['Em', 0, 59],
  ['acompanhamento', 1, 62],
  ['no', 2, 77],
  ['ambualtorio', 3, 80],
  ['há', 4, 92],
  ['5', 5, 95],
  ['anos', 6, 97],
  ['por', 7, 102],
  ['FA', 8, 106],
  [',', 9, 108],
  ['uso', 10, 110],
  ['de', 11, 114],
  ['marevan', 12, 117],
  ['5mg', 13, 125],
  ['1', 14, 129],
  ['x', 15, 131],
  ['ao', 16, 133],
  ['dia', 17, 136],
  ['.', 18, 139]],
 [['', [8], 'Problema'], [' ', [12, 13], 'Tratamento']]]

In [563]:
dic_sentencesTest[15]

[[['calcificação', 0, 1083],
  ['mitral', 1, 1096],
  ['e', 2, 1103],
  ['aórtica', 3, 1105],
  ['com', 4, 1113],
  ['refluxo', 5, 1117],
  ['leve', 6, 1125],
  ['.', 7, 1129]],
 [['calcificação mitral e aórtica com refluxo leve',
   [0, 1, 2, 3, 4, 5, 6],
   'Problema'],
  ['mitral', [1], 'Anatomia'],
  ['aórtica', [3], 'Anatomia']]]

In [564]:
dicPred2[14]

[[['aumento', 0, 1047],
  ['moderado', 1, 1055],
  ['de', 2, 1064],
  ['átrio', 3, 1067],
  ['esquerdo', 4, 1073],
  ['.', 5, 1081]],
 [[' ', [3, 4], 'Anatomia'], ['    ', [0, 1, 2, 3, 4], 'Problema']]]

In [565]:
dicPred2[20]

[[['Otimizo', 0, 1284],
  ['dose', 1, 1292],
  ['da', 2, 1297],
  ['sinvastatina', 3, 1300],
  ['para', 4, 1313],
  ['40mg', 5, 1318],
  ['/', 6, 1323],
  ['dia', 7, 1324],
  ['.', 8, 1327]],
 [['  ', [3, 4, 5], 'Tratamento']]]

In [566]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import os
from pathlib import Path
import re
import pickle
# ver qtos o modelo apenas de ner acertaria
from transformers import AutoTokenizer, AutoModelForTokenClassification
import nltk    
from nltk import tokenize 
import torch
from transformers import BertTokenizer,BertForTokenClassification
import numpy as np
import json   
from importlib import reload  # Python 3.4+
import random
from torch.utils.data import DataLoader
from importlib import reload 
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
import joblib
import os


In [567]:
print(len(dicPred2))
print(len(dic_sentencesTest))
print(dicPred2[2])
print(dic_sentencesTest[2])

506
506
[[['Comorbidades', 0, 142], [':', 1, 154], ['DM', 2, 156], ['há', 3, 159], ['10', 4, 162], ['anos', 5, 165], ['em', 6, 170], ['uso', 7, 173], ['de', 8, 177], ['metformina', 9, 180], ['850mg', 10, 191], ['3', 11, 197], ['cp', 12, 199], ['/', 13, 201], ['dia', 14, 202], [',', 15, 205], ['acarbose', 16, 207], ['1', 17, 216], ['cp', 18, 218], ['/', 19, 220], ['dia', 20, 221], ['e', 21, 225], ['glicazida', 22, 227], ['60mg', 23, 237], ['2', 24, 242], ['cp', 25, 244], ['/', 26, 246], ['dia', 27, 247], ['e', 28, 251], ['insulina', 29, 253], ['(', 30, 262], ['24', 31, 263], ['-', 32, 266], ['0', 33, 268], ['-', 34, 270], ['24', 35, 272], [')', 36, 274], ['.', 37, 275]], [['', [2], 'Problema'], [' ', [9, 10], 'Tratamento'], [' ', [16, 17], 'Tratamento'], [' ', [22, 23], 'Tratamento'], ['', [29], 'Tratamento']]]
[[['Comorbidades', 0, 142], [':', 1, 154], ['DM', 2, 156], ['há', 3, 159], ['10', 4, 162], ['anos', 5, 165], ['em', 6, 170], ['uso', 7, 173], ['de', 8, 177], ['metformina', 9, 18

In [568]:
dic_sentencesTest[20]

[[['Otimizo', 0, 1284],
  ['dose', 1, 1292],
  ['da', 2, 1297],
  ['sinvastatina', 3, 1300],
  ['para', 4, 1313],
  ['40mg', 5, 1318],
  ['/', 6, 1323],
  ['dia', 7, 1324],
  ['.', 8, 1327]],
 [['sinvastatina para 40mg', [3, 4, 5], 'Tratamento']]]

In [569]:
dicPred2[14]

[[['aumento', 0, 1047],
  ['moderado', 1, 1055],
  ['de', 2, 1064],
  ['átrio', 3, 1067],
  ['esquerdo', 4, 1073],
  ['.', 5, 1081]],
 [[' ', [3, 4], 'Anatomia'], ['    ', [0, 1, 2, 3, 4], 'Problema']]]

In [570]:


listaEnts = [['DM', [9, 10], 'Problema'], ['DM', [9, 10], 'Anatomia'], ['glicazida 60mg', [22, 23], 'Tratamento'], ['insulina', [29], 'Teste'], ['DM', [9, 10], 'Tratamento']]
def AgrupaEntidades(listaEnts):
    listaEnts2=[]
    for ent in listaEnts:
        #print(ent)
        #ntGravar=ent
        labels=[ent[2]]
        for ent2 in listaEnts:
            if ent[1]==ent2[1] and ent[2]!=ent2[2]:
                labels.append(ent2[2])
        
        ent[1].sort()
        labels.sort()
        entGravar=[ent[0], ent[1], labels]
        listaEnts2.append(entGravar)
        

    listaEnts3=[]
    for ent in listaEnts2:
        if ent not in listaEnts3:
            listaEnts3.append(ent)
    
    return listaEnts3
AgrupaEntidades(listaEnts)

def AvalFinalNova(dic_entidades_gabarito, dic_entidades_preditas):

    region_true_list, region_pred_list = list(), list() 
    for i in range(0, len(dic_entidades_gabarito), 1):
        listaEntidades = []
        entidades_gabarito = dic_entidades_gabarito[i][1]
        entidades_preditas = dic_entidades_preditas[i][1]

        entidades_gabarito = AgrupaEntidades(entidades_gabarito)
        entidades_preditas = AgrupaEntidades(entidades_preditas)

        #print('entidades_gabarito:', entidades_gabarito)
        #print('entidades_preditas:', entidades_preditas)

        for entidade_gabarito in entidades_gabarito:
            tem=0
            #print('--entidade_gabarito--:', entidade_gabarito)
            for entidade_predita in entidades_preditas:
                #print('entidade_predita:', entidade_predita)
                if entidade_predita[1]==entidade_gabarito[1]:
                    listaEntidades.append([entidade_gabarito[0],entidade_gabarito[1],entidade_gabarito[2],entidade_predita[2]])
                    tem=1
            if tem==0:
                listaEntidades.append([entidade_gabarito[0],entidade_gabarito[1],entidade_gabarito[2],['O']])

        for entidade_predita in entidades_preditas:
            #print('entidade_predita:', entidade_predita)
            tem=0
            for entidade in listaEntidades:
                #print('entidade:', entidade)
                if entidade[1]==entidade_predita[1] and entidade[3]==entidade_predita[2]:
                    tem=1
            if tem==0:
                listaEntidades.append([entidade_predita[0],entidade_predita[1],['O'],entidade_predita[2]])
                break

        #print('listaEntidades:>>>', listaEntidades)

        for entidade in listaEntidades:
            #print(entidade)
            tags_gabarito = entidade[2]
            tags_prevista = entidade[3]

            #print('tags_gabarito:', tags_gabarito)
            #print('tags_prevista:', tags_prevista)

            if len(tags_gabarito)==len(tags_prevista):
                region_true_list = region_true_list + [t for t in tags_gabarito]
                region_pred_list = region_pred_list + [t for t in tags_prevista]           
            else:
                if len(tags_gabarito) > len(tags_prevista):
                    for tag_gabarito in tags_gabarito:
                        region_true_list = region_true_list + [tag_gabarito]
                        if tag_gabarito in tags_prevista:
                            region_pred_list = region_pred_list + [tag_gabarito]
                        else:
                            region_pred_list = region_pred_list + ['O']
                else:
                    for tag_prevista in tags_prevista:
                        region_pred_list = region_pred_list + [tag_prevista]
                        if tag_prevista in tags_gabarito:
                            region_true_list = region_true_list + [tag_prevista]
                        else:
                            region_true_list = region_true_list + ['O']
                
    
    #print('region_true_list:', region_true_list)
    #print('region_pred_list:', region_pred_list)
    #print(classification_report(region_true_list, region_pred_list, labels=['O', 'PROTEIN', 'DNA', 'RNA', 'CELL_TYPE', 'CELL_LINE'], target_names=['O', 'PROTEIN', 'DNA', 'RNA', 'CELL_TYPE', 'CELL_LINE'], digits=6))
    print(classification_report(region_true_list, region_pred_list, digits=6))

    return region_true_list, region_pred_list

In [571]:

region_true_list, region_pred_list= AvalFinalNova(dic_sentencesTest, dicPred2)

print('---sem o O ----')
print(classification_report(region_true_list, region_pred_list, digits=6, labels=getTiposEntidade()))
print(confusion_matrix(region_true_list, region_pred_list, labels=getTiposEntidade()))



              precision    recall  f1-score   support

    Anatomia   0.862385  0.479592  0.616393       196
           O   0.000000  0.000000  0.000000       134
    Problema   0.765625  0.724852  0.744681       338
       Teste   0.871921  0.725410  0.791946       244
  Tratamento   0.835821  0.785047  0.809639       214

    accuracy                       0.607460      1126
   macro avg   0.667150  0.542980  0.592532      1126
weighted avg   0.727729  0.607460  0.656317      1126

---sem o O ----
              precision    recall  f1-score   support

    Problema   0.765625  0.724852  0.744681       338
       Teste   0.871921  0.725410  0.791946       244
  Tratamento   0.835821  0.785047  0.809639       214
    Anatomia   0.862385  0.479592  0.616393       196

   micro avg   0.821128  0.689516  0.749589       992
   macro avg   0.833938  0.678725  0.740665       992
weighted avg   0.826031  0.689516  0.744973       992

[[245   2   2   0]
 [  2 177   1   0]
 [  3   0 168   0]
 [ 

In [572]:
'''
threshold: 0.25

              precision    recall  f1-score   support

    Anatomia   0.858407  0.497436  0.629870       195
           O   0.000000  0.000000  0.000000       144
    Problema   0.753894  0.715976  0.734446       338
       Teste   0.868293  0.729508  0.792873       244
  Tratamento   0.809756  0.775701  0.792363       214

    accuracy                       0.601762      1135
   macro avg   0.658070  0.543724  0.589910      1135
weighted avg   0.711328  0.601762  0.646779      1135

---sem o O ----
              precision    recall  f1-score   support

    Problema   0.753894  0.715976  0.734446       338
       Teste   0.868293  0.729508  0.792873       244
  Tratamento   0.809756  0.775701  0.792363       214
    Anatomia   0.858407  0.497436  0.629870       195

   micro avg   0.809242  0.689203  0.744414       991
   macro avg   0.822587  0.679655  0.737388       991
weighted avg   0.814689  0.689203  0.740761       991

[[242   3   3   0]
 [  2 178   1   0]
 [  4   0 166   0]
 [  1   2   1  97]]
'''

'\nthreshold: 0.25\n\n              precision    recall  f1-score   support\n\n    Anatomia   0.858407  0.497436  0.629870       195\n           O   0.000000  0.000000  0.000000       144\n    Problema   0.753894  0.715976  0.734446       338\n       Teste   0.868293  0.729508  0.792873       244\n  Tratamento   0.809756  0.775701  0.792363       214\n\n    accuracy                       0.601762      1135\n   macro avg   0.658070  0.543724  0.589910      1135\nweighted avg   0.711328  0.601762  0.646779      1135\n\n---sem o O ----\n              precision    recall  f1-score   support\n\n    Problema   0.753894  0.715976  0.734446       338\n       Teste   0.868293  0.729508  0.792873       244\n  Tratamento   0.809756  0.775701  0.792363       214\n    Anatomia   0.858407  0.497436  0.629870       195\n\n   micro avg   0.809242  0.689203  0.744414       991\n   macro avg   0.822587  0.679655  0.737388       991\nweighted avg   0.814689  0.689203  0.740761       991\n\n[[242   3   3 