In [1]:
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
import os
from pathlib import Path
import re
import pickle
import random
import numpy as np
import eli5

NUM_JANELA=4

In [2]:
def getTiposEntidade():
    return ['Procedures','Disorders','ChemicalsDrugs','Abbreviation']
    
def replaceWhiteSpaces(str):
    return re.sub('\s{2,}',' ',str)

def save_obj(name, obj):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name):
    print('Load obj em: ', 'obj/' + name + '.pkl')
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


In [3]:
def read_clusters(cluster_file):
    word2cluster = {}
    try:
        with open(cluster_file, encoding='utf-8') as i:
            for num, line in enumerate(i):
                if line:
                    word, cluster = line.strip().split('\t')
                    word2cluster[word] = cluster
    except:
        print(line)
        print(num)
        raise
    return word2cluster

def word2features(sent, i):
    word = sent[i][0]
    postag = tipoPostaggerTokens(word, dicPostagger)
    cluster = word2cluster[word.lower()] if word.lower() in word2cluster else "0"
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[:3]': word[:3],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'word.cluster': cluster
    }
    temFeatureOrdemPalavra = False
    temFeatureOrdemPalavraFinal = False
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:word.cluster': cluster
        })
    else:
        features['BOS'] = True
        temFeatureOrdemPalavra = True
    
    if i > 1:
        word1 = sent[i-2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isupper()': word1.isupper(),
            '-2:postag': postag1,
            '-2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Segunda_palavra'] = True
            temFeatureOrdemPalavra = True

    if i > 2:
        word1 = sent[i-3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-3:word.lower()': word1.lower(),
            '-3:word.istitle()': word1.istitle(),
            '-3:word.isupper()': word1.isupper(),
            '-3:postag': postag1,
            '-3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Terceira_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i > 3:
        word1 = sent[i-4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-4:word.lower()': word1.lower(),
            '-4:word.istitle()': word1.istitle(),
            '-4:word.isupper()': word1.isupper(),
            '-4:postag': postag1,
            '-4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Quarta_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:word.cluster': cluster
        })
    else:
        features['EOS'] = True
        temFeatureOrdemPalavraFinal = True
    if i < len(sent)-2:
        word1 = sent[i+2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
            '+2:postag': postag1,
            '+2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Ultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    if i < len(sent)-3:
        word1 = sent[i+3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+3:word.lower()': word1.lower(),
            '+3:word.istitle()': word1.istitle(),
            '+3:word.isupper()': word1.isupper(),
            '+3:postag': postag1,
            '+3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Penultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    
    if i < len(sent)-4:
        word1 = sent[i+4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+4:word.lower()': word1.lower(),
            '+4:word.istitle()': word1.istitle(),
            '+4:word.isupper()': word1.isupper(),
            '+4:postag': postag1,
            '+4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Antepenultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
       

def sent2labels(sent):
    try:
        return [label for token, label in sent]
    except:
        print(sent)
        raise
        

def sent2tokens(sent):
    return [token for token, postag, label in sent]

#word2cluster = read_clusters(r"clusters/cluster-50.tsv")
word2cluster = read_clusters(r"clusters/cluster-300.tsv")


In [4]:
dicPostagger = load_obj('../spanclassification/obj/dic_postagger')
def tipoPostaggerTokens(token, dicPostagger):
    postagger = 'N' # na duvida é N
    if token.lower() in dicPostagger.keys():
        postagger = dicPostagger.get(token.lower())
    return postagger
tipoPostaggerTokens('coração', dicPostagger)

Load obj em:  obj/../spanclassification/obj/dic_postagger.pkl


'N'

In [6]:
dic_sentencesTrain = load_obj('../bionested/obj/dic_sentencesTrainSemClinBr')
dic_sentencesDev = load_obj('../bionested/obj/dic_sentencesDevSemClinBr')
dic_sentencesTest = load_obj('../bionested/obj/dic_sentencesTestSemClinBr')
dic_sentencesTest[0]

Load obj em:  obj/../bionested/obj/dic_sentencesTrainSemClinBr.pkl
Load obj em:  obj/../bionested/obj/dic_sentencesDevSemClinBr.pkl
Load obj em:  obj/../bionested/obj/dic_sentencesTestSemClinBr.pkl


[[['Conduta', 0, 0],
  [':', 1, 8],
  ['-', 2, 10],
  ['Alprazolam', 3, 12],
  ['1', 4, 23],
  ['mg', 5, 25],
  ['a', 6, 28],
  ['noite', 7, 30],
  [',', 8, 36],
  ['-', 9, 38],
  ['Retorno', 10, 40],
  ['em', 11, 48],
  ['3', 12, 51],
  ['meses', 13, 53],
  ['com', 14, 59],
  ['exames', 15, 63],
  ['de', 16, 70],
  ['rotina', 17, 73]],
 [['exames de rotina', [15, 16, 17], 'Procedures']]]

In [15]:
dic_sentencesTest[18]

[[['Paciente', 0, 0],
  ['Has', 1, 9],
  ['ha', 2, 13],
  ['14', 3, 16],
  ['anos', 4, 19]],
 [['Has', [1], 'Disorders'], ['Has', [1], 'Abbreviation']]]

In [7]:
len(dic_sentencesTest)

1905

In [19]:
# para teste. nao preciso replicar
# senao ficad dificil depois juntar tudo
def gravarArquivosBinarios(dic_sentences, tipo):
    # gerar arquivo treinamento
    f_entidade = open(r'crf/semclinbr_'+tipo+'.conll', 'w', encoding='utf-8')

    num_entidade_total=0
    num_entidade=0

    # TODO - refazer.. qdo vem entidade isolada, nao está gravando...
    print('\nGravando arquivo de {} '.format(tipo))
    numKeysDuplicadosTeste=[]
    numKeys2DuplicadosTeste=[]
    j=-1
    for i in range(len(dic_sentences)):
        tokens = dic_sentences[i][0]
        ents = dic_sentences[i][1]
        indiceEnts=[]
        duplicaFrase=False
        entidadesUsadas=[]
        for token in tokens:
            #print('token:', token)
            indiceToken = token[1]
            temEntidade=False
            tag='O'
            for ent in ents:
                #print(ent)
                if indiceToken in ent[1]: #and ent[2]==entidade:
                    if not temEntidade:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                        entidadesUsadas.append(ent[1])
                    else:
                        #print('aaaaaaaaa:', ents)
                        duplicaFrase = True
                        break
                        
                    #break
            #if tag != entidade:
            #    tag='O'
            tokenGravar = token[0].replace(' ','')
            tokenGravar = tokenGravar.strip()
            f_entidade.write(tokenGravar+' '+tag+'\n')
            num_entidade_total=num_entidade_total+1
        f_entidade.write('\n')
        j=j+1
        
        if tipo!='test':
            if duplicaFrase:
                if tipo=='test':
                    numKeysDuplicadosTeste.append(i)
                    numKeys2DuplicadosTeste.append(j)
                for token in tokens:
                    #print('token:', token)
                    indiceToken = token[1]
                    tag='O'
                    for ent in ents:
                        #print(ent)
                        if indiceToken in ent[1] and indiceToken not in entidadesUsadas:
                            tag = ent[2]
                            num_entidade=num_entidade+1
                            temEntidade = True
                    tokenGravar = token[0].replace(' ','')
                    tokenGravar = tokenGravar.strip()
                    f_entidade.write(tokenGravar+' '+tag+'\n')
                    num_entidade_total=num_entidade_total+1
                f_entidade.write('\n')
                j=j+1
        #if i>15:
        #    break
    f_entidade.close()

    print('num_entidade:', num_entidade)
    print('num_entidade_total:', num_entidade_total)
    return numKeysDuplicadosTeste, numKeys2DuplicadosTeste

numKeysDuplicadosTeste, numKeys2DuplicadosTeste = gravarArquivosBinarios(dic_sentencesTest, 'test')
_, _ = gravarArquivosBinarios(dic_sentencesTrain, 'train')
_, _ = gravarArquivosBinarios(dic_sentencesDev, 'dev')


Gravando arquivo de test 
num_entidade: 9338
num_entidade_total: 29840

Gravando arquivo de train 
num_entidade: 48779
num_entidade_total: 124689

Gravando arquivo de dev 
num_entidade: 17536
num_entidade_total: 50348


In [20]:
dic_sentencesTest[13]

[[['KOH', 0, 0], ['NEGATIVO', 1, 4]], []]

In [21]:
#pathTrain=r'../spanclassification/preProcessamento/data-ner-binario/nested_train.conll'
#pathDev=r'../spanclassification/preProcessamento/data-ner-binario/nested_dev.conll'
#pathTest=r'../spanclassification/preProcessamento/data-ner-binario/nested_test.conll'

tipos = getTiposEntidade()
#tipos=['Anatomia']
X_train = []
X_dev = []
X_test = []
y_train = []
y_dev = []
y_test = []

pathTrain=r'crf\semclinbr_train.conll'
pathDev=r'crf\semclinbr_dev.conll'
pathTest=r'crf\semclinbr_test.conll'

with open(pathTest, encoding='utf-8') as f:
  testdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathDev, encoding='utf-8') as f:
  devdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathTrain, encoding='utf-8') as f:
  traindata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]

X_train = [sent2features(s) for s in traindata]
y_train = [sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]
#devdata[:2]
traindata[:2]

[[('POI', 'Abbreviation'),
  ('DE', 'O'),
  ('LAVAGEM', 'Procedures'),
  ('+', 'O'),
  ('CURETA', 'Procedures'),
  ('DE', 'O'),
  ('TECIDO', 'Disorders'),
  ('NECROTICO', 'Disorders')],
 [('18', 'O'),
  (':', 'O'),
  ('00', 'O'),
  (':', 'O'),
  ('PACIENTE', 'O'),
  ('RETORNOU', 'O'),
  ('DO', 'O'),
  ('CC', 'Abbreviation'),
  ('LUCIDO', 'Disorders'),
  (',', 'O'),
  ('ORIENTADO', 'Disorders'),
  (',', 'O'),
  ('COMUNICATIVO', 'Disorders'),
  (';', 'O'),
  ('MANTEM', 'O'),
  ('AVP', 'Disorders'),
  ('COM', 'O'),
  ('STP', 'Procedures')]]

In [22]:
testdata[13]

[('KOH', 'O'), ('NEGATIVO', 'O')]

In [23]:
testdata[14]

[('MAMAS', 'O'),
 ('DE', 'O'),
 ('MEDIO', 'O'),
 ('VOLUME', 'O'),
 (',', 'O'),
 ('COM', 'O'),
 ('PARENQUIMA', 'O'),
 ('ESPESSO', 'O'),
 (',', 'O'),
 ('SEM', 'O'),
 ('NODULOS', 'Disorders'),
 ('MPALPAVEIS', 'Disorders'),
 (',', 'O'),
 ('DERRAME', 'Disorders'),
 ('PAPILAR', 'Disorders'),
 ('NEGATIVO', 'Disorders'),
 ('E', 'O'),
 ('AXILAS', 'Disorders'),
 ('LIVRES', 'Disorders')]

In [24]:
testdata[15]

[('A', 'O'),
 ('#', 'O'),
 ('ROTINA', 'Procedures'),
 ('GINECOLOGICA', 'Procedures')]

In [25]:
testdata[19]

[('Paciente', 'O'),
 ('queixa', 'O'),
 ('-', 'O'),
 ('se', 'O'),
 ('de', 'O'),
 ('artralgia', 'Disorders'),
 ('generalizada', 'Disorders'),
 (',', 'O'),
 ('sem', 'O'),
 ('demais', 'O'),
 ('alteracoes', 'Disorders')]

In [26]:
len(testdata)

1905

In [27]:
X_test[0]

[{'bias': 1.0,
  'word.lower()': 'conduta',
  'word[-3:]': 'uta',
  'word[:3]': 'Con',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'N',
  'word.cluster': '49',
  'BOS': True,
  '+1:word.lower()': ':',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'PU',
  '+1:word.cluster': '242',
  '+2:word.lower()': '-',
  '+2:word.istitle()': False,
  '+2:word.isupper()': False,
  '+2:postag': 'PU',
  '+2:word.cluster': '242',
  '+3:word.lower()': 'alprazolam',
  '+3:word.istitle()': True,
  '+3:word.isupper()': False,
  '+3:postag': 'N',
  '+3:word.cluster': '132',
  '+4:word.lower()': '1',
  '+4:word.istitle()': False,
  '+4:word.isupper()': False,
  '+4:postag': 'NUM',
  '+4:word.cluster': '267'},
 {'bias': 1.0,
  'word.lower()': ':',
  'word[-3:]': ':',
  'word[:3]': ':',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'PU',
  'word.cluster': '242',
  '-1:word.lower()': 'condu

In [28]:
y_test[0:2]

[['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'Procedures',
  'Procedures',
  'Procedures'],
 ['Abbreviation', 'Abbreviation', 'O', 'O', 'O']]

In [29]:
print(len(X_train))
print(len(y_train))
print(len(X_train[0]))

7187
7187
8


In [30]:
print(len(traindata))
print(len(X_train))
print(len(y_train))

7187
7187
7187


## Janela de 4 vizinhos antes e depois

Precisa reforçar os outros.. se mandar O, vai achar q é tudo O.. gera um arquivo só, só duplica a frase qdo tem nested..

In [31]:
import time

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False)

start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
#crf.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

print('CRF model was trained!')

loading training data to CRFsuite: 100%|██████████| 7187/7187 [00:06<00:00, 1195.37it/s]





loading dev data to CRFsuite: 100%|██████████| 2577/2577 [00:02<00:00, 997.40it/s] 



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 112634
Seconds required: 1.677

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.51  loss=140274.61 active=110893 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.696 0.111  feature_norm=1.00
Iter 2   time=0.37  loss=130082.86 active=103841 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.696 0.111  feature_norm=0.90
Iter 3   time=0.41  loss=121564.29 active=104576 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.696 0.111  feature_norm=1.05
Iter 4   time=0.37  loss=111184.98 active=108873 precision=0.233  recall=0.203  F1=0.171  Acc(item/seq)=0.697 0.112  feature_norm=1.53
Iter 5   time=0.37  loss=95362.61 active

Iter 61  time=0.40  loss=19094.18 active=85721 precision=0.575  recall=0.485  F1=0.515  Acc(item/seq)=0.719 0.218  feature_norm=165.65
Iter 62  time=0.35  loss=19027.57 active=85322 precision=0.581  recall=0.480  F1=0.514  Acc(item/seq)=0.721 0.218  feature_norm=165.82
Iter 63  time=0.39  loss=18967.56 active=85119 precision=0.574  recall=0.485  F1=0.515  Acc(item/seq)=0.718 0.218  feature_norm=166.05
Iter 64  time=0.28  loss=18910.04 active=84822 precision=0.577  recall=0.480  F1=0.513  Acc(item/seq)=0.720 0.218  feature_norm=166.20
Iter 65  time=0.27  loss=18855.86 active=84493 precision=0.575  recall=0.486  F1=0.516  Acc(item/seq)=0.719 0.218  feature_norm=166.41
Iter 66  time=0.33  loss=18800.15 active=83976 precision=0.578  recall=0.484  F1=0.516  Acc(item/seq)=0.720 0.220  feature_norm=166.58
Iter 67  time=0.38  loss=18750.24 active=83405 precision=0.576  recall=0.487  F1=0.517  Acc(item/seq)=0.719 0.217  feature_norm=166.83
Iter 68  time=0.36  loss=18702.64 active=82993 precisio

In [32]:
import eli5

eli5.show_weights(crf, top=30)



From \ To,Abbreviation,ChemicalsDrugs,Disorders,O,Procedures
Abbreviation,2.864,-2.053,-2.175,-0.546,-1.737
ChemicalsDrugs,-2.277,3.169,-3.108,-0.484,-1.427
Disorders,-2.631,0.0,3.07,-0.776,-2.586
O,-0.941,-0.262,-1.097,2.23,-0.618
Procedures,-1.931,-0.815,-2.696,-0.548,3.419

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+4.124,word.lower():fenta,,,
+3.759,word.lower():po,,,
+3.710,-1:word.lower():hipocoradas,,,
+3.307,word.lower():pot,,,
+3.240,-1:word.lower():waterman,,,
+3.218,+3:word.lower():endoprotese,,,
+3.210,-2:word.lower():rpm,,,
+3.187,+3:word.lower():vitima,,,
+3.110,-3:word.lower():dle,,,
+3.107,-1:word.lower():antebraco,,,

Weight?,Feature
+4.124,word.lower():fenta
+3.759,word.lower():po
+3.710,-1:word.lower():hipocoradas
+3.307,word.lower():pot
+3.240,-1:word.lower():waterman
+3.218,+3:word.lower():endoprotese
+3.210,-2:word.lower():rpm
+3.187,+3:word.lower():vitima
+3.110,-3:word.lower():dle
+3.107,-1:word.lower():antebraco

Weight?,Feature
+4.160,word.cluster:132
+4.011,word.cluster:27
+3.998,word.cluster:5
+3.689,word.cluster:1
+3.395,word.lower():dormonid
+3.202,word.cluster:216
+3.088,word[-3:]:INA
+3.023,word.lower():propofol
+2.873,word[-3:]:ina
+2.778,-1:word.lower():infundindo

Weight?,Feature
+3.738,word.lower():comorbidades
+3.623,+2:word.lower():stp
+3.597,word.lower():insulinodependente
+3.412,-3:word.lower():hipotenso
+3.381,+1:word.lower():agudo
+3.220,-3:word.lower():prec
+3.204,word.lower():trauma
+3.063,word.lower():multiplas
+3.011,word.lower():eupineico
+2.999,+2:word.lower():adventicios

Weight?,Feature
+6.108,word.lower():ausentes
+4.908,+3:word.lower():borra
+4.494,word.lower():paciente
+4.227,+2:word.lower():cominicativo
+4.176,-3:word.lower():58
+3.972,word.lower():externamente
+3.897,-1:word.lower():ig
+3.823,+4:word.lower():cominicativo
+3.745,-3:word.lower():tipoia
+3.677,+4:word.lower():deamublando

Weight?,Feature
+4.591,-1:word.lower():acompanahda
+4.122,+4:word.lower():seguie
+3.844,word[-3:]:mia
+3.753,word.lower():palpacao
+3.582,word.lower():curativo
+3.311,word[-3:]:pia
+3.301,word.lower():tala
+3.220,-1:word.lower():cinta
+3.206,-1:word.lower():flogisticos
+3.195,word.lower():prescricao


In [33]:
testdata[0]

[('Conduta', 'O'),
 (':', 'O'),
 ('-', 'O'),
 ('Alprazolam', 'O'),
 ('1', 'O'),
 ('mg', 'O'),
 ('a', 'O'),
 ('noite', 'O'),
 (',', 'O'),
 ('-', 'O'),
 ('Retorno', 'O'),
 ('em', 'O'),
 ('3', 'O'),
 ('meses', 'O'),
 ('com', 'O'),
 ('exames', 'Procedures'),
 ('de', 'Procedures'),
 ('rotina', 'Procedures')]

In [35]:
y_pred = crf.predict_marginals(X_test)
y_pred[:5]

[[{'Abbreviation': 0.005991433420153,
   'O': 0.9785915756958916,
   'Procedures': 0.0153595122837672,
   'Disorders': 2.448926061335424e-05,
   'ChemicalsDrugs': 3.2989339574831604e-05},
  {'Abbreviation': 3.632617363632022e-06,
   'O': 0.9999887256814695,
   'Procedures': 3.928200773881827e-06,
   'Disorders': 1.56119711201699e-06,
   'ChemicalsDrugs': 2.1523032811781794e-06},
  {'Abbreviation': 3.358775711151873e-05,
   'O': 0.997253345951055,
   'Procedures': 0.0001790808106998358,
   'Disorders': 1.3495860184106903e-05,
   'ChemicalsDrugs': 0.0025204896209497455},
  {'Abbreviation': 0.013542474775889022,
   'O': 0.473770956524168,
   'Procedures': 0.009206003603785724,
   'Disorders': 0.0010050456604264967,
   'ChemicalsDrugs': 0.5024755194357309},
  {'Abbreviation': 0.00361451384908663,
   'O': 0.9960833810338756,
   'Procedures': 6.264271175850558e-05,
   'Disorders': 7.490760086667097e-06,
   'ChemicalsDrugs': 0.00023197164519262316},
  {'Abbreviation': 0.002844596230591919,
  

In [36]:
y_pred[0][3]

{'Abbreviation': 0.013542474775889022,
 'O': 0.473770956524168,
 'Procedures': 0.009206003603785724,
 'Disorders': 0.0010050456604264967,
 'ChemicalsDrugs': 0.5024755194357309}

In [37]:

probabilities3 = []
for num, y in enumerate(y_pred): # para cada frase
    probabilities2 = []
    # para cada palavra da frase
    #print('y:', y)
    for palavra in y:
        sequence_prob = []
        #print('palavra:', palavra)
        v = palavra.values()
        #print('v:', v)
        for valor in v:
            #if valor >=0.25:
            if valor >=0.35:
                sequence_prob.append('ENT')
            else:
                sequence_prob.append('O')

        #print('sequence_prob:', sequence_prob)
        entidades = []
        for i, valor in enumerate(sequence_prob):
            if i==0 and valor=='ENT': # Problema
                entidades.append('Problema')
            if i==1 and valor=='ENT': # O
                entidades.append('O')
            if i==2 and valor=='ENT': # Anatomia
                entidades.append('Anatomia')
            if i==3 and valor=='ENT': # Teste
                entidades.append('Teste')
            if i==4 and valor=='ENT': # Tratamento
                entidades.append('Tratamento')
        probabilities2.append(entidades)
    probabilities3.append(probabilities2)
    #if num>5:
    #    break

              
            
probabilities3[1]

[['O'], ['O'], ['O'], ['O'], ['O']]

In [38]:
i=0
for value, pred in zip(dic_sentencesTest.values(), probabilities3):
    print('---------------')
    print(value[1])
    #print(pred)
    indices_pred = [['', [j], p] for j, p in enumerate(pred) if p!=['O']]
    print(indices_pred)
    #for p in pred
    i=i+1
    if i>10:
        break

---------------
[['exames de rotina', [15, 16, 17], 'Procedures']]
[['', [3], ['O', 'Tratamento']], ['', [15], ['Anatomia']], ['', [16], ['Anatomia']], ['', [17], ['Anatomia']]]
---------------
[['R3 NEFRO', [0, 1], 'Abbreviation']]
[]
---------------
[['ENCAMINHADA', [6], 'Procedures'], ['ACOMPANHAMENTO GINECOLOGICO DE ROTINA', [8, 9, 10, 11], 'Procedures']]
[['', [4], ['Problema', 'O']], ['', [6], ['O', 'Anatomia']], ['', [8], ['Anatomia']], ['', [9], ['Anatomia']], ['', [10], ['Anatomia']], ['', [11], ['Anatomia']]]
---------------
[['COLETA', [1], 'Procedures'], ['CITOLOGIA ONCOTICA', [3, 4], 'Procedures']]
[['', [1], []], ['', [3], ['Teste']], ['', [4], ['Teste']]]
---------------
[['CORRIMENTO', [4], 'Disorders'], ['COLORACAO BRANCO AMARELADA', [10, 11, 12], 'Disorders'], ['ODOR FORTE', [17, 18], 'Disorders']]
[['', [17], ['Teste']], ['', [18], ['Teste']]]
---------------
[['PALPACAO', [15], 'Procedures'], ['PEQUENOS NODULOS', [3, 4], 'Disorders']]
[['', [1], ['Teste']], ['', [2]

## BEST CRF

In [None]:
# cluster 50

import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
y_pred = best_crf.predict_marginals(X_test)

print(len(y_pred))
print(len(X_test))


In [None]:
len(dic_sentencesTest)

In [None]:
y_pred[13]

In [None]:
dic={}
num=-1
num2=-1
for frase in y_pred: # para cada frase
    num=num+1
    num2=num2+1
    #print(frase)
    # para cada token
    tokens=dic_sentencesTest[num2][0]
    entidades=[] # entidades do token
    for numT, token in enumerate(frase):
        #print('token>', token['Problema'])
        for k, tag in token.items():
            #print('tag:', tag)
            if tag>0.3:
            #if tag>0.25:
                if k!='O':
                    entidades.append(['', numT, k])
   
    dic[num2] = [tokens, entidades]
    #if num>20:
    #    break
    
dic[13]

In [None]:
dic[14]

In [None]:
dic[14]

In [None]:
dic_sentencesTest[14]

In [560]:
def getDicSentences2(dicGabaritoNested): 
    dicGabaritoNested2={}
    for key, values in dicGabaritoNested.items():
        tokens = values[0]
        #print('tokens:', tokens)
        entidades=values[1]
        entidadesForm=[]
        entidadeString=''
        entidadeIndices=[]
        entidadeTag=''    
        ultimoIndice=-10
        ultimoTipo=''
        print('key:', key)
        entidades_ordenadas = sorted(entidades, key=lambda x: x[2])
        for ent in entidades_ordenadas:
                    #print('ent:', ent)
                    try:
                        #print('tipoEnt:', tipoEnt)
                        if ent[1] == ultimoIndice+1 and ultimoTipo == ent[2]: # sao seguidas
                            entidadeString = entidadeString+' '+ent[0]
                            entidadeIndices.append(ent[1])
                        else: # mudou tipo
                            if ultimoIndice>0: # se for primeira vez, nao grava
                                entidadesForm.append([entidadeString, entidadeIndices, entidadeTag])
                            entidadeString = ent[0]
                            entidadeIndices = [ent[1]]
                            entidadeTag = ent[2]
                        ultimoIndice = ent[1]
                        ultimoTipo =ent[2]
                    except:
                        print('ent:', ent)
                        print('tokens:', tokens)
                        print('key:', key)
                        raise

        if len(entidadeIndices)>0:
            entidadesForm.append([entidadeString, entidadeIndices, entidadeTag])
        if key in dicGabaritoNested2:
            dicGabaritoNested2[key][1].append(entidadesForm)
        else:
            dicGabaritoNested2[key]=[tokens, entidadesForm]
        entidadesForm=[]
        #if key>14:
        #    break
    
    return dicGabaritoNested2


In [561]:
lista = [['', 0, 'Problema'], ['', 1, 'Problema'], ['', 3, 'Anatomia']]
lista_ordenada = sorted(lista, key=lambda x: x[2])
print(lista_ordenada)

[['', 3, 'Anatomia'], ['', 0, 'Problema'], ['', 1, 'Problema']]


In [562]:
dicPred2 = getDicSentences2(dic)
dicPred2[1]

key: 0
key: 1
key: 2
key: 3
key: 4
key: 5
key: 6
key: 7
key: 8
key: 9
key: 10
key: 11
key: 12
key: 13
key: 14
key: 15
key: 16
key: 17
key: 18
key: 19
key: 20
key: 21
key: 22
key: 23
key: 24
key: 25
key: 26
key: 27
key: 28
key: 29
key: 30
key: 31
key: 32
key: 33
key: 34
key: 35
key: 36
key: 37
key: 38
key: 39
key: 40
key: 41
key: 42
key: 43
key: 44
key: 45
key: 46
key: 47
key: 48
key: 49
key: 50
key: 51
key: 52
key: 53
key: 54
key: 55
key: 56
key: 57
key: 58
key: 59
key: 60
key: 61
key: 62
key: 63
key: 64
key: 65
key: 66
key: 67
key: 68
key: 69
key: 70
key: 71
key: 72
key: 73
key: 74
key: 75
key: 76
key: 77
key: 78
key: 79
key: 80
key: 81
key: 82
key: 83
key: 84
key: 85
key: 86
key: 87
key: 88
key: 89
key: 90
key: 91
key: 92
key: 93
key: 94
key: 95
key: 96
key: 97
key: 98
key: 99
key: 100
key: 101
key: 102
key: 103
key: 104
key: 105
key: 106
key: 107
key: 108
key: 109
key: 110
key: 111
key: 112
key: 113
key: 114
key: 115
key: 116
key: 117
key: 118
key: 119
key: 120
key: 121
key: 122
key

[[['Em', 0, 59],
  ['acompanhamento', 1, 62],
  ['no', 2, 77],
  ['ambualtorio', 3, 80],
  ['há', 4, 92],
  ['5', 5, 95],
  ['anos', 6, 97],
  ['por', 7, 102],
  ['FA', 8, 106],
  [',', 9, 108],
  ['uso', 10, 110],
  ['de', 11, 114],
  ['marevan', 12, 117],
  ['5mg', 13, 125],
  ['1', 14, 129],
  ['x', 15, 131],
  ['ao', 16, 133],
  ['dia', 17, 136],
  ['.', 18, 139]],
 [['', [8], 'Problema'], [' ', [12, 13], 'Tratamento']]]

In [563]:
dic_sentencesTest[15]

[[['calcificação', 0, 1083],
  ['mitral', 1, 1096],
  ['e', 2, 1103],
  ['aórtica', 3, 1105],
  ['com', 4, 1113],
  ['refluxo', 5, 1117],
  ['leve', 6, 1125],
  ['.', 7, 1129]],
 [['calcificação mitral e aórtica com refluxo leve',
   [0, 1, 2, 3, 4, 5, 6],
   'Problema'],
  ['mitral', [1], 'Anatomia'],
  ['aórtica', [3], 'Anatomia']]]

In [564]:
dicPred2[14]

[[['aumento', 0, 1047],
  ['moderado', 1, 1055],
  ['de', 2, 1064],
  ['átrio', 3, 1067],
  ['esquerdo', 4, 1073],
  ['.', 5, 1081]],
 [[' ', [3, 4], 'Anatomia'], ['    ', [0, 1, 2, 3, 4], 'Problema']]]

In [565]:
dicPred2[20]

[[['Otimizo', 0, 1284],
  ['dose', 1, 1292],
  ['da', 2, 1297],
  ['sinvastatina', 3, 1300],
  ['para', 4, 1313],
  ['40mg', 5, 1318],
  ['/', 6, 1323],
  ['dia', 7, 1324],
  ['.', 8, 1327]],
 [['  ', [3, 4, 5], 'Tratamento']]]

In [566]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import os
from pathlib import Path
import re
import pickle
# ver qtos o modelo apenas de ner acertaria
from transformers import AutoTokenizer, AutoModelForTokenClassification
import nltk    
from nltk import tokenize 
import torch
from transformers import BertTokenizer,BertForTokenClassification
import numpy as np
import json   
from importlib import reload  # Python 3.4+
import random
from torch.utils.data import DataLoader
from importlib import reload 
from sklearn_crfsuite.metrics import flat_f1_score, flat_classification_report
import joblib
import os


In [567]:
print(len(dicPred2))
print(len(dic_sentencesTest))
print(dicPred2[2])
print(dic_sentencesTest[2])

506
506
[[['Comorbidades', 0, 142], [':', 1, 154], ['DM', 2, 156], ['há', 3, 159], ['10', 4, 162], ['anos', 5, 165], ['em', 6, 170], ['uso', 7, 173], ['de', 8, 177], ['metformina', 9, 180], ['850mg', 10, 191], ['3', 11, 197], ['cp', 12, 199], ['/', 13, 201], ['dia', 14, 202], [',', 15, 205], ['acarbose', 16, 207], ['1', 17, 216], ['cp', 18, 218], ['/', 19, 220], ['dia', 20, 221], ['e', 21, 225], ['glicazida', 22, 227], ['60mg', 23, 237], ['2', 24, 242], ['cp', 25, 244], ['/', 26, 246], ['dia', 27, 247], ['e', 28, 251], ['insulina', 29, 253], ['(', 30, 262], ['24', 31, 263], ['-', 32, 266], ['0', 33, 268], ['-', 34, 270], ['24', 35, 272], [')', 36, 274], ['.', 37, 275]], [['', [2], 'Problema'], [' ', [9, 10], 'Tratamento'], [' ', [16, 17], 'Tratamento'], [' ', [22, 23], 'Tratamento'], ['', [29], 'Tratamento']]]
[[['Comorbidades', 0, 142], [':', 1, 154], ['DM', 2, 156], ['há', 3, 159], ['10', 4, 162], ['anos', 5, 165], ['em', 6, 170], ['uso', 7, 173], ['de', 8, 177], ['metformina', 9, 18

In [568]:
dic_sentencesTest[20]

[[['Otimizo', 0, 1284],
  ['dose', 1, 1292],
  ['da', 2, 1297],
  ['sinvastatina', 3, 1300],
  ['para', 4, 1313],
  ['40mg', 5, 1318],
  ['/', 6, 1323],
  ['dia', 7, 1324],
  ['.', 8, 1327]],
 [['sinvastatina para 40mg', [3, 4, 5], 'Tratamento']]]

In [569]:
dicPred2[14]

[[['aumento', 0, 1047],
  ['moderado', 1, 1055],
  ['de', 2, 1064],
  ['átrio', 3, 1067],
  ['esquerdo', 4, 1073],
  ['.', 5, 1081]],
 [[' ', [3, 4], 'Anatomia'], ['    ', [0, 1, 2, 3, 4], 'Problema']]]

In [570]:


listaEnts = [['DM', [9, 10], 'Problema'], ['DM', [9, 10], 'Anatomia'], ['glicazida 60mg', [22, 23], 'Tratamento'], ['insulina', [29], 'Teste'], ['DM', [9, 10], 'Tratamento']]
def AgrupaEntidades(listaEnts):
    listaEnts2=[]
    for ent in listaEnts:
        #print(ent)
        #ntGravar=ent
        labels=[ent[2]]
        for ent2 in listaEnts:
            if ent[1]==ent2[1] and ent[2]!=ent2[2]:
                labels.append(ent2[2])
        
        ent[1].sort()
        labels.sort()
        entGravar=[ent[0], ent[1], labels]
        listaEnts2.append(entGravar)
        

    listaEnts3=[]
    for ent in listaEnts2:
        if ent not in listaEnts3:
            listaEnts3.append(ent)
    
    return listaEnts3
AgrupaEntidades(listaEnts)

def AvalFinalNova(dic_entidades_gabarito, dic_entidades_preditas):

    region_true_list, region_pred_list = list(), list() 
    for i in range(0, len(dic_entidades_gabarito), 1):
        listaEntidades = []
        entidades_gabarito = dic_entidades_gabarito[i][1]
        entidades_preditas = dic_entidades_preditas[i][1]

        entidades_gabarito = AgrupaEntidades(entidades_gabarito)
        entidades_preditas = AgrupaEntidades(entidades_preditas)

        #print('entidades_gabarito:', entidades_gabarito)
        #print('entidades_preditas:', entidades_preditas)

        for entidade_gabarito in entidades_gabarito:
            tem=0
            #print('--entidade_gabarito--:', entidade_gabarito)
            for entidade_predita in entidades_preditas:
                #print('entidade_predita:', entidade_predita)
                if entidade_predita[1]==entidade_gabarito[1]:
                    listaEntidades.append([entidade_gabarito[0],entidade_gabarito[1],entidade_gabarito[2],entidade_predita[2]])
                    tem=1
            if tem==0:
                listaEntidades.append([entidade_gabarito[0],entidade_gabarito[1],entidade_gabarito[2],['O']])

        for entidade_predita in entidades_preditas:
            #print('entidade_predita:', entidade_predita)
            tem=0
            for entidade in listaEntidades:
                #print('entidade:', entidade)
                if entidade[1]==entidade_predita[1] and entidade[3]==entidade_predita[2]:
                    tem=1
            if tem==0:
                listaEntidades.append([entidade_predita[0],entidade_predita[1],['O'],entidade_predita[2]])
                break

        #print('listaEntidades:>>>', listaEntidades)

        for entidade in listaEntidades:
            #print(entidade)
            tags_gabarito = entidade[2]
            tags_prevista = entidade[3]

            #print('tags_gabarito:', tags_gabarito)
            #print('tags_prevista:', tags_prevista)

            if len(tags_gabarito)==len(tags_prevista):
                region_true_list = region_true_list + [t for t in tags_gabarito]
                region_pred_list = region_pred_list + [t for t in tags_prevista]           
            else:
                if len(tags_gabarito) > len(tags_prevista):
                    for tag_gabarito in tags_gabarito:
                        region_true_list = region_true_list + [tag_gabarito]
                        if tag_gabarito in tags_prevista:
                            region_pred_list = region_pred_list + [tag_gabarito]
                        else:
                            region_pred_list = region_pred_list + ['O']
                else:
                    for tag_prevista in tags_prevista:
                        region_pred_list = region_pred_list + [tag_prevista]
                        if tag_prevista in tags_gabarito:
                            region_true_list = region_true_list + [tag_prevista]
                        else:
                            region_true_list = region_true_list + ['O']
                
    
    #print('region_true_list:', region_true_list)
    #print('region_pred_list:', region_pred_list)
    #print(classification_report(region_true_list, region_pred_list, labels=['O', 'PROTEIN', 'DNA', 'RNA', 'CELL_TYPE', 'CELL_LINE'], target_names=['O', 'PROTEIN', 'DNA', 'RNA', 'CELL_TYPE', 'CELL_LINE'], digits=6))
    print(classification_report(region_true_list, region_pred_list, digits=6))

    return region_true_list, region_pred_list

In [571]:

region_true_list, region_pred_list= AvalFinalNova(dic_sentencesTest, dicPred2)

print('---sem o O ----')
print(classification_report(region_true_list, region_pred_list, digits=6, labels=getTiposEntidade()))
print(confusion_matrix(region_true_list, region_pred_list, labels=getTiposEntidade()))



              precision    recall  f1-score   support

    Anatomia   0.862385  0.479592  0.616393       196
           O   0.000000  0.000000  0.000000       134
    Problema   0.765625  0.724852  0.744681       338
       Teste   0.871921  0.725410  0.791946       244
  Tratamento   0.835821  0.785047  0.809639       214

    accuracy                       0.607460      1126
   macro avg   0.667150  0.542980  0.592532      1126
weighted avg   0.727729  0.607460  0.656317      1126

---sem o O ----
              precision    recall  f1-score   support

    Problema   0.765625  0.724852  0.744681       338
       Teste   0.871921  0.725410  0.791946       244
  Tratamento   0.835821  0.785047  0.809639       214
    Anatomia   0.862385  0.479592  0.616393       196

   micro avg   0.821128  0.689516  0.749589       992
   macro avg   0.833938  0.678725  0.740665       992
weighted avg   0.826031  0.689516  0.744973       992

[[245   2   2   0]
 [  2 177   1   0]
 [  3   0 168   0]
 [ 

In [572]:
'''
threshold: 0.25

              precision    recall  f1-score   support

    Anatomia   0.858407  0.497436  0.629870       195
           O   0.000000  0.000000  0.000000       144
    Problema   0.753894  0.715976  0.734446       338
       Teste   0.868293  0.729508  0.792873       244
  Tratamento   0.809756  0.775701  0.792363       214

    accuracy                       0.601762      1135
   macro avg   0.658070  0.543724  0.589910      1135
weighted avg   0.711328  0.601762  0.646779      1135

---sem o O ----
              precision    recall  f1-score   support

    Problema   0.753894  0.715976  0.734446       338
       Teste   0.868293  0.729508  0.792873       244
  Tratamento   0.809756  0.775701  0.792363       214
    Anatomia   0.858407  0.497436  0.629870       195

   micro avg   0.809242  0.689203  0.744414       991
   macro avg   0.822587  0.679655  0.737388       991
weighted avg   0.814689  0.689203  0.740761       991

[[242   3   3   0]
 [  2 178   1   0]
 [  4   0 166   0]
 [  1   2   1  97]]
'''

'\nthreshold: 0.25\n\n              precision    recall  f1-score   support\n\n    Anatomia   0.858407  0.497436  0.629870       195\n           O   0.000000  0.000000  0.000000       144\n    Problema   0.753894  0.715976  0.734446       338\n       Teste   0.868293  0.729508  0.792873       244\n  Tratamento   0.809756  0.775701  0.792363       214\n\n    accuracy                       0.601762      1135\n   macro avg   0.658070  0.543724  0.589910      1135\nweighted avg   0.711328  0.601762  0.646779      1135\n\n---sem o O ----\n              precision    recall  f1-score   support\n\n    Problema   0.753894  0.715976  0.734446       338\n       Teste   0.868293  0.729508  0.792873       244\n  Tratamento   0.809756  0.775701  0.792363       214\n    Anatomia   0.858407  0.497436  0.629870       195\n\n   micro avg   0.809242  0.689203  0.744414       991\n   macro avg   0.822587  0.679655  0.737388       991\nweighted avg   0.814689  0.689203  0.740761       991\n\n[[242   3   3 

In [573]:
save_obj('dicPredCRFNested',dicPred2)


In [574]:
import joblib
import os

OUTPUT_PATH = "models/"
OUTPUT_FILE = "best_crf_nested_multilabel"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(best_crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

['models/best_crf_nested_multilabel']