In [1]:
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
import os
from pathlib import Path
import re
import pickle
import random
import numpy as np
import eli5

NUM_JANELA=4

In [5]:
def getTiposEntidade():
    return ['Problema','Teste','Tratamento','Anatomia']
    
def replaceWhiteSpaces(str):
    return re.sub('\s{2,}',' ',str)

def save_obj(name, obj):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name):
    print('Load obj em: ', 'obj/' + name + '.pkl')
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


In [23]:
def read_clusters(cluster_file):
    word2cluster = {}
    try:
        with open(cluster_file, encoding='utf-8') as i:
            for num, line in enumerate(i):
                if line:
                    word, cluster = line.strip().split('\t')
                    word2cluster[word] = cluster
    except:
        print(line)
        print(num)
        raise
    return word2cluster

def word2features(sent, i):
    word = sent[i][0]
    postag = tipoPostaggerTokens(word, dicPostagger)
    cluster = word2cluster[word.lower()] if word.lower() in word2cluster else "0"
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[:3]': word[:3],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'word.cluster': cluster
    }
    temFeatureOrdemPalavra = False
    temFeatureOrdemPalavraFinal = False
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:word.cluster': cluster
        })
    else:
        features['BOS'] = True
        temFeatureOrdemPalavra = True
    
    if i > 1:
        word1 = sent[i-2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isupper()': word1.isupper(),
            '-2:postag': postag1,
            '-2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Segunda_palavra'] = True
            temFeatureOrdemPalavra = True

    if i > 2:
        word1 = sent[i-3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-3:word.lower()': word1.lower(),
            '-3:word.istitle()': word1.istitle(),
            '-3:word.isupper()': word1.isupper(),
            '-3:postag': postag1,
            '-3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Terceira_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i > 3:
        word1 = sent[i-4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-4:word.lower()': word1.lower(),
            '-4:word.istitle()': word1.istitle(),
            '-4:word.isupper()': word1.isupper(),
            '-4:postag': postag1,
            '-4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Quarta_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:word.cluster': cluster
        })
    else:
        features['EOS'] = True
        temFeatureOrdemPalavraFinal = True
    if i < len(sent)-2:
        word1 = sent[i+2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
            '+2:postag': postag1,
            '+2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Ultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    if i < len(sent)-3:
        word1 = sent[i+3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+3:word.lower()': word1.lower(),
            '+3:word.istitle()': word1.istitle(),
            '+3:word.isupper()': word1.isupper(),
            '+3:postag': postag1,
            '+3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Penultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    
    if i < len(sent)-4:
        word1 = sent[i+4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+4:word.lower()': word1.lower(),
            '+4:word.istitle()': word1.istitle(),
            '+4:word.isupper()': word1.isupper(),
            '+4:postag': postag1,
            '+4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Antepenultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
       

def sent2labels(sent):
    try:
        return [label for token, label in sent]
    except:
        print(sent)
        raise
        

def sent2tokens(sent):
    return [token for token, postag, label in sent]

#word2cluster = read_clusters(r"clusters/cluster-50.tsv")
word2cluster = read_clusters(r"clusters/cluster-300.tsv")


In [24]:
dicPostagger = load_obj('../spanclassification/obj/dic_postagger')
def tipoPostaggerTokens(token, dicPostagger):
    postagger = 'N' # na duvida é N
    if token.lower() in dicPostagger.keys():
        postagger = dicPostagger.get(token.lower())
    return postagger
tipoPostaggerTokens('coração', dicPostagger)

Load obj em:  obj/../spanclassification/obj/dic_postagger.pkl


'N'

In [56]:
dic_sentencesTrain = load_obj('../spanclassification/obj/dic_sentencesTrain')
dic_sentencesDev = load_obj('../spanclassification/obj/dic_sentencesDev')
dic_sentencesTest = load_obj('../spanclassification/obj/dic_sentencesTestNested')
dic_sentencesTest[0]

Load obj em:  obj/../spanclassification/obj/dic_sentencesTrain.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesDev.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesTestNested.pkl


[[['Lucas', 0, 43],
  [',', 1, 48],
  ['74', 2, 50],
  ['anos', 3, 53],
  ['.', 4, 57]],
 []]

In [70]:
def gravarArquivosBinarios(dic_sentences, tipo):
    # gerar arquivo treinamento
    f_entidade = open(r'crf/nested_'+tipo+'.conll', 'w', encoding='utf-8')

    num_entidade_total=0
    num_entidade=0

    # TODO - refazer.. qdo vem entidade isolada, nao está gravando...
    print('\nGravando arquivo de {} '.format(tipo))

    for i in range(len(dic_sentences)):
        tokens = dic_sentences[i][0]
        ents = dic_sentences[i][1]
        indiceEnts=[]
        duplicaFrase=False
        entidadesUsadas=[]
        for token in tokens:
            #print('token:', token)
            indiceToken = token[1]
            temEntidade=False
            tag='O'
            for ent in ents:
                #print(ent)
                if indiceToken in ent[1]: #and ent[2]==entidade:
                    if not temEntidade:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                        entidadesUsadas.append(ent[1])
                    else:
                        #print('aaaaaaaaa:', ents)
                        duplicaFrase = True
                    #break
            #if tag != entidade:
            #    tag='O'
            tokenGravar = token[0].replace(' ','')
            tokenGravar = tokenGravar.strip()
            f_entidade.write(tokenGravar+' '+tag+'\n')
            num_entidade_total=num_entidade_total+1
        f_entidade.write('\n')
        
        if duplicaFrase:
            for token in tokens:
                #print('token:', token)
                indiceToken = token[1]
                tag='O'
                for ent in ents:
                    #print(ent)
                    if indiceToken in ent[1] and indiceToken not in entidadesUsadas:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                tokenGravar = token[0].replace(' ','')
                tokenGravar = tokenGravar.strip()
                f_entidade.write(tokenGravar+' '+tag+'\n')
                num_entidade_total=num_entidade_total+1
            f_entidade.write('\n')
        #if i>15:
        #    break
    f_entidade.close()

    print('num_entidade:', num_entidade)
    print('num_entidade_total:', num_entidade_total)

gravarArquivosBinarios(dic_sentencesTest, 'test')
gravarArquivosBinarios(dic_sentencesTrain, 'train')
gravarArquivosBinarios(dic_sentencesDev, 'dev')


Gravando arquivo de test 
num_entidade: 2391
num_entidade_total: 6663

Gravando arquivo de train 
num_entidade: 6406
num_entidade_total: 16828

Gravando arquivo de dev 
num_entidade: 1555
num_entidade_total: 4777


In [73]:
#pathTrain=r'../spanclassification/preProcessamento/data-ner-binario/nested_train.conll'
#pathDev=r'../spanclassification/preProcessamento/data-ner-binario/nested_dev.conll'
#pathTest=r'../spanclassification/preProcessamento/data-ner-binario/nested_test.conll'

tipos = getTiposEntidade()
#tipos=['Anatomia']
X_train = []
X_dev = []
X_test = []
y_train = []
y_dev = []
y_test = []

pathTrain=r'crf\nested_train.conll'
pathDev=r'crf\nested_dev.conll'
pathTest=r'crf\nested_test.conll'

with open(pathTest, encoding='utf-8') as f:
  testdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathDev, encoding='utf-8') as f:
  devdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathTrain, encoding='utf-8') as f:
  traindata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]

X_train = [sent2features(s) for s in traindata]
y_train = [sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]
#devdata[:2]
traindata[:2]

[[('Dispneia', 'Problema'),
  ('importante', 'Problema'),
  ('aos', 'Problema'),
  ('esforços', 'Problema'),
  ('+', 'O'),
  ('dor', 'Problema'),
  ('tipo', 'Problema'),
  ('peso', 'Problema'),
  ('no', 'Problema'),
  ('peito', 'Problema'),
  ('no', 'Problema'),
  ('esforço', 'Problema'),
  ('.', 'O')],
 [('Dispneia', 'Problema'),
  ('importante', 'Problema'),
  ('aos', 'Problema'),
  ('esforços', 'Problema'),
  ('+', 'O'),
  ('dor', 'Problema'),
  ('tipo', 'Problema'),
  ('peso', 'Problema'),
  ('no', 'Problema'),
  ('peito', 'Anatomia'),
  ('no', 'Problema'),
  ('esforço', 'Problema'),
  ('.', 'O')]]

In [74]:
X_test[0]

[{'bias': 1.0,
  'word.lower()': 'lucas',
  'word[-3:]': 'cas',
  'word[:3]': 'Luc',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'N',
  'word.cluster': '0',
  'BOS': True,
  '+1:word.lower()': ',',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'PU',
  '+1:word.cluster': '22',
  '+2:word.lower()': '74',
  '+2:word.istitle()': False,
  '+2:word.isupper()': False,
  '+2:postag': 'NUM',
  '+2:word.cluster': '299',
  '+3:word.lower()': 'anos',
  '+3:word.istitle()': False,
  '+3:word.isupper()': False,
  '+3:postag': 'N',
  '+3:word.cluster': '134',
  '+4:word.lower()': '.',
  '+4:word.istitle()': False,
  '+4:word.isupper()': False,
  '+4:postag': 'PU',
  '+4:word.cluster': '153'},
 {'bias': 1.0,
  'word.lower()': ',',
  'word[-3:]': ',',
  'word[:3]': ',',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'PU',
  'word.cluster': '22',
  '-1:word.lower()': 'lucas',
  '-1:

In [75]:
y_test[0:2]

[['O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'Problema',
  'O',
  'O',
  'O',
  'Tratamento',
  'Tratamento',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [76]:
print(len(X_train))
print(len(y_train))
print(len(X_train[0]))

1541
1541
13


In [77]:
print(len(traindata))
print(len(X_train))
print(len(y_train))

1541
1541
1541


## Janela de 4 vizinhos antes e depois

Precisa reforçar os outros.. se mandar O, vai achar q é tudo O.. gera um arquivo só, só duplica a frase qdo tem nested..

In [78]:
import time

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False)

start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
#crf.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

print('CRF model was trained!')

loading training data to CRFsuite: 100%|██████████| 1541/1541 [00:00<00:00, 2485.48it/s]





loading dev data to CRFsuite: 100%|██████████| 466/466 [00:00<00:00, 2251.18it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 33449
Seconds required: 0.197

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.06  loss=18005.74 active=33001 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.694 0.191  feature_norm=1.00
Iter 2   time=0.04  loss=16567.56 active=31276 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.694 0.191  feature_norm=1.07
Iter 3   time=0.03  loss=15273.65 active=30914 precision=0.205  recall=0.260  F1=0.221  Acc(item/seq)=0.602 0.152  feature_norm=2.04
Iter 4   time=0.03  loss=13563.52 active=32697 precision=0.201  recall=0.230  F1=0.213  Acc(item/seq)=0.663 0.191  feature_norm=1.94
Iter 5   time=0.03  loss=12821.43 active=32842 pr

Iter 61  time=0.04  loss=1408.29  active=15760 precision=0.815  recall=0.731  F1=0.759  Acc(item/seq)=0.870 0.567  feature_norm=52.36
Iter 62  time=0.03  loss=1407.23  active=15673 precision=0.821  recall=0.731  F1=0.763  Acc(item/seq)=0.873 0.573  feature_norm=52.39
Iter 63  time=0.04  loss=1406.16  active=15602 precision=0.814  recall=0.732  F1=0.760  Acc(item/seq)=0.870 0.567  feature_norm=52.43
Iter 64  time=0.04  loss=1405.22  active=15544 precision=0.821  recall=0.731  F1=0.762  Acc(item/seq)=0.873 0.571  feature_norm=52.46
Iter 65  time=0.04  loss=1404.27  active=15520 precision=0.815  recall=0.731  F1=0.760  Acc(item/seq)=0.870 0.564  feature_norm=52.50
Iter 66  time=0.04  loss=1403.38  active=15468 precision=0.818  recall=0.730  F1=0.761  Acc(item/seq)=0.871 0.569  feature_norm=52.53
Iter 67  time=0.03  loss=1402.46  active=15416 precision=0.826  recall=0.735  F1=0.767  Acc(item/seq)=0.873 0.567  feature_norm=52.57
Iter 68  time=0.03  loss=1401.65  active=15348 precision=0.828

In [79]:
import eli5

eli5.show_weights(crf, top=30)

From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,1.809,-1.481,-1.178,-1.155,0.169
O,-2.241,2.079,-1.182,-0.544,0.028
Problema,-0.419,-1.425,2.487,-1.546,-2.078
Teste,-0.568,0.24,-1.563,2.496,0.0
Tratamento,-0.404,-1.177,-3.009,-2.261,3.577

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+3.806,word.cluster:126,,,
+2.425,word.cluster:286,,,
+2.173,word.cluster:295,,,
+2.148,word.lower():ao,,,
+2.104,word[:3]:car,,,
+2.056,word.lower():cpp,,,
+1.981,word.lower():ae,,,
+1.981,word.cluster:170,,,
+1.980,word.lower():mmii,,,
+1.918,word.cluster:199,,,

Weight?,Feature
+3.806,word.cluster:126
+2.425,word.cluster:286
+2.173,word.cluster:295
+2.148,word.lower():ao
+2.104,word[:3]:car
+2.056,word.lower():cpp
+1.981,word.lower():ae
+1.981,word.cluster:170
+1.980,word.lower():mmii
+1.918,word.cluster:199

Weight?,Feature
+3.166,postag:PU
+2.759,word.cluster:93
+2.296,word.cluster:292
+2.246,word.cluster:23
+2.233,word[:3]:POS
+2.203,word.cluster:153
+2.103,word.cluster:69
+2.073,word.cluster:25
+2.039,postag:KC
+2.038,word.cluster:105

Weight?,Feature
+4.549,word.cluster:117
+3.151,word.lower():comorbidades
+2.848,word.cluster:118
+2.718,word.cluster:122
+2.326,-1:word.lower():sem
+2.317,word.cluster:255
+2.260,word.lower():ss
+2.183,word[:3]:DIS
+2.159,word[:3]:HIP
+2.040,word.cluster:200

Weight?,Feature
+6.067,word.cluster:284
+3.205,word.cluster:17
+2.877,word.cluster:260
+2.504,word.cluster:259
+2.387,word.lower():ecg
+2.267,word.cluster:289
+2.230,word.cluster:108
+2.195,word.cluster:193
+2.181,word[-3:]:ECG
+2.181,word[:3]:ECG

Weight?,Feature
+4.745,word.cluster:27
+3.262,word.cluster:211
+3.249,word.cluster:1
+2.421,word.cluster:273
+2.198,word.cluster:16
+2.096,+1:word.cluster:273
+1.937,word.lower():angioplastia
+1.877,word.lower():cx
+1.853,word[:3]:tra
+1.852,word.cluster:72


In [80]:
testdata[0]

[('Lucas', 'O'), (',', 'O'), ('74', 'O'), ('anos', 'O'), ('.', 'O')]

In [81]:
y_pred = crf.predict_marginals(X_test)
y_pred[:5]

[[{'Problema': 0.992177204220076,
   'O': 0.006201498917039438,
   'Anatomia': 0.0006639669346637978,
   'Teste': 0.00018642006944193484,
   'Tratamento': 0.000770909858778943},
  {'Problema': 0.005327598196854566,
   'O': 0.9946424779484917,
   'Anatomia': 1.7162040847125747e-05,
   'Teste': 4.415461413532976e-06,
   'Tratamento': 8.34635239313311e-06},
  {'Problema': 0.9243929585964284,
   'O': 0.06739710475763984,
   'Anatomia': 0.00034346724606564347,
   'Teste': 0.0006144088087414378,
   'Tratamento': 0.007252060591124852},
  {'Problema': 0.004553117716456589,
   'O': 0.9953848344421016,
   'Anatomia': 1.7008304938785828e-05,
   'Teste': 4.798204145419547e-06,
   'Tratamento': 4.024133235773012e-05},
  {'Problema': 0.014218262719198372,
   'O': 0.9856292423154552,
   'Anatomia': 5.134589099585765e-05,
   'Teste': 9.723048382459091e-05,
   'Tratamento': 3.918590526018474e-06},
  {'Problema': 0.9990616342973843,
   'O': 0.0003600983540736084,
   'Anatomia': 5.4253882977234525e-05,
 

In [101]:
y_pred[0][3]

{'Problema': 0.0009324260106915946,
 'O': 0.9986534899336298,
 'Anatomia': 1.8282167793443225e-05,
 'Teste': 0.00012592784136012113,
 'Tratamento': 0.0002698740465249659}

In [110]:

probabilities3 = []
for num, y in enumerate(y_pred): # para cada frase
    probabilities2 = []
    # para cada palavra da frase
    #print('y:', y)
    for palavra in y:
        sequence_prob = []
        #print('palavra:', palavra)
        v = palavra.values()
        #print('v:', v)
        for valor in v:
            if valor >=0.25:
                sequence_prob.append('ENT')
            else:
                sequence_prob.append('O')

        #print('sequence_prob:', sequence_prob)
        entidades = []
        for i, valor in enumerate(sequence_prob):
            if i==0 and valor=='ENT': # Problema
                entidades.append('Problema')
            if i==1 and valor=='ENT': # O
                entidades.append('O')
            if i==2 and valor=='ENT': # Anatomia
                entidades.append('Anatomia')
            if i==3 and valor=='ENT': # Teste
                entidades.append('Teste')
            if i==4 and valor=='ENT': # Tratamento
                entidades.append('Tratamento')
        probabilities2.append(entidades)
    probabilities3.append(probabilities2)
    #if num>5:
    #    break

              
            
probabilities3

[[['O'], ['O'], ['O'], ['O'], ['O']],
 [['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O']],
 [['Problema'],
  ['O'],
  ['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O', 'Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O']],
 [['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O', 'Teste', 'Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O']],
 [['Problema

In [123]:
i=0
for value, pred in zip(dic_sentencesTest.values(), probabilities3):
    print('---------------')
    print(value[1])
    #print(pred)
    indices_pred = [['', [j], p] for j, p in enumerate(pred) if p!=['O']]
    print(indices_pred)
    #for p in pred
    i=i+1
    if i>10:
        break

---------------
[]
[]
---------------
[['FA', [8], 'Problema'], ['marevan 5mg', [12, 13], 'Tratamento']]
[['', [8], ['Problema']], ['', [12], ['Tratamento']], ['', [13], ['Tratamento']]]
---------------
[['Comorbidades', [0], 'Problema'], ['DM', [2], 'Problema'], ['metformina 850mg', [9, 10], 'Tratamento'], ['acarbose', [16], 'Tratamento'], ['glicazida 60mg', [22, 23], 'Tratamento'], ['insulina', [29], 'Tratamento']]
[['', [0], ['Problema']], ['', [2], ['Problema']], ['', [9], ['Tratamento']], ['', [10], ['Tratamento']], ['', [16], ['Tratamento']], ['', [17], ['O', 'Tratamento']], ['', [22], ['Tratamento']], ['', [23], ['Tratamento']], ['', [29], ['Tratamento']]]
---------------
[['HAS', [0], 'Problema'], ['losartana 50mg', [7, 8], 'Tratamento'], ['digoxina', [12], 'Tratamento'], ['carvedilol 25', [20, 21], 'Tratamento'], ['HCTZ', [26], 'Tratamento']]
[['', [0], ['Problema']], ['', [7], ['Tratamento']], ['', [8], ['Tratamento']], ['', [12], ['O', 'Teste', 'Tratamento']], ['', [20], ['T

## BEST CRF

In [124]:
# cluster 50

import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.6min finished


              precision    recall  f1-score   support

    Problema      0.774     0.821     0.797      1120
       Teste      0.904     0.801     0.849       366
  Tratamento      0.884     0.884     0.884       484
    Anatomia      0.696     0.447     0.544       262

   micro avg      0.812     0.787     0.800      2232
   macro avg      0.815     0.738     0.769      2232
weighted avg      0.810     0.787     0.795      2232

best params: {'c1': 0.07142426429277902, 'c2': 0.014106307808927127}
best CV score: 0.7425037694235993
model size: 0.65M


From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,1.963,-0.661,-1.665,-1.125,-0.623
O,-3.483,2.087,-2.602,-1.284,-1.816
Problema,-0.659,-1.057,2.455,-1.433,-3.77
Teste,-1.927,-0.201,-3.003,1.988,-4.229
Tratamento,-0.381,-0.632,-3.671,-2.355,3.468

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+4.735,word.cluster:126,,,
+3.446,word.lower():ao,,,
+3.356,word.lower():cpp,,,
+3.006,word.cluster:286,,,
+2.970,word.lower():acv,,,
+2.683,word[:3]:car,,,
+2.672,word.cluster:209,,,
+2.519,word[:3]:CAR,,,
+2.437,-3:word.lower():vd,,,
+2.436,word.lower():vao,,,

Weight?,Feature
+4.735,word.cluster:126
+3.446,word.lower():ao
+3.356,word.lower():cpp
+3.006,word.cluster:286
+2.970,word.lower():acv
+2.683,word[:3]:car
+2.672,word.cluster:209
+2.519,word[:3]:CAR
+2.437,-3:word.lower():vd
+2.436,word.lower():vao

Weight?,Feature
+4.491,postag:PU
+4.307,word.cluster:93
+4.087,word.cluster:15
+3.796,word.cluster:292
+3.671,postag:KC
+3.615,word.lower():ou
+3.564,word.cluster:69
+3.456,word[:3]:POS
+3.281,+2:word.cluster:187
+3.270,-1:word.lower():ra

Weight?,Feature
+6.296,word.cluster:117
+4.836,word.lower():comorbidades
+4.045,word.lower():chagas
+3.957,word.cluster:118
+3.851,-1:word.lower():fao
+3.725,-1:word.lower():quaisquer
+3.701,word.cluster:122
+3.603,word.lower():afebril
+3.570,word[:3]:Hip
+3.498,word[:3]:DIS

Weight?,Feature
+7.406,word.cluster:284
+4.902,word.cluster:17
+4.601,word.cluster:260
+4.507,word.cluster:259
+4.007,word.lower():ecg
+3.724,word.cluster:279
+3.718,word.cluster:108
+3.553,word.lower():avaliação
+3.443,+2:word.lower():84
+3.294,word.lower():exame

Weight?,Feature
+6.541,word.cluster:27
+5.872,word.cluster:1
+4.480,word.cluster:211
+4.067,word.cluster:132
+3.671,word.lower():cx
+3.655,word.cluster:5
+3.381,word[:3]:SEL
+3.298,word[:3]:tra
+3.158,+3:word.lower():safena
+3.111,word.cluster:72


In [134]:
y_pred = best_crf.predict_marginals(X_test)

probabilities3 = []
for num, y in enumerate(y_pred): # para cada frase
    probabilities2 = []
    # para cada palavra da frase
    #print('y:', y)
    for palavra in y:
        sequence_prob = []
        #print('palavra:', palavra)
        v = palavra.values()
        #print('v:', v)
        for valor in v:
            if valor >=0.25:
                sequence_prob.append('ENT')
            else:
                sequence_prob.append('O')

        #print('sequence_prob:', sequence_prob)
        entidades = []
        for i, valor in enumerate(sequence_prob):
            if i==0 and valor=='ENT': # Problema
                entidades.append('Problema')
            if i==1 and valor=='ENT': # O
                entidades.append('O')
            if i==2 and valor=='ENT': # Anatomia
                entidades.append('Anatomia')
            if i==3 and valor=='ENT': # Teste
                entidades.append('Teste')
            if i==4 and valor=='ENT': # Tratamento
                entidades.append('Tratamento')
        probabilities2.append(entidades)
    probabilities3.append(probabilities2)
    #if num>5:
    #    break
           
            
probabilities3


[[['O'], ['O'], ['O'], ['O'], ['O']],
 [['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O']],
 [['Problema'],
  ['O'],
  ['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O']],
 [['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O']],
 [['Problema'],
  ['O'],
  ['O'

In [145]:
dicPred={}
i=0
for value, pred in zip(dic_sentencesTest.values(), probabilities3):
    #print('---------------')
    #print(value[1])
    #print(pred)
    indices_pred = [['', j, p] for j, p in enumerate(pred) if p!=['O']]
    #print(indices_pred)
    dicPred[i]=[value[0],indices_pred]
    #for p in pred
    i=i+1
    #if i>10:
    #    break
    
dicPred[0]

[[['Lucas', 0, 43],
  [',', 1, 48],
  ['74', 2, 50],
  ['anos', 3, 53],
  ['.', 4, 57]],
 []]

In [146]:
dicPred[1]

[[['Em', 0, 59],
  ['acompanhamento', 1, 62],
  ['no', 2, 77],
  ['ambualtorio', 3, 80],
  ['há', 4, 92],
  ['5', 5, 95],
  ['anos', 6, 97],
  ['por', 7, 102],
  ['FA', 8, 106],
  [',', 9, 108],
  ['uso', 10, 110],
  ['de', 11, 114],
  ['marevan', 12, 117],
  ['5mg', 13, 125],
  ['1', 14, 129],
  ['x', 15, 131],
  ['ao', 16, 133],
  ['dia', 17, 136],
  ['.', 18, 139]],
 [['', 8, ['Problema']], ['', 12, ['Tratamento']], ['', 13, ['Tratamento']]]]

In [189]:
def getDicSentences2(dicGabaritoNested): 
    dicGabaritoNested2={}
    for key, values in dicGabaritoNested.items():
        tokens = values[0]
        #print('tokens:', tokens)
        entidades=values[1]
        entidadesForm=[]
        entidadeString=''
        entidadeIndices=[]
        entidadeTag=''    
        ultimoIndice=-10
        ultimoTipo=''
        #print('key:', key)
        for ent in entidades:
            #print('ent[2]:', ent[2])
            for tipoEnt in ent[2]:
                #print('tipoEnt:', tipoEnt)
                if ent[1] == ultimoIndice+1 and ultimoTipo == tipoEnt: # sao seguidas
                    entidadeString = entidadeString+' '+ent[0]
                    entidadeIndices.append(ent[1])
                else: # mudou tipo
                    if ultimoIndice>0: # se for primeira vez, nao grava
                        entidadesForm.append([entidadeString, entidadeIndices, entidadeTag])
                    entidadeString = ent[0]
                    entidadeIndices = [ent[1]]
                    entidadeTag = tipoEnt
                ultimoIndice = ent[1]
                ultimoTipo =tipoEnt

        if len(entidadeIndices)>0:
            entidadesForm.append([entidadeString, entidadeIndices, entidadeTag])
        dicGabaritoNested2[key]=[tokens, entidadesForm]
        entidadesForm=[]
        #if key>3:
        #    break
    
    return dicGabaritoNested2


In [190]:
dicPred2 = getDicSentences2(dicPred)
dicPred2[0]

aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa
aaa


[[['Lucas', 0, 43],
  [',', 1, 48],
  ['74', 2, 50],
  ['anos', 3, 53],
  ['.', 4, 57]],
 []]

In [191]:
dicPred2[1]

[[['Em', 0, 59],
  ['acompanhamento', 1, 62],
  ['no', 2, 77],
  ['ambualtorio', 3, 80],
  ['há', 4, 92],
  ['5', 5, 95],
  ['anos', 6, 97],
  ['por', 7, 102],
  ['FA', 8, 106],
  [',', 9, 108],
  ['uso', 10, 110],
  ['de', 11, 114],
  ['marevan', 12, 117],
  ['5mg', 13, 125],
  ['1', 14, 129],
  ['x', 15, 131],
  ['ao', 16, 133],
  ['dia', 17, 136],
  ['.', 18, 139]],
 [['', [8], 'Problema'], [' ', [12, 13], 'Tratamento']]]