In [1]:
import nltk
import sklearn
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn_crfsuite as crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
import os
from pathlib import Path
import re
import pickle
import random
import numpy as np
import eli5

NUM_JANELA=4

In [5]:
def getTiposEntidade():
    return ['Problema','Teste','Tratamento','Anatomia']
    
def replaceWhiteSpaces(str):
    return re.sub('\s{2,}',' ',str)

def save_obj(name, obj):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name):
    print('Load obj em: ', 'obj/' + name + '.pkl')
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


In [23]:
def read_clusters(cluster_file):
    word2cluster = {}
    try:
        with open(cluster_file, encoding='utf-8') as i:
            for num, line in enumerate(i):
                if line:
                    word, cluster = line.strip().split('\t')
                    word2cluster[word] = cluster
    except:
        print(line)
        print(num)
        raise
    return word2cluster

def word2features(sent, i):
    word = sent[i][0]
    postag = tipoPostaggerTokens(word, dicPostagger)
    cluster = word2cluster[word.lower()] if word.lower() in word2cluster else "0"
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[:3]': word[:3],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'word.cluster': cluster
    }
    temFeatureOrdemPalavra = False
    temFeatureOrdemPalavraFinal = False
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:word.cluster': cluster
        })
    else:
        features['BOS'] = True
        temFeatureOrdemPalavra = True
    
    if i > 1:
        word1 = sent[i-2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-2:word.lower()': word1.lower(),
            '-2:word.istitle()': word1.istitle(),
            '-2:word.isupper()': word1.isupper(),
            '-2:postag': postag1,
            '-2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Segunda_palavra'] = True
            temFeatureOrdemPalavra = True

    if i > 2:
        word1 = sent[i-3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-3:word.lower()': word1.lower(),
            '-3:word.istitle()': word1.istitle(),
            '-3:word.isupper()': word1.isupper(),
            '-3:postag': postag1,
            '-3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Terceira_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i > 3:
        word1 = sent[i-4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '-4:word.lower()': word1.lower(),
            '-4:word.istitle()': word1.istitle(),
            '-4:word.isupper()': word1.isupper(),
            '-4:postag': postag1,
            '-4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavra:
            features['Quarta_palavra'] = True
            temFeatureOrdemPalavra = True
    
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:word.cluster': cluster
        })
    else:
        features['EOS'] = True
        temFeatureOrdemPalavraFinal = True
    if i < len(sent)-2:
        word1 = sent[i+2][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+2:word.lower()': word1.lower(),
            '+2:word.istitle()': word1.istitle(),
            '+2:word.isupper()': word1.isupper(),
            '+2:postag': postag1,
            '+2:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Ultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    if i < len(sent)-3:
        word1 = sent[i+3][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+3:word.lower()': word1.lower(),
            '+3:word.istitle()': word1.istitle(),
            '+3:word.isupper()': word1.isupper(),
            '+3:postag': postag1,
            '+3:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Penultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    
    if i < len(sent)-4:
        word1 = sent[i+4][0]
        postag1 = tipoPostaggerTokens(word1, dicPostagger)
        cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
        features.update({
            '+4:word.lower()': word1.lower(),
            '+4:word.istitle()': word1.istitle(),
            '+4:word.isupper()': word1.isupper(),
            '+4:postag': postag1,
            '+4:word.cluster': cluster
        })
    else:
        if not temFeatureOrdemPalavraFinal:
            features['Antepenultima_palavra'] = True
            temFeatureOrdemPalavraFinal = True
    

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
       

def sent2labels(sent):
    try:
        return [label for token, label in sent]
    except:
        print(sent)
        raise
        

def sent2tokens(sent):
    return [token for token, postag, label in sent]

#word2cluster = read_clusters(r"clusters/cluster-50.tsv")
word2cluster = read_clusters(r"clusters/cluster-300.tsv")


In [24]:
dicPostagger = load_obj('../spanclassification/obj/dic_postagger')
def tipoPostaggerTokens(token, dicPostagger):
    postagger = 'N' # na duvida é N
    if token.lower() in dicPostagger.keys():
        postagger = dicPostagger.get(token.lower())
    return postagger
tipoPostaggerTokens('coração', dicPostagger)

Load obj em:  obj/../spanclassification/obj/dic_postagger.pkl


'N'

In [56]:
dic_sentencesTrain = load_obj('../spanclassification/obj/dic_sentencesTrain')
dic_sentencesDev = load_obj('../spanclassification/obj/dic_sentencesDev')
dic_sentencesTest = load_obj('../spanclassification/obj/dic_sentencesTestNested')
dic_sentencesTest[0]

Load obj em:  obj/../spanclassification/obj/dic_sentencesTrain.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesDev.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesTestNested.pkl


[[['Lucas', 0, 43],
  [',', 1, 48],
  ['74', 2, 50],
  ['anos', 3, 53],
  ['.', 4, 57]],
 []]

In [70]:
def gravarArquivosBinarios(dic_sentences, tipo):
    # gerar arquivo treinamento
    f_entidade = open(r'crf/nested_'+tipo+'.conll', 'w', encoding='utf-8')

    num_entidade_total=0
    num_entidade=0

    # TODO - refazer.. qdo vem entidade isolada, nao está gravando...
    print('\nGravando arquivo de {} '.format(tipo))

    for i in range(len(dic_sentences)):
        tokens = dic_sentences[i][0]
        ents = dic_sentences[i][1]
        indiceEnts=[]
        duplicaFrase=False
        entidadesUsadas=[]
        for token in tokens:
            #print('token:', token)
            indiceToken = token[1]
            temEntidade=False
            tag='O'
            for ent in ents:
                #print(ent)
                if indiceToken in ent[1]: #and ent[2]==entidade:
                    if not temEntidade:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                        entidadesUsadas.append(ent[1])
                    else:
                        #print('aaaaaaaaa:', ents)
                        duplicaFrase = True
                    #break
            #if tag != entidade:
            #    tag='O'
            tokenGravar = token[0].replace(' ','')
            tokenGravar = tokenGravar.strip()
            f_entidade.write(tokenGravar+' '+tag+'\n')
            num_entidade_total=num_entidade_total+1
        f_entidade.write('\n')
        
        if duplicaFrase:
            for token in tokens:
                #print('token:', token)
                indiceToken = token[1]
                tag='O'
                for ent in ents:
                    #print(ent)
                    if indiceToken in ent[1] and indiceToken not in entidadesUsadas:
                        tag = ent[2]
                        num_entidade=num_entidade+1
                        temEntidade = True
                tokenGravar = token[0].replace(' ','')
                tokenGravar = tokenGravar.strip()
                f_entidade.write(tokenGravar+' '+tag+'\n')
                num_entidade_total=num_entidade_total+1
            f_entidade.write('\n')
        #if i>15:
        #    break
    f_entidade.close()

    print('num_entidade:', num_entidade)
    print('num_entidade_total:', num_entidade_total)

gravarArquivosBinarios(dic_sentencesTest, 'test')
gravarArquivosBinarios(dic_sentencesTrain, 'train')
gravarArquivosBinarios(dic_sentencesDev, 'dev')


Gravando arquivo de test 
num_entidade: 2391
num_entidade_total: 6663

Gravando arquivo de train 
num_entidade: 6406
num_entidade_total: 16828

Gravando arquivo de dev 
num_entidade: 1555
num_entidade_total: 4777


In [73]:
#pathTrain=r'../spanclassification/preProcessamento/data-ner-binario/nested_train.conll'
#pathDev=r'../spanclassification/preProcessamento/data-ner-binario/nested_dev.conll'
#pathTest=r'../spanclassification/preProcessamento/data-ner-binario/nested_test.conll'

tipos = getTiposEntidade()
#tipos=['Anatomia']
X_train = []
X_dev = []
X_test = []
y_train = []
y_dev = []
y_test = []

pathTrain=r'crf\nested_train.conll'
pathDev=r'crf\nested_dev.conll'
pathTest=r'crf\nested_test.conll'

with open(pathTest, encoding='utf-8') as f:
  testdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathDev, encoding='utf-8') as f:
  devdata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]
with open(pathTrain, encoding='utf-8') as f:
  traindata = [[tuple(w.split(' ')) for w in snt.split('\n')] for snt in f.read().split('\n\n')]

X_train = [sent2features(s) for s in traindata]
y_train = [sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]
#devdata[:2]
traindata[:2]

[[('Dispneia', 'Problema'),
  ('importante', 'Problema'),
  ('aos', 'Problema'),
  ('esforços', 'Problema'),
  ('+', 'O'),
  ('dor', 'Problema'),
  ('tipo', 'Problema'),
  ('peso', 'Problema'),
  ('no', 'Problema'),
  ('peito', 'Problema'),
  ('no', 'Problema'),
  ('esforço', 'Problema'),
  ('.', 'O')],
 [('Dispneia', 'Problema'),
  ('importante', 'Problema'),
  ('aos', 'Problema'),
  ('esforços', 'Problema'),
  ('+', 'O'),
  ('dor', 'Problema'),
  ('tipo', 'Problema'),
  ('peso', 'Problema'),
  ('no', 'Problema'),
  ('peito', 'Anatomia'),
  ('no', 'Problema'),
  ('esforço', 'Problema'),
  ('.', 'O')]]

In [74]:
X_test[0]

[{'bias': 1.0,
  'word.lower()': 'lucas',
  'word[-3:]': 'cas',
  'word[:3]': 'Luc',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'N',
  'word.cluster': '0',
  'BOS': True,
  '+1:word.lower()': ',',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'PU',
  '+1:word.cluster': '22',
  '+2:word.lower()': '74',
  '+2:word.istitle()': False,
  '+2:word.isupper()': False,
  '+2:postag': 'NUM',
  '+2:word.cluster': '299',
  '+3:word.lower()': 'anos',
  '+3:word.istitle()': False,
  '+3:word.isupper()': False,
  '+3:postag': 'N',
  '+3:word.cluster': '134',
  '+4:word.lower()': '.',
  '+4:word.istitle()': False,
  '+4:word.isupper()': False,
  '+4:postag': 'PU',
  '+4:word.cluster': '153'},
 {'bias': 1.0,
  'word.lower()': ',',
  'word[-3:]': ',',
  'word[:3]': ',',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'PU',
  'word.cluster': '22',
  '-1:word.lower()': 'lucas',
  '-1:

In [75]:
y_test[0:2]

[['O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'Problema',
  'O',
  'O',
  'O',
  'Tratamento',
  'Tratamento',
  'O',
  'O',
  'O',
  'O',
  'O']]

In [76]:
print(len(X_train))
print(len(y_train))
print(len(X_train[0]))

1541
1541
13


In [77]:
print(len(traindata))
print(len(X_train))
print(len(y_train))

1541
1541
1541


## Janela de 4 vizinhos antes e depois

Precisa reforçar os outros.. se mandar O, vai achar q é tudo O.. gera um arquivo só, só duplica a frase qdo tem nested..

In [78]:
import time

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False)

start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
#crf.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

print('CRF model was trained!')

loading training data to CRFsuite: 100%|██████████| 1541/1541 [00:00<00:00, 2485.48it/s]





loading dev data to CRFsuite: 100%|██████████| 466/466 [00:00<00:00, 2251.18it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 33449
Seconds required: 0.197

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.06  loss=18005.74 active=33001 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.694 0.191  feature_norm=1.00
Iter 2   time=0.04  loss=16567.56 active=31276 precision=0.139  recall=0.200  F1=0.164  Acc(item/seq)=0.694 0.191  feature_norm=1.07
Iter 3   time=0.03  loss=15273.65 active=30914 precision=0.205  recall=0.260  F1=0.221  Acc(item/seq)=0.602 0.152  feature_norm=2.04
Iter 4   time=0.03  loss=13563.52 active=32697 precision=0.201  recall=0.230  F1=0.213  Acc(item/seq)=0.663 0.191  feature_norm=1.94
Iter 5   time=0.03  loss=12821.43 active=32842 pr

Iter 61  time=0.04  loss=1408.29  active=15760 precision=0.815  recall=0.731  F1=0.759  Acc(item/seq)=0.870 0.567  feature_norm=52.36
Iter 62  time=0.03  loss=1407.23  active=15673 precision=0.821  recall=0.731  F1=0.763  Acc(item/seq)=0.873 0.573  feature_norm=52.39
Iter 63  time=0.04  loss=1406.16  active=15602 precision=0.814  recall=0.732  F1=0.760  Acc(item/seq)=0.870 0.567  feature_norm=52.43
Iter 64  time=0.04  loss=1405.22  active=15544 precision=0.821  recall=0.731  F1=0.762  Acc(item/seq)=0.873 0.571  feature_norm=52.46
Iter 65  time=0.04  loss=1404.27  active=15520 precision=0.815  recall=0.731  F1=0.760  Acc(item/seq)=0.870 0.564  feature_norm=52.50
Iter 66  time=0.04  loss=1403.38  active=15468 precision=0.818  recall=0.730  F1=0.761  Acc(item/seq)=0.871 0.569  feature_norm=52.53
Iter 67  time=0.03  loss=1402.46  active=15416 precision=0.826  recall=0.735  F1=0.767  Acc(item/seq)=0.873 0.567  feature_norm=52.57
Iter 68  time=0.03  loss=1401.65  active=15348 precision=0.828

In [79]:
import eli5

eli5.show_weights(crf, top=30)

From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,1.809,-1.481,-1.178,-1.155,0.169
O,-2.241,2.079,-1.182,-0.544,0.028
Problema,-0.419,-1.425,2.487,-1.546,-2.078
Teste,-0.568,0.24,-1.563,2.496,0.0
Tratamento,-0.404,-1.177,-3.009,-2.261,3.577

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+3.806,word.cluster:126,,,
+2.425,word.cluster:286,,,
+2.173,word.cluster:295,,,
+2.148,word.lower():ao,,,
+2.104,word[:3]:car,,,
+2.056,word.lower():cpp,,,
+1.981,word.lower():ae,,,
+1.981,word.cluster:170,,,
+1.980,word.lower():mmii,,,
+1.918,word.cluster:199,,,

Weight?,Feature
+3.806,word.cluster:126
+2.425,word.cluster:286
+2.173,word.cluster:295
+2.148,word.lower():ao
+2.104,word[:3]:car
+2.056,word.lower():cpp
+1.981,word.lower():ae
+1.981,word.cluster:170
+1.980,word.lower():mmii
+1.918,word.cluster:199

Weight?,Feature
+3.166,postag:PU
+2.759,word.cluster:93
+2.296,word.cluster:292
+2.246,word.cluster:23
+2.233,word[:3]:POS
+2.203,word.cluster:153
+2.103,word.cluster:69
+2.073,word.cluster:25
+2.039,postag:KC
+2.038,word.cluster:105

Weight?,Feature
+4.549,word.cluster:117
+3.151,word.lower():comorbidades
+2.848,word.cluster:118
+2.718,word.cluster:122
+2.326,-1:word.lower():sem
+2.317,word.cluster:255
+2.260,word.lower():ss
+2.183,word[:3]:DIS
+2.159,word[:3]:HIP
+2.040,word.cluster:200

Weight?,Feature
+6.067,word.cluster:284
+3.205,word.cluster:17
+2.877,word.cluster:260
+2.504,word.cluster:259
+2.387,word.lower():ecg
+2.267,word.cluster:289
+2.230,word.cluster:108
+2.195,word.cluster:193
+2.181,word[-3:]:ECG
+2.181,word[:3]:ECG

Weight?,Feature
+4.745,word.cluster:27
+3.262,word.cluster:211
+3.249,word.cluster:1
+2.421,word.cluster:273
+2.198,word.cluster:16
+2.096,+1:word.cluster:273
+1.937,word.lower():angioplastia
+1.877,word.lower():cx
+1.853,word[:3]:tra
+1.852,word.cluster:72


In [80]:
testdata[0]

[('Lucas', 'O'), (',', 'O'), ('74', 'O'), ('anos', 'O'), ('.', 'O')]

In [81]:
y_pred = crf.predict_marginals(X_dev)
y_pred[:5]

[[{'Problema': 0.992177204220076,
   'O': 0.006201498917039438,
   'Anatomia': 0.0006639669346637978,
   'Teste': 0.00018642006944193484,
   'Tratamento': 0.000770909858778943},
  {'Problema': 0.005327598196854566,
   'O': 0.9946424779484917,
   'Anatomia': 1.7162040847125747e-05,
   'Teste': 4.415461413532976e-06,
   'Tratamento': 8.34635239313311e-06},
  {'Problema': 0.9243929585964284,
   'O': 0.06739710475763984,
   'Anatomia': 0.00034346724606564347,
   'Teste': 0.0006144088087414378,
   'Tratamento': 0.007252060591124852},
  {'Problema': 0.004553117716456589,
   'O': 0.9953848344421016,
   'Anatomia': 1.7008304938785828e-05,
   'Teste': 4.798204145419547e-06,
   'Tratamento': 4.024133235773012e-05},
  {'Problema': 0.014218262719198372,
   'O': 0.9856292423154552,
   'Anatomia': 5.134589099585765e-05,
   'Teste': 9.723048382459091e-05,
   'Tratamento': 3.918590526018474e-06},
  {'Problema': 0.9990616342973843,
   'O': 0.0003600983540736084,
   'Anatomia': 5.4253882977234525e-05,
 

In [101]:
y_pred[0][3]

{'Problema': 0.0009324260106915946,
 'O': 0.9986534899336298,
 'Anatomia': 1.8282167793443225e-05,
 'Teste': 0.00012592784136012113,
 'Tratamento': 0.0002698740465249659}

In [109]:

probabilities3 = []
for num, y in enumerate(y_pred): # para cada frase
    probabilities2 = []
    # para cada palavra da frase
    #print('y:', y)
    for palavra in y:
        sequence_prob = []
        #print('palavra:', palavra)
        v = palavra.values()
        #print('v:', v)
        for valor in v:
            if valor >=0.25:
                sequence_prob.append('ENT')
            else:
                sequence_prob.append('O')

        #print('sequence_prob:', sequence_prob)
        entidades = []
        for i, valor in enumerate(sequence_prob):
            if i==0 and valor=='ENT': # Problema
                entidades.append('Problema')
            if i==1 and valor=='ENT': # O
                entidades.append('O')
            if i==2 and valor=='ENT': # Anatomia
                entidades.append('Anatomia')
            if i==3 and valor=='ENT': # Teste
                entidades.append('Teste')
            if i==4 and valor=='ENT': # Tratamento
                entidades.append('Tratamento')
        probabilities2.append(entidades)
    probabilities3.append(probabilities2)
    #if num>5:
    #    break

              
            
probabilities3

y: [{'Problema': 0.004393964671940167, 'O': 0.9928741862105271, 'Anatomia': 0.0001606806599963722, 'Teste': 0.0014384353798509873, 'Tratamento': 0.0011327330776847197}, {'Problema': 4.80901943882659e-07, 'O': 0.9999550694678075, 'Anatomia': 3.6729647097224244e-07, 'Teste': 1.9280256209206254e-05, 'Tratamento': 2.4802077567925276e-05}, {'Problema': 3.85303819864706e-05, 'O': 0.9996734119568931, 'Anatomia': 1.010521757170217e-06, 'Teste': 8.586798205637833e-06, 'Tratamento': 0.00027846034115724396}, {'Problema': 0.0009324260106915946, 'O': 0.9986534899336298, 'Anatomia': 1.8282167793443225e-05, 'Teste': 0.00012592784136012113, 'Tratamento': 0.0002698740465249659}, {'Problema': 4.896492638804341e-07, 'O': 0.9999884305954868, 'Anatomia': 4.228689102534893e-06, 'Teste': 4.957577318260102e-06, 'Tratamento': 1.893488828566144e-06}]
palavra: {'Problema': 0.004393964671940167, 'O': 0.9928741862105271, 'Anatomia': 0.0001606806599963722, 'Teste': 0.0014384353798509873, 'Tratamento': 0.00113273307

[[['O'], ['O'], ['O'], ['O'], ['O']],
 [['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O']],
 [['Problema'],
  ['O'],
  ['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O', 'Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O']],
 [['Problema'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O', 'Teste', 'Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['Tratamento'],
  ['O'],
  ['O'],
  ['O'],
  ['O'],
  ['Tratamento'],
  ['O']],
 [['Problema

# janela de 10 - nao melhora

In [106]:
# cluster 50

import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.9min finished


              precision    recall  f1-score   support

         ENT      0.892     0.723     0.799      1806

   micro avg      0.892     0.723     0.799      1806
   macro avg      0.892     0.723     0.799      1806
weighted avg      0.892     0.723     0.799      1806





NameError: name 'OUTPUT_PATH' is not defined

In [108]:
import eli5

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)

best params: {'c1': 0.05408072153584, 'c2': 0.025342765201054995}
best CV score: 0.7954955818528687
model size: 0.70M




From \ To,ENT,O
ENT,0.612,-0.982
O,-0.986,0.347

Weight?,Feature
Weight?,Feature
+3.509,+1:word.lower():irradiaçao
+2.966,+3:word.lower():troca
+2.848,+3:word.lower():várias
+2.789,+1:word.lower():duração
+2.636,+1:word.lower():cateterismo
+2.283,+1:word.lower():sentado
+1.870,+2:word.lower():cavidade
+1.853,-1:postag:ENT
+1.813,-2:word.lower():>
+1.779,word.lower():muita

Weight?,Feature
+3.509,+1:word.lower():irradiaçao
+2.966,+3:word.lower():troca
+2.848,+3:word.lower():várias
+2.789,+1:word.lower():duração
+2.636,+1:word.lower():cateterismo
+2.283,+1:word.lower():sentado
+1.870,+2:word.lower():cavidade
+1.853,-1:postag:ENT
+1.813,-2:word.lower():>
+1.779,word.lower():muita

Weight?,Feature
+4.745,+1:word.lower():ss
+3.927,word[:3]:nor
+3.865,-1:word.lower():mg
+3.865,-3:word.lower():mg
+3.780,word.lower():refere
+3.687,word.lower():ou
+3.645,postag:PU
+3.637,EOS
+3.527,word[-3:]:mHg
+3.496,-2:word.lower():levotirox


In [112]:
# cluster 300, janela 5

#X_train = word2features(lista, word2cluster, dicPostagger)
#X_train = [sent2features(s, word2cluster) for s in train_sents]
X_train = [sent2features(s) for s in traindata]
y_train =[sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]

import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  5.9min finished


              precision    recall  f1-score   support

         ENT      0.892     0.709     0.790      1806

   micro avg      0.892     0.709     0.790      1806
   macro avg      0.892     0.709     0.790      1806
weighted avg      0.892     0.709     0.790      1806

best params: {'c1': 0.17041205830636758, 'c2': 0.00275779785894667}
best CV score: 0.8004865319740259
model size: 0.56M


From \ To,ENT,O
ENT,0.75,-0.685
O,-0.806,0.519

Weight?,Feature
Weight?,Feature
+2.831,+1:word.lower():irradiaçao
+2.826,word.lower():peito
+2.545,+3:word.lower():várias
+2.391,word[:3]:Mui
+2.274,+4:word.lower():angiosplatia
+2.229,word.lower():muita
+2.112,+4:word.lower():150
+1.965,+3:word.lower():troca
+1.945,+3:word.lower():safena
+1.904,+1:word.lower():duração

Weight?,Feature
+2.831,+1:word.lower():irradiaçao
+2.826,word.lower():peito
+2.545,+3:word.lower():várias
+2.391,word[:3]:Mui
+2.274,+4:word.lower():angiosplatia
+2.229,word.lower():muita
+2.112,+4:word.lower():150
+1.965,+3:word.lower():troca
+1.945,+3:word.lower():safena
+1.904,+1:word.lower():duração

Weight?,Feature
+7.735,word.cluster:93
+6.606,word.cluster:298
+6.043,word.lower():ou
+6.010,word.lower():mmhg
+5.877,word.cluster:25
+5.605,EOS
+4.981,+1:word.cluster:193
+4.849,word.cluster:248
+4.740,word.lower():há
+4.424,word.lower():nega


In [121]:
tag_index = best_crf.classes_.index('ENT')
print(tag_index)
tag_index = best_crf.classes_.index('O')
print(tag_index)

0
1


In [125]:
import time

# class_weight={'0': 1, '1': 5})

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False,  class_weight={'0': 2, '1': 1})

tag_index = best_crf.classes_.index('ENT')
print('ENT:', tag_index)
tag_index = best_crf.classes_.index('O')
print('O', tag_index)

# sem balanceamento - 77,9 de F1
start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

#print('CRF model was trained!')

TypeError: __init__() got an unexpected keyword argument 'class_weight'

In [65]:
import time

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False)

start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
#crf.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

print('CRF model was trained!')

loading training data to CRFsuite: 100%|██████████| 1319/1319 [00:00<00:00, 3501.49it/s]





loading dev data to CRFsuite: 100%|██████████| 416/416 [00:00<00:00, 3423.07it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 26345
Seconds required: 0.100

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.03  loss=13044.14 active=26009 precision=0.143  recall=0.200  F1=0.167  Acc(item/seq)=0.717 0.214  feature_norm=1.00
Iter 2   time=0.02  loss=11803.30 active=24798 precision=0.143  recall=0.200  F1=0.167  Acc(item/seq)=0.717 0.214  feature_norm=1.03
Iter 3   time=0.02  loss=9435.91  active=24645 precision=0.483  recall=0.341  F1=0.312  Acc(item/seq)=0.769 0.243  feature_norm=1.88
Iter 4   time=0.02  loss=8234.84  active=25550 precision=0.501  recall=0.353  F1=0.364  Acc(item/seq)=0.796 0.250  feature_norm=2.11
Iter 5   time=0.02  loss=6902.72  active=25798 pr

Iter 64  time=0.02  loss=796.28   active=8051  precision=0.936  recall=0.844  F1=0.882  Acc(item/seq)=0.956 0.690  feature_norm=46.79
Iter 65  time=0.02  loss=795.95   active=8037  precision=0.933  recall=0.845  F1=0.881  Acc(item/seq)=0.956 0.685  feature_norm=46.81
Iter 66  time=0.02  loss=795.59   active=8027  precision=0.936  recall=0.844  F1=0.881  Acc(item/seq)=0.956 0.688  feature_norm=46.82
Iter 67  time=0.02  loss=795.35   active=8019  precision=0.933  recall=0.845  F1=0.881  Acc(item/seq)=0.956 0.685  feature_norm=46.84
Iter 68  time=0.02  loss=795.01   active=8005  precision=0.936  recall=0.844  F1=0.881  Acc(item/seq)=0.956 0.688  feature_norm=46.84
Iter 69  time=0.02  loss=794.77   active=7986  precision=0.933  recall=0.846  F1=0.881  Acc(item/seq)=0.956 0.688  feature_norm=46.87
Iter 70  time=0.02  loss=794.49   active=7971  precision=0.936  recall=0.844  F1=0.881  Acc(item/seq)=0.956 0.688  feature_norm=46.87
Iter 71  time=0.02  loss=794.26   active=7948  precision=0.933

## Janela de 3 vizinhos antes e depois


In [72]:
import time

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False)

start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
#crf.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

print('CRF model was trained!')

loading training data to CRFsuite: 100%|██████████| 1319/1319 [00:00<00:00, 4303.40it/s]





loading dev data to CRFsuite: 100%|██████████| 416/416 [00:00<00:00, 4037.73it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 21792
Seconds required: 0.097

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.03  loss=12913.99 active=21519 precision=0.143  recall=0.200  F1=0.167  Acc(item/seq)=0.717 0.214  feature_norm=1.00
Iter 2   time=0.02  loss=11685.19 active=20574 precision=0.143  recall=0.200  F1=0.167  Acc(item/seq)=0.717 0.214  feature_norm=1.09
Iter 3   time=0.02  loss=8670.78  active=20435 precision=0.473  recall=0.365  F1=0.337  Acc(item/seq)=0.775 0.248  feature_norm=2.50
Iter 4   time=0.02  loss=7264.99  active=21202 precision=0.503  recall=0.418  F1=0.441  Acc(item/seq)=0.825 0.255  feature_norm=2.82
Iter 5   time=0.02  loss=6352.30  active=21377 pr

Iter 63  time=0.02  loss=819.72   active=7182  precision=0.948  recall=0.845  F1=0.884  Acc(item/seq)=0.955 0.685  feature_norm=47.76
Iter 64  time=0.02  loss=819.39   active=7167  precision=0.947  recall=0.845  F1=0.883  Acc(item/seq)=0.955 0.685  feature_norm=47.77
Iter 65  time=0.02  loss=819.14   active=7146  precision=0.948  recall=0.845  F1=0.884  Acc(item/seq)=0.955 0.685  feature_norm=47.77
Iter 66  time=0.02  loss=818.86   active=7143  precision=0.948  recall=0.845  F1=0.884  Acc(item/seq)=0.955 0.685  feature_norm=47.78
Iter 67  time=0.02  loss=818.66   active=7132  precision=0.948  recall=0.844  F1=0.883  Acc(item/seq)=0.955 0.685  feature_norm=47.77
Iter 68  time=0.02  loss=818.39   active=7110  precision=0.948  recall=0.845  F1=0.884  Acc(item/seq)=0.955 0.685  feature_norm=47.79
Iter 69  time=0.02  loss=818.16   active=7101  precision=0.948  recall=0.843  F1=0.883  Acc(item/seq)=0.955 0.685  feature_norm=47.78
Iter 70  time=0.02  loss=817.92   active=7103  precision=0.948

## Janela de 2 vizinhos antes e depois

In [52]:
import time

#crfsuite is an implementation for Conditional Random Field

#!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF

crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         verbose='true',
         max_iterations = 100,
         all_possible_transitions = False)

start = time.time()
crf.fit(X_train, y_train, X_dev=X_dev, y_dev=y_dev)
#crf.fit(X_train, y_train)
stop = time.time()
print(f"Training time: {round(stop - start,2)}s")

print('CRF model was trained!')

loading training data to CRFsuite: 100%|██████████| 1319/1319 [00:00<00:00, 5221.01it/s]





loading dev data to CRFsuite: 100%|██████████| 416/416 [00:00<00:00, 4914.06it/s]



Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 17071
Seconds required: 0.085

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.02  loss=12838.87 active=16868 precision=0.143  recall=0.200  F1=0.167  Acc(item/seq)=0.717 0.214  feature_norm=1.00
Iter 2   time=0.01  loss=11581.15 active=16125 precision=0.143  recall=0.200  F1=0.167  Acc(item/seq)=0.717 0.214  feature_norm=1.12
Iter 3   time=0.01  loss=7980.63  active=16058 precision=0.484  recall=0.427  F1=0.426  Acc(item/seq)=0.814 0.267  feature_norm=2.84
Iter 4   time=0.01  loss=6654.99  active=16604 precision=0.522  recall=0.450  F1=0.476  Acc(item/seq)=0.846 0.308  feature_norm=3.28
Iter 5   time=0.01  loss=6065.41  active=16832 pr

Iter 73  time=0.01  loss=856.55   active=6080  precision=0.947  recall=0.849  F1=0.886  Acc(item/seq)=0.955 0.690  feature_norm=49.58
Iter 74  time=0.01  loss=856.40   active=6074  precision=0.948  recall=0.848  F1=0.886  Acc(item/seq)=0.955 0.692  feature_norm=49.59
Iter 75  time=0.01  loss=856.28   active=6062  precision=0.947  recall=0.850  F1=0.887  Acc(item/seq)=0.956 0.690  feature_norm=49.60
Iter 76  time=0.01  loss=856.15   active=6056  precision=0.948  recall=0.849  F1=0.887  Acc(item/seq)=0.956 0.692  feature_norm=49.61
Iter 77  time=0.01  loss=856.05   active=6054  precision=0.947  recall=0.850  F1=0.887  Acc(item/seq)=0.956 0.690  feature_norm=49.62
Iter 78  time=0.01  loss=855.93   active=6047  precision=0.948  recall=0.849  F1=0.886  Acc(item/seq)=0.955 0.692  feature_norm=49.63
Iter 79  time=0.02  loss=855.82   active=6049  precision=0.948  recall=0.849  F1=0.887  Acc(item/seq)=0.956 0.692  feature_norm=49.64
Iter 80  time=0.01  loss=855.71   active=6046  precision=0.949

In [54]:
#!pip install eli5

import eli5

eli5.show_weights(crf, top=30)

From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,0.881,0.052,-1.514,-0.643,0.0
O,0.662,0.665,-0.069,0.073,1.14
Problema,0.0,0.624,1.156,-0.212,0.0
Teste,0.0,0.746,-0.472,1.131,0.0
Tratamento,0.0,0.054,-1.455,-1.122,1.997

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+2.220,word.lower():cpp,,,
+2.181,-1:word.lower():carotida,,,
+2.146,+1:word.lower():estenose,,,
+2.027,+2:word.lower():20,,,
+2.018,word.lower():pulmonar,,,
+1.921,word.cluster:41,,,
+1.878,word.lower():mmii,,,
+1.770,word[:3]:ABD,,,
+1.761,word.lower():abdome,,,
+1.752,word.lower():abd,,,

Weight?,Feature
+2.220,word.lower():cpp
+2.181,-1:word.lower():carotida
+2.146,+1:word.lower():estenose
+2.027,+2:word.lower():20
+2.018,word.lower():pulmonar
+1.921,word.cluster:41
+1.878,word.lower():mmii
+1.770,word[:3]:ABD
+1.761,word.lower():abdome
+1.752,word.lower():abd

Weight?,Feature
+3.534,postag:PU
+3.421,-1:word.lower():mg
+3.354,+1:word.lower():ss
+3.183,word.lower():refere
+3.164,word.lower():nega
+2.989,-2:word.lower():faleceu
+2.719,word.cluster:2
+2.640,postag:KC
+2.423,word.lower():ou
+2.324,word.cluster:43

Weight?,Feature
+3.294,-1:word.lower():sem
+2.955,+1:word.lower():irradiaçao
+2.856,+1:word.lower():estavel
+2.717,+1:postag:Problema
+2.432,word.lower():ss
+2.431,word[-3:]:DES
+2.305,word[:3]:Hip
+2.101,"+1:word.lower():,"
+2.083,word[:3]:INF
+2.061,word[:3]:HIP

Weight?,Feature
+4.032,word.cluster:46
+3.404,word.cluster:45
+2.852,word.cluster:16
+2.626,word.lower():exame
+2.427,word.cluster:48
+2.423,word.cluster:37
+2.408,word[:3]:PA
+2.408,word[-3:]:PA
+2.408,word.lower():pa
+2.044,word[:3]:ECO

Weight?,Feature
+2.856,+1:postag:Tratamento
+2.749,word.cluster:28
+2.498,+1:word.lower():5mg
+2.481,word[:3]:Med
+2.341,word[:3]:SEL
+2.118,word[:3]:MED
+2.094,word[-3:]:ACO
+2.093,word.lower():angioplastia
+2.015,word[-3:]:lol
+1.998,-1:postag:Tratamento


# Finding the optimal hyperparameters


## Cluster 50

In [73]:
import scipy
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.6min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000027F06C8FD00>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000027F06C8FCD0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['Problema', 'Teste', 'Tratamento', 'Anatomia']),
                   verbose=1)

In [75]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.025025294252710672, 'c2': 0.0044731390067481855}
best CV score: 0.9085100773498992
model size: 0.38M


In [76]:
# com janela de 3
best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

              precision    recall  f1-score   support

    Problema      0.952     0.955     0.954       795
       Teste      0.930     0.909     0.920       308
  Tratamento      0.968     0.944     0.956       446
    Anatomia      0.895     0.716     0.795        95

   micro avg      0.950     0.929     0.939      1644
   macro avg      0.936     0.881     0.906      1644
weighted avg      0.949     0.929     0.939      1644



In [56]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.0710867769766405, 'c2': 0.007025286647796449}
best CV score: 0.9114276297168044
model size: 0.26M


In [57]:
# com janela de 2
best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

              precision    recall  f1-score   support

    Problema      0.955     0.955     0.955       795
       Teste      0.936     0.899     0.917       308
  Tratamento      0.968     0.935     0.951       446
    Anatomia      0.918     0.705     0.798        95

   micro avg      0.953     0.925     0.939      1644
   macro avg      0.944     0.874     0.905      1644
weighted avg      0.953     0.925     0.938      1644





In [58]:
import joblib
import os

OUTPUT_PATH = "models/"
OUTPUT_FILE = "best_crf_ner_cluster50"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(best_crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

['models/best_crf_ner_cluster50']

## Cluster 5


In [78]:
word2cluster = read_clusters(r"clusters/cluster-5.tsv")

X_train = [sent2features(s) for s in traindata]
y_train =[sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]


crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

OUTPUT_PATH = "models/"
OUTPUT_FILE = "best_crf_ner_cluster5"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(best_crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.3min finished


              precision    recall  f1-score   support

    Problema      0.948     0.948     0.948       795
       Teste      0.955     0.896     0.925       308
  Tratamento      0.961     0.948     0.955       446
    Anatomia      0.899     0.747     0.816        95

   micro avg      0.951     0.927     0.939      1644
   macro avg      0.941     0.885     0.911      1644
weighted avg      0.950     0.927     0.938      1644

best params: {'c1': 0.05058570873780858, 'c2': 0.02849278797510965}
best CV score: 0.9041782839454671
model size: 0.47M


From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,0.783,0.054,-1.842,-0.873,-0.912
O,0.027,0.793,0.038,0.047,0.547
Problema,-0.887,0.0,1.34,-0.033,-2.287
Teste,-1.053,0.193,-0.857,0.708,-2.495
Tratamento,-1.85,0.011,-1.38,-1.378,2.035

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+3.410,word.lower():ae,,,
+3.260,word.lower():cpp,,,
+3.048,word.lower():mmii,,,
+3.025,word.lower():pulmonar,,,
+2.856,+1:word.lower():estenose,,,
+2.749,+2:word.lower():20,,,
+2.607,word.lower():ve,,,
+2.598,-1:word.lower():carotida,,,
+2.598,-3:word.lower():carotida,,,
+2.533,word[:3]:Abd,,,

Weight?,Feature
+3.410,word.lower():ae
+3.260,word.lower():cpp
+3.048,word.lower():mmii
+3.025,word.lower():pulmonar
+2.856,+1:word.lower():estenose
+2.749,+2:word.lower():20
+2.607,word.lower():ve
+2.598,-1:word.lower():carotida
+2.598,-3:word.lower():carotida
+2.533,word[:3]:Abd

Weight?,Feature
+5.234,postag:PU
+4.631,+1:word.lower():ss
+3.761,-2:word.lower():faleceu
+2.978,+1:word.lower():dor
+2.967,word.lower():refere
+2.950,word[-3:]:mHg
+2.936,EOS
+2.891,word[:3]:nor
+2.840,+1:word.lower():função
+2.836,+1:word.lower():lab

Weight?,Feature
+5.272,+1:word.lower():irradiaçao
+3.666,+1:word.lower():estavel
+3.544,-1:word.lower():sem
+3.459,word[:3]:Hip
+3.229,word[-3:]:DES
+3.199,word[:3]:INF
+3.126,word[:3]:Dis
+3.086,-1:postag:Problema
+2.981,word.lower():ss
+2.973,word[-3:]:eia

Weight?,Feature
+4.372,word[:3]:ECO
+3.844,+2:word.lower():84
+3.454,word[:3]:CIN
+3.451,word.lower():exames
+3.430,word.lower():exame
+3.373,word[:3]:PA
+3.373,word.lower():pa
+3.373,word[-3:]:PA
+3.357,word.lower():holter
+3.313,word[:3]:CAT

Weight?,Feature
+4.185,word[:3]:MED
+4.175,word[:3]:Med
+4.158,word.lower():angioplastia
+3.499,+1:word.lower():5mg
+3.480,-1:postag:Tratamento
+3.465,word.lower():medicamentos
+3.449,word.lower():atc
+3.353,+1:postag:Tratamento
+3.326,word.lower():tratamento
+3.274,word[:3]:SEL


## Cluster 10

In [79]:
word2cluster = read_clusters(r"clusters/cluster-10.tsv")

X_train = [sent2features(s) for s in traindata]
y_train =[sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]


crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

OUTPUT_PATH = "models/"
OUTPUT_FILE = "best_crf_ner_cluster10"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(best_crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.4min finished


              precision    recall  f1-score   support

    Problema      0.952     0.952     0.952       795
       Teste      0.933     0.903     0.917       308
  Tratamento      0.959     0.935     0.947       446
    Anatomia      0.895     0.716     0.795        95

   micro avg      0.948     0.925     0.936      1644
   macro avg      0.935     0.876     0.903      1644
weighted avg      0.947     0.925     0.935      1644

best params: {'c1': 0.03143530062660093, 'c2': 0.011407537452349636}
best CV score: 0.9025164743902709
model size: 0.44M


From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,0.586,0.013,-1.79,-1.039,-1.092
O,-0.019,0.985,-0.124,-0.044,0.321
Problema,-1.606,-0.297,1.782,-0.458,-3.177
Teste,-1.132,0.214,-0.886,1.062,-3.339
Tratamento,-2.827,-0.375,-1.711,-1.922,1.816

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+4.058,word.lower():mmii,,,
+3.988,word.lower():ve,,,
+3.936,word.lower():cpp,,,
+3.720,word.lower():ae,,,
+3.419,word.lower():pulmonar,,,
+3.378,+1:word.lower():estenose,,,
+3.261,-1:word.lower():carotida,,,
+3.261,-3:word.lower():carotida,,,
+3.246,word.lower():vd,,,
+3.106,word[:3]:Abd,,,

Weight?,Feature
+4.058,word.lower():mmii
+3.988,word.lower():ve
+3.936,word.lower():cpp
+3.720,word.lower():ae
+3.419,word.lower():pulmonar
+3.378,+1:word.lower():estenose
+3.261,-1:word.lower():carotida
+3.261,-3:word.lower():carotida
+3.246,word.lower():vd
+3.106,word[:3]:Abd

Weight?,Feature
+6.095,postag:PU
+5.403,+1:word.lower():ss
+5.345,word[:3]:POS
+4.983,-2:word.lower():faleceu
+4.343,word.lower():x
+4.157,word.lower():refere
+3.877,word[:3]:evi
+3.754,+1:word.lower():lab
+3.693,word.lower():ou
+3.643,EOS

Weight?,Feature
+6.316,+1:word.lower():irradiaçao
+4.485,word[:3]:Hip
+4.224,+1:word.lower():estavel
+4.157,word[:3]:INF
+3.998,word.lower():ss
+3.988,-1:word.lower():sem
+3.610,word.lower():tabagismo
+3.569,word.lower():dor
+3.542,word[-3:]:DES
+3.485,word.lower():chagas

Weight?,Feature
+4.799,+2:word.lower():84
+4.358,word[:3]:CAT
+4.074,word.lower():exame
+4.042,word.lower():cate
+3.898,word.lower():avaliação
+3.878,word.cluster:9
+3.831,word[:3]:ECO
+3.651,word[-3:]:PA
+3.651,word[:3]:PA
+3.651,word.lower():pa

Weight?,Feature
+5.151,word.lower():angioplastia
+5.056,word[:3]:Med
+4.497,word.lower():tratamento
+4.330,word[:3]:MED
+4.061,word[:3]:SEL
+4.048,+1:word.lower():5mg
+3.656,word.lower():cx
+3.557,word.lower():rvm
+3.549,word.lower():atc
+3.507,word.lower():anticoagulação


## Cluster 50

In [80]:
word2cluster = read_clusters(r"clusters/cluster-10.tsv")

X_train = [sent2features(s) for s in traindata]
y_train =[sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]


crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

OUTPUT_PATH = "models/"
OUTPUT_FILE = "best_crf_ner_cluster10"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(best_crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.5min finished


              precision    recall  f1-score   support

    Problema      0.957     0.951     0.954       795
       Teste      0.939     0.903     0.921       308
  Tratamento      0.961     0.935     0.948       446
    Anatomia      0.875     0.737     0.800        95

   micro avg      0.951     0.925     0.938      1644
   macro avg      0.933     0.881     0.906      1644
weighted avg      0.950     0.925     0.937      1644

best params: {'c1': 0.022118110448088233, 'c2': 0.027860023218946085}
best CV score: 0.9038591874502191
model size: 0.58M


From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,0.838,-0.018,-1.727,-0.736,-1.032
O,0.062,0.889,-0.274,0.013,0.311
Problema,-0.985,-0.067,1.571,-0.278,-2.435
Teste,-0.898,0.314,-0.885,0.931,-2.576
Tratamento,-2.364,-0.335,-1.662,-1.477,1.746

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+3.197,word.lower():mmii,,,
+3.036,word.lower():cpp,,,
+2.870,word.lower():ve,,,
+2.775,word.lower():ae,,,
+2.645,-1:word.lower():carotida,,,
+2.645,-3:word.lower():carotida,,,
+2.538,word.lower():pulmonar,,,
+2.483,+1:word.lower():estenose,,,
+2.427,+2:word.lower():20,,,
+2.396,word.lower():acv,,,

Weight?,Feature
+3.197,word.lower():mmii
+3.036,word.lower():cpp
+2.870,word.lower():ve
+2.775,word.lower():ae
+2.645,-1:word.lower():carotida
+2.645,-3:word.lower():carotida
+2.538,word.lower():pulmonar
+2.483,+1:word.lower():estenose
+2.427,+2:word.lower():20
+2.396,word.lower():acv

Weight?,Feature
+4.876,postag:PU
+4.217,+1:word.lower():ss
+3.822,-2:word.lower():faleceu
+3.498,word[:3]:POS
+3.338,word.lower():x
+3.115,word.lower():refere
+3.086,word.cluster:8
+2.993,word.lower():ou
+2.873,+1:word.lower():familiar
+2.773,+1:word.lower():dor

Weight?,Feature
+4.948,+1:word.lower():irradiaçao
+3.605,-1:word.lower():sem
+3.579,word[:3]:Hip
+3.374,+1:word.lower():estavel
+3.306,word.lower():ss
+2.825,word[:3]:HIP
+2.777,word[:3]:Dis
+2.725,word[-3:]:DES
+2.723,word.lower():tabagismo
+2.628,+1:word.lower():há

Weight?,Feature
+3.776,+2:word.lower():84
+3.472,word.cluster:9
+3.301,word[:3]:CAT
+3.182,word.lower():exame
+3.090,word[:3]:PA
+3.090,word.lower():pa
+3.090,word[-3:]:PA
+3.069,word.lower():cate
+2.972,word[:3]:ECO
+2.886,word.lower():avaliação

Weight?,Feature
+3.866,word.lower():angioplastia
+3.817,word[:3]:Med
+3.748,word[:3]:MED
+3.290,+1:word.lower():5mg
+3.181,-1:postag:Tratamento
+3.148,word.lower():tratamento
+2.974,word.lower():medicações
+2.902,word.lower():atc
+2.745,word.lower():cx
+2.735,word[:3]:SEL


## Cluster 100 

In [81]:
word2cluster = read_clusters(r"clusters/cluster-100.tsv")

X_train = [sent2features(s) for s in traindata]
y_train =[sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]

crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

OUTPUT_PATH = "models/"
OUTPUT_FILE = "best_crf_ner_cluster100"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(best_crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.2s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.4min finished


              precision    recall  f1-score   support

    Problema      0.945     0.956     0.951       795
       Teste      0.942     0.899     0.920       308
  Tratamento      0.970     0.937     0.953       446
    Anatomia      0.895     0.716     0.795        95

   micro avg      0.949     0.926     0.938      1644
   macro avg      0.938     0.877     0.905      1644
weighted avg      0.948     0.926     0.937      1644

best params: {'c1': 0.029081952698140505, 'c2': 0.01750489686490583}
best CV score: 0.911749142381645
model size: 0.45M


From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,0.838,0.005,-1.903,-0.762,-0.957
O,0.016,0.707,0.033,0.1,0.313
Problema,-1.514,-0.476,1.614,-0.394,-2.92
Teste,-0.796,0.245,-0.925,1.506,-2.795
Tratamento,-2.686,-0.592,-1.648,-1.655,1.523

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+3.786,word.cluster:36,,,
+3.205,word.lower():mmii,,,
+3.045,word.lower():ae,,,
+2.937,+1:word.lower():estenose,,,
+2.910,-1:word.lower():carotida,,,
+2.910,-3:word.lower():carotida,,,
+2.873,word.lower():cpp,,,
+2.757,+2:word.lower():20,,,
+2.646,word.cluster:30,,,
+2.588,word.lower():ve,,,

Weight?,Feature
+3.786,word.cluster:36
+3.205,word.lower():mmii
+3.045,word.lower():ae
+2.937,+1:word.lower():estenose
+2.910,-1:word.lower():carotida
+2.910,-3:word.lower():carotida
+2.873,word.lower():cpp
+2.757,+2:word.lower():20
+2.646,word.cluster:30
+2.588,word.lower():ve

Weight?,Feature
+5.294,+1:word.lower():ss
+5.152,postag:PU
+4.301,word.cluster:26
+4.278,word.lower():x
+4.128,-2:word.lower():faleceu
+3.743,word.cluster:4
+3.516,word[:3]:POS
+3.338,-2:word.lower():ambu
+3.325,+1:word.lower():lab
+3.259,+1:word.lower():função

Weight?,Feature
+6.140,+1:word.lower():irradiaçao
+4.437,word.cluster:27
+3.871,-1:word.lower():sem
+3.665,word.lower():comorbidades
+3.570,+3:word.lower():várias
+3.532,word.lower():ss
+3.492,+1:word.lower():estavel
+3.305,word[-3:]:DES
+3.295,word[:3]:Hip
+3.282,word[:3]:INF

Weight?,Feature
+8.768,word.cluster:95
+5.252,word.cluster:91
+4.672,word.cluster:5
+3.441,+2:word.lower():84
+3.374,word.cluster:82
+3.136,word.lower():fr
+2.962,word.lower():exame
+2.708,word.lower():pa
+2.708,word[-3:]:PA
+2.708,word[:3]:PA

Weight?,Feature
+4.472,word.cluster:73
+4.053,-1:postag:Tratamento
+3.978,word.cluster:6
+3.626,+1:word.lower():5mg
+3.313,+1:postag:Tratamento
+3.265,word[:3]:SEL
+3.244,word[:3]:Med
+3.133,word[-3:]:lol
+2.970,word.lower():anticoagulação
+2.683,word.cluster:25


## Cluster 300 (melhor)

In [82]:
word2cluster = read_clusters(r"clusters/cluster-300.tsv")

X_train = [sent2features(s) for s in traindata]
y_train =[sent2labels(s) for s in traindata]
X_dev = [sent2features(s) for s in devdata]
y_dev = [sent2labels(s) for s in devdata]
X_test = [sent2features(s) for s in testdata]
y_test = [sent2labels(s) for s in testdata]


crf = crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=getTiposEntidade())

rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

best_crf = rs.best_estimator_
y_pred = best_crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=getTiposEntidade(), digits=3
))

import joblib
import os

OUTPUT_PATH = "models/"
OUTPUT_FILE = "best_crf_ner_cluster300"

if not os.path.exists(OUTPUT_PATH):
    os.mkdir(OUTPUT_PATH)

joblib.dump(best_crf, os.path.join(OUTPUT_PATH, OUTPUT_FILE))

print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

eli5.show_weights(best_crf, top=30)



Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.4min finished


              precision    recall  f1-score   support

    Problema      0.958     0.952     0.955       795
       Teste      0.936     0.903     0.919       308
  Tratamento      0.977     0.960     0.968       446
    Anatomia      0.900     0.758     0.823        95

   micro avg      0.956     0.934     0.945      1644
   macro avg      0.943     0.893     0.916      1644
weighted avg      0.956     0.934     0.944      1644

best params: {'c1': 0.04358790768620843, 'c2': 0.01833886983260396}
best CV score: 0.9160357967942182
model size: 0.40M


From \ To,Anatomia,O,Problema,Teste,Tratamento
Anatomia,0.614,0.044,-1.95,-0.976,-1.232
O,0.065,1.041,0.118,0.106,0.301
Problema,-1.457,-0.045,1.722,-0.448,-2.836
Teste,-1.133,0.27,-0.861,0.948,-2.87
Tratamento,-2.542,-0.199,-1.72,-1.766,1.572

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
+5.746,word.cluster:126,,,
+3.515,word.cluster:295,,,
+3.174,word.cluster:209,,,
+2.989,+1:word.lower():estenose,,,
+2.956,+2:word.lower():20,,,
+2.855,word.lower():cpp,,,
+2.810,word.lower():mmii,,,
+2.807,-3:word.lower():carotida,,,
+2.807,-1:word.lower():carotida,,,
+2.741,word.lower():ae,,,

Weight?,Feature
+5.746,word.cluster:126
+3.515,word.cluster:295
+3.174,word.cluster:209
+2.989,+1:word.lower():estenose
+2.956,+2:word.lower():20
+2.855,word.lower():cpp
+2.810,word.lower():mmii
+2.807,-3:word.lower():carotida
+2.807,-1:word.lower():carotida
+2.741,word.lower():ae

Weight?,Feature
+4.759,-2:word.lower():faleceu
+4.645,+1:word.lower():ss
+4.633,word.cluster:93
+3.766,postag:PU
+3.440,+1:word.lower():lab
+3.313,+1:word.lower():função
+3.207,word.cluster:129
+3.197,word.cluster:25
+3.151,word.cluster:292
+3.074,word.lower():nega

Weight?,Feature
+6.085,word.cluster:117
+5.852,+1:word.lower():irradiaçao
+3.852,+1:word.lower():estavel
+3.834,-1:word.lower():sem
+3.615,word[:3]:INF
+3.591,word.lower():comorbidades
+3.579,word.lower():ss
+3.078,word[-3:]:DES
+3.027,+3:word.lower():várias
+2.994,word.lower():afebril

Weight?,Feature
+8.673,word.cluster:284
+6.067,word.cluster:17
+5.002,word.cluster:260
+4.472,word.cluster:259
+4.035,+2:word.lower():84
+3.803,word.cluster:279
+3.456,word.cluster:130
+3.294,word[:3]:ECG
+3.294,word[-3:]:ECG
+3.244,word.lower():ecg

Weight?,Feature
+6.933,word.cluster:27
+5.566,word.cluster:1
+4.626,word.cluster:211
+3.755,+1:word.lower():5mg
+3.693,-1:postag:Tratamento
+3.468,word.cluster:132
+3.364,word.cluster:222
+3.313,+1:postag:Tratamento
+3.102,word[:3]:Med
+3.081,word.lower():angioplastia


## Agora com as regioes
(pode até ser binário)

In [83]:
# gerar as possibeis regioes, se são entidades, atribuir a label, senao, recebe 'O'

dic_sentencesTrain = load_obj(r'../spanclassification/obj/dic_sentencesTrain')
dic_sentencesDev = load_obj(r'../spanclassification/obj/dic_sentencesDev')
dic_sentencesTest = load_obj(r'../spanclassification/obj/dic_sentencesTest')
dic_sentencesTest[2]

Load obj em:  obj/../spanclassification/obj/dic_sentencesTrain.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesDev.pkl
Load obj em:  obj/../spanclassification/obj/dic_sentencesTest.pkl


[[['Comorbidades', 0, 142],
  [':', 1, 154],
  ['DM', 2, 156],
  ['há', 3, 159],
  ['10', 4, 162],
  ['anos', 5, 165],
  ['em', 6, 170],
  ['uso', 7, 173],
  ['de', 8, 177],
  ['metformina', 9, 180],
  ['850mg', 10, 191],
  ['3', 11, 197],
  ['cp', 12, 199],
  ['/', 13, 201],
  ['dia', 14, 202],
  [',', 15, 205],
  ['acarbose', 16, 207],
  ['1', 17, 216],
  ['cp', 18, 218],
  ['/', 19, 220],
  ['dia', 20, 221],
  ['e', 21, 225],
  ['glicazida', 22, 227],
  ['60mg', 23, 237],
  ['2', 24, 242],
  ['cp', 25, 244],
  ['/', 26, 246],
  ['dia', 27, 247],
  ['e', 28, 251],
  ['insulina', 29, 253],
  ['(', 30, 262],
  ['24', 31, 263],
  ['-', 32, 266],
  ['0', 33, 268],
  ['-', 34, 270],
  ['24', 35, 272],
  [')', 36, 274],
  ['.', 37, 275]],
 [['Comorbidades', [0], 'Problema'],
  ['DM', [2], 'Problema'],
  ['metformina 850mg', [9, 10], 'Tratamento'],
  ['acarbose', [16], 'Tratamento'],
  ['glicazida 60mg', [22, 23], 'Tratamento'],
  ['insulina', [29], 'Tratamento']]]

In [84]:
def powerset(entidade):    
    lista=list()
    for i in range(1, len(entidade)+1):
        #lista.append(entidade[i-1])
        for j in range(len(entidade) - i + 1):
            lista.append(entidade[j:j + i])
    return lista
        
#print(list(powerset([4, 5, 6])))
print(powerset(['dor', 'no', 'peito']))

[['dor'], ['no'], ['peito'], ['dor', 'no'], ['no', 'peito'], ['dor', 'no', 'peito']]


In [153]:
# aqui - gerar exemplos negativos tbm e add no dic
#dic_sentencesTest = load_obj(r'../spanclassification/obj/dic_sentencesTest')

def getListaCombinacaoEntidades(dic_sentences):
    listaRetorno=list()
    for key,value in dic_sentences.items():
        tokens=value[0]
        soTokens=[t[0] for t in tokens]
        soIndices=[t[1] for t in tokens]
        maiorIndice = soIndices[-1]
        #print('soIndices:', soIndices)
        entidadesPositivas=value[1]
        if len(entidadesPositivas)>0:
            #entidadesPositivasIndices=entidadesPositivas[1]
            entidadesPositivasIndices=[ent[1] for ent in entidadesPositivas]
        else:
            entidadesPositivasIndices=[]
        #print('tokens:', tokens)
        #print('entidadesPositivas:',entidadesPositivas)
        #print('entidadesPositivasIndices:',entidadesPositivasIndices)
        dicTokens={}
        for token in tokens:
            #print('token:', token)
            dicTokens[token[1]]=token[0]
        # fazendo a combinação
        indicesNovaEntidade=powerset(soIndices)
        #print(indicesNovaEntidade)
        listaEntidadesFrase=list()
        for indiceNovaEntidade in indicesNovaEntidade:
            vizinhosAntes = list()
            vizinhosDepois = list()
            label=0
            #print('indiceNovaEntidade:',indiceNovaEntidade)
            #print('entidadesPositivasIndices:',entidadesPositivasIndices)
            if indiceNovaEntidade in entidadesPositivasIndices:
                #print('ja tem')
                label=1
            tokensNovaEntidade = [dicTokens[i] for i in indiceNovaEntidade]
            inicioEnt = indiceNovaEntidade[0]
            fimEnt = indiceNovaEntidade[-1]
            #print('inicioEnt:', inicioEnt)
            #print('fimEnt:', fimEnt)
            for i in range(inicioEnt-1, inicioEnt-4, -1):
                if i>=0:
                    vizinhosAntes.append(i)
            for i in range(fimEnt+1, fimEnt+5, 1):
                if i<=maiorIndice:
                    vizinhosDepois.append(i)
            vizinhosAntes.sort()
            vizinhosAntesTokens=[dicTokens[i] for i in vizinhosAntes]
            vizinhosDepoisTokens=[dicTokens[i] for i in vizinhosDepois]
            listaEntidadesFrase.append([' '.join(tokensNovaEntidade), vizinhosAntesTokens, vizinhosDepoisTokens, label])
        listaRetorno.append(listaEntidadesFrase)
        listaEntidadesFrase=list()
        #listaEntidadesFrase.append([indiceNovaEntidade, vizinhosAntes, vizinhosDepois, label])
        #print(listaEntidadesFrase)
        #print(powerset(['dor', 'no', 'peito']))
    return listaRetorno

In [154]:
listaTest = getListaCombinacaoEntidades(dic_sentencesTest)
listaTest[0:2]

[[['Lucas', [], [',', '74', 'anos', '.'], 0],
  [',', ['Lucas'], ['74', 'anos', '.'], 0],
  ['74', ['Lucas', ','], ['anos', '.'], 0],
  ['anos', ['Lucas', ',', '74'], ['.'], 0],
  ['.', [',', '74', 'anos'], [], 0],
  ['Lucas ,', [], ['74', 'anos', '.'], 0],
  [', 74', ['Lucas'], ['anos', '.'], 0],
  ['74 anos', ['Lucas', ','], ['.'], 0],
  ['anos .', ['Lucas', ',', '74'], [], 0],
  ['Lucas , 74', [], ['anos', '.'], 0],
  [', 74 anos', ['Lucas'], ['.'], 0],
  ['74 anos .', ['Lucas', ','], [], 0],
  ['Lucas , 74 anos', [], ['.'], 0],
  [', 74 anos .', ['Lucas'], [], 0],
  ['Lucas , 74 anos .', [], [], 0]],
 [['Em', [], ['acompanhamento', 'no', 'ambualtorio', 'há'], 0],
  ['acompanhamento', ['Em'], ['no', 'ambualtorio', 'há', '5'], 0],
  ['no', ['Em', 'acompanhamento'], ['ambualtorio', 'há', '5', 'anos'], 0],
  ['ambualtorio',
   ['Em', 'acompanhamento', 'no'],
   ['há', '5', 'anos', 'por'],
   0],
  ['há',
   ['acompanhamento', 'no', 'ambualtorio'],
   ['5', 'anos', 'por', 'FA'],
   0],


In [161]:
listaDev = getListaCombinacaoEntidades(dic_sentencesDev)
listaDev[0:2]

[[['HAS', [], [',', 'ICC', ',', 'nega'], 1],
  [',', ['HAS'], ['ICC', ',', 'nega', 'DM'], 0],
  ['ICC', ['HAS', ','], [',', 'nega', 'DM', '.'], 1],
  [',', ['HAS', ',', 'ICC'], ['nega', 'DM', '.'], 0],
  ['nega', [',', 'ICC', ','], ['DM', '.'], 0],
  ['DM', ['ICC', ',', 'nega'], ['.'], 1],
  ['.', [',', 'nega', 'DM'], [], 0],
  ['HAS ,', [], ['ICC', ',', 'nega', 'DM'], 0],
  [', ICC', ['HAS'], [',', 'nega', 'DM', '.'], 0],
  ['ICC ,', ['HAS', ','], ['nega', 'DM', '.'], 0],
  [', nega', ['HAS', ',', 'ICC'], ['DM', '.'], 0],
  ['nega DM', [',', 'ICC', ','], ['.'], 0],
  ['DM .', ['ICC', ',', 'nega'], [], 0],
  ['HAS , ICC', [], [',', 'nega', 'DM', '.'], 0],
  [', ICC ,', ['HAS'], ['nega', 'DM', '.'], 0],
  ['ICC , nega', ['HAS', ','], ['DM', '.'], 0],
  [', nega DM', ['HAS', ',', 'ICC'], ['.'], 0],
  ['nega DM .', [',', 'ICC', ','], [], 0],
  ['HAS , ICC ,', [], ['nega', 'DM', '.'], 0],
  [', ICC , nega', ['HAS'], ['DM', '.'], 0],
  ['ICC , nega DM', ['HAS', ','], ['.'], 0],
  [', nega D

In [155]:
listaTrain = getListaCombinacaoEntidades(dic_sentencesTrain)
listaTrain[0:2]

[[['Dispneia', [], ['importante', 'aos', 'esforços', '+'], 0],
  ['importante', ['Dispneia'], ['aos', 'esforços', '+', 'dor'], 0],
  ['aos', ['Dispneia', 'importante'], ['esforços', '+', 'dor', 'tipo'], 0],
  ['esforços',
   ['Dispneia', 'importante', 'aos'],
   ['+', 'dor', 'tipo', 'peso'],
   0],
  ['+', ['importante', 'aos', 'esforços'], ['dor', 'tipo', 'peso', 'no'], 0],
  ['dor', ['aos', 'esforços', '+'], ['tipo', 'peso', 'no', 'peito'], 0],
  ['tipo', ['esforços', '+', 'dor'], ['peso', 'no', 'peito', 'no'], 0],
  ['peso', ['+', 'dor', 'tipo'], ['no', 'peito', 'no', 'esforço'], 0],
  ['no', ['dor', 'tipo', 'peso'], ['peito', 'no', 'esforço', '.'], 0],
  ['peito', ['tipo', 'peso', 'no'], ['no', 'esforço', '.'], 1],
  ['no', ['peso', 'no', 'peito'], ['esforço', '.'], 0],
  ['esforço', ['no', 'peito', 'no'], ['.'], 0],
  ['.', ['peito', 'no', 'esforço'], [], 0],
  ['Dispneia importante', [], ['aos', 'esforços', '+', 'dor'], 0],
  ['importante aos', ['Dispneia'], ['esforços', '+', 'do

In [190]:
def read_clusters(cluster_file):
    word2cluster = {}
    try:
        with open(cluster_file, encoding='utf-8') as i:
            for num, line in enumerate(i):
                if line:
                    word, cluster = line.strip().split('\t')
                    word2cluster[word] = cluster
    except:
        print(line)
        print(num)
        raise
    return word2cluster


def word2features(sent, word2cluster, dicPostagger):
    try:
        features = list()
        entidades = sent[0]
        for entidade in entidades.split():
            postag = tipoPostaggerTokens(entidade, dicPostagger)
            features.extend([
            'bias',
            'word.lower=' + entidade.lower(),
            'word[-3:]=' + entidade[-3:],
            'word[:3]=' + entidade[:3],
            'word.isupper=%s' % entidade.isupper(),
            'word.istitle=%s' % entidade.istitle(),
            'word.isdigit=%s' % entidade.isdigit(),
            'word.cluster=%s' % word2cluster[entidade.lower()] if entidade.lower() in word2cluster else "0",
            'postag=' + postag,
            ])
        # palavras anteriores
        vizinhosAntes = sent[1] 
        if len(vizinhosAntes)>0:
            for num, vizinhoAntes in enumerate(vizinhosAntes):
                word1 = vizinhoAntes
                postag1 =  tipoPostaggerTokens(vizinhoAntes, dicPostagger)
                cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
                features.extend([
                    '-'+str(num+1)+':word.lower=' + word1.lower(),
                    '-'+str(num+1)+':word.istitle=%s' % word1.istitle(),
                    '-'+str(num+1)+':word.isupper=%s' % word1.isupper(),
                    '-'+str(num+1)+':postag=' + postag1,
                    '-'+str(num+1)+':word.cluster='+ cluster
                ])
        else:
            features.append('BOS')

        # próximas palavras
        vizinhosDepois = sent[2]
        if len(vizinhosDepois)>0:
            for num, vizinhoDepois in enumerate(vizinhosDepois):
                word1 = vizinhoDepois
                postag1 =  tipoPostaggerTokens(vizinhoDepois, dicPostagger)
                cluster = word2cluster[word1.lower()] if word1.lower() in word2cluster else "0"
                features.extend([
                    '+'+str(num+1)+':word.lower=' + word1.lower(),
                    '+'+str(num+1)+':word.istitle=%s' % word1.istitle(),
                    '+'+str(num+1)+':word.isupper=%s' % word1.isupper(),
                    '+'+str(num+1)+':postag=' + postag1,
                    '+'+str(num+1)+':word.cluster='+ cluster
                ])
        else:
            features.append('EOS')
    except:
        print('sent:', sent)
        raise
    return features


def sent2features(lista, word2cluster, dicPostagger):
    #return word2features(lista, word2cluster, dicPostagger)
    return [word2features(lista[i], word2cluster, dicPostagger) for i in range(len(lista))]

def sent2labels(lista):
    #return [label for _, _, _, label in lista]
    return [str(label) for _, _, _, label in lista]


#def sent2tokens(sent):
#    return [token for token, postag, label in sent]

word2cluster = read_clusters(r"clusters/cluster-50.tsv")
#word2cluster = read_clusters(r"clusters/cluster-5.tsv")
#word2cluster = read_clusters(r"clusters/cluster-10.tsv")
#word2cluster = read_clusters(r"clusters/cluster-100.tsv")
#word2cluster = read_clusters(r"clusters/cluster-300.tsv")

In [191]:
X_train = [sent2features(s, word2cluster, dicPostagger) for s in listaTrain]
y_train = [sent2labels(s) for s in listaTrain]
X_train[0]

[['bias',
  'word.lower=dispneia',
  'word[-3:]=eia',
  'word[:3]=Dis',
  'word.isupper=False',
  'word.istitle=True',
  'word.isdigit=False',
  'word.cluster=25',
  'postag=N',
  'BOS',
  '+1:word.lower=importante',
  '+1:word.istitle=False',
  '+1:word.isupper=False',
  '+1:postag=ADJ',
  '+1:word.cluster=26',
  '+2:word.lower=aos',
  '+2:word.istitle=False',
  '+2:word.isupper=False',
  '+2:postag=ART',
  '+2:word.cluster=43',
  '+3:word.lower=esforços',
  '+3:word.istitle=False',
  '+3:word.isupper=False',
  '+3:postag=N',
  '+3:word.cluster=25',
  '+4:word.lower=+',
  '+4:word.istitle=False',
  '+4:word.isupper=False',
  '+4:postag=N',
  '+4:word.cluster=17'],
 ['bias',
  'word.lower=importante',
  'word[-3:]=nte',
  'word[:3]=imp',
  'word.isupper=False',
  'word.istitle=False',
  'word.isdigit=False',
  'word.cluster=26',
  'postag=ADJ',
  '-1:word.lower=dispneia',
  '-1:word.istitle=True',
  '-1:word.isupper=False',
  '-1:postag=N',
  '-1:word.cluster=25',
  '+1:word.lower=aos'

In [192]:
y_train[:2]

[['0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0'],
 ['1',
  '0',
  '1',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0']]

In [193]:
X_dev = [sent2features(s, word2cluster, dicPostagger) for s in listaDev]
y_dev = [sent2labels(s) for s in listaDev]

In [194]:
X_test = [sent2features(s, word2cluster, dicPostagger) for s in listaTest]
y_test = [sent2labels(s) for s in listaTest]

In [197]:
print(len(listaTrain))
print(len(X_train))
print(len(y_train))

1319
1319
1319
