In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICE'] = '2'

# Preparation

`xmen dict conf/xmen.yaml --code src/gazetteer.py`
(extend the SympTEMIST gazetteer with UMLS aliases and save as jsonl file)

`xmen index conf/xmen.yaml --all`
(compute TF-IDF and SapBERT indices)

In [63]:
def count_ents(ds):
    print(len([e for d in ds['entities'] for e in d]))

In [2]:
from datasets import load_dataset, DatasetDict, load_from_disk
from pathlib import Path
from xmen.evaluation import *

In [3]:
base_path = Path.home() / '.cache' / 'xmen' / 'symptemist'

In [4]:
symptemist_data = load_dataset(
    path="../../biomedical/bigbio/hub/hub_repos/symptemist/symptemist.py", 
    name="symptemist_linking_bigbio_kb"
)
train_data = symptemist_data['train']
test_data = symptemist_data['test']

Found cached dataset symptemist (/home/Florian.Borchert/.cache/huggingface/datasets/symptemist/symptemist_linking_bigbio_kb/2.2.0/3dbf79e4e7b75f1bb9c18feedb52bbb78832208c888945f61f41b6ed603cf32c)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
#import random
#random.seed(42)

#doc_ids = symptemist_data['train']['document_id']
#n_valid = int(0.2 * len(doc_ids))
#n_valid
#valid_doc_ids = random.sample(doc_ids, n_valid)
#with open('../data/subtrack2_valid_docids.txt', 'w') as fh:
#     for v in valid_doc_ids:
#        fh.write(v + '\n')
valid_doc_ids = [doc_id.strip() for doc_id in open('../data/subtrack2_valid_docids.txt', 'r').readlines()]
len(valid_doc_ids)

60

In [6]:
dataset = DatasetDict()
dataset['train'] = train_data.filter(lambda d: d['document_id'] not in valid_doc_ids)
dataset['validation'] = train_data.filter(lambda d: d['document_id'] in valid_doc_ids)
dataset['test'] = test_data
dataset

Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/symptemist/symptemist_linking_bigbio_kb/2.2.0/3dbf79e4e7b75f1bb9c18feedb52bbb78832208c888945f61f41b6ed603cf32c/cache-ce59562e02ad8e8e.arrow
Loading cached processed dataset at /home/Florian.Borchert/.cache/huggingface/datasets/symptemist/symptemist_linking_bigbio_kb/2.2.0/3dbf79e4e7b75f1bb9c18feedb52bbb78832208c888945f61f41b6ed603cf32c/cache-fbd11820a1aceef7.arrow


DatasetDict({
    train: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 244
    })
    validation: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 60
    })
    test: Dataset({
        features: ['id', 'document_id', 'passages', 'entities', 'events', 'coreferences', 'relations'],
        num_rows: 247
    })
})

In [7]:
from xmen.data import get_cuis
cuis = get_cuis(dataset['train']) + get_cuis(dataset['validation'])
len(cuis)

3484

In [8]:
from xmen import load_kb
kb = load_kb(base_path / 'symptemist.jsonl')

In [9]:
set([c for c in cuis if not c in kb.cui_to_entity])

{'NO_CODE'}

In [10]:
# aliases
sum([len(c) for c in kb.alias_to_cuis.values()])

1079623

# Candidate Generation

In [11]:
from xmen.linkers import default_ensemble

In [None]:
linker = default_ensemble(base_path / 'index')

In [None]:
candidates_ngram = linker.linkers_fn['ngram']().predict_batch(dataset)

In [None]:
print('Training Set:')
_ = evaluate_at_k(dataset['train'], candidates_ngram['train'])
print('Validation Set:')
_ = evaluate_at_k(dataset['validation'], candidates_ngram['validation'])

In [None]:
candidates_sap = linker.linkers_fn['sapbert']().predict_batch(dataset, batch_size=128)

In [None]:
print('Training Set:')
_ = evaluate_at_k(dataset['train'], candidates_sap['train'])
print('Validation Set:')
_ = evaluate_at_k(dataset['validation'], candidates_sap['validation'])

In [None]:
candidates = linker.predict_batch(dataset, batch_size=128, top_k=64, reuse_preds={'sapbert' : candidates_sap, 'ngram' : candidates_ngram})

In [None]:
print('Training Set:')
_ = evaluate_at_k(dataset['train'], candidates['train'])
print('Validation Set:')
_ = evaluate_at_k(dataset['validation'], candidates['validation'])

In [None]:
candidates.save_to_disk('../data/candidates')

In [None]:
candidates

# Prepare Data for Reranking

In [12]:
from xmen.reranking import CrossEncoderReranker

In [13]:
candidates = load_from_disk('../data/candidates')

In [15]:
_ = evaluate_at_k(dataset['validation'], candidates['validation'])

Recall@1 0.4217335058214748
Recall@2 0.5433376455368694
Recall@4 0.6261319534282018
Recall@8 0.6817593790426908
Recall@16 0.7309184993531694
Recall@32 0.7684346701164295
Recall@64 0.8020698576972833


In [17]:
ce_dataset = CrossEncoderReranker.prepare_data(candidates, dataset, kb)

Context length: 128
Use NIL values: True


  0%|          | 0/2711 [00:00<?, ?it/s]

  0%|          | 0/2711 [00:00<?, ?it/s]

  0%|          | 0/2711 [00:00<?, ?it/s]

  0%|          | 0/773 [00:00<?, ?it/s]

  0%|          | 0/773 [00:00<?, ?it/s]

  0%|          | 0/773 [00:00<?, ?it/s]

  0%|          | 0/3104 [00:00<?, ?it/s]

  0%|          | 0/3104 [00:00<?, ?it/s]

  0%|          | 0/3104 [00:00<?, ?it/s]

# Reranker Training

In [None]:
from xmen.reranking.cross_encoder import CrossEncoderTrainingArgs
args = CrossEncoderTrainingArgs(num_train_epochs=20, model_name='PlanTL-GOB-ES/roberta-base-biomedical-clinical-es')

In [None]:
import datetime
output_dir = Path('output') / 'cross_encoder' / datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

In [None]:
import wandb

wandb.init(project="symptemist")

try:
    rr = CrossEncoderReranker()
    rr.fit(args, ce_dataset['train'].dataset, ce_dataset['validation'].dataset, show_progress_bar=False, eval_callback=wandb.log, output_dir=output_dir)
finally:
    if run := wandb.run:
        run.finish()

# Prediction on Validation / Test Set

In [18]:
rr = CrossEncoderReranker.load('output/cross_encoder/20230920-134756/', device=0)

In [19]:
pred_validation = rr.rerank_batch(candidates['validation'], ce_dataset['validation'])
pred_validation.save_to_disk('../data/pred_validation')

Batches:   0%|          | 0/773 [00:00<?, ?it/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/60 [00:00<?, ? examples/s]

In [21]:
evaluate(dataset['validation'], pred_validation)

{'strict': {'precision': 0.69558599695586,
  'recall': 0.5912031047865459,
  'fscore': 0.6391608391608392,
  'ptp': 457,
  'fp': 200,
  'rtp': 457,
  'fn': 316,
  'n_docs_system': 60,
  'n_annos_system': 657,
  'n_docs_gold': 60,
  'n_annos_gold': 773}}

In [20]:
pred_validation_no_nil = rr.rerank_batch(candidates['validation'], ce_dataset['validation'], allow_nil=False)
pred_validation_no_nil.save_to_disk('../data/pred_validation_no_nil')

Batches:   0%|          | 0/773 [00:00<?, ?it/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/60 [00:00<?, ? examples/s]

In [22]:
evaluate(dataset['validation'], pred_validation_no_nil)

{'strict': {'precision': 0.6093143596377749,
  'recall': 0.6093143596377749,
  'fscore': 0.6093143596377749,
  'ptp': 471,
  'fp': 302,
  'rtp': 471,
  'fn': 302,
  'n_docs_system': 60,
  'n_annos_system': 773,
  'n_docs_gold': 60,
  'n_annos_gold': 773}}

In [23]:
pred_test = rr.rerank_batch(candidates['test'], ce_dataset['test'])
pred_test.save_to_disk('../data/pred_test')

Batches:   0%|          | 0/3104 [00:00<?, ?it/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/247 [00:00<?, ? examples/s]

In [24]:
pred_test_no_nil = rr.rerank_batch(candidates['test'], ce_dataset['test'], allow_nil=False)
pred_test_no_nil.save_to_disk('../data/pred_test_no_nil')

Batches:   0%|          | 0/3104 [00:00<?, ?it/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/247 [00:00<?, ? examples/s]

# Post-Processing

In [25]:
str2cui = []
for d in dataset['train']['entities']:
    for e in d:
        str2cui.append({'text' : ' '.join(e['text']), 'norm' : e['normalized'][0]['db_id']})
str2cui = pd.DataFrame(str2cui)
str2norm = str2cui.groupby('text').agg(set).norm

str2cui_list = str2cui.groupby('text').agg(list).norm

lookup = str2cui_list[str2cui_list.map(len) > 1].map(lambda l: l[0])
lookup = str2norm.map(lambda s: list(s)[0])

lookup

text
5HIAA en orina de 24 horas estaba dentro de los parámetros normales    171250001
A nivel analítico no presentaba alteración                             166315009
ALT y AST mayores de 20 veces el valor normal                          707724006
ALT y AST menos de 3 veces el valor normal                             166642001
AMA (Anticuerpos antimitocondriales) negativos                         310293008
                                                                         ...    
éxitus                                                                 419099009
íleon por engrosamiento parietal                                       312895004
íleon terminal una mucosa extremadamente irregular                     312895004
óbito                                                                  419099009
β-HCG normal                                                            33809001
Name: norm, Length: 1985, dtype: object

In [26]:
# Mapping is unique
str2norm[str2norm.map(len) != 1]

Series([], Name: norm, dtype: object)

In [27]:
def transform_lookup(sample):
    entities = sample['entities'].copy()
    for e in entities:
        t = ' '.join(e['text'])
        if t in lookup.index:
            train_cui = lookup.loc[t]
            norm = e['normalized']
            entry = {'db_name': 'SNOMED_CT', 'db_id': train_cui, 'score' : 1.0, 'predicted_by' : ['lookup']}
            norm.insert(0, entry)
    return { 'entities' : entities }

In [30]:
from xmen.data import filter_and_apply_threshold

def get_all_predictions(pred, pred_no_nil):
    pred = pred.map(transform_lookup)
    pred_no_nil = pred_no_nil.map(transform_lookup)
    
    pred_02 = filter_and_apply_threshold(pred, k=64, threshold=0.02)
    pred_no_nil_02 = filter_and_apply_threshold(pred_no_nil, k=64, threshold=0.02)
    
    return {
        'xmen' : pred,
        'xmen_no_nil': pred_no_nil,
        'xmen_02': pred_02,
        'xmen_no_nil_02' : pred_no_nil_02
    }

In [32]:
results_valid = get_all_predictions(pred_validation, pred_validation_no_nil)
results_test = get_all_predictions(pred_test, pred_test_no_nil)

Loading cached processed dataset at /mnt/nfs/home/Florian.Borchert/workspace/symptemist_biocreative_2023/data/candidates/validation/cache-be7bfafbf6fb8015.arrow
Loading cached processed dataset at /mnt/nfs/home/Florian.Borchert/workspace/symptemist_biocreative_2023/data/candidates/validation/cache-f703e5ffe7edf423.arrow


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

Map:   0%|          | 0/247 [00:00<?, ? examples/s]

# Create Submission TSV

In [74]:
def write_tsv(folder, input_tsv, name, result):
    pred_tsv = input_tsv.copy()[['filename', 'label', 'span_ini', 'span_end', 'text']]
    ents = {(d['document_id'], e['offsets'][0][0], e['offsets'][0][1]) : e for d in result for e in d['entities']}
    
    assert len(ents) == len(pred_tsv.drop_duplicates(subset=['filename', 'span_ini', 'span_end'])), (len(ents), len(pred_tsv))
    output = pred_tsv.copy()
    for idx, pred in pred_tsv.iterrows():
        e_idx = (pred['filename'], pred['span_ini'], pred['span_end'])
        assert e_idx in ents
        norms = ents[e_idx]['normalized']
        if len(norms) > 0:
            output.loc[idx, 'code'] = norms[0]['db_id']        
    out = Path(folder) / name
    print('Written to:', out)
    output[~output.code.isna()].rename(columns={'span_ini' : 'start_span', 'span_end' : 'end_span'}).to_csv(out, sep='\t', index=False)

## Validation Set

In [50]:
import pandas as pd
train_tsv = pd.read_csv(Path('../data/symptemist-train_all_subtasks+gazetteer+multilingual_230919/symptemist_train/subtask2-linking/symptemist_tsv_train_subtask2.tsv'), sep='\t')
valid_tsv = train_tsv[train_tsv.filename.isin(valid_doc_ids)]
valid_tsv.rename(columns={'span_ini' : 'start_span', 'span_end' : 'end_span'}).to_csv('../data/valid_subtask2.tsv', sep='\t', index=False)

In [51]:
len(valid_tsv)

773

In [52]:
for k, v in results_valid.items():
    write_tsv('submission/validation', valid_tsv, k + '.tsv', v)
    print(evaluate(dataset['validation'], v))

Written to: submission/validation/xmen.tsv
{'strict': {'precision': 0.706766917293233, 'recall': 0.6080206985769728, 'fscore': 0.6536856745479833, 'ptp': 470, 'fp': 195, 'rtp': 470, 'fn': 303, 'n_docs_system': 60, 'n_annos_system': 665, 'n_docs_gold': 60, 'n_annos_gold': 773}}
Written to: submission/validation/xmen_no_nil.tsv
{'strict': {'precision': 0.6261319534282018, 'recall': 0.6261319534282018, 'fscore': 0.6261319534282018, 'ptp': 484, 'fp': 289, 'rtp': 484, 'fn': 289, 'n_docs_system': 60, 'n_annos_system': 773, 'n_docs_gold': 60, 'n_annos_gold': 773}}
Written to: submission/validation/xmen_02.tsv
{'strict': {'precision': 0.8924731182795699, 'recall': 0.536869340232859, 'fscore': 0.6704361873990307, 'ptp': 415, 'fp': 50, 'rtp': 415, 'fn': 358, 'n_docs_system': 60, 'n_annos_system': 465, 'n_docs_gold': 60, 'n_annos_gold': 773}}
Written to: submission/validation/xmen_no_nil_02.tsv
{'strict': {'precision': 0.8919491525423728, 'recall': 0.5446313065976714, 'fscore': 0.6763052208835341

# Test Set

In [75]:
#test_tsv.drop_duplicates(subset=['filename', 'span_ini', 'span_end'])

In [76]:
test_tsv = pd.read_csv(Path('../data/symptemist-train_all_subtasks+gazetteer+multilingual+test_all_subtasks+bg_231006/symptemist_test/subtask2-linking/symptemist_tsv_test_subtask2.tsv'), sep='\t')
test_tsv

Unnamed: 0,filename,label,span_ini,span_end,text,code
0,es-S1138-123X2005000200006-2,SINTOMA,227,257,tensión de los tejidos blandos,
1,es-S1138-123X2005000200006-2,SINTOMA,579,606,segmentos estaban alineados,
2,es-S0211-69952011000100019-1,SINTOMA,676,716,normalización de la radiografía de tórax,
3,es-S0211-69952011000100019-1,SINTOMA,390,411,incomodidad abdominal,
4,es-S0211-69952011000100019-1,SINTOMA,413,445,dolor pleurítico en el hemitórax,
...,...,...,...,...,...,...
3099,es-S0212-16112011000300031-1,SINTOMA,1502,1523,estómago de retención,
3100,es-S0212-16112011000300031-1,SINTOMA,1527,1559,alteraciones hidroelectrolíticas,
3101,es-S0212-16112011000300031-1,SINTOMA,2321,2339,hipotonía gástrica,
3102,es-S0212-16112011000300031-1,SINTOMA,255,332,"estudio bioquímico completo, hemograma y funci...",


In [101]:
for i, (k, v) in enumerate(results_test.items()):
    write_tsv('submission/test/subtask2-linking', test_tsv, f"{i+1}-{k}.tsv", v)

Written to: submission/test/subtask2-linking/1-xmen.tsv
Written to: submission/test/subtask2-linking/2-xmen_no_nil.tsv
Written to: submission/test/subtask2-linking/3-xmen_02.tsv
Written to: submission/test/subtask2-linking/4-xmen_no_nil_02.tsv


In [104]:
pxs = [pd.read_csv(f'submission/test/subtask2-linking/{i+1}-{k}.tsv', sep='\t') for i, k in enumerate(results_test.keys())]

In [112]:
pxs[0]

Unnamed: 0,filename,label,start_span,end_span,text,code
0,es-S1138-123X2005000200006-2,SINTOMA,227,257,tensión de los tejidos blandos,416809004
1,es-S0211-69952011000100019-1,SINTOMA,676,716,normalización de la radiografía de tórax,168733007
2,es-S0211-69952011000100019-1,SINTOMA,390,411,incomodidad abdominal,43364001
3,es-S0211-69952011000100019-1,SINTOMA,413,445,dolor pleurítico en el hemitórax,2237002
4,es-S0211-69952011000100019-1,SINTOMA,1576,1584,falleció,419099009
...,...,...,...,...,...,...
2636,es-S0212-16112011000300031-1,SINTOMA,1502,1523,estómago de retención,307227006
2637,es-S0212-16112011000300031-1,SINTOMA,1527,1559,alteraciones hidroelectrolíticas,76314005
2638,es-S0212-16112011000300031-1,SINTOMA,2321,2339,hipotonía gástrica,46218001
2639,es-S0212-16112011000300031-1,SINTOMA,255,332,"estudio bioquímico completo, hemograma y funci...",312397004


In [111]:
pxs[1]

Unnamed: 0,filename,label,start_span,end_span,text,code
0,es-S1138-123X2005000200006-2,SINTOMA,227,257,tensión de los tejidos blandos,416809004
1,es-S1138-123X2005000200006-2,SINTOMA,579,606,segmentos estaban alineados,251256007
2,es-S0211-69952011000100019-1,SINTOMA,676,716,normalización de la radiografía de tórax,168733007
3,es-S0211-69952011000100019-1,SINTOMA,390,411,incomodidad abdominal,43364001
4,es-S0211-69952011000100019-1,SINTOMA,413,445,dolor pleurítico en el hemitórax,2237002
...,...,...,...,...,...,...
3099,es-S0212-16112011000300031-1,SINTOMA,1502,1523,estómago de retención,307227006
3100,es-S0212-16112011000300031-1,SINTOMA,1527,1559,alteraciones hidroelectrolíticas,76314005
3101,es-S0212-16112011000300031-1,SINTOMA,2321,2339,hipotonía gástrica,46218001
3102,es-S0212-16112011000300031-1,SINTOMA,255,332,"estudio bioquímico completo, hemograma y funci...",312397004


In [134]:
!cd submission/test; zip -r ../symptemist_HPI-DHC_subtask2_20231009.zip . -x "**/.*" -x ".*"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: subtask2-linking/ (stored 0%)
  adding: subtask2-linking/1-xmen.tsv (deflated 73%)
  adding: subtask2-linking/2-xmen_no_nil.tsv (deflated 72%)
  adding: subtask2-linking/3-xmen_02.tsv (deflated 75%)
  adding: subtask2-linking/4-xmen_no_nil_02.tsv (deflated 75%)
  adding: README.txt (deflated 45%)


In [115]:
train_preds = pd.read_csv('../data/subtask1-ner/1-1e5_best_withcontext.tsv', sep='\t')

In [123]:
train_preds.head(20)

Unnamed: 0,filename,label,start_span,end_span,text
0,es-S0365-66912012000500003-1,SINTOMA,133,145,dolor ocular
1,es-S0365-66912012000500003-1,SINTOMA,148,202,disminución progresiva de agudeza visual en am...
2,es-S0365-66912012000500003-1,SINTOMA,598,640,hemorragia en astilla peripapilar inferior
3,es-S0365-66912012000500003-1,SINTOMA,642,688,pliegues retinocoroideos temporales superiores
4,es-S0365-66912012000500003-1,SINTOMA,691,709,exudados lipídicos
5,es-S0365-66912012000500003-1,SINTOMA,718,733,retina aplicada
6,es-S0365-66912012000500003-1,SINTOMA,790,819,engrosamiento escleral difuso
7,es-S0365-66912012000500003-1,SINTOMA,830,859,fluido en la cápsula de Tenon
8,es-S0365-66912012000500003-1,SINTOMA,885,899,signo «de la T
9,es-S0365-66912012000500003-1,SINTOMA,1693,1705,dolor ocular


In [129]:
train_preds[train_preds.text.str.contains(' y ')]

Unnamed: 0,filename,label,start_span,end_span,text
23,es-S0210-48062009000600012-1,SINTOMA,3129,3181,"captación a nivel presacro, subpleural (D8) y ..."
31,es-S1698-44472004000400012-1,SINTOMA,1046,1141,hemograma y el estudio de coagulación postoper...
44,es-S0213-12852005000400003-1,SINTOMA,586,639,inflamación de los ganglios cervicales y subma...
51,es-S2254-28842014000200009-1,SINTOMA,604,650,Menstruación muy duradera (8 días) y abundante
53,es-S2254-28842014000200009-1,SINTOMA,672,696,palidez cutánea y mucosa
...,...,...,...,...,...
2757,es-S0365-66912008000700010-1,SINTOMA,335,371,pupilas isocóricas y normorreactivas
2758,es-S0365-66912008000700010-1,SINTOMA,501,561,papila del OD aparecía hiperémica y ligerament...
2765,es-S1130-63432014000400016-1,SINTOMA,239,327,"exantema maculopapuloeritematoso, evanescente ..."
2773,es-S1130-63432014000400016-1,SINTOMA,2830,2911,manifestaciones articulares en codos y carpos ...


In [131]:
fname = 'es-S0210-48062009000600012-1'

In [132]:
train_preds.query('filename == @fname').sort_values('start_span')

Unnamed: 0,filename,label,start_span,end_span,text
16,es-S0210-48062009000600012-1,SINTOMA,391,430,controles de test del aliento negativos
17,es-S0210-48062009000600012-1,SINTOMA,535,554,síndrome prostático
18,es-S0210-48062009000600012-1,SINTOMA,936,945,fallecido
19,es-S0210-48062009000600012-1,SINTOMA,1016,1041,sintomatología prostática
20,es-S0210-48062009000600012-1,SINTOMA,1127,1149,próstata firboelástica
21,es-S0210-48062009000600012-1,SINTOMA,1588,1612,tejido prostático normal
22,es-S0210-48062009000600012-1,SINTOMA,2902,2920,masa paravertebral
23,es-S0210-48062009000600012-1,SINTOMA,3129,3181,"captación a nivel presacro, subpleural (D8) y ..."
24,es-S0210-48062009000600012-1,SINTOMA,3197,3228,cariotipo con resultado de 46XY
25,es-S0210-48062009000600012-1,SINTOMA,3938,3950,asintomático


In [133]:
test_tsv.query('filename == @fname').sort_values('span_ini')

Unnamed: 0,filename,label,span_ini,span_end,text,code
1291,es-S0210-48062009000600012-1,SINTOMA,404,430,test del aliento negativos,
1296,es-S0210-48062009000600012-1,SINTOMA,535,554,síndrome prostático,
1297,es-S0210-48062009000600012-1,SINTOMA,936,945,fallecido,
1288,es-S0210-48062009000600012-1,SINTOMA,1016,1041,sintomatología prostática,
1289,es-S0210-48062009000600012-1,SINTOMA,1127,1149,próstata firboelástica,
1293,es-S0210-48062009000600012-1,SINTOMA,1154,1162,dolorosa,
1298,es-S0210-48062009000600012-1,SINTOMA,1588,1612,tejido prostático normal,
1299,es-S0210-48062009000600012-1,SINTOMA,1640,1654,estroma normal,
1300,es-S0210-48062009000600012-1,SINTOMA,1927,1941,estroma normal,
1294,es-S0210-48062009000600012-1,SINTOMA,2459,2477,traslocación 14-18,
