In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from collections import defaultdict
import pandas as pd

import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel  

import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
snomed_corpus = "../symptemist-train_all_subtasks+gazetteer+multilingual+test_all_subtasks+bg_231006/symptemist_gazetteer/symptemist_gazetter_snomed_ES_v2.tsv"
snomed = pd.read_csv(snomed_corpus,sep="\t")


snomed_code_dictionary = dict()

for row in snomed.iterrows():
    if row[1]['term'].lower() in snomed_code_dictionary.keys():
        snomed_code_dictionary[row[1]['term'].lower()] = snomed_code_dictionary[row[1]['term'].lower()]+ "+"+ str(row[1]['code'])
    else:
        snomed_code_dictionary[row[1]['term'].lower()] = str(row[1]['code'])


In [43]:
data_file = "../symptemist-train_all_subtasks+gazetteer+multilingual+test_all_subtasks+bg_231006/symptemist_train/subtask2-linking/symptemist_tsv_train_subtask2.tsv"
#for 3
# data_file = "../symptemist-train_all_subtasks+gazetteer+multilingual+test_all_subtasks+bg_231006/symptemist_multilingual-silver-standard/train/en/symptemist_train_en.tsv"

data = pd.read_csv(data_file,sep="\t")
data = data.sample(frac=1, random_state=42)

val_data = data[int(-len(data)*0.5):]
data = data[:int(len(data)*0.5)]

data_dictionary = dict()
for row in data.iterrows():
    if row[1]['text'].lower() in data_dictionary.keys():
        data_dictionary[row[1]['text'].lower()] = data_dictionary[row[1]['text'].lower()]+ "+"+ str(row[1]['code'])
    else:
        data_dictionary[row[1]['text'].lower()] = str(row[1]['code'])



In [44]:
# data_files = defaultdict()

# for file in data['filename'].unique():
#     with open(f"../symptemist-train_all_subtasks+gazetteer+multilingual+test_task1_230929/symptemist_train/subtask1-ner/txt/{file}.txt", "r") as f:
#         data_files[file] = f.read()

# val_data_text = []
# for sample in val_data.iterrows():
#     val_data_text.append(data_files[sample[1]['filename']][sample[1]['span_ini']-100:sample[1]['span_end']+100:])

# # span_ini	span_end



In [45]:
# checkpoint ="cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR-large"
checkpoint ="cambridgeltl/SapBERT-UMLS-2020AB-all-lang-from-XLMR"
# checkpoint = "Blaxzter/LaBSE-sentence-embeddings"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)  
model = AutoModel.from_pretrained(checkpoint).cuda()

In [46]:
def get_embeddings(text, batch_size):

    embeddings = []
    for i in tqdm(np.arange(0, len(text), batch_size)):
        tokens = tokenizer.batch_encode_plus(text[i:i+batch_size], 
                                           padding="max_length", 
                                           max_length=25, 
                                           truncation=True,
                                           return_tensors="pt")
        toks_cuda = {}
        for k,v in tokens.items():
            toks_cuda[k] = v.cuda()
        # cls_rep = model(**toks_cuda)[0][:,0,:] # use CLS representation as the embedding
        cls_rep = model(**toks_cuda)[0].mean(axis=1)
        # print(cls_rep.shape)
        # break
        # [:,0,:] # use CLS representation as the embedding

        embeddings.append(cls_rep.cpu().detach().numpy())
    
    embeddings = np.concatenate(embeddings, axis=0)
    return embeddings

In [47]:
batch_size = 64
snomed_text = list(snomed_code_dictionary.keys())
val_data_text = list(val_data['text'].str.lower())

snomed_embeddings = get_embeddings(snomed_text, batch_size)
val_embeddings = get_embeddings(val_data_text, batch_size)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2575/2575 [02:09<00:00, 19.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:01<00:00, 19.98it/s]


In [48]:
def get_norm_emb(emb):
    return emb/np.linalg.norm(emb,ord=2, axis=-1, keepdims=True)

def get_scores(corpus_embeddings, val_embeddings):
    lookup_codes_emb_norm = get_norm_emb(corpus_embeddings).T
        
    val_embed_norm = get_norm_emb(val_embeddings) 
    scores = val_embed_norm @ lookup_codes_emb_norm
    return scores


codes = list(snomed_code_dictionary.values())
scores = get_scores(snomed_embeddings,val_embeddings)




In [55]:
list_of_codes_per_sample = []

for text, score, index in zip(val_data_text, scores, np.argmax(scores, axis=-1)):
    if text in data_dictionary.keys():
        list_of_codes_per_sample.append(str(data_dictionary[text]))
    elif text in snomed_code_dictionary.keys():
        list_of_codes_per_sample.append(str(snomed_code_dictionary[text]))
    else:
        if score[index]>0.5:
            list_of_codes_per_sample.append(codes[index])
        else:
            list_of_codes_per_sample.append(-1)
        
    
correct = 0
for i,j,k in zip(list_of_codes_per_sample, val_data['code'], val_data_text):
    if i == -1:
        i = 'NO_CODE'
    for code in i.split("+"):
        if code == j:
            correct += 1
            break
    # else:
    #     try:
    #         print(list(snomed[snomed['code']==int(i)]['term']), list(snomed[snomed['code']==int(j)]['term']), k)
    #     except:
    #         print(i,j)
print(correct/len(list_of_codes_per_sample))

0.5614236509758898


In [10]:
# seed evaluation:
#42: 91.95
#100: 92.53
#1000 91.37
#109560 91.52
#20% none 88.2

# 40 validation - 90.23
#80 59.16
#50 90.66
#50 large -> 90.8

In [11]:


for threshhold in [0.50,0.55, 0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95]:
    list_of_codes_per_sample = []
    for index, score in zip(np.argmax(scores, axis=-1), scores):
        if score[index]>threshhold:
            list_of_codes_per_sample.append(codes[index])
        else:
            list_of_codes_per_sample.append(-1)
    
    correct = 0
    for i,j in zip(list_of_codes_per_sample, val_data['code']):
        if i == -1:
            i = 'NO_CODE'
        if i == j:
            correct += 1
    print(threshhold, correct/len(list_of_codes_per_sample))

0.5 0.4410919540229885
0.55 0.4425287356321839
0.6 0.4410919540229885
0.65 0.4425287356321839
0.7 0.4410919540229885
0.75 0.4339080459770115
0.8 0.4267241379310345
0.85 0.39798850574712646
0.9 0.35201149425287354
0.95 0.3045977011494253


In [12]:
for threshhold in[0.50,0.55, 0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,0.99,0.98, 1.0]:
    list_of_codes_per_sample = []
    confidence_counter =0
    for index, score in zip(np.argmax(scores, axis=-1), scores):
        if score[index]>threshhold:
            list_of_codes_per_sample.append(codes[index])
        else:
            list_of_codes_per_sample.append(-1)
    
    correct = 0
    for i,j,k in zip(list_of_codes_per_sample, val_data['code'], val_data_text):
        if i != -1:
            confidence_counter+=1
            if i == j:
                correct += 1
            # else:
                # try:
                #     print(list(snomed[snomed['code']==int(i)]['term']), list(snomed[snomed['code']==int(j)]['term']), k)
                # except:
                #     print(i,j)
    print(threshhold, correct/confidence_counter, confidence_counter)

0.5 0.4410919540229885 696
0.55 0.4442836468885673 691
0.6 0.4473684210526316 684
0.65 0.4686064318529862 653
0.7 0.49917627677100496 607
0.75 0.5471349353049908 541
0.8 0.6247288503253796 461
0.85 0.7223719676549866 371
0.9 0.7892976588628763 299
0.95 0.8218623481781376 247
0.99 0.8430232558139535 172
0.98 0.8449197860962567 187
1.0 0.7592592592592593 54


In [13]:
list(snomed[snomed['code']==int(j)].term)

['mala oclusión dental']

In [42]:
for threshhold in [0.2, 0.15, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0001]:
    list_of_codes_per_sample = []
    confidence_counter = 0

    
    for score in scores:
        top_index, second_index = np.argsort(score)[::-1][:2]
        
        if (score[top_index] - score[second_index])> threshhold:
            list_of_codes_per_sample.append(codes[top_index])
        else:
            list_of_codes_per_sample.append(-1)
    
    correct = 0
    for i,j in zip(list_of_codes_per_sample, val_data['code']):
        if i != -1:
            confidence_counter+=1
        if i == j:
            correct += 1
    print(threshhold, correct/len(list_of_codes_per_sample))


# tmp

0.2 0.011494252873563218
0.15 0.08189655172413793
0.1 0.17672413793103448
0.05 0.3132183908045977
0.01 0.4425287356321839
0.005 0.46264367816091956
0.001 0.4755747126436782
0.0001 0.47701149425287354


In [15]:
np.max(scores, axis =0)

array([0.86541665, 0.8390726 , 0.50508165, ..., 0.5093457 , 0.5625357 ,
       0.6734853 ], dtype=float32)

In [16]:
scores.sort()

In [17]:
snomed.dtype()

AttributeError: 'DataFrame' object has no attribute 'dtype'

In [None]:
last_score = []
for sample in scores:
    last_score.append(sample[-1]-sample[-2])



In [None]:
last_score

In [None]:
import seaborn as sns

In [None]:
sns.histplot(last_score)

In [None]:
last_score

In [None]:
!ps

In [None]:
#0.43 - large
# 0.39 - base 

In [None]:
    
run_data["code"] = list_of_codes_per_sample

basename_run_file = os.path.basename(run_file)

run_data.to_csv(os.path.join(output_folder,basename_run_file), sep="\t", index=False)
    

In [None]:
counter = 0
for text_sample,code in zip(data['text'], data['code']):
    if text_sample.lower() in snomed_code_dictionary.keys():
        
        if str(code) != str(snomed_code_dictionary[text_sample.lower()]):
            # print(text_sample)
            print(text_sample, code, snomed_code_dictionary[text_sample.lower()])
            counter+=1

counter

In [None]:

print(snomed[snomed['code']==29740003])
print(snomed[snomed['code']==88275004])

difference between the train and actual codes is 80...

In [None]:
for i in snomed.iterrows():
    print(i[1]['term'])
    break

In [None]:
len(snomed['code'].unique())

In [None]:
val_data