In [1]:
import pandas as pd
import numpy as np
from numba import jit, njit, prange
import glob
import os

In [2]:
emb_cols = [str(col) for col in range(767+1)]

In [3]:
@njit
def cosine_sim(a, b): 
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
[cosine_sim(np.random.normal(size=(800,)),np.random.normal(size=(800,))) for _ in range(20000)];

In [4]:
@njit(parallel=True)
def return_similarities(embedding, concept_embeddings):
    result = np.zeros((concept_embeddings.shape[0], ))
    for i in prange(concept_embeddings.shape[0]):
        result[i] = cosine_sim(embedding, concept_embeddings[i])
    return result

In [5]:
try:
    import google.colab
    colab = True
    from google.colab import drive
    drive.mount('/content/drive')
    
    emb_files = {
        'bertopic_test': '/content/drive/MyDrive/NLP_files/embeddings/bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_test_embeddings_biobert.csv',
        'bertopic_train': '/content/drive/MyDrive/NLP_files/embeddings/bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_train_embeddings_biobert.csv',
        'LDA_test': '/content/drive/MyDrive/NLP_files/embeddings/LDA_test_2022-11-23_17-47-24_embeddings_biobert.csv',
        'LDA_train': '/content/drive/MyDrive/NLP_files/embeddings/LDA_train_2022-11-23_17-47-24_embeddings_biobert.csv'
    }
    
    emb_glob_regex = '/content/drive/MyDrive/NLP_files/concepts_embeddings_*_biobert.csv'
    
    device_name = 'cuda'
    import sys
    sys.path.append('/content/drive/MyDrive/NLP/bert_embeddings')
    print(sys.path[-1])
except ImportError:
    colab = False
    
    emb_files = {
        'bertopic_test': '../results/embeddings/bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_test_embeddings_biobert.csv',
        'bertopic_train': '../results/embeddings/bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_train_embeddings_biobert.csv',
        'LDA_test': '../results/embeddings/LDA_test_2022-12-07_15-04-23_embeddings_biobert.csv',
        'LDA_train': '../results/embeddings/LDA_train_2022-12-07_15-04-23_embeddings_biobert.csv'
    }
    emb_glob_regex = '../results/embeddings/concepts_embeddings_*_biobert.csv'
    
    device_name = 'cpu'

In [6]:
concepts_emb_pretrained = pd.concat([pd.read_csv(file) for file in glob.glob(emb_glob_regex)]).reset_index(drop=True)

In [7]:
concepts_emb_pretrained_raw = np.ascontiguousarray(concepts_emb_pretrained[emb_cols].values.astype(np.float32))

In [8]:
concepts_emb_pretrained = concepts_emb_pretrained[['concept_name', 'CUI']]

In [9]:
n_words = 1

In [10]:
def return_embedded_words(row):
    embedding = row[0].astype(np.float32).flatten()
    #cosine_similarities = np.apply_along_axis(lambda x: cosine_sim(embedding, x), axis=1, arr=concepts_emb_pretrained_raw)
    cosine_similarities = return_similarities(embedding, concepts_emb_pretrained_raw)
    argsort_indices = np.argsort(cosine_similarities)
    chosen_concepts_names = set()
    i = 0
    while len(chosen_concepts_names) != n_words: 
        max_indices = argsort_indices[-(n_words+i):]
        chosen_concepts_names = set(concepts_emb_pretrained.iloc[max_indices]['concept_name'])
        i += 1
        
    return concepts_emb_pretrained.iloc[max_indices]['concept_name'].drop_duplicates(), concepts_emb_pretrained.iloc[max_indices]['CUI']

In [11]:
def collect_list(x): 
    result= {}
    result['tagged_words'] = list(x['tagged_word'])
    return pd.Series(result, index=['tagged_words'])

In [12]:
def postprocess_keywords(x):
    result = {}
    result['text_to_annotate'] = ', '.join(x['topic_keyword'])
    result['ncbo_annotations_pairs'] = []
    for topic_keyword, tagged_words in zip(x['topic_keyword'], x['tagged_word']): 
        for tagged_word in tagged_words: 
            result['ncbo_annotations_pairs'].append([topic_keyword, tagged_word])
    return pd.Series(result, index=['text_to_annotate', 'ncbo_annotations_pairs'])

In [13]:
import tqdm
chosen_concepts_names = set()
for name, filename in emb_files.items():
    print(filename)
    keyword_embs = pd.read_csv(filename)
    distict_keywords = keyword_embs.groupby('topic_keyword').apply(lambda x: x[emb_cols].values[0]).reset_index()
    tagged_words = []
    CUIs = []
    similarities = [] 
    for i, row in tqdm.tqdm(distict_keywords.iterrows(), total=len(distict_keywords)):
        tagged_word, CUI = return_embedded_words(row)
        tagged_words.append(tagged_word.values)
        CUIs.append(CUI.values)
        chosen_concepts_names.update(tagged_word.values)
    distict_keywords['tagged_word'] = tagged_words
    distict_keywords['CUI'] = CUIs
    print(len(keyword_embs))
    keyword_embs = keyword_embs.merge(distict_keywords)
    print(len(keyword_embs))
    output_filename = filename.replace('embeddings', 'emb_tagger')
    output_filename = os.path.join(os.path.dirname(output_filename), 'tagged_1_word_' + os.path.basename(output_filename))
    print(output_filename)
    #keyword_embs.groupby('PMID').apply(collect_list).reset_index().to_csv('../results/tagged_bertopic_bert_pretrained.csv', index=False)

    keyword_embs.drop([str(col) for col in list(range(767+1))] + [0],axis=1)\
        .groupby('PMID').apply(postprocess_keywords).reset_index().to_csv(output_filename, index=False)

../results/embeddings/bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_test_embeddings_biobert.csv


100%|████████████████████████████████████████████████████████████████████████████████| 480/480 [05:21<00:00,  1.49it/s]


8790
8790
../results/emb_tagger\tagged_1_word_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_test_emb_tagger_biobert.csv
../results/embeddings/bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_train_embeddings_biobert.csv


100%|████████████████████████████████████████████████████████████████████████████████| 506/506 [05:27<00:00,  1.55it/s]


35130
35130
../results/emb_tagger\tagged_1_word_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_train_emb_tagger_biobert.csv
../results/embeddings/LDA_test_2022-12-07_15-04-23_embeddings_biobert.csv


100%|████████████████████████████████████████████████████████████████████████████████| 155/155 [01:37<00:00,  1.60it/s]


8790
8790
../results/emb_tagger\tagged_1_word_LDA_test_2022-12-07_15-04-23_emb_tagger_biobert.csv
../results/embeddings/LDA_train_2022-12-07_15-04-23_embeddings_biobert.csv


100%|████████████████████████████████████████████████████████████████████████████████| 265/265 [02:55<00:00,  1.51it/s]


35130
35130
../results/emb_tagger\tagged_1_word_LDA_train_2022-12-07_15-04-23_emb_tagger_biobert.csv


In [17]:
concepts_emb_pretrained = pd.concat([pd.read_csv(file) for file in glob.glob(emb_glob_regex)]).reset_index(drop=True)

In [18]:
pd.DataFrame(chosen_concepts_names, columns=['concept_name'])\
    .merge(concepts_emb_pretrained.drop('CUI', axis=1).drop_duplicates())\
    .to_csv('../results/emb_tagger/used_concepts_embeddings.csv', index=False)

In [19]:
pd.DataFrame(chosen_concepts_names, columns=['concept_name'])\
    .merge(concepts_emb_pretrained.drop('CUI', axis=1).drop_duplicates())

Unnamed: 0,concept_name,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,Geometridae,0.026590,-0.218336,-0.082368,-0.125464,-0.205064,-0.166346,0.290295,0.103192,0.150951,...,-0.398013,0.176718,-0.168824,0.150870,0.042568,0.272103,0.286327,-0.215377,0.032077,0.400268
1,NK cell tolerance induction,0.031189,-0.107167,0.055883,0.164044,-0.215502,0.102412,-0.240992,0.025580,0.241655,...,-0.249865,0.088483,-0.034737,0.261345,-0.084805,0.123435,0.371982,0.027173,-0.094913,-0.069846
2,adhesive foot cushion,-0.038001,-0.153222,0.044902,0.116850,-0.018270,-0.232527,-0.115886,0.081946,-0.131368,...,-0.163994,0.042573,0.153897,0.064127,-0.205107,0.295644,0.255009,-0.025804,0.168556,0.181749
3,nursing,-0.334731,-0.160043,-0.046998,-0.065790,0.012464,-0.120366,0.113284,0.087155,0.040135,...,-0.262575,0.154686,-0.128484,0.154836,-0.068263,0.218019,0.191168,-0.538231,0.199024,0.244326
4,Acute anterior uveitis,-0.230938,-0.058553,-0.138428,0.220002,0.152350,0.090824,0.146693,0.247187,-0.039041,...,-0.155126,-0.240387,-0.119872,-0.147565,-0.264947,0.052113,0.272022,0.123350,-0.229727,0.016395
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1604,Escherichia fergusonii,0.000133,0.078904,-0.139959,-0.189859,-0.056166,-0.162164,0.048942,0.208812,-0.172592,...,-0.168959,-0.005957,0.009746,0.192924,-0.241182,-0.148843,-0.131791,0.150299,0.046157,-0.130218
1605,bacterium F01,0.019230,-0.070695,-0.020400,-0.008321,-0.174546,-0.019121,0.063023,-0.031484,0.056122,...,0.040141,0.117656,0.127575,0.194082,-0.006614,0.104087,0.005277,-0.137691,0.366202,0.056038
1606,physiology,-0.128424,-0.039474,0.095588,-0.082102,-0.047596,-0.358291,0.248942,0.206391,-0.344330,...,-0.240900,0.299748,-0.272029,0.157699,-0.182038,0.142554,0.158865,-0.443456,0.126494,0.130544
1607,Macular crystals,-0.209252,-0.261402,0.006632,0.075071,-0.007613,0.137105,0.028207,0.122092,-0.131646,...,-0.205228,-0.200365,0.097290,0.144611,-0.206496,0.200568,0.262804,0.285512,-0.147594,0.308372


In [20]:
glob.glob('../results/emb_tagger\\tagged*.csv')

['../results/emb_tagger\\tagged_1_word_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_test_emb_tagger_biobert.csv',
 '../results/emb_tagger\\tagged_1_word_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_train_emb_tagger_biobert.csv',
 '../results/emb_tagger\\tagged_1_word_LDA_test_2022-12-07_15-04-23_emb_tagger_biobert.csv',
 '../results/emb_tagger\\tagged_1_word_LDA_train_2022-12-07_15-04-23_emb_tagger_biobert.csv',
 '../results/emb_tagger\\tagged_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_test_emb_tagger_biobert.csv',
 '../results/emb_tagger\\tagged_bertopic_lemmatize_nostopwords_data_2022-11-28_19-41-48_train_emb_tagger_biobert.csv',
 '../results/emb_tagger\\tagged_bertopic_processed_data_2022-11-22_23-14-24_test_emb_tagged_biobert.csv',
 '../results/emb_tagger\\tagged_bertopic_processed_data_2022-11-22_23-14-24_train_emb_tagger_biobert.csv',
 '../results/emb_tagger\\tagged_LDA_test_2022-12-07_15-04-23_emb_tagger_biobert.csv',
 '../results/emb_tagger\\t

In [34]:
pres = pd.read_csv('../results/emb_tagger\\tagged_1_word_LDA_test_2022-12-07_15-04-23_emb_tagger_biobert.csv')[['text_to_annotate','ncbo_annotations_pairs']]\
    .drop_duplicates()
pres['ncbo_annotations_pairs'] = pres['ncbo_annotations_pairs'].apply(eval)

In [49]:
pairs = np.array(sum(pres['ncbo_annotations_pairs'].drop_duplicates().values, []))
pairs = pd.DataFrame(pairs).drop_duplicates()
pairs[pairs[0].str.lower() == pairs[1].str.lower()].sample(20)

Unnamed: 0,0,1
65,resistance,Resistance
256,site,Site
139,body,Body
3,study,Study
54,brain,Brain
157,P,P
114,depression,Depression
217,implant,Implant
64,DNA,DNA
77,pain,Pain


In [48]:
pd.DataFrame([['', ''], ['imaging', 'analysis'], ['neuron', 'Neurites'], ['', '']])

Unnamed: 0,0,1
0,,
1,imaging,analysis
2,neuron,Neurites
3,,


In [33]:
pres['ncbo_annotations_pairs'].drop_duplicates().values

array(["[['risk', 'Simon'], ['95', 'Harvest'], ['patient', 'Soldier'], ['study', 'Study'], ['factor', 'analysis'], ['associated', 'Detective'], ['polymorphism', 'Genetic polymorphism'], ['level', 'education'], ['disease', 'Disease'], ['population', 'Population']]",
       "[['patient', 'Soldier'], ['study', 'Study'], ['disease', 'Disease'], ['response', 'Response'], ['forest', 'Cherokee'], ['plant', 'Rosemary'], ['result', 'Cliff'], ['2', '2+'], ['target', 'MVP'], ['food', 'Simon']]",
       "[['patient', 'Soldier'], ['study', 'Study'], ['associated', 'Detective'], ['data', 'Data'], ['condition', 'condition list'], ['sample', 'Detective'], ['low', 'Middle'], ['disorder', 'physiology'], ['increased', 'Polytechnic'], ['change', 'Distribution']]",
       "[['study', 'Study'], ['muscle', 'Ham'], ['model', 'Models'], ['alarm', 'Floor'], ['group', 'Group'], ['tissue', 'metabolism'], ['exercise', 'nursing'], ['delay', 'commercial'], ['growth', 'physiology'], ['rate', 'education']]",
       "[