In [13]:
import pandas as pd
import numpy as np
from numba import jit, njit, prange
import glob
import os

In [14]:
emb_cols = [str(col) for col in range(767+1)]

In [15]:
@njit
def cosine_sim(a, b): 
    return np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
[cosine_sim(np.random.normal(size=(800,)),np.random.normal(size=(800,))) for _ in range(20000)];

In [16]:
@njit(parallel=True)
def return_similarities(embedding, concept_embeddings):
    result = np.zeros((concept_embeddings.shape[0], ))
    for i in prange(concept_embeddings.shape[0]):
        result[i] = cosine_sim(embedding, concept_embeddings[i])
    return result

In [17]:
try:
    import google.colab
    colab = True
    from google.colab import drive
    drive.mount('/content/drive')
    
    emb_files = {
        'bertopic_test': '/content/drive/MyDrive/NLP_files/embeddings/bertopic_processed_data_2022-11-22_23-14-24_test_embeddings_biobert.csv',
        'bertopic_train': '/content/drive/MyDrive/NLP_files/embeddings/bertopic_processed_data_2022-11-22_23-14-24_train_embeddings_biobert.csv',
        'LDA_test': '/content/drive/MyDrive/NLP_files/embeddings/LDA_test_2022-11-23_17-47-24_embeddings_biobert.csv',
        'LDA_train': '/content/drive/MyDrive/NLP_files/embeddings/LDA_train_2022-11-23_17-47-24_embeddings_biobert.csv'
    }
    
    emb_glob_regex = '/content/drive/MyDrive/NLP_files/concepts_embeddings_*_biobert.csv'
    
    device_name = 'cuda'
    sys.path.append('/content/drive/MyDrive/NLP/bert_embeddings')
    print(sys.path[-1])
except ImportError:
    colab = False
    
    emb_files = {
        'bertopic_test': '../results/embeddings/bertopic_processed_data_2022-11-22_23-14-24_test_embeddings_biobert.csv',
        'bertopic_train': '../results/embeddings/bertopic_processed_data_2022-11-22_23-14-24_train_embeddings_biobert.csv',
        'LDA_test': '../results/embeddings/LDA_test_2022-11-23_17-47-24_embeddings_biobert.csv',
        'LDA_train': '../results/embeddings/LDA_train_2022-11-23_17-47-24_embeddings_biobert.csv'
    }
    emb_glob_regex = '../results/embeddings/concepts_embeddings_*_biobert.csv'
    
    device_name = 'cpu'

In [18]:
concepts_emb_pretrained = pd.concat([pd.read_csv(file) for file in glob.glob(emb_glob_regex)])

In [19]:
concepts_emb_pretrained_raw = np.ascontiguousarray(concepts_emb_pretrained[emb_cols].values.astype(np.float32))

In [20]:
concepts_emb_pretrained = concepts_emb_pretrained[['concept_name']]

In [21]:
def return_embedded_word(row):
    embedding = row.values[5:].astype(np.float32).flatten()
    #cosine_similarities = np.apply_along_axis(lambda x: cosine_sim(embedding, x), axis=1, arr=concepts_emb_pretrained_raw)
    cosine_similarities = return_similarities(embedding, concepts_emb_pretrained_raw)
    max_idx = cosine_similarities.argmax()
    return concepts_emb_pretrained.iloc[max_idx]['concept_name'], concepts_emb_pretrained.iloc[max_idx]['CUI']

In [22]:
def collect_list(x): 
    result= {}
    result['tagged_words'] = list(x['tagged_word'])
    return pd.Series(result, index=['tagged_words'])

In [23]:
import tqdm
for name, filename in emb_files.items():
    keyword_embs = pd.read_csv(filename).head(5)
    tagged_words = []
    CUIs = []
    for i, row in tqdm.tqdm(keyword_embs.iterrows(), total=len(keyword_embs)):
        tagged_word, CUI = return_embedded_word(row)
        tagged_words.append(tagged_word)
        CUIs.append(CUI)
    keyword_embs['tagged_word'] = tagged_words
    keyword_embs['CUI'] = CUIs
    output_filename = filename.replace('embeddings', 'emb_tagged')
    output_filename = os.path.join(os.path.dirname(output_filename), 'tagged_' + os.path.basename(output_filename))
    print(output_filename)
    #keyword_embs.groupby('PMID').apply(collect_list).reset_index().to_csv('../results/tagged_bertopic_bert_pretrained.csv', index=False)
    keyword_embs.to_csv(output_filename, index=False)
    break

  0%|                                                                                            | 0/5 [00:01<?, ?it/s]


KeyError: 'CUI'

In [None]:
keyword_embs