In [1]:
import pandas as pd
import numpy as np
import sys
import tqdm
import os
import glob

In [2]:
!rm -R /content/drive/MyDrive/NLP
!git clone https://github.com/hadaszm/NLP.git /content/drive/MyDrive/NLP

Cloning into '/content/drive/MyDrive/NLP'...
remote: Enumerating objects: 227, done.[K
remote: Counting objects: 100% (186/186), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 227 (delta 92), reused 143 (delta 51), pack-reused 41[K
Receiving objects: 100% (227/227), 120.64 MiB | 21.40 MiB/s, done.
Resolving deltas: 100% (102/102), done.
Checking out files: 100% (27/27), done.


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
try:
    import google.colab
    colab = True
    from google.colab import drive
    drive.mount('/content/drive')
    
    fine_tuned_filepath = '/content/drive/MyDrive/NLP_files/mybert.pickle'
    concepts_filepath = '/content/drive/MyDrive/NLP_files/concepts_strings_with_ids.csv'
    embeddings_output_filepath = '/content/drive/MyDrive/NLP_files/concepts_embeddings.csv'
    device_name = 'cuda'
    sys.path.append('/content/drive/MyDrive/NLP/bert_embeddings')
    print(sys.path[-1])
except ImportError:
    colab = False
    fine_tuned_filepath = '../models/mybert.pickle'
    concepts_filepath = '../data/concepts_strings_with_ids.csv'
    embeddings_output_filepath = '../results/embeddings/concepts_embeddings.csv'
    device_name = 'cpu'
from emb_helpers import * 


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import AutoTokenizer, AutoModel

biobert = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device_name)

In [8]:
for file in glob.glob(os.path.join(os.path.dirname(concepts_filepath), os.path.basename(concepts_filepath).replace('.csv', '*.csv'))):
    print(file)
    id_ = file.split('_')[-1].replace('.csv', '')
    output_filename = embeddings_output_filepath.replace('.csv',f'_{id_}_biobert.csv')
    if os.path.exists(output_filename): 
        continue
    concepts = pd.read_csv(file)
    concepts_emb = return_embeddings_for_concepts(concepts['best_string'], biobert, device_name)
    concepts_emb['concept_name'] = concepts['best_string']
    concepts_emb['CUI'] = concepts['CUI']
    concepts_emb[['CUI', 'concept_name'] + list(range(len(concepts_emb.columns)-1))]
    concepts_emb.to_csv(output_filename, index=False)

/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_1.csv


100%|██████████| 100000/100000 [23:01<00:00, 72.37it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_0.csv


100%|██████████| 100000/100000 [22:41<00:00, 73.43it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_6.csv


100%|██████████| 100000/100000 [23:28<00:00, 70.99it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_5.csv


100%|██████████| 100000/100000 [22:44<00:00, 73.28it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_7.csv


100%|██████████| 100000/100000 [22:26<00:00, 74.27it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_4.csv


100%|██████████| 100000/100000 [22:25<00:00, 74.32it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_2.csv


100%|██████████| 100000/100000 [23:21<00:00, 71.37it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_8.csv


100%|██████████| 100000/100000 [22:14<00:00, 74.96it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_3.csv


100%|██████████| 100000/100000 [22:09<00:00, 75.23it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_10.csv


100%|██████████| 73954/73954 [16:22<00:00, 75.30it/s]


/content/drive/MyDrive/NLP_files/concepts_strings_with_ids_9.csv


100%|██████████| 100000/100000 [22:09<00:00, 75.21it/s]


In [9]:
file = '../data/ground_truth_concepts_strings_with_ids.csv'
output_filename = embeddings_output_filepath.replace('.csv',f'_biobert.csv')

concepts = pd.read_csv(file)
concepts_emb = return_embeddings_for_concepts(concepts['best_string'], biobert, device_name)
concepts_emb['concept_name'] = concepts['best_string']
concepts_emb['CUI'] = concepts['CUI']
concepts_emb[['CUI', 'concept_name'] + list(range(len(concepts_emb.columns)-2))]
print(output_filename)
concepts_emb.to_csv(output_filename, index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 6906/6906 [04:06<00:00, 28.01it/s]


../results/embeddings/concepts_embeddings_biobert.csv


In [14]:
gt_cuis = set(concepts_emb['CUI'])

In [15]:
for file in glob.glob('../results/embeddings/concepts_embeddings_*_biobert.csv'):
    saved_embs = pd.read_csv(file)
    saved_embs.to_csv(file.replace('concepts_embeddings', 'old_concepts_embeddings'), index=False)
    saved_embs = saved_embs.loc[~saved_embs['CUI'].isin(gt_cuis)]
    saved_embs.to_csv(file, index=False)

In [17]:
concepts_emb.to_csv('../results/embeddings/concepts_embeddings_12_biobert.csv', index=False)