In [1]:
import pandas as pd
import numpy as np
import sys
import tqdm
import os
import glob

In [2]:
try:
    import google.colab
    colab = True
    from google.colab import drive
    ncbo_input = '/content/drive/MyDrive/NLP_files/CRAFT/results/bertopic_ncbo/*'
    embeddings_output_filepath = '/content/drive/MyDrive/NLP_files/CRAFT/data/embeddings/'
    drive.mount('/content/drive')
    device_name = 'cuda'
    sys.path.append('/content/drive/MyDrive/NLP/bert_embeddings')
except ImportError:
    colab = False
    ncbo_input = '../../0.RESULTS/*_ncbo/*'
    embeddings_output_filepath = '../../0.RESULTS/embeddings/'
    device_name = 'cpu'
from emb_helpers import * 


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
try:
    os.mkdir(embeddings_output_filepath)
except FileExistsError: 
    pass

In [4]:
from transformers import AutoTokenizer, AutoModel

biobert = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device_name)

In [5]:
files = glob.glob(ncbo_input)
df = pd.concat([pd.read_csv(file) for file in files])

In [7]:
df['ncbo_annotations_pairs'].iloc[0]#.apply(lambda x: sum([row[:2] for row in eval(x)], []))

"[['MOUSE', 'Rodentia', 'ANCESTOR', 'https://data.bioontology.org/ontologies/CL/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FNCBITaxon_10090/ancestors'], ['DOPAMINE', 'organonitrogen compound', 'ANCESTOR', 'https://data.bioontology.org/ontologies/CL/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FCHEBI_18243/ancestors'], ['BEHAVIOR', 'behavior', 'DIRECT', 'https://data.bioontology.org/ontologies/CL/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGO_0007610'], ['BEHAVIOR', 'multicellular organismal process', 'ANCESTOR', 'https://data.bioontology.org/ontologies/GO/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGO_0007610/ancestors'], ['DOPAMINE', 'ammonium ion derivative', 'ANCESTOR', 'https://data.bioontology.org/ontologies/UBERON/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FCHEBI_59905/ancestors'], ['BEHAVIOR', 'multicellular organismal process', 'ANCESTOR', 'https://data.bioontology.org/ontologies/UBERON/classes/http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FGO_0007610/ancestors

In [8]:
df['ncbo_annotations_pairs'].apply(lambda x: sum([row[:2] for row in eval(x)], [])).iloc[0]

['MOUSE',
 'Rodentia',
 'DOPAMINE',
 'organonitrogen compound',
 'BEHAVIOR',
 'behavior',
 'BEHAVIOR',
 'multicellular organismal process',
 'DOPAMINE',
 'ammonium ion derivative',
 'BEHAVIOR',
 'multicellular organismal process',
 'DOPAMINE',
 'main group molecular entity',
 'RECEPTOR',
 'Receptor',
 'MOUSE',
 'Deuterostomia',
 'LEARNING',
 'biological_process',
 'DOPAMINE',
 'polyatomic cation',
 'BEHAVIOR',
 'behavior',
 'DOPAMINE',
 'phenols',
 'MOUSE',
 'Craniata <chordates>',
 'DOPAMINE',
 's-block molecular entity',
 'DOPAMINE',
 'monoamine',
 'MOUSE',
 'Amniota',
 'TASTE',
 'biological_process',
 'DOPAMINE',
 'molecular entity',
 'MOUSE',
 'Muridae',
 'DOPAMINE',
 'organic aromatic compound',
 'DOPAMINE',
 'organic cyclic compound',
 'MOUSE',
 'Mus <subgenus>',
 'MOUSE',
 'Dipnotetrapodomorpha',
 'DOPAMINE',
 'benzenediols',
 'MOUSE',
 'Boreoeutheria',
 'MOUSE',
 'Euarchontoglires',
 'DOPAMINE',
 'material entity',
 'TASTE',
 'sensory perception of taste',
 'TASTE',
 'nervous s

In [9]:
df['ncbo_annotations_pairs'] = df['ncbo_annotations_pairs'].apply(lambda x: sum([row[:2] for row in eval(x)], []))
all_words_to_embed = list(set(sum(df['ncbo_annotations_pairs'], [])))
all_words_to_embed = pd.DataFrame(all_words_to_embed, columns=['words'])
embeddings = return_embeddings_for_concepts(all_words_to_embed['words'],biobert, 'cpu')
embeddings['words'] = all_words_to_embed['words']
embeddings[['words'] + list(range(len(embeddings.columns)-1))]
basename = 'ncbo_embeddings.csv'
print(basename)
embeddings.to_csv(os.path.join(embeddings_output_filepath, basename), index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 854/854 [00:31<00:00, 27.25it/s]


ncbo_embeddings.csv
