In [1]:
import pandas as pd
import numpy as np

In [2]:
from transformers import BertTokenizer, BertForPreTraining, BertModel
import torch
my_sentence = "Whatever your sentence is"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_pretrained = BertModel.from_pretrained('bert-base-uncased')
model_fine_tuned = torch.load('../models/mybert.pickle', map_location=torch.device('cpu'))

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def return_embeddings(my_sentence, model=model_pretrained): 
    input_ids = tokenizer(my_sentence, return_tensors="pt")
    output = model(**input_ids)

    final_layer = output.last_hidden_state
    return final_layer[0][1].detach().numpy().reshape(1, -1)
return_embeddings('Whatever').shape

(1, 768)

In [4]:
def process_extracted_keywords_file(filename): 
    df = pd.read_csv(filename)\
        .drop('Unnamed: 0', axis=1)
    df['topic_keywords'] = df['topic_keywords'].apply(eval)
    df = df.explode('topic_keywords')
    df[['topic_keyword', 'probability']] = df[['topic_keywords']].apply(lambda x: [x['topic_keywords'][0], x['topic_keywords'][1]], result_type='expand', axis=1)
   
    df = df.drop('topic_keywords', axis=1)
    return df
bertopic_keywords = process_extracted_keywords_file('../results/bertopic/bertopic_processed_data_2022-11-22_23-14-24_train.csv')

In [5]:
import tqdm

In [6]:
bertopic_keywords.head()

Unnamed: 0,PMID,topic_number,topic_probs,topic_keyword,probability
0,25763772,52,1.0,pneumonia,0.108995
0,25763772,52,1.0,antimicrobi,0.060756
0,25763772,52,1.0,isol,0.059366
0,25763772,52,1.0,hmkp,0.049176
0,25763772,52,1.0,infect,0.04822


In [7]:
def return_embeddings_for_keywords(keywords, model=model_pretrained):
    unique_keywords = keywords.drop_duplicates()
    embeddings = []
    for keyword in tqdm.tqdm(unique_keywords['topic_keyword']): 
        embeddings.append(return_embeddings(keyword, model=model_pretrained))
    embeddings = np.concatenate(embeddings, axis=0)
    embeddings = pd.DataFrame(embeddings, columns=list(range(embeddings.shape[1])))
    embeddings['topic_keyword'] = unique_keywords['topic_keyword'].reset_index(drop=True)

    return embeddings

def add_embeddings_to_keywords(df, model=model_pretrained): 
    keywords_embeddings = return_embeddings_for_keywords(df[['topic_keyword']])
    return df.merge(keywords_embeddings)

bertopic_keywords = add_embeddings_to_keywords(bertopic_keywords, model=model_pretrained)


100%|████████████████████████████████████████████████████████████████████████████████| 494/494 [00:15<00:00, 31.39it/s]


In [8]:
import os

In [10]:
import glob

In [11]:
models = {'model_pretrained': model_pretrained, 'model_fine_tuned': model_fine_tuned}
for model_name, model in models.items():
    for file in glob.glob('../results/LDA/LDA*.csv') + glob.glob('../results/bertopic/bertopic_processed*.csv'): 
        extracted_keywords = process_extracted_keywords_file(file)
        embeddings_and_keywords = add_embeddings_to_keywords(extracted_keywords, model=model)
        new_file = os.path.join('../results/embeddings', os.path.basename(file).replace('.csv', f'_embeddings_{model_name}.csv'))
        embeddings_and_keywords.to_csv(new_file, index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:03<00:00, 28.92it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 110/110 [00:03<00:00, 28.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 435/435 [00:14<00:00, 30.23it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 494/494 [00:16<00:00, 30.66it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 91/91 [00:02<00:00, 34.25it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 110/110 [00:03<00:00, 30.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 435/435 [00:13<00:00, 32.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 494/494 [00:16<00:00, 29.50it/s]


In [56]:
def return_embeddings_for_concept(my_sentence, model=model_pretrained): 
    input_ids = tokenizer(my_sentence, return_tensors="pt")
    output = model(**input_ids)

    final_layer = output.last_hidden_state
    return final_layer[0][1:-1].detach().numpy().mean(0).reshape(1, -1)
return_embeddings_for_concept('tetramethyl phenylene diamine oxidase').shape

(1, 768)

In [63]:
def return_embeddings_for_concepts(concepts, model=model_pretrained):
    embs = []

    for concept in tqdm.tqdm(concepts.values.flatten()): 
        embs.append(return_embeddings_for_concept(concept, model))

    return pd.DataFrame(np.concatenate(embs), columns=list(range(embs[0].shape[1])))

In [64]:
concepts = pd.read_csv('../data/ids_to_concept_mapping.csv')

In [65]:
concepts_emb_pretrained = return_embeddings_for_concepts(concepts['concept_name'],model_pretrained)
concepts_emb_pretrained['concept_name'] = concepts['concept_name']
concepts_emb_pretrained[['concept_name'] + list(range(len(concepts_emb_pretrained.columns)-1))]
#concepts_emb_fine_tuned = return_embeddings_for_concepts(concepts['concept_name'], model_fine_tuned)

100%|████████████████████████████████████████████████████████████████████████████| 17820/17820 [10:38<00:00, 27.90it/s]


Unnamed: 0,concept_name,0,1,2,3,4,5,6,7,8,...,758,759,760,761,762,763,764,765,766,767
0,"1-Methyl-4-phenyl-1,2,3,6-tetrahydropyridine",0.039947,0.140954,0.112286,0.019872,-0.627875,-0.140577,0.265062,-0.083067,0.147241,...,-0.444273,-0.129947,-0.134103,-0.280778,-0.013810,-0.340783,0.322789,-0.312991,0.317635,0.460104
1,1-Methyl-4-phenylpyridinium,0.381213,0.021037,0.062805,-0.061094,-0.553361,-0.300442,0.249189,-0.332862,0.221136,...,-0.361582,-0.275238,-0.059798,0.296815,0.187730,0.084299,0.269819,-0.218715,0.139012,0.597568
2,"3,4-Dihydroxyphenylacetic Acid",0.171835,-0.114181,0.318304,0.050699,-0.267773,-0.190374,0.147208,0.094964,0.007978,...,-0.517286,-0.011329,0.074155,0.210840,-0.050799,-0.191697,0.219117,-0.474767,0.070690,0.468977
3,pyrithione zinc 1 % TOPICAL SHAMPOO,0.241237,0.200548,-0.288182,0.026677,0.260056,0.030032,0.483646,0.125540,0.379671,...,-0.180128,-0.358755,0.051714,0.283720,0.347812,-0.109365,-0.440637,-0.548938,-0.322496,0.481040
4,dalfampridine,-0.058877,-0.604588,-0.206819,-0.186415,0.075803,-0.584705,0.222073,0.177960,-0.130236,...,0.234394,-0.276867,-0.082531,0.517761,0.217361,0.455573,-0.094413,-0.332777,-0.164161,0.802149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17815,Autistic symptoms,-0.551843,-0.047219,0.370944,-0.351795,0.676788,0.112030,0.460379,1.037875,-0.871300,...,0.124645,0.209162,0.314340,0.079829,-0.261706,-0.279989,-0.782161,-0.272435,-0.170435,-0.658430
17816,Small size at birth,-0.649714,-0.323830,-0.414301,-0.658360,0.316826,0.155497,0.037140,0.983196,-0.379999,...,-0.043060,-0.135122,0.691281,0.282643,-0.433693,-0.373392,0.052395,-0.240808,-0.344765,0.143974
17817,Autism or autism spectrum disorder,0.321699,0.568425,-0.042600,-0.629903,0.768433,0.176140,0.331890,1.196353,-1.236493,...,-0.216357,0.000727,0.521696,-0.216283,-0.131417,-0.228413,-0.508352,-0.286844,-0.175067,-0.401804
17818,Childhood epilepsy,-0.249563,-0.016884,-0.459402,-0.227199,0.905198,0.043063,-0.096856,0.748580,-0.148533,...,-0.297823,0.086314,0.337602,-0.252247,0.005456,-0.003812,-0.340069,-0.440074,-0.550789,-0.702810


In [67]:
concepts_emb_pretrained.to_csv('../results/embeddings/concepts_emb_pretrained_model.csv', index=False)