In [1]:
import pandas as pd
import numpy as np
import sys
import tqdm
import os
import glob
from collections import Counter

In [2]:
try:
    import google.colab
    colab = True
    from google.colab import drive
    lda_input = '/content/drive/MyDrive/NLP_files/CRAFT/results/lda/*whole*'
    bertopic_input = '/content/drive/MyDrive/NLP_files/CRAFT/results/bertopic/bertopic_lemmatize_nostopwords_data_2023-01-02_10-04-20.csv'

    embeddings_output_filepath = '/content/drive/MyDrive/NLP_files/CRAFT/data/embeddings/'
    drive.mount('/content/drive')
    device_name = 'cuda'
    sys.path.append('/content/drive/MyDrive/NLP/bert_embeddings')
except ImportError:
    colab = False
    lda_input = '../../0.RESULTS/lda/*lda*'
    bertopic_input = '../../0.RESULTS/bertopic/bertopic*.csv'

    embeddings_output_filepath = '../../0.RESULTS/embeddings/'
    device_name = 'cpu'
from emb_helpers import * 


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
try:
    os.mkdir(embeddings_output_filepath)
except FileExistsError: 
    pass

In [4]:
from transformers import AutoTokenizer, AutoModel

biobert = AutoModel.from_pretrained("dmis-lab/biobert-v1.1").to(device_name)

In [5]:
files = glob.glob(lda_input) + glob.glob(bertopic_input)

In [6]:
Counter(pd.read_csv(files[1])['topic_number']).most_common()

[(-1, 25),
 (0, 20),
 (1, 9),
 (2, 7),
 (4, 6),
 (3, 6),
 (5, 6),
 (6, 5),
 (8, 5),
 (7, 5),
 (9, 3)]

In [7]:
df = pd.concat([pd.read_csv(file) for file in files])
all_keywords = sum(df['topic_keywords'].apply(eval), [])
all_keywords = list(set([keyword for keyword, prob in all_keywords]))
all_keywords = pd.DataFrame(all_keywords, columns=['keyword'])
embeddings = return_embeddings_for_concepts(all_keywords['keyword'],biobert, 'cpu')
embeddings['keyword'] = all_keywords['keyword']
embeddings = embeddings[['keyword'] + list(range(len(embeddings.columns)-1))]
embeddings.to_csv(os.path.join(embeddings_output_filepath, 'keywords_embeddings.csv'), index=False)

100%|████████████████████████████████████████████████████████████████████████████████| 125/125 [00:03<00:00, 33.24it/s]
