# System Preparation

## Install requirements

In [None]:
import importlib
import subprocess
try:
    # Check if the module is already installed
    importlib.import_module('torch')
    print("torch is already installed.")
except ImportError:
    # If the module is not installed, try installing it
    subprocess.run(['pip3', 'install', 'torch', 'torchvision', 'torchaudio', '--index-url', 'https://download.pytorch.org/whl/cu118'])
    print("torch was installed correctly.")

import torch
print("Torch version:",torch.__version__)
print("Is CUDA enabled?",torch.cuda.is_available())
if torch.cuda.is_available():
    print(torch.randn(1).cuda())
else:
    #get torch here: https://pytorch.org/get-started/locally/
    subprocess.run(['pip3', 'uninstall', 'torch'])
    subprocess.run(['pip3', 'install', 'torch', 'torchvision', 'torchaudio', '--index-url', 'https://download.pytorch.org/whl/cu118'])

In [None]:
#install requirements
!pip install nltk
!pip install gdown
#!pip install pandas
!pip install bertopic
!pip install wordcloud
!pip install matplotlib
#!pip install bertopic[visualization]  

# Import packages

In [None]:
# Import packages
import os
import io
import sys
import ast
import re
import random
import zipfile
import requests
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
if not 'google.colab' in sys.modules:
    !pip install pyperclip
    import pyperclip as pc

# Plotly: Set notebook mode to work offline
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()

# Set data directory
save_path = 'data/'

# Set working directory
if 'notebooks' in os.getcwd():
    print(os.getcwd())
    os.chdir(os.path.dirname(os.getcwd()))
    print('cwd:', os.getcwd())


# Define functions

def simple_bool(message):
    choose = input(message+" (y/n): ").lower()
    your_bool = choose in ["y", "yes","yea","sure"]
    return your_bool

def get_and_extract(file, dir = os.getcwd(), ext = '.zip'):
    url='https://zenodo.org/record/8205724/files/'+file+'.zip?download=1'
    zip_file_name = file+ext
    extracted_folder_name = dir
    # Download the ZIP file
    response = requests.get(url)
    if response.status_code == 200:
        # Extract the ZIP contents
        with io.BytesIO(response.content) as zip_buffer:
            with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
                zip_ref.extractall(extracted_folder_name)
        print(f"ZIP file '{zip_file_name}' extracted to '{extracted_folder_name}' successfully.")
    else:
        print("Failed to download the ZIP file.")
        
def get_gitfile(url, flag='', dir = os.getcwd()):
    url = url.replace('blob','raw')
    response = requests.get(url)
    file_name = flag + url.rsplit('/',1)[1]
    file_path = os.path.join(dir, file_name)
    if response.status_code == 200:
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"File downloaded successfully. Saved as {file_name}")
    else:
        print("Unable to download the file.")

def load_preprocessed(doc_name = 'abs_preprocessed.txt'):
    with open(save_path+doc_name, 'r',encoding='utf-8') as file:
        docs_processed = []
        for line in file:
            docs_processed.append(str(line.strip()))

    print("Imported list:", doc_name)
    return docs_processed

#get_gitfile("https://raw.githubusercontent.com/johndef64/pyutilities_datascience/main/general_utilities.py")

# Import Source Datasets
https://zenodo.org/record/8205724  DOI: 10.5281/zenodo.8205724

### Import GRPM dataset

In [None]:
if not os.path.exists("grpm_dataset/grpm_db_pcg/grpm_table_output.csv"):
    if simple_bool('Download source Dataset from Zenodo?'):
        timea = datetime.now()
        get_and_extract('grpm_dataset')
        print('Download and extraction time ',datetime.now()-timea)
    else:
        pass

def pmidstr(df):
    df['pmids'] = df['pmids'].astype(str)

grpm_dataset = pd.read_csv("grpm_dataset/grpm_db_pcg/grpm_table_output.csv", index_col=0).reset_index(drop=True)
grpm_dataset_rna = pd.read_csv("grpm_dataset/grpm_db_rna/grpm_table_output.csv", index_col=0).reset_index(drop=True)
grpm_dataset_pseudo = pd.read_csv("grpm_dataset/grpm_db_pseudo/grpm_table_output.csv", index_col=0).reset_index(drop=True)

pmidstr(grpm_dataset)
pmidstr(grpm_dataset_rna)
pmidstr(grpm_dataset_pseudo)

print('nbib dataset shape:',grpm_dataset.shape)
grpm_dataset.head()

### Import NBIB dataset

In [None]:
if not os.path.exists("grpm_dataset\grpm_db_pcg\complete_nbibtable.csv"):
    if simple_bool('Download nbib-data from Zenodo?\n (size: 5GB unpacked; average import time: 7'')'):
        timea = datetime.now()
        get_and_extract('nbib_data')
        print('Download and extraction time: ',datetime.now()-timea)
    else:
        pass

nbib_dataset = pd.read_csv("grpm_dataset\grpm_db_pcg\complete_nbibtable.csv", index_col=0).dropna(subset=['descriptors']).reset_index(drop=True)
print('nbib dataset shape:',nbib_dataset.shape)
nbib_dataset.head()

### Import MeSH Terms

In [None]:
# load reference Mesh List
df_mesh = pd.read_csv(save_path+'ref-mesh.csv')['Preferred Label']
list_mesh = df_mesh.to_list()
docs_mesh = str(list_mesh)
mesh_count = len(list_mesh)
print('MeSH list count:',mesh_count)
print('MeSH list sample:',list_mesh[0:10])
#pc.copy(df_mesh.to_latex())
#df_mesh.to_csv('S1_MeSH.csv')
df_mesh

# Preprocess Data

## NBIB filtering by MeSH

In [None]:
# filtering source dataset
timea = time.time()
nbib_subset = pd.DataFrame(columns= nbib_dataset.columns)
filteres_grpm = grpm_dataset[grpm_dataset.mesh.isin(list_mesh)]

nbib_subset = nbib_dataset[nbib_dataset.pubmed_id.isin(filteres_grpm.pmids.drop_duplicates())]

print((time.time()-timea)/60,'minutes')
print('pubmed id count:',nbib_subset.pubmed_id.nunique())
print('pubmed id with abstract count:',nbib_subset.abstract.nunique()-1)
nbib_subset.head()

## save/load NBIB subset

In [None]:
if simple_bool('save nbib subset?'):
    nbib_subset.reset_index(drop=True).to_csv(save_path+'nbib_subset_'+str(mesh_count)+'.csv')
    
if simple_bool('load nbib subset?'):
    nbib_subset = pd.read_csv(save_path+'nbib_subset_251.csv', index_col=0)
    
print('nbib subset shape:', nbib_subset.shape)

## NBIB subset cleaning

In [None]:
if simple_bool('load nbib_subset_clean?'):
    nbib_subset_clean = pd.read_csv(save_path+'nbib_subset_clean_251.csv', index_col=0)
    docs = nbib_subset_clean.combined.drop_duplicates().to_list()
else:
    #get only pubmed_id with abstract:
    nbib_subset_clean = nbib_subset[['gene', 'pubmed_id', 'publication_date', 'title','abstract', 'authors', 'publication_types','place_of_publication','nlm_journal_id', 'descriptors', 'keywords',  'pii', 'doi', 'publication_status']].drop_duplicates().dropna(subset='abstract')
    
    # add PMID to abstract (flag):
    nbib_subset_clean['combined'] = nbib_subset_clean['pubmed_id'].astype('str') + ' $ ' + nbib_subset_clean['abstract']

def displaysub():
    print('nbib subset shape:',nbib_subset_clean.shape)
    print('pubmed id count:', nbib_subset_clean.pubmed_id.nunique())
    print('abstract count:', nbib_subset_clean.abstract.nunique())
    #nbib_subset_clean[['gene', 'pubmed_id', 'publication_date', 'authors', 'publication_types','nlm_journal_id', 'descriptors', 'keywords']].head().to_clipboard()
    display(nbib_subset_clean.head())

displaysub()

In [None]:
if simple_bool('save nbib_subset_clean?'):
    nbib_subset_clean.reset_index(drop=True).to_csv(save_path+'nbib_subset_clean_'+str(mesh_count)+'.csv')

## Source Dataset Statistics

In [None]:
filteres_grpm = grpm_dataset[grpm_dataset.mesh.isin(list_mesh)]
filteres_grpm = filteres_grpm[['gene', 'rsid', 'pmids', 'mesh']].drop_duplicates()
filteres_grpm.describe().to_clipboard()

print('Original Dataset')
display(grpm_dataset[['gene', 'rsid', 'pmids', 'mesh']].drop_duplicates().describe())

print('Filtered by MeSH')
display(filteres_grpm.describe())

In [None]:
# Stats
keep_l = ['gene', 'pubmed_id', 'publication_date', 'authors', 'publication_types','nlm_journal_id', 'descriptors', 'keywords']
keep_s = ['gene', 'pubmed_id']

data = nbib_subset_clean[keep_s]#.dropna(subset=['authors'])#, 'descriptors'])

# Statistics
# Convert string representations of lists to actual lists
data['pubmed_id'] =  data['pubmed_id'].astype(str)
#data['authors'] = data['authors'].apply(ast.literal_eval)
#data['publication_types'] = data['publication_types'].apply(ast.literal_eval)
#data['descriptors'] = data['descriptors'].apply(ast.literal_eval)
#data['keywords'] = data['keywords'].apply(ast.literal_eval)'''

# Display basic statistics
print('Data Preprocessed for Topic Modeling:')
display(data.describe())

# Number of Publications per Gene
gene_counts = data['gene'].value_counts()
plt.figure(figsize=(15, 5))
gene_counts.head(60).plot(kind='bar')
plt.title('Number of Publications per Gene')
plt.xlabel('Gene')
plt.ylabel('Number of Publications')
plt.show()

## Prepare abstracts

In [None]:
# Load NBIB subset combined
if 'nbib_subset_clean' not in globals():
    if simple_bool('load nbib_subset_clean?'):
        nbib_subset_clean = pd.read_csv(save_path+'nbib_subset_clean_251.csv', index_col=0)
        docs = nbib_subset_clean.combined.drop_duplicates().to_list()
else:
    docs = nbib_subset_clean.combined.drop_duplicates().to_list()

print('PMID count:',nbib_subset.pubmed_id.nunique(),  
      '\nabstract count:',nbib_subset_clean.abstract.nunique(), 
      '\nPMID without abstract:',nbib_subset.pubmed_id.nunique()-nbib_subset_clean.abstract.nunique())

## Corpus normalization

### - nltk normalization (optional)
1.  import nltk for stemming and stopword removal
2.  remove stopwords and then stem the abstracts

In [None]:
#download stopwords and the punkt tokenizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')

#tokenize, remove stopwords and stem
# import nltk for stemming and stopword removal
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import time
import random
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def stem_and_remove_stopwords(text):
    word_tokens = word_tokenize(text)
    lowercase_tokens = [word.lower() for word in word_tokens] #lower
    filtered_text = [word for word in lowercase_tokens if word not in stop_words] #trimming
    stemmed_text = [stemmer.stem(word) for word in filtered_text] #stemming
    return " ".join(stemmed_text)

docs_to_process = docs#[100:1600]#random.sample(docs, 100)

# Normalize docs
timea = time.time()
docs_processed = [stem_and_remove_stopwords(doc) for doc in docs_to_process]
print(time.time()-timea)

print(docs_to_process[1],'\n')
print(docs_processed[1])
len(docs_processed)

### - no-nltk normalization 
(best for BERTopic)

In [None]:
# lower text keeping acronyms uppercase
def lower(text):
    # Split the text into words
    words = text.split()
    # Lowercase the words that don't have consecutive uppercase letters
    processed_words = [word.lower() if not re.search('[A-Z]{2,}', word) else word for word in words]
    # Join the processed words back into text
    processed_text = ' '.join(processed_words)
    return processed_text

# lower text keeping genes uppercase
''' Deprecated (too slow)
genes = pd.concat([grpm_dataset.gene, grpm_dataset_rna.gene, grpm_dataset_pseudo.gene])
genes= genes.drop_duplicates().reset_index(drop=True)
genes

def lower_gene(text):
    lowered_text = text.lower()
    for j in range(len(genes)):
        gene = genes[j]
        lowered_text = lowered_text.replace(' '+gene.lower()+' ', ' '+gene+' ')
        lowered_text = lowered_text.replace('('+gene.lower()+')', '('+gene+')')
        lowered_text = lowered_text.replace(' '+gene.lower()+',', ' '+gene+',')
        lowered_text = lowered_text.replace(' '+gene.lower()+'.', ' '+gene+'.')
        if j%100 == 0:
            print(gene.ljust(10), 'remaning:',len(genes)-j)
    return lowered_text
'''

docs_to_process = docs#[100:1600]#random.sample(docs, 100)

# Normalize docs
timea = time.time()
sampled_docs = docs_to_process
docs_str = str(sampled_docs)
docs_lower = lower(docs_str)
docs_processed = ast.literal_eval(docs_lower)
print(time.time()-timea)

print(docs_to_process[1],'\n')
print(docs_processed[1])
len(docs_processed)

In [None]:
# save pruned and stemmed abstracts
if simple_bool('save abs_preprocessed?'):
    doc_name = 'abs_preprocessed_'+str(mesh_count)+'.txt'
    with open(save_path+doc_name, 'w', encoding='utf-8') as file:
        file.write('\n'.join(map(str, docs_processed)))
    print("List saved as "+doc_name)

In [None]:
# load docs_processed.txt as a list
if simple_bool('load abs_preprocessed?'):
    load_preprocessed()

# Topic modeling (TM)


1.   Default setting - Vanilla (Extras)
2.   Custom - choose embedding model and set clustering parameters

## Define Parameters

Custom settings
https://maartengr.github.io/BERTopic/getting_started/tips_and_tricks/tips_and_tricks.html

In [None]:
import re
from hdbscan import HDBSCAN
from umap import UMAP
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.representation import KeyBERTInspired


# Step 1 Extract embeddings (SBERT)
models = ["allenai-specter",
          "all-mpnet-base-v2",
          "all-MiniLM-L6-v2"
         ] 
    # ref: https://www.sbert.net/docs/pretrained_models.html 

#- allenai-specter: model trained on scientific citations and can be used to estimate the similarity of two publications. We can use it to find similar papers. 
#- all-mpnet-base-v2: general purpose model, The all-mpnet-base-v2 model provides the best quality
#- all-MiniLM-L6-v2: 5 times faster and still offers good quality.

base_embedder = models[0]  # BaseEmbedder
sentence_transformer = SentenceTransformer(base_embedder) # SentenceTransformer


# Step 2 - Reduce dimensionality 
# uniform manifold approximation and projection (UMAP) to reduce the dimension of embeddings
random_state = 1337 #1000 #1337
umap_model = UMAP(n_neighbors  = 15, #num of high dimensional neighbours
                  n_components = 5,  # default:5 #30
                  min_dist     = 0.0, 
                  random_state = random_state) # default:None
            #  ref: https://stackoverflow.com/questions/71320201/how-to-fix-random-seed-for-bertopic


# Step 3 - Cluster reduced embeddings 
# HDBSCAN (hierarchical density-based spatial clustering of applications with Noise)  to generate semantically similar document clusters. 
# Since HDBSCAN is a density-based clustering algorithm, the number of clusters is automatically chosen based on the minimum distance to be considered as a neighbor. 
min_cluster_size = 100 #5 default HDBSCAN()
hdbscan_model = HDBSCAN(min_cluster_size = min_cluster_size,
                        metric='euclidean', 
                        cluster_selection_method='eom', 
                        prediction_data=True)


# Step 4 - Tokenize topics 
vectorizer_model = CountVectorizer(stop_words="english", lowercase=False) # lowercase=False to keep genes uppercase


# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True) # False default


# Step 6 - Fine-tune topic representations with a bertopic.representation model
representation_model = MaximalMarginalRelevance(diversity = 0.7,  # 0.1 default
                                                top_n_words = 15) # 10 default
#representation_model = KeyBERTInspired()


# Use the representation model in BERTopic on top of the default pipeline

# All steps together
topic_model = BERTopic(
    min_topic_size = 10,                         # 10 default
    top_n_words = 15,                            # 10 default
    calculate_probabilities = True,
    embedding_model = sentence_transformer,      # Step 1 - Extract embeddings
    umap_model = umap_model,                     # Step 2 - Reduce dimensionality
    hdbscan_model = hdbscan_model,               # Step 3 - Cluster reduced embeddings
    vectorizer_model = vectorizer_model,         # Step 4 - Tokenize topics
    ctfidf_model = ctfidf_model,                 # Step 5 - Extract topic words
    representation_model= representation_model   # Step 6 - (Optional) Fine-tune topic represenations
)

In [None]:
len(docs_processed)

## Fit Transform

In [None]:
# Fit Transform
#topics, probs = topic_model.fit_transform(docs_processed)
#topics_fit = topic_model.fit(docs_processed)

In [None]:
## If you want to split embedding phase, use it as follows:

# Step 1 Embedding documents with sentence_transformer
embedding_file = 'nbib_subset_clean_embeddings.txt'

if 'docs_processed' not in globals():
    docs_processed = load_preprocessed()

if simple_bool('Load pre-generated embeddings?'):
    embeddings = np.loadtxt(save_path + embedding_file)
else:
    embeddings = sentence_transformer.encode(docs_processed, show_progress_bar=True)
    
# Get Topic Model using custom embeddings
topics, probs = topic_model.fit_transform(docs_processed, embeddings=embeddings)
#topics = topic_model.fit(docs_processed, embeddings=embeddings)
print('random_state:',random_state)

In [None]:
print(embeddings.shape)
topic_model.get_topic_info()

## Save Embeddings

In [None]:
if simple_bool('save grpm_abs_embeddings?'):
    filename = 'nbib_grpm_abs_embeddings.txt'
    np.savetxt(save_path+filename, embeddings)

## Save Model

In [None]:
if simple_bool('Save topics model?'):
    filename= "topic_model_seed"+str(random_state)
    topic_model.save(filename)
    np.savetxt(save_path+filename+'_probs.txt', probs)

## Load Model

In [None]:
from bertopic import BERTopic
if simple_bool('Load topics model?'):
    filename = 'topic_model_251'
    topic_model = BERTopic.load(save_path+filename)
    probs = np.loadtxt(save_path+filename+'_probs.txt')

if 'docs_processed' not in globals():
    docs_processed = load_preprocessed()

In [None]:
display(topic_model.get_topic_info().head())
pd.DataFrame(probs).head()
#topic_model.get_document_info(docs_processed).Probability

## Custom Labels

In [None]:
#basic custom labels 
topic_labels = topic_model.generate_topic_labels(nr_words     = 4,
                                                 topic_prefix = True,
                                                 word_length  = 10,
                                                 separator    = " ")
common_length = 30
padded_list = [s.ljust(common_length) + " "  if len(s) < common_length else s[:common_length] + " "  for s in topic_labels]

if simple_bool('use padded list?'):
    topic_labels = padded_list
    
topic_model.set_topic_labels(topic_labels)
topic_model.get_topic_info()#.head()

In [None]:
# Return top n words for a specific topic and their c-TF-IDF scores
print('Top n words for a specific topic and their c-TF-IDF scores')
topic_id = 0
topic = topic_model.get_topic(topic_id, full=False)
pd.DataFrame(topic)#[1].sum()

## Topic Info

In [None]:
# Return topics with top n words and their c-TF-IDF score
all_topics = topic_model.get_topics()
topic_df = pd.DataFrame(all_topics)#.T#.head()

# Probablities
probs_df = pd.DataFrame(probs)

# Topic FREQ
topic_freq = topic_model.get_topic_freq()
outliers = topic_freq.Count[0]

print('total',topic_freq.Count.sum())
print('outliers',outliers)
print('ingroup',topic_freq.Count.sum()-outliers)

# Topic INFO
topic_info = topic_model.get_topic_info()
topic_info

In [None]:
#mesh_count, random_state = 251,'v2',1337
if simple_bool('save topic info?'):
    filename = 'topic_info_abs_seed'+str(random_state)+'.csv'
    topic_info.to_csv(save_path+filename)

## Document Info

In [None]:
if 'docs_processed' not in locals():
    docs_processed = load_preprocessed()

doc_info = topic_model.get_document_info(docs_processed)
doc_info['pubmed_id'] = doc_info.Document.str.split(' $ ').str[0].str.split().str[0]
doc_info['Abstract']  = doc_info.Document.str.split('$').str[1]

doc_info_s = doc_info[['Topic','pubmed_id', 'Abstract', 'Name', 'Representation', 'Top_n_words', 'Probability','Representative_document']]

print(doc_info.columns)
doc_info_s[doc_info_s.Topic == 1].reset_index().sort_values(by='index').head()#.T

In [None]:
if simple_bool('Save document info?'):
    filename = 'doc_info_'+str(mesh_count)+'_rndm'+str(random_state)+'.csv'
    doc_info.to_csv(save_path+filename)

In [None]:
if simple_bool('Load document info?'):
    doc_info = pd.read_csv('data/doc_info_251_rndm1337.csv', index_col=0)
doc_info

## Get PMID and Genes

In [None]:
# get pubmed_id per topic
def get_pmids(topic_id):
    len(doc_info[doc_info.Topic == topic_id])
    doc_topic = doc_info[doc_info.Topic == topic_id]
    pmid_topic = doc_topic.pubmed_id.reset_index(drop=True).astype(str)
    return pmid_topic

#get gene per topic
def get_genes(topic_id):
    genes_topic = grpm_dataset[grpm_dataset['pmids'].isin(get_pmids(topic_id))].gene.drop_duplicates().reset_index(drop=True)
    return genes_topic

if 'grpm_dataset' not in globals():
    grpm_dataset = pd.read_csv("grpm_dataset/grpm_db_pcg/grpm_table_output.csv", index_col=0).reset_index(drop=True)
    grpm_dataset['pmids'] = grpm_dataset['pmids'].astype(str)
    
get_genes(2)

In [None]:
# Get Representative Documents
print(topic_info['Representative_Docs'][1])
print('\n',topic_info['Representation'][1])

doc_info_s_rep = doc_info_s[doc_info_s.Representative_document==True]
doc_info_s_rep

In [None]:
# Print the top n words for / c-TF-IDF scores for a specific topic (here the 0)
for n in range(2):
    print(pd.Series(topic_model.get_topic(n))) 

## Topic naming

### Generate names (GPT3.5)

In [1]:
import importlib
import subprocess

# Check if pychatgpt is already installed
try:
    importlib.import_module('pychatgpt')
except ImportError:
    # If not installed, run subprocess to install it
    subprocess.run(['pip', 'install', 'git+https://github.com/johndef64/pychatgpt.git'])

In [2]:
# Import module
from pychatgpt import GPT
op = GPT()
op.model = 3

op.expand_chat('''You are an helpful scientific assistant''', 'system')
topic_info['CustomLabel_GPT3.5'] = ''
free_user = False

gpt_label = 'a short label of the topic. Reply only with the result.'
gpt_description = 'a short but accurate description of the topic. Reply only with the result.'

op.clearchat()
def generate(what, label):
    for i in range(len(topic_info)):
        prompt1 = ''' 
        Using your knowledge of biomendical science, give a short, appropriate and specific name to the topic describing the following list of biomedical terms, max 8 words. 
        '''+str(topic_info.Representation.iloc[i])

        '''
        Example: 
        term list: 
        ['cell','mice','protein','levels','liver','lipid','role','plasma','inflammatory','activation','muscle','study','diseases','mutations','associated']
        outout: 
        "Role of Inflammatory Protein in Liver Diseases'"
        '''

        if len(doc_info[doc_info.Topic == i-1]) <=1:
            sample = 1
        elif len(doc_info[doc_info.Topic == i-1]) <=2:
            sample =2
        else:
            sample = 3

        sample_docs = str(doc_info[doc_info.Topic == i-1][doc_info.Representative_document == True].sample(sample).Abstract)

        prompt2 = """
        I have topic that contains the following documents: """ +sample_docs+ """
        The topic is described by the following keywords: """ +str(topic_info.Representation.iloc[i])+ """
        
        Based on the above information, give me """+what

        op.send_message(prompt2, model=op.model, print_token= False, save_chat=False)
        if label not in topic_info.columns:
            topic_info[label] = None
        topic_info[label][i] = str(i)+'.'+op.reply #topic_info['CustomLabel_GPT3.5'][i]

        if i>=3 and i % 20 == 0:
            print('\nsleeping... (61 sec)')
            time.sleep(61)

        if free_user:
            if i>=3 and i % 3 == 0:
                print('\nsleeping... (55 sec)')
                time.sleep(55)
        print('\n')

generate(gpt_label, 'CustomLabel_GPT3.5')
generate(gpt_description, 'Description_GPT3.5')

topic_labels_gpt = topic_info['CustomLabel_GPT3.5']
topic_info.head()

Hello! How can I assist you today?
 <prompt tokens: 27>


In [None]:
if simple_bool('Save topic info GPT?'):
    filename = 'topic_info_seed'+str(random_state)+'_GPT.csv'
    topic_info.to_csv(save_path+filename)

if simple_bool('Load topic info GPT?'):
    topic_info = pd.read_csv(save_path+'topic_info_seed1337_GPT.csv', index_col=0) 

topic_info[['Topic', 'Count', 'CustomLabel_GPT3.5','Representation']]#[17:].to_clipboard(index=False)#csv('table.csv')
#['CustomLabel_gpt-3.5'][0]

In [None]:
latex_table = topic_info[['Topic', 'Count', 'CustomLabel_GPT3.5']].to_latex(index=False)
import pyperclip as pc
pc.copy(latex_table)

RPM (requests per minute) – The maximum requests allowed per minute  
RPD (requests per day) – The maximum requests allowed per da  

TPM (tokens per minute) – The maximum tokens allowed to be sent per minute

In [None]:
data = {'TIER': ['Free', 'Tier 1', 'Tier 2', 'Tier 3', 'Tier 4', 'Tier 5'],
        'QUALIFICATION': ['User must be in an allowed geography', '$5 paid', '$50 paid and 7+ days since first successful payment','$100 paid and 7+ days since first successful payment', '$250 paid and 14+ days since first successful payment','$1000 paid and 30+ days since first successful payment'],
        'MAX CREDITS': ['$100', '$100', '$250', '$500', '$1000', '$1000'],
        'REQUEST LIMITS': ['3 RPM, 200 RPD', '500 RPM, 10K RPD', '5000 RPM', '5000 RPM', '10K RPM', '10K RPM'],
        'TOKEN LIMITS': ['10K TPM', '20K TPM', '40K TPM', '80K TPM', '300K TPM', '300K TPM']}
print('Openai API reference:')
pd.DataFrame(data)

-- GOAL: PMID clusering and topic definition ('openai')

### Set topic labels

In [None]:
# basic custom labels 
topic_model.set_topic_labels(topic_labels)

In [None]:
# gpt-3.5 custom labels 
topic_model.set_topic_labels(topic_labels_gpt.to_list())
topic_model.get_topic_info().head()

# Topic Visualization¶

Topic Visualization¶
Evaluating topic models can be rather difficult due to the somewhat subjective nature of evaluation. Visualizing different aspects of the topic model helps in understanding the model and makes it easier to tweak the model to your liking.

| Method                                      | Code                                    |
|--------------------------------------------|-----------------------------------------|
| Visualize Topics                            | `.visualize_topics()`                     |
| Visualize Documents                         | `.visualize_documents()`                  |
| Visualize Document Hierarchy                | `.visualize_hierarchical_documents()`     |
| Visualize Topic Hierarchy                   | `.visualize_hierarchy()`                  |
| Visualize Topic Tree                        | `.get_topic_tree(hierarchical_topics)`    |
| Visualize Topic Terms                       | `.visualize_barchart()`                   |
| Visualize Topic Similarity                  | `.visualize_heatmap()`                    |
| Visualize Term Score Decline                | `.visualize_term_rank()`                  |
| Visualize Topic Probability Distribution    | `.visualize_distribution(probs[0])`       |
| Visualize Topics over Time                  | `.visualize_topics_over_time(topics_over_time)` |
| Visualize Topics per Class                  | `.visualize_topics_per_class(topics_per_class)` |

## Document visualization

In [None]:
# visualize_documents (sample) ! Computationally intensive !

topic_model.set_topic_labels(topic_labels)
topic_model.visualize_documents(docs_processed, sample= 0.05, custom_labels=True)  
#topic_model.visualize_documents(docs_processed, embeddings=embeddings)

In [None]:
# visualize_documents (sample) (using embeddings, faster)
if 'embeddings' not in locals():
    embedding_file = 'abs_embeddings_seed1337.txt'
    embeddings = np.loadtxt(save_path+embedding_file)

topic_model.set_topic_labels(topic_labels)
topic_model.visualize_documents(docs_processed, embeddings=embeddings,sample= 0.1, custom_labels=False)

## Topic Visualization

### Intertopic Distance Map

After having trained our BERTopic model, we can iteratively go through hundreds of topics to get a good understanding of the topics that were extracted. However, that takes quite some time and lacks a global representation. Instead, we can visualize the topics that were generated in a way very similar to LDAvis.

We embed our c-TF-IDF representation of the topics in 2D using Umap and then visualize the two dimensions using plotly such that we can create an interactive view.

In [None]:
topic_model.visualize_topics() #can help you visualize topics generated with their sizes and corresponding words.

# Post-processing

## Topic Evaluation
thanks to Topic Clusterization and Intertopic Distance Map 

Results:
- based on 251 Mesh, we retieverd 37.000 pepers from GRPM Dataset
- Topic Modeling on abstracts, we got 50 Topics
- Topic Evaluation, from 50 Topics to X0 topics
- from 37.000 paper to 10.238 papers, XX.000 excluded and 13.YYY outliers
- from 8.280 genes to 3.911 genes (cross papers with GRPM ds)
- get genes network for selected topics 

### Curated Topic Evaluation  

In [None]:
pertinent_topics = [ 11,28,48,18,2,5,  #cluster1:
                     9,35, #cluster2
                     12,#cluster3
                     #cluster4
                     37,32,19,40,26,33, #cluster5
                     3,31,#cluster6
                     #cluster7
                     #cluster8
                     #cluster9
                     8,20,4,34#cluster10
                     ]

non_pertinent_topics = [ 27,47, #cluster1
                         1,#cluster2
                         0,30,21,49,36, #cluster3
                         39,13,#cluster3
                         16,39, #cluster4
                         #cluster5
                         29,#cluster6
                         45,25,7,44,24,23,14,#cluster7
                         38,50,46,10,42,17,#cluster8
                         41,6,16,15,43,22,#cluster9
                         #cluster10
                         ]
print('Selected Topics:')
topic_info[topic_info.Topic.isin(pertinent_topics)]

### Get Representative Docs

In [None]:
#Get Representative_document abstracts for pertinent topics
doc_info_s_rep = doc_info_s[doc_info_s.Representative_document==True]
doc_info_s_rep_pert = doc_info_s_rep[doc_info_s_rep.Topic.isin(pertinent_topics)]
doc_info_s_rep_pert.Abstract.to_list()
#pc.copy(str(doc_info_s_rep_pert.Abstract.to_list()))
#pc.copy(str(doc_info_s_rep_pert.pubmed_id.to_list()))

In [None]:
# visualize_documents (sample)
if 'embeddings' not in locals():
    embedding_file = 'abs_embeddings_seed1337.txt'
    embeddings = np.loadtxt(save_path+embedding_file)
    
topic_model.set_topic_labels(topic_labels)
topic_model.visualize_documents(docs_processed, topics= pertinent_topics, embeddings=embeddings,sample= 0.1, custom_labels=False)

In [ ]:
!pip install squarify

In [None]:
# Evaluation Results
topic_info_pertinent = topic_info[['Topic', 'Count','CustomName', 'Representation']][topic_info.Topic.isin(pertinent_topics)]
doc_info_s_pertinent = doc_info_s[doc_info_s.Topic.isin(pertinent_topics)]

total_count = topic_info.Count.sum()
pertinent_count = topic_info_pertinent.Count.sum()
outliers = topic_info.Count[0]
non_pertinent_count = total_count-outliers-pertinent_count

print('total count:', total_count)
print('outliers count:', outliers)
print('non pertinent count:', non_pertinent_count)
print('pertinent count:', pertinent_count)
'''
import plotly.express as px
data = {
    'Type': ['Pertinent Count', 'Non-Pertinent Count', 'Outliers'],
    'Value': [pertinent_count, non_pertinent_count, outliers]
}
fig = px.treemap(data, path=['Type'], values='Value', title='Treemap')
fig.show()
'''

In [None]:
import matplotlib.pyplot as plt
import squarify

labels = ['Pertinent Count', 'Non-Pertinent Count', 'Outliers']
sizes = [pertinent_count, non_pertinent_count, outliers]
colors = ['green', 'red', 'grey']

plt.figure(figsize=(4, 4))  
squarify.plot(sizes=sizes, label=labels, color=colors, alpha=0.7)
#plt.axis('off')
#plt.title('Treemap')
plt.savefig('evaluation_placeholder.png', dpi=300)
plt.show()

topic_info_pertinent#.describe()

## Output Dataset 
### Output Dataset Stats

In [None]:
filteres_grpm = grpm_dataset[grpm_dataset.mesh.isin(list_mesh)]
filteres_grpm = filteres_grpm[['gene', 'rsid', 'pmids', 'mesh']].drop_duplicates()
filteres_grpm.describe().to_clipboard()

pertinent_pmid = doc_info[doc_info.Topic.isin(pertinent_topics)].pubmed_id
modeled_grpm = grpm_dataset[grpm_dataset.pmids.isin(pertinent_pmid)]

print('Filtered by MeSH')
display(filteres_grpm.describe())
print('Selected by Topic')
display(modeled_grpm.describe())


### Get and Enrich Output Dataset

In [None]:
output_ds = doc_info_s_pertinent[['Topic', 'pubmed_id', 'Abstract', 'Name', 'Representation']]

# filter NBIB
filterd_nbib = nbib_subset_clean[['pubmed_id','title','abstract', 'keywords', 'descriptors']].drop_duplicates()
filterd_nbib.pubmed_id = filterd_nbib.pubmed_id.astype(str)
output_nbib = filterd_nbib[filterd_nbib.pubmed_id.isin(output_ds.pubmed_id)]
output_nbib#.to_csv(save_path+'RAG_LLM_nutrigentic_dataset.csv')

# MERGE NBIB & TOPICS
output_ds_topic = pd.merge(output_ds[['Topic', 'pubmed_id', 'Name', 'Representation']], output_nbib, on='pubmed_id')
output_ds_topic.rename(columns={'Topic':'topic_id', 'pubmed_id':'pubmed_id', 'Name':'topic_name', 'Representation':'topic_representation'}, inplace=True)
output_ds_topic#.to_csv(save_path+'output_dataset_bertopic_nutrigentics.csv')

# MERGE NBIB-TOPICS & GENE-RSID
df = modeled_grpm[['gene', 'rsid', 'pmids']]
transformed_df = df.groupby('pmids').aggregate({'rsid': lambda x: list(x), 'gene': lambda x: list(x)}).reset_index()
transformed_df[['pmids', 'rsid', 'gene']]

output_ds_topic_gene_rsid = pd.merge(output_ds_topic, transformed_df, left_on='pubmed_id', right_on='pmids')
output_ds_topic_gene_rsid.rename(columns={'gene':'genes', 'rsid':'rsids'}, inplace=True)
output_ds_topic_gene_rsid.drop(columns='pmids', inplace=True)
output_ds_topic_gene_rsid = output_ds_topic_gene_rsid[['topic_id', 'pubmed_id', 'topic_name', 'topic_representation', 'title','abstract', 'keywords', 'descriptors', 'genes', 'rsids']]
output_ds_topic_gene_rsid#.to_csv(save_path+'output_dataset_bertopic_nutrigentics.csv')

### Load Enriched Output Dataset

In [ ]:
output_ds_topic_gene_rsid = pd.read_csv(save_path+'output_dataset_bertopic_nutrigenetics.csv', index_col=0)
output_ds_topic_gene_rsid

In [ ]:
# get enriched representative docs
topic_id = 2
doc_info_s_rep_pmid_bool = doc_info_s_rep[['pubmed_id', 'Representative_document']]
output_ds_topic_gene_rsid[output_ds_topic_gene_rsid.topic_id==topic_id]

# filter for representative docs
output_ds_topic_gene_rsid_rep = output_ds_topic_gene_rsid[output_ds_topic_gene_rsid.pubmed_id.isin(doc_info_s_rep_pmid_bool.pubmed_id)]
output_ds_topic_gene_rsid_rep[output_ds_topic_gene_rsid.topic_id==topic_id][['title','abstract','keywords']]

### Extras

In [None]:
# to latex
df = topic_info[['Topic', 'Count', 'CustomLabel_GPT3.5']]
df['CustomLabel_GPT3.5']=df['CustomLabel_GPT3.5'].str.replace(r'^.*?\.', '', regex=True)
df_isin = df[df.Topic.isin(pertinent_topics)]

latex_table = df_isin.to_latex(index=False,caption='Pertinent Topics')
import pyperclip as pc
pc.copy(latex_table)

In [None]:
# Return top n words for a specific topic and their c-TF-IDF scores
print('Top n words for a specific topic and their c-TF-IDF scores')
for topic_id in pertinent_topics:
    topic = topic_model.get_topic(topic_id, full=False)
    print('\n',topic_info.CustomName[topic_id])
    print(pd.DataFrame(topic))#[1].sum()

The argument with id -1 groups  all unattributed abstracts. We will ignore it.

# Display Results

### Visualize Probablities or Distribution¶


In [None]:
# To visualize the probabilities of topic assignment
import plotly.graph_objects as go
fig = go.Figure()
for n in range(3):
    fig = topic_model.visualize_distribution(probs[n],
                                             min_probability = 0,
                                             custom_labels = False)
    fig.show()

In [None]:
!pip install jinja2

In [None]:
# Calculate the topic distributions on a token-level
topic_distr, topic_token_distr = topic_model.approximate_distribution(docs_processed, calculate_tokens=True, min_similarity=0)

In [None]:
# Visualize the token-level distributions
topic_n = 4
df = topic_model.visualize_approximate_distribution(docs_processed[topic_n], topic_token_distr[topic_n])
df

In [None]:
topic_n = 2
doc_info[doc_info.Document == docs_processed[topic_n]].reset_index().sort_values(by='index')#.head()#.T

### Similarity Matrix
Having generated topic embeddings, through both c-TF-IDF and embeddings, we can create a similarity matrix by simply applying cosine similarities through those topic embeddings. The result will be a matrix indicating how similar certain topics are to each other.

In [None]:
!pip install -U kaleido

In [None]:
import plotly.graph_objects as go

def show_similarty(topics):
    fig = go.Figure()
    fig = topic_model.visualize_heatmap(topics=topics,
                                        top_n_topics = 51,
                                        #n_clusters = 20,
                                        custom_labels=True,
                                        width = 1100,
                                        height = 1100)
    #fig.write_image("fig1.png", engine='kaleido')
    fig.show()
    
show_similarty(None) # this is correct, in-line with the Intertopic distance map
#show_similarty(topic_info.Topic[:len(topic_info.Topic)-10]) # !!! reorder topic doesn't work
#show_similarty(hierarchy[:len(hierarchy)]) # !!! reorder topic doesn't work
#show_similarty(pertinent_topics) # !!! cherry-picking topics doesn't work
#show_similarty([7, 24,0,1,34,14,23]) 
#show_similarty([7, 14,20,14,3,4,2])
# Conclusion: the heatmap is fixed 

### Similarity Matrix (custom)

In [None]:
pertinent_topics.sort()
pertinent_topics

In [None]:
topic_info[topic_info.Topic.isin(pertinent_topics)].CustomName

In [None]:
import numpy as np
import plotly.express as px
from sklearn.metrics.pairwise import cosine_similarity

distance_matrix = cosine_similarity(np.array(topic_model.topic_embeddings_)[1:, :])

# get labels
labels = topic_info[topic_info.Topic.isin(pertinent_topics)].CustomName
# Filter rows and columns based on indices
pertinent_topics.sort()
rows_to_keep = pertinent_topics
cols_to_keep = pertinent_topics
filtered_distance_matrix = distance_matrix[np.ix_(rows_to_keep, cols_to_keep)]

fig = px.imshow(filtered_distance_matrix,
                labels=dict(color="Similarity Score"),
                x=labels,
                y=labels,
                color_continuous_scale='GnBu'
                )
title: str = "<b>Similarity Matrix</b>"
fig.update_layout(
    title={
        'text': f"{title}",
        'y': .95,
        'x': 0.55,
        'xanchor': 'center',
        'yanchor': 'top',
        'font': dict(
            size=22,
            color="Black")
    },
    width=800,
    height=800,
    hoverlabel=dict(
        bgcolor="white",
        font_size=16,
        font_family="Rockwell"
    ),
)
fig.update_layout(showlegend=True)
fig.update_layout(legend_title_text='Trend')
fig.write_image("heatmap_short.png", engine="kaleido", scale=3)
fig.show()

## Hierarchy Visualization

### Hierarchical clustering

In [None]:
topic_model.visualize_hierarchy(custom_labels=True, width=700, height=600) #The topics that were created can be hierarchically reduced.

### Hierarchical labels

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs_processed)
topic_model.visualize_hierarchy(custom_labels=True, width=700, height=600, hierarchical_topics=hierarchical_topics)

In [ ]:
hierarchical_topics = topic_model.hierarchical_topics(docs_processed)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

## Terms Visualization

### Wordclouds

In [None]:
# Show wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    print(text)
    wc = WordCloud(background_color="white", max_words=1000, width=800, height=400)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

def create_wordcloud_multiple(topics, output_path='wordcloud.png', dpi=300, save=True):
    merged_dict = {}
    for i in topics:
        text = {word: value for word, value in topic_model.get_topic(i)}
        merged_dict.update(text)
    #pc.copy(str(merged_dict))

    plt.figure(figsize=(12, 8))

    wc = WordCloud(background_color="white", max_words=1000, width=1000, height=500)
    wc.generate_from_frequencies(merged_dict)

    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    if save:
        plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
    plt.show()

def create_wordcloud_from_corpus(corpus, output_path='wordcloud.png', dpi=300, save=True, replace=False):
    # Combine the text corpus into a single string
    text = ' '.join(corpus)
    # Generate WordCloud from the text
    wc = WordCloud(background_color="white", max_words=1000, width=800, height=400)
    wc.generate(text)
    # Display the WordCloud
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    if save:
        plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
    plt.show()

In [None]:
create_wordcloud_multiple(pertinent_topics, dpi=500, save = True,output_path=save_path+'wordcloud_pert.png')

In [None]:
# create_wordcloud_from_corpus
create_wordcloud_from_corpus(doc_info_s_pertinent.Abstract.to_list(), dpi=500, save = True,output_path=save_path+'wordcloud_pert.png')

In [None]:
# Show wordcloud
for i in pertinent_topics[:2]:
    print('')
    create_wordcloud(topic_model, topic=i)

### Topic Word score (c-TF-IDF)

In [None]:
# show the selected terms for a few topics by creating bar charts out of the c-TF-IDF scores.
topic_model.visualize_barchart(n_words = 20, 
                               topics = pertinent_topics,
                               #top_n_topics=len(topic_info)//4,
                              )  

### Term Score Decline 

In [None]:
#Term Score Decline 
# [TF-IDF — Term Frequency-Inverse Document] #https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/
# Each topic is represented by a set of words.
# These words, however, do not all equally represent the topic.
# This visualization shows how many words are needed to represent a topic and at which point the beneficial effect of adding words starts to decline.
# https://maartengr.github.io/BERTopic/getting_started/ctfidf/ctfidf.html
topic_model.visualize_term_rank()#(log_scale=True)

## Topic Over Time

In [None]:
# prepocess dates
import re
if 'nbib_subset_clean' not in globals():
    print("loading 'nbib_subset_clean'...")
    nbib_subset_clean = pd.read_csv(save_path+'nbib_subset_clean_251.csv', index_col=0)
df_crop = nbib_subset_clean[['pubmed_id','publication_date']].drop_duplicates()
dates = df_crop.publication_date.to_list()
years = [date.split(' ')[0].split("-")[0] for date in dates]
len(years)

for idx, date in enumerate(dates):
  dates[idx] = " ".join(date.split(" ")[:2]).split("-")[0].strip().split("/")[0].strip()
    
for i in range(len(dates)):
    if len(dates[i]) == 4: 
        dates[i] += ' Jan'
        
dates[:10]
'''
converted_dates = []
import datetime
for date in dates:
    try:
        dt = datetime.datetime.strptime(date, "%Y %b")
        converted_dates.append(dt.strftime("%m/%Y"))
    except ValueError:
        converted_dates.append('Invalid Date')
converted_dates[:10]
'''
len(years), len(docs_processed)
#pd.Series(years).drop_duplicates().sort_values(ascending=False)

In [None]:
datetime_formats = ['%Y %b',"%Y"]
#topics, probs = topic_model_t.fit_transform(docs_processed,embeddings)
topics_over_time = topic_model.topics_over_time(docs_processed, 
                                                  years, 
                                                  datetime_format= datetime_formats[1],
                                                  global_tuning=True, 
                                                  evolution_tuning=True)

In [None]:
topics_over_time

In [None]:
#topic_model_t.visualize_topics_over_time(topics_over_time[topics_over_time["Timestamp"]<2023],top_n_topics=10,width=1250,height=450).show()
#topic_model_t.visualize_topics_over_time(topics_over_time[topics_over_time["Timestamp"]<2023],top_n_topics=10,custom_labels = True, title = "<b>Topics over Time</b>", width=1400,height=800).show()

topic_model.visualize_topics_over_time(topics_over_time,
                                         top_n_topics=10,
                                         custom_labels = True,
                                         topics = pertinent_topics,
                                         normalize_frequency = False,
                                         title = "<b>Topics over Time</b>", 
                                         width=1400,
                                         height=800).show()

# Extras

## Alternative NBIB Preprocessors

In [None]:
timea = time.time()

if simple_bool('Preprocess source dataset (slow preprocessor)'):
    nbib_subdataset = pd.DataFrame(columns= nbib_dataset.columns)

    num = len(nbib_dataset)
    for j in tqdm(range(len(nbib_dataset[:num]))):
        descriptors = ast.literal_eval(nbib_dataset.descriptors[j])
        descriptors_list =[]
        for i in range(len(descriptors)):
            descriptors_list.append(descriptors[i]['descriptor'])
        if any(term in descriptors_list for term in list_mesh):
            nbib_subdataset = pd.concat([nbib_subdataset, nbib_dataset.iloc[[j]]], ignore_index=True)
    print((time.time()-timea)/60,'minutes')

if simple_bool('Use alternative preprocessor (fast unaccurate)'):
    mesh = df_mesh['Preferred Label'].drop_duplicates().dropna().str.replace('+','')
    nbib_subdataset = nbib_dataset[nbib_dataset.descriptors.str.contains('|'.join(mesh[:len(mesh)]))]
    print((time.time()-timea)/60,'minutes')

if simple_bool('save output?'):
    nbib_subdataset.to_csv('data'+'nbib_subset'+str(mesh_count)+'.csv')

nbib_subdataset.pubmed_id.drop_duplicates()

## Train Model: Default setting

In [None]:
# Run model
import time
from hdbscan.hdbscan_ import HDBSCAN

# Prepare embeddings
""" SentenceTransformers is a Python framework for state-of-the-art sentence, text and image embeddings. """
embedding_file = 'grpm_abs_embeddings_313_nonltk.txt'

if simple_bool('Load pre-generated embeddings?'):
    embeddings = np.loadtxt(save_path+embedding_file)
else:
    from sentence_transformers import SentenceTransformer

    sentence_model = SentenceTransformer("all-mpnet-base-v2")
    embeddings = sentence_model.encode(docs_processed,
                                       show_progress_bar=True)
print(embeddings[0][0:5])
print('type',type(embeddings))

#Find the optimal number of clusters with HDBscan
""" HDBSCAN: Hierarchical Density-Based Spatial Clustering of Applications with Noise """
timea = time.time()
hdbscan_model = HDBSCAN(min_cluster_size=100, #15
                        metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True)

timeb = time.time()
print((timeb - timea)/60, 'minutes')

# Run BERTopic
""" BERTopic is a topic modeling technique """
doc = docs #docs_processed

topic_model_HBDSCAN = BERTopic(hdbscan_model=hdbscan_model).fit(docs_processed, embeddings)

print((time.time() - timea)/60, 'minutes')
'''
LOG: GRPM EMBEDDING 17.000 ABS IN 18.797277983029684 minutes  
LOG: GRPM EMBEDDING 44.000 ABS IN 3.5 minutes on server <***>  
LOG: GRPM EMBEDDING 42.700 ABS IN 2.993170352776845 minutes on server <***>
'''

# Saving the array as a txt file   
if simple_bool('save grpm_abs_embeddings?'):
    np.savetxt(save_path+'grpm_abs_embeddings.txt', embeddings)

## accessory functions

In [None]:
# Prepare data
umap_data = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

## optimize labels

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") #base_embedder

# A selected topic representation
# 'god jesus atheists atheism belief atheist believe exist beliefs existence'
sequence_to_classify =  " ".join([word for word, _ in topic_model.get_topic(1)])

# Our set of potential topic labels
candidate_labels = ['brain', 'liver', 'cancer']

classifier(sequence_to_classify, candidate_labels)

#{'labels': ['cooking', 'dancing', 'religion'],
# 'scores': [0.086, 0.063, 0.850],
# 'sequence': 'god jesus atheists atheism belief atheist believe exist beliefs existence'}

In [None]:
df = pd.read_csv('output_dataset_bertopic_nutrigenetics.csv')
df.pubmed_id=df.pubmed_id.astype(str)
df

In [None]:
grpm_subste=grpm_dataset[grpm_dataset.pmids.isin(df.pubmed_id)]

In [None]:
out_mesh =grpm_subste.mesh.drop_duplicates()
out_mesh[out_mesh.isin(list_mesh)].drop_duplicates()