In [6]:
import pandas as pd
from bertopic import BERTopic
import datetime
import os 
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


# BERTopic functions definitions

In [7]:
def get_now_str():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [8]:
def transform_strings_to_arrays(df, col_names = ['tokenized_sentences', 'tokenized_words', 'tokenized_words_processed', 'tokenized_words_no_stopwords', 'tokenized_words_lemmatize']):
    for col in col_names:
        df[col] = df[col].apply(eval)
    return df

In [11]:
def get_keywords_bertopic(data_path, models_path, results_path, timestamp, min_topic_size=6, top_n_words=22):
    """Performs bertopic keywords extraction for data after lemmatization.

    Parameters
    ----------
    data_path : str
        Path to preprocessed dataset. Dataset must contain a column with name 'tokenized_words_lemmatize'.
    
    models_path : str
        Path to save the model to (folder must exist).

    results_path : str
        Path to save the results to (folder must exist).

    timestamp : str
        timestamp that will be added to filenames

    min_topic_size: int
        minimal number of datapoints in topic

    top_n_words: int
        number of extracted keywords

    Returns
    ------
    (result_path, model_save_name) : tuple[str]
        Frist element is the path to created file with extracted keywrods, second - path to created model.
    """

    import pandas as pd
    from bertopic import BERTopic
    import os 
    import numpy as np

    # basic BertTopic keyword extraction
    def train_transform_save(train_data, model_save_name, min_topic_size=10):
        
        # train transform
        topic_model = BERTopic(min_topic_size=min_topic_size, top_n_words=top_n_words)
        topics, probs = topic_model.fit_transform(train_data.values)

        # save model
        topic_model.save(model_save_name)

        return topic_model, topics, probs


    def load_transform_save(data, model_save_name, results_path):

        # load model
        loaded_model = BERTopic.load(model_save_name)

        # transform for data 
        samples_topics, samples_probs = loaded_model.transform(data.values)
        res_df = pd.DataFrame({
            'PMID': np.unique(data.index),
            'topic_number': samples_topics,
            'topic_probs': samples_probs,
            "topic_keywords": [loaded_model.get_topic(topic_number) for topic_number in samples_topics]
        })
        res_df.to_csv(results_path, index=False)
        return loaded_model, res_df

    ##############################################################################################################################

    full_data = transform_strings_to_arrays(pd.read_csv(data_path), col_names=['tokenized_words_lemmatize'])

    data = full_data.groupby(by = ['PMID'])['tokenized_words_lemmatize'].agg(lambda x: ' '.join(x.values[0] + x.values[1]))

    model_name = f'bertopic_keywords_{timestamp}'
    model_save_name = os.path.join(models_path, model_name)
    result_path = os.path.join(results_path, 'bertopic', f'{model_name}.csv')

    topic_model, topics, probs = train_transform_save(data, model_save_name, min_topic_size=3)
    _, res_df = load_transform_save(data, model_save_name, result_path)

    return result_path, model_save_name 

# BERTopic for CRAFT dataset

In [13]:
data_path = "../../0.RESULTS/preprocessing/data_whole.csv"
models_path = "../../0.RESULTS/bertopic/models/"
results_path = "../../0.RESULTS/"
timestamp = get_now_str()
get_keywords_bertopic(data_path, models_path, results_path, timestamp)

('../../0.RESULTS/bertopic\\bertopic_keywords_2023-01-23_11-11-41.csv',
 '../../0.RESULTS/bertopic/models/bertopic_keywords_2023-01-23_11-11-41')

In [14]:
topic_model = BERTopic.load('../../0.RESULTS/bertopic/models/bertopic_keywords_2023-01-23_11-11-41')

In [15]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,24,-1_gene_cell_expression_mouse
1,0,16,0_cell_mouse_embryonic_mutant
2,1,15,1_mouse_strain_muscle_background
3,2,13,2_mouse_protein_pax6_differentiation
4,3,8,3_annexin_a7_protein_ranbp2
5,4,7,4_olfactory_receptor_mouse_sox1
6,5,6,5_pulmonary_development_individual_lung
7,6,5,6_ear_sensory_cell_hair
8,7,3,7_bone_bmp2_bmp4_limb
