In [46]:
import pandas as pd
from bertopic import BERTopic
import datetime
import os 
import numpy as np

## Basic BertTopic

### Functions/definitons

In [211]:
# PATHS
train_data_path = '../data/titles_and_abstracts_processed_train.csv'
test_data_path = '../data/titles_and_abstracts_processed_test.csv'
models_path = '../models/' 
results_path = '../results/bertopic'

In [212]:
def get_now_str():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [213]:
def transform_strings_to_arrays(df, col_names = ['tokenized_sentences', 'tokenized_words', 'tokenized_words_processed']):
    for col in col_names:
        df[col] = df[col].apply(eval)
    return df

In [214]:
# basic BertTopic keyword extraction
def train_transform_save(train_data, model_save_name):
    
    # train transform
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(train_data.values)

    # save model
    topic_model.save(model_save_name)

    return topic_model, topics, probs

In [228]:
def load_transform_save(data, model_save_name, results_path):

    # load model
    loaded_model = BERTopic.load(model_save_name)

    # transform for data 
    samples_topics, samples_probs = loaded_model.transform(data.values)
    res_df = pd.DataFrame({
        'PMID': np.unique(data.index),
        'topic_number': samples_topics,
        'topic_probs': samples_probs,
        "topic_keywords": [loaded_model.get_topic(topic_number) for topic_number in samples_topics]
    })
    res_df.to_csv(results_path)
    return loaded_model, res_df


### Processed data (with steming and stop words removal)

In [229]:
train_full_data = transform_strings_to_arrays(pd.read_csv(train_data_path))
test_full_data = transform_strings_to_arrays(pd.read_csv(test_data_path))

train_data = train_full_data.groupby(by = ['PMID'])['tokenized_words_processed'].agg(lambda x: ' '.join(x.values[0] + x.values[1]))
test_data = test_full_data.groupby(by = ['PMID'])['tokenized_words_processed'].agg(lambda x: ' '.join(x.values[0] + x.values[1]))

model_name = f'berttopic_processed_data_{get_now_str()}'
model_save_name = os.path.join(models_path, model_name)
result_path_train = os.path.join(results_path, f'{model_name}_train.csv')
result_path_test = os.path.join(results_path, f'{model_name}_test.csv')

topic_model, topics, probs = train_transform_save(train_data, model_save_name)
_, res_df_train = load_transform_save(train_data, model_save_name, result_path_train)
_, res_df_test = load_transform_save(test_data, model_save_name, result_path_test)

#### Processed data example

In [230]:
train_data.iloc[0]

'dctn4 modifi chronic pseudomona aeruginosa infect cystic fibrosi pseudomona aeruginosa infect cystic fibrosi cf patient associ wors pulmonari diseas shorter surviv chronic infect cpa associ reduc lung function faster rate lung declin increas rate exacerb shorter surviv exom sequenc extrem phenotyp design recent shown isoform dynactin 4 dctn4 influenc infect cf lead wors respiratori diseas purpos studi investig role dctn4 missens variant infect incid age infect chronic infect incid cohort adult cf patient singl centr polymeras chain reaction direct sequenc screen dna sampl dctn4 variant total 121 adult cf patient cochin hospit cf centr includ carri cftr defect 103 develop 1 pulmonari infect 68 patient cpa dctn4 variant identifi 24 cf patient infect 17 cf patient infect patient cpa 29 dctn4 missens variant vs 23 patient cpa interestingli tend frequent observ cf patient cpa patient cpa vs dctn4 missens variant tend frequent male cf patient cpa bear class mutat male cf patient cpa bear cl

In [231]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1421,-1_patient_studi_effect_associ
1,0,124,0_neuron_brain_cell_express
2,1,120,1_cell_cancer_tumor_express
3,2,115,2_nanoparticl_surfac_coat_properti
4,3,100,3_speci_forest_popul_bird
5,4,100,4_fractur_knee_bone_spine
6,5,94,5_health_student_practic_profession
7,6,61,6_protein_bind_structur_ligand
8,7,58,7_ventricular_cardiac_patient_lv
9,8,58,8_cancer_surviv_patient_lung


In [241]:
# example 
i = 11
doc = train_data.iloc[i]
doc_topic = topics[i]
doc_prob = probs[i]
topic_info = topic_model.get_topic_info(doc_topic)
topic_words = topic_model.get_topic(doc_topic)

In [243]:
doc

'neutral forbidden link morpholog match assembl mutualist hawkmoth plant network major challeng evolutionari ecolog understand process shape pattern interact speci commun level pollin flower long corolla tube hawkmoth invok showcas model recent optim forag model predict close associ mouthpart length corolla depth visit flower favour trait converg special commun level assess hawkmoth frequent pollin plant floral tube length similar probosci length morpholog match hypothesi abund process neutral hypothesi ecolog trait mismatch constraint forbidden link hypothesi process structur hawkmoth plant mutualist network commun biogeograph region south america found converg morpholog trait commun distribut morpholog differ hawkmoth plant consist expect morpholog match hypothesi commun remain commun ecoton distinct biogeograph area interact predict neutral hypothesi find consist idea diffus drive evolut extrem long proboscis flower tube highlight import morpholog trait forbidden link hypothesi stru

In [244]:
topic_info

Unnamed: 0,Topic,Count,Name
0,3,100,3_speci_forest_popul_bird


In [246]:
topic_words

[('speci', 0.04541899474237719),
 ('forest', 0.01986315689650549),
 ('popul', 0.01879033701326042),
 ('bird', 0.0155259175762611),
 ('climat', 0.01385799346446381),
 ('plant', 0.013240451447144554),
 ('divers', 0.013054858242023333),
 ('genet', 0.011344986058897222),
 ('tree', 0.01119505679918582),
 ('phylogenet', 0.0104014699784837)]

### Raw data (without stemming and stopwords removal)

In [247]:
train_full_data = transform_strings_to_arrays(pd.read_csv(train_data_path))
test_full_data = transform_strings_to_arrays(pd.read_csv(test_data_path))

train_data = train_full_data.groupby(by = ['PMID'])['Content'].agg(lambda x: f'{x.values[0]}. {x.values[1]}')
test_data = test_full_data.groupby(by = ['PMID'])['Content'].agg(lambda x: f'{x.values[0]}. {x.values[1]}')

model_name = f'bertopic_raw_data_{get_now_str()}'
model_save_name = os.path.join(models_path, model_name)
result_path_train = os.path.join(results_path, f'{model_name}_train.csv')
result_path_test = os.path.join(results_path, f'{model_name}_test.csv')

topic_model, topics, probs = train_transform_save(train_data, model_save_name)
_, res_df_train = load_transform_save(train_data, model_save_name, result_path_train)
_, res_df_test = load_transform_save(test_data, model_save_name, result_path_test)

#### Raw data example

In [249]:
train_data.iloc[0]

'DCTN4 as a modifier of chronic Pseudomonas aeruginosa infection in cystic fibrosis. Pseudomonas aeruginosa (Pa) infection in cystic fibrosis (CF) patients is associated with worse long-term pulmonary disease and shorter survival, and chronic Pa infection (CPA) is associated with reduced lung function, faster rate of lung decline, increased rates of exacerbations and shorter survival. By using exome sequencing and extreme phenotype design, it was recently shown that isoforms of dynactin 4 (DCTN4) may influence Pa infection in CF, leading to worse respiratory disease. The purpose of this study was to investigate the role of DCTN4 missense variants on Pa infection incidence, age at first Pa infection and chronic Pa infection incidence in a cohort of adult CF patients from a single centre. Polymerase chain reaction and direct sequencing were used to screen DNA samples for DCTN4 variants. A total of 121 adult CF patients from the Cochin Hospital CF centre have been included, all of them ca

In [250]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1432,-1_the_of_and_in
1,0,168,0_care_health_and_to
2,1,106,1_brain_to_the_in
3,2,82,2_production_soil_the_of
4,3,78,3_cells_that_the_in
...,...,...,...
60,59,13,59_exercise_muscle_muscles_dystrophy
61,60,13,60_dpd_parks_spatial_travel
62,61,13,61_thyroid_tc_cancer_95
63,62,12,62_schizophrenia_asenapine_antipsychotic_bid


In [251]:
# example 
i = 11
doc = train_data.iloc[i]
doc_topic = topics[i]
doc_prob = probs[i]
topic_info = topic_model.get_topic_info(doc_topic)
topic_words = topic_model.get_topic(doc_topic)

In [252]:
doc

"Beyond neutral and forbidden links: morphological matches and the assembly of mutualistic hawkmoth - plant networks. A major challenge in evolutionary ecology is to understand how co-evolutionary processes shape patterns of interactions between species at community level. Pollination of flowers with long corolla tubes by long-tongued hawkmoths has been invoked as a showcase model of co-evolution. Recently, optimal foraging models have predicted that there might be a close association between mouthparts' length and the corolla depth of the visited flowers, thus favouring trait convergence and specialization at community level. Here, we assessed whether hawkmoths more frequently pollinate plants with floral tube lengths similar to their proboscis lengths (morphological match hypothesis) against abundance -based processes (neutral hypothesis) and ecological trait mismatches constraints (forbidden links hypothesis), and how these processes structure hawkmoth - plant mutualistic networks f

In [253]:
topic_info

Unnamed: 0,Topic,Count,Name
0,4,70,4_species_the_of_and


In [254]:
topic_words

[('species', 0.040162911115070137),
 ('the', 0.016339623447882926),
 ('of', 0.014235148640351948),
 ('and', 0.012700373298775505),
 ('we', 0.01182749835321019),
 ('forests', 0.01173852168148737),
 ('in', 0.011656504596709786),
 ('that', 0.011253687294829333),
 ('to', 0.011098637279477231),
 ('climate', 0.010928281659616299)]