In [46]:
import pandas as pd
from bertopic import BERTopic
import datetime
import os 
import numpy as np

## Basic BertTopic

#### Functions/definitons

In [205]:
# PATHS
train_data_path = '../data/titles_and_abstracts_processed_train.csv'
test_data_path = '../data/titles_and_abstracts_processed_test.csv'
models_path = '../models/' 
raw_data_results_path = '../results/bertopic/raw_data'
processed_data_results_path = '../results/bertopic/processed_data'

In [201]:
def get_now_str():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [202]:
def transform_strings_to_arrays(df, col_names = ['tokenized_sentences', 'tokenized_words', 'tokenized_words_processed']):
    for col in col_names:
        df[col] = df[col].apply(eval)
    return df

In [203]:
# basic BertTopic keyword extraction
def train_transform_save(train_data, model_save_name):
    
    # train transform
    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(train_data.values)

    # save model
    topic_model.save(model_save_name)

    return topic_model, topics, probs

In [204]:
def load_transform_save(data, model_save_name, results_path):

    # load model
    loaded_model = BERTopic.load(model_save_name)

    # transform for data 
    samples_topics, samples_probs = loaded_model.transform(data.values)
    res_df = pd.DataFrame({
        'PMID': np.unique(data.PMID),
        'topic_number': samples_topics,
        'topic_probs': samples_probs,
        "topic_keywords": [loaded_model.get_topic(topic_number) for topic_number in samples_topics]
    })
    res_df.to_csv(results_path)
    return loaded_model, res_df


#### Raw data (without steming and stop words removal)

In [207]:
train_full_data = transform_strings_to_arrays(pd.read_csv(train_data_path))
test_full_data = transform_strings_to_arrays(pd.read_csv(test_data_path))

train_data = train_full_data.groupby(by = ['PMID'])['tokenized_words_processed'].agg(lambda x: ' '.join(x.values[0] + x.values[1]))
test_data = test_full_data.groupby(by = ['PMID'])['tokenized_words_processed'].agg(lambda x: ' '.join(x.values[0] + x.values[1]))

model_save_name = os.path.join(models_path, f'berttopic_raw_data_{get_now_str()}')
results_path = 

topic_model, topics, probs = train_transform_save(train_data, model_save_name)
loaded_model, res_df = load_transform_save(train_data, model_save_name, results_dir)

KeyboardInterrupt: 

In [137]:
ta_content_train.iloc[0]

'DCTN4 as a modifier of chronic Pseudomonas aeruginosa infection in cystic fibrosis. Pseudomonas aeruginosa (Pa) infection in cystic fibrosis (CF) patients is associated with worse long-term pulmonary disease and shorter survival, and chronic Pa infection (CPA) is associated with reduced lung function, faster rate of lung decline, increased rates of exacerbations and shorter survival. By using exome sequencing and extreme phenotype design, it was recently shown that isoforms of dynactin 4 (DCTN4) may influence Pa infection in CF, leading to worse respiratory disease. The purpose of this study was to investigate the role of DCTN4 missense variants on Pa infection incidence, age at first Pa infection and chronic Pa infection incidence in a cohort of adult CF patients from a single centre. Polymerase chain reaction and direct sequencing were used to screen DNA samples for DCTN4 variants. A total of 121 adult CF patients from the Cochin Hospital CF centre have been included, all of them ca

In [138]:
# basic BertTopic keyword extraction
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(ta_content_train.values)

In [159]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,1343,-1_the_of_and_in
1,0,292,0_health_care_to_and
2,1,138,1_brain_to_the_in
3,2,119,2_and_diabetes_vitamin_obesity
4,3,92,3_cancer_cell_expression_cells
...,...,...,...
60,59,11,59_asthma_allergic_medicine_sa
61,60,11,60_breast_cancer_bc_er
62,61,10,61_cardiac_mnk1_differentiation_bio
63,62,10,62_water_metal_dwtr_cu


In [148]:
# example 
i = 11
pmid = train_data.iloc[i].PMID
doc = '. '.join(train_data[train_data.PMID == pmid]['Content'].values)
doc_topic = topics[i]
doc_prob = probs[i]
topic_info = topic_model.get_topic_info(doc_topic)
topic_words = topic_model.get_topic(doc_topic)

In [150]:
doc

"Promoting lifestyle behaviour change and well-being in hospital patients: a pilot study of an evidence-based psychological intervention. Lifestyle risk behaviours show an inverse social gradient, clustering in vulnerable groups. We designed and piloted an intervention to address barriers to lifestyle behaviour change among hospital patients. We designed our intervention using effective components of behaviour change interventions informed by psychological theory. Delivered by a health psychologist based at the Royal Free London NHS Foundation Trust, the 4-week intervention included detailed baseline assessment, personalized goal setting, psychological skills development, motivation support and referral to community services. Primary outcomes were feasibility and patient acceptability. We also evaluated changes to health and well-being. From 1 July 2013 to 31 September 2014, 686 patients were referred, 338 (49.3%) attended a first appointment and 172 (25.1%) completed follow-up. Furthe

In [151]:
topic_info

Unnamed: 0,Topic,Count,Name
0,4,67,4_species_the_of_and


In [152]:
topic_words

[('species', 0.04094609274655893),
 ('the', 0.015919871735481973),
 ('of', 0.01407421291845833),
 ('and', 0.012802701613979894),
 ('forests', 0.012224788239035477),
 ('we', 0.012009514487563176),
 ('to', 0.011480034706860086),
 ('in', 0.011351481571335607),
 ('that', 0.011224663898045432),
 ('climate', 0.011025328387382053)]

In [154]:
# save model
model_save_name = os.path.join(models_path, f'berttopic_raw_content_{get_now_str()}')
topic_model.save(model_save_name)

In [160]:
# load model
loaded_model = BERTopic.load(model_save_name)

In [175]:
# tranform data both training and test
input_data = transform_strings_to_arrays(pd.read_csv(train_data_path))
model = loaded_model
save_path = os.path.join(results_path, f'{model_save_name}_test.csv')


samples_list = input_data.groupby(by = ['PMID'])['tokenized_words_processed'].agg(lambda x: ' '.join(x.values[0] + x.values[1])).values

samples_topics, samples_probs = model.transform(samples_list)
res_df = pd.DataFrame({
    'PMID': np.unique(input_data.PMID),
    'topic_number': samples_topics,
    "topic_keywords": [model.get_topic(topic_number) for topic_number in samples_topics]
})


In [197]:
res_df

Unnamed: 0,PMID,topic_number,topic_keywords
0,25763772,-1,"[(the, 0.013610109100331086), (of, 0.013344696..."
1,25847295,-1,"[(the, 0.013610109100331086), (of, 0.013344696..."
2,26316050,-1,"[(the, 0.013610109100331086), (of, 0.013344696..."
3,26406200,14,"[(walking, 0.023270379588251042), (players, 0...."
4,26424709,0,"[(health, 0.01563368814463759), (care, 0.01416..."
...,...,...,...
3508,28549399,0,"[(health, 0.01563368814463759), (care, 0.01416..."
3509,28549760,2,"[(and, 0.015258906726062352), (diabetes, 0.015..."
3510,28550154,36,"[(detection, 0.04332267580859146), (quantum, 0..."
3511,28550348,0,"[(health, 0.01563368814463759), (care, 0.01416..."


In [164]:
data

Unnamed: 0,PMID,Type,Content,tokenized_sentences,tokenized_words,tokenized_words_processed
0,25763772,t,DCTN4 as a modifier of chronic Pseudomonas aer...,['DCTN4 as a modifier of chronic Pseudomonas a...,"[['DCTN4', 'as', 'a', 'modifier', 'of', 'chron...","['dctn4', 'modifi', 'chronic', 'pseudomona', '..."
1,25763772,a,Pseudomonas aeruginosa (Pa) infection in cysti...,['Pseudomonas aeruginosa (Pa) infection in cys...,"[['Pseudomonas', 'aeruginosa', '(', 'Pa', ')',...","['pseudomona', 'aeruginosa', 'infect', 'cystic..."
2,25847295,t,Nonylphenol diethoxylate inhibits apoptosis in...,['Nonylphenol diethoxylate inhibits apoptosis ...,"[['Nonylphenol', 'diethoxylate', 'inhibits', '...","['nonylphenol', 'diethoxyl', 'inhibit', 'apopt..."
3,25847295,a,Nonylphenol and short-chain nonylphenol ethoxy...,['Nonylphenol and short-chain nonylphenol etho...,"[['Nonylphenol', 'and', 'short-chain', 'nonylp...","['nonylphenol', 'nonylphenol', 'ethoxyl', 'np2..."
4,26316050,t,Prevascularized silicon membranes for the enha...,['Prevascularized silicon membranes for the en...,"[['Prevascularized', 'silicon', 'membranes', '...","['prevascular', 'silicon', 'membran', 'enhanc'..."
...,...,...,...,...,...,...
7021,28550154,a,The nanoparticles (NPs) of hemoglobin (Hb) wer...,['The nanoparticles (NPs) of hemoglobin (Hb) w...,"[['The', 'nanoparticles', '(', 'NPs', ')', 'of...","['nanoparticl', 'np', 'hemoglobin', 'hb', 'pre..."
7022,28550348,t,Medication regimen complexity and prevalence o...,['Medication regimen complexity and prevalence...,"[['Medication', 'regimen', 'complexity', 'and'...","['medic', 'regimen', 'complex', 'preval', 'pot..."
7023,28550348,a,Background There is a relative paucity of info...,['Background There is a relative paucity of in...,"[['Background', 'There', 'is', 'a', 'relative'...","['background', 'rel', 'pauciti', 'inform', 'ch..."
7024,28550521,t,Assessment of periodontal bone level revisited...,['Assessment of periodontal bone level revisit...,"[['Assessment', 'of', 'periodontal', 'bone', '...","['assess', 'periodont', 'bone', 'level', 'revi..."


## Without stemming and stowords removal

In [None]:
ta_content_train = train_data.groupby(by = ['PMID'])['Content'].agg(lambda x: f'{x.values[0]}. {x.values[1]}')
