### Initialise

In [1]:
# import libraries
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Prepare input data

In [2]:
# load full dataset
full_df = pd.read_csv('../data/processed/journals/journals.csv')
full_df['lens_id'] = full_df['lens_id'].astype('string')
# load labelled data
labelled_df = pd.read_csv('../data/processed/journals/journals_filtered.csv')
labelled_df['tech'] = labelled_df[['quantum', 'semiconductors', 'cell-based meats', 'hydrogen power', 'personalised medicine']].idxmax(1)
labelled_df['tech'] = pd.factorize(labelled_df['tech'])[0] + 1
labelled_df['lens_id'] = labelled_df['lens_id'].astype('string')
# join labels to full dataset
joined_df = full_df.set_index('lens_id').join(labelled_df.set_index('lens_id'), rsuffix='_join', how='left')
joined_df['tech'] = joined_df['tech'].fillna(-1)

In [136]:
# clean abstract text for topic modelling
joined_df['abstract_cleaned'] = joined_df['abstract'].str.replace(r'(?<=\<)(.*?)(?=\>)|>|<|\r|\n', '', regex=True)
joined_df['abstract_cleaned'] = joined_df['abstract_cleaned'].str.replace(r'\s+', ' ', regex=True)

In [148]:
# remove empty texts
cleaned_df = joined_df.loc[(joined_df.abstract_cleaned!='Null.')&(joined_df.abstract_cleaned!=''),]

In [149]:
# create doc text and target classes lists 
docs = cleaned_df['abstract_cleaned'].astype('str').to_list()
target_classes = cleaned_df['tech'].astype('int').to_list()

In [None]:
# otherwise load pickle files for input
with open('journal_abstract_docs', 'wb') as f:
    pickle.dump(docs, f)
with open('journal_abstract_target_classes', 'wb') as f:
    pickle.dump(target_classes, f)

### Define topic model

In [184]:
# define model components
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
hdbscan_model = HDBSCAN(min_cluster_size=50, prediction_data=True)
representation_model = KeyBERTInspired()
# define topic model
topic_model = BERTopic(embedding_model=sentence_model, hdbscan_model=hdbscan_model, representation_model=representation_model,
                       top_n_words=10, calculate_probabilities=False)

### Embeddings

In [154]:
# compute embeddings
embeddings = sentence_model.encode(docs, show_progress_bar=True)

Batches: 100%|██████████| 3436/3436 [1:17:21<00:00,  1.35s/it]


In [155]:
# store as pickle file
with open('journal_abstract_embeddings', 'wb') as f:
    pickle.dump(embeddings, f)

In [4]:
# load previously stored embeddings
with open('journal_abstract_embeddings', 'rb') as f:
    embeddings = pickle.load(f)

### Fit topic model

In [185]:
# fit and transform model
topics, probs = topic_model.fit_transform(docs, embeddings, y=target_classes)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [186]:
# represent topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [160]:
# save model
topic_model.save('journal_abstract_model', serialization='safetensors', save_ctfidf=True, save_embedding_model=sentence_model)

In [5]:
# save outputs
with open('journal_abstract_topics', 'wb') as f:
    pickle.dump(topics, f)
with open('journal_abstract_probs', 'wb') as f:
    pickle.dump(probs, f)

### Produce results

In [5]:
# load a saved model
topic_model = BERTopic.load('journal_abstract_model')
topics = topic_model.topics_
with open('journal_abstract_probs', 'rb') as f:
    probs = pickle.load(f)

In [9]:
# create a topic docs dataframe
topic_docs_df = joined_df.copy()
topic_docs_df['topic_number'] = topics
topic_docs_df['topic_probabilities'] = probs
# save as csv
topic_docs_df.to_csv('../data/dashboard/journal_abstract_topic_docs.csv')

In [10]:
# create a topic names dataframe
topic_names_df = topic_model.get_topic_info()
top_terms = (topic_model.get_topics().values())
topic_names_df['topic_terms'] = [[pair[0] for pair in topic] for topic in top_terms]
topic_names_df['term_probabilities'] = [[float(pair[1]) for pair in topic] for topic in top_terms]
# save as csv and display
topic_names_df.to_csv('../data/dashboard/journal_abstract_topic_names.csv')
topic_names_df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs,topic_terms,term_probabilities
0,-1,525670,-1_data_memory_methods_semiconductor,"[data, memory, methods, semiconductor, method,...",,"[data, memory, methods, semiconductor, method,...","[0.0012342221353732418, 0.001184329463540932, ..."
1,0,16971,0_implant_surgical_ultrasound_bone,"[implant, surgical, ultrasound, bone, ultrason...",,"[implant, surgical, ultrasound, bone, ultrason...","[0.019720560007693996, 0.019325451915862407, 0..."
2,1,13547,1_battery_lithium_secondary battery_electrode,"[battery, lithium, secondary battery, electrod...",,"[battery, lithium, secondary battery, electrod...","[0.04086501931215215, 0.029120830357143107, 0...."
3,2,9317,2_uplink_downlink_wireless_radio,"[uplink, downlink, wireless, radio, communicat...",,"[uplink, downlink, wireless, radio, communicat...","[0.01762133110650405, 0.013877049838332479, 0...."
4,3,9273,3_tire_rotor_wheel_pneumatic tire,"[tire, rotor, wheel, pneumatic tire, stator, p...",,"[tire, rotor, wheel, pneumatic tire, stator, p...","[0.045568553714066214, 0.026842639076329364, 0..."
...,...,...,...,...,...,...,...
886,885,151,885_tool power_power tool_tool_power,"[tool power, power tool, tool, power, connecti...",,"[tool power, power tool, tool, power, connecti...","[0.4958888895812067, 0.4464668922085273, 0.145..."
887,886,151,886_soil_method soil_soil properties_measuring...,"[soil, method soil, soil properties, measuring...",,"[soil, method soil, soil properties, measuring...","[0.2611271884508649, 0.045580477275160965, 0.0..."
888,887,150,887_silicone_silicone composition_curable sili...,"[silicone, silicone composition, curable silic...",,"[silicone, silicone composition, curable silic...","[0.24614620900071665, 0.1104104833689966, 0.07..."
889,888,150,888_chimeric_chimeric antigen_antigen_antigen ...,"[chimeric, chimeric antigen, antigen, antigen ...",,"[chimeric, chimeric antigen, antigen, antigen ...","[0.22089086470877603, 0.22038928083446813, 0.1..."


In [None]:
# save a topic chart
fig = topic_model.visualize_topics()
fig.write_html('journal_abstract_topics.html')
fig

### Search topics

In [199]:
# search topics by keyword
similar_topics, similarity = topic_model.find_topics('hydrogen', top_n=5)
num=0
print(similar_topics[num])
topic_model.get_topic(similar_topics[num])

199


[('oil', 0.027549664562592505),
 ('biodiesel', 0.027468339950354106),
 ('pyrolysis', 0.022325249913568726),
 ('biomass', 0.021505980482731422),
 ('lignin', 0.021398605420019326),
 ('diesel', 0.0166505922154185),
 ('fuel', 0.014060187616488095),
 ('bio', 0.011374901601249054),
 ('bio oil', 0.01074939913906866),
 ('microwave', 0.010140563259358337)]