### Initialise

In [2]:
# import libraries
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
import pickle

  from .autonotebook import tqdm as notebook_tqdm


### Prepare input data

In [31]:
# load full dataset
full_df = pd.read_parquet('../data/processed/patents/patents_data.parquet')
full_df['lens_id'] = full_df['lens_id'].astype('string')
# load labelled data
labelled_df = pd.read_csv('../data/processed/patents/patents_data_filtered.csv')
labelled_df['tech'] = labelled_df[['quantum', 'semiconductors', 'cell-based meats', 'hydrogen power', 'personalised medicine']].idxmax(1)
labelled_df['tech'] = pd.factorize(labelled_df['tech'])[0] + 1
labelled_df['lens_id'] = labelled_df['lens_id'].astype('string')
# join labels to full dataset
joined_df = full_df.set_index('lens_id').join(labelled_df.set_index('lens_id'), rsuffix='_join', how='left')
joined_df['tech'] = joined_df['tech'].fillna(-1)

In [68]:
# create doc text and target classes lists 
docs = joined_df['title'].to_list()
target_classes = joined_df['tech'].astype('int').to_list()

In [None]:
# otherwise load pickle files for input
with open('patent_title_docs', 'wb') as f:
    pickle.dump(docs, f)
with open('patent_title_target_classes', 'wb') as f:
    pickle.dump(target_classes, f)

### Define topic model

In [65]:
# define model components
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
hdbscan_model = HDBSCAN(min_cluster_size=150, prediction_data=True)
representation_model = KeyBERTInspired()
# define topic model
topic_model = BERTopic(embedding_model=sentence_model, hdbscan_model=hdbscan_model, representation_model=representation_model,
                       top_n_words=10, nr_topics='auto', calculate_probabilities=False)

### Embeddings

In [None]:
# compute embeddings
embeddings = sentence_model.encode(docs, show_progress_bar=True)

In [67]:
# store as pickle file
with open('patent_title_embeddings', 'wb') as f:
    pickle.dump(embeddings, f)

In [4]:
# load previously stored embeddings
with open('patent_title_embeddings', 'rb') as f:
    embeddings = pickle.load(f)

### Fit topic model

In [5]:
# fit and transform model
topics, probs = topic_model.fit_transform(docs, embeddings, y=target_classes)

: 

In [None]:
# represent topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [None]:
# save model
topic_model.save('patent_title_model', serialization='safetensors', save_ctfidf=True, save_embedding_model=sentence_model)

In [None]:
# save outputs
with open('patent_title_topics', 'wb') as f:
    pickle.dump(topics, f)
with open('patent_title_probs', 'wb') as f:
    pickle.dump(probs, f)

### Produce results

In [None]:
# load a saved model
topic_model = BERTopic.load('patent_title_model')
topics = topic_model.topics_
with open('patent_title_probs', 'rb') as f:
    probs = pickle.load(f)

In [None]:
# create a topic docs dataframe
topic_docs_df = joined_df.copy()
topic_docs_df['topic_number'] = topics
topic_docs_df['topic_probabilities'] = probs
# save as csv
topic_docs_df.to_csv('../data/dashboard/patent_title_topic_docs.csv')

In [None]:
# create a topic names dataframe
topic_names_df = topic_model.get_topic_info()
top_terms = (topic_model.get_topics().values())
topic_names_df['topic_terms'] = [[pair[0] for pair in topic] for topic in top_terms]
topic_names_df['term_probabilities'] = [[float(pair[1]) for pair in topic] for topic in top_terms]
# save as csv and display
topic_names_df.to_csv('../data/dashboard/patent_title_topic_names.csv')
topic_names_df

In [None]:
# save a topic chart
fig = topic_model.visualize_topics()
fig.write_html('patent_title_topics.html')
fig

### Search topics

In [None]:
# search topics by keyword
similar_topics, similarity = topic_model.find_topics('hydrogen', top_n=5)
num=0
print(similar_topics[num])
topic_model.get_topic(similar_topics[num])