In [18]:
import os
import pandas as pd
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
# Load Data
df_patents = pd.read_csv('/Users/juergenthiesen/Documents/Patentsview/Cleantech CSV Files/g_patent_Y02.csv')
df_cpc = pd.read_csv('/Users/juergenthiesen/Documents/Patentsview/Cleantech CSV Files/g_cpc_current_Y02.csv')

# Drop all classes uneqaul to Y02 from df_cpc
df_cpc = df_cpc[df_cpc['cpc_class'].str.contains('Y02')]

# Merge data on patent_id, keep duplicates
df = pd.merge(df_patents, df_cpc, on='patent_id', how='left')

# Drop all data in df with title or abstract NaN
df = df.dropna(subset=['patent_title', 'patent_abstract'])

# Merge title and abstract with [SEP] token
df['patent_title_abstract'] = df['patent_title'] + ' [SEP] ' + df['patent_abstract']

# Randomly sample 1000 patents per cpc_subclass
df = df.groupby('cpc_subclass').apply(lambda x: x.sample(1000, random_state=42))

In [40]:
# Walk through BERTopic initialization

# Generate embeddings
embedding_model = SentenceTransformer('climatebert/distilroberta-base-climate-f')

# Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='euclidean')

# Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Fine-tune topic representations
representation_model = KeyBERTInspired()

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    min_topic_size=5
)


No sentence-transformers model found with name /Users/juergenthiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/juergenthiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from

In [41]:
# Generate list from column patent_title_abstract
patent_title_abstract = df['patent_title_abstract'].tolist()
# Fit topic model
topic, probs = topic_model.fit_transform(patent_title_abstract)

In [46]:
# Visualize topics
topic_model.visualize_topics()

ValueError: zero-size array to reduction operation maximum which has no identity