In [7]:
import pandas as pd
import plotly.express as px
# from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import torch
import hdbscan
import umap
from tqdm import tqdm

# Data Preprocessing and Wrangling

In [8]:
# Load data
df_keywords = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/Cleantech Concepts/df_cpc_subclass_keywords_yake.json')

In [10]:
# Explode 'keywords_yake_desc_5000' column
df_keywords_list = df_keywords.explode('keywords_yake_desc_5000')
df_keywords_list = df_keywords_list.reset_index(drop=True)

In [11]:
# Delete column keywords_yake and keywords_yake_desc
df_keywords_list = df_keywords_list.drop(['keywords_yake', 'keywords_yake_desc'], axis=1)

# Embedding

In [13]:
# Ensure a GPU is available and select it for processing
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'Using GPU: {torch.cuda.get_device_name()}')
else:
    device = torch.device("cpu")
    print('No GPU available, using CPU.')

Using GPU: NVIDIA RTX A4500


In [22]:
# Load the Sentence Transformer model, try 'AI-Growth-Lab/PatentSBERTa' or 'anferico/bert-for-patents' or 'climatebert/distilroberta-base-climate-f'
model = SentenceTransformer('climatebert/distilroberta-base-climate-f', device=device)

Downloading (…)9a318/.gitattributes: 100%|██████████| 1.23k/1.23k [00:00<00:00, 9.46MB/s]
Downloading (…)d2f289a318/README.md: 100%|██████████| 3.12k/3.12k [00:00<00:00, 17.3MB/s]
Downloading (…)18/added_tokens.json: 100%|██████████| 4.98k/4.98k [00:00<00:00, 12.0MB/s]
Downloading (…)f289a318/config.json: 100%|██████████| 752/752 [00:00<00:00, 4.24MB/s]
Downloading (…)2f289a318/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.47MB/s]
Downloading model.safetensors: 100%|██████████| 329M/329M [00:03<00:00, 104MB/s]  
Downloading pytorch_model.bin: 100%|██████████| 329M/329M [00:08<00:00, 38.8MB/s] 
Downloading (…)cial_tokens_map.json: 100%|██████████| 280/280 [00:00<00:00, 2.53MB/s]
Downloading (…)9a318/tokenizer.json: 100%|██████████| 2.15M/2.15M [00:00<00:00, 4.14MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 1.46k/1.46k [00:00<00:00, 12.1MB/s]
Downloading (…)2f289a318/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 1.95MB/s]
No sentence-transformers model found

In [23]:
# Convert the column of lists of keywords to a list of strings
keywords_list = df_keywords_list['keywords_yake_desc_5000'].tolist()

# Compute embeddings (the model will automatically use the GPU if available)
keywords_embeddings = model.encode(keywords_list, convert_to_tensor=True)

# Convert embeddings tensor to a list of lists and add to dataframe
df_keywords_list['keywords_embeddings_climatebert'] = keywords_embeddings.cpu().numpy().tolist()

In [26]:
df_keywords_list.to_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/Cleantech Concepts/df_cpc_subclass_keywords_yake_embeddings.json', orient='records')

# Dimensionality Reduction with UMAP

In [None]:
# Perform dimensionality reduction with UMAP
reducer = umap.UMAP(random_state=42)
embeddings = df_keywords_list['keywords_embeddings_patentsberta'].tolist()
umap_embeddings = reducer.fit_transform(embeddings)

# Add UMAP embeddings to dataframe
df_keywords_list['keywords_embeddings_patentsberta_umap'] = umap_embeddings.tolist()

# Clustering with HDBSCAN

In [None]:
# Perform clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
clusterer.fit(umap_embeddings)

# Add HDBSCAN clusters to dataframe
df_keywords_list['keywords_embeddings_patentsberta_hdbscan'] = clusterer.labels_.tolist()