In [None]:
import pandas as pd
import ast
from tqdm import tqdm
tqdm.pandas()

In [None]:
# Load the data
df_yake_cleantech_keywords = pd.read_json('data/yake_cleantech_keywords.json')

# Embedding

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available: {}".format(torch.cuda.get_device_name(0)))
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
df_yake_cleantech_keywords['bertforpatents_embedding'] = model_bertforpatents.encode(df_yake_cleantech_keywords['keyword'].tolist(), show_progress_bar=True)

# Dimensionality reduction (Open TSNE)

In [None]:
import numpy as np
from openTSNE import TSNE
from sklearn.model_selection import train_test_split

In [None]:
bertforpatents_x_train, bertforpatents_x_test = train_test_split(df_yake_cleantech_keywords['bertforpatents_embedding'].tolist(), test_size=0.2, random_state=42)
# Convert to numpy array
bertforpatents_x_train = np.array(bertforpatents_x_train)
bertforpatents_x_test = np.array(bertforpatents_x_test)

In [None]:
tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    n_jobs=8,
    random_state=42,
    n_iter=1000,
    verbose=True,
)

In [None]:
# Perform t-SNE
bertforpatents_embedding_train = tsne.fit(bertforpatents_x_train)
bertforpatents_embedding_test = bertforpatents_embedding_train.transform(bertforpatents_x_test)

In [None]:
df_yake_cleantech_keywords['bertforpatents_embedding_tsne'] = np.concatenate((bertforpatents_embedding_train, bertforpatents_embedding_test), axis=0).tolist()
df_yake_cleantech_keywords['bertforpatents_embedding_tsne_x'] = df_yake_cleantech_keywords['bertforpatents_embedding_tsne'].apply(lambda x: x[0])
df_yake_cleantech_keywords['bertforpatents_embedding_tsne_y'] = df_yake_cleantech_keywords['bertforpatents_embedding_tsne'].apply(lambda x: x[1])

# Clustering (HDBSCAN)

In [None]:
import hdbscan

In [None]:
# Perform HDBSCAN clustering on the UMAP coordinates
clusterer_bertforpatents = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=1).fit(df_yake_cleantech_keywords['bertforpatents_embedding_tsne'].tolist())
df_yake_cleantech_keywords['bertforpatents_cluster'] = clusterer_bertforpatents.labels_.tolist()

# Erase all rows with cluster -1
df_yake_cleantech_keywords = df_yake_cleantech_keywords[df_yake_cleantech_keywords['bertforpatents_cluster'] != -1]

# Name Cluster (HuggingFace)

In [None]:
from transformers import pipeline, set_seed

In [None]:
# Aggregate the keywords per cluster
df_yake_cleantech_keywords_cluster = df_yake_cleantech_keywords.groupby(['bertforpatents_cluster'])['keyword'].apply(list).reset_index(name='keywords')

In [None]:
# Initialize a text-generation pipeline with Flan-T5-large
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator = pipeline('text2text-generation', model='google/flan-t5-large', device=device)
set_seed(42)

In [None]:
def generate_cluster_name(keywords):
    # Ensure the keywords are in a list format
    keywords = keywords.split(', ') if isinstance(keywords, str) else keywords
    # Select only the first xxx keywords from the list
    selected_keywords = keywords[:2000]
    # Join the selected keywords into a string format
    keywords_str = ', '.join(selected_keywords)
    # Create a prompt from the selected keywords
    # prompt = f"Based on the following keywords, come up with a specific, precise and short topic name: {keywords_str}"
    prompt = f"Generate a concise and descriptive common theme or category for a cluster containing the following keywords: {keywords_str}." # The name should be in title case and should not exceed three words."
    # Doesn't work at all -> only focuses on electric vehicle innovation# prompt = f"Given the keywords: {keywords_str}, provide a succinct cluster name similar to how 'Electric Vehicle Innovation' represents keywords like 'battery technology, electric motor, charging infrastructure'."
    # prompt = f"Identify a common theme or category for the following keywords: {keywords_str}. Provide a concise, descriptive name for this theme or category."
    # prompt = f"The keywords {keywords_str} all belong to the category: _____"
    # Generate a response using the GPT-3 model
    response = generator(prompt, max_length=10, do_sample=True, temperature=0.8)[0]['generated_text']
    # Extract the cluster name from the response
    cluster_name = response
    return cluster_name

# Apply the function to the 'keywords' column to generate cluster names
df_yake_cleantech_keywords_cluster['cluster_name'] = df_yake_cleantech_keywords_cluster['keywords'].progress_apply(generate_cluster_name)
df_yake_cleantech_keywords['cluster_name'] = df_yake_cleantech_keywords['bertforpatents_cluster'].map(df_yake_cleantech_keywords_cluster.set_index('bertforpatents_cluster')['cluster_name'])

# Visualization (Plotly Express)

In [None]:
import plotly.express as px

In [None]:
fig_bertforpatents = px.scatter(
    df_yake_cleantech_keywords,
    x='bertforpatents_embedding_tsne_x',
    y='bertforpatents_embedding_tsne_y',
    color='bertforpatents_cluster',
    hover_data=['keyword', 'cluster_name'],
    title='HDBSCAN clustering of YAKE keywords using BERT for patents embeddings',
    height=800,
    width=800
)

fig_bertforpatents.show()

fig_bertforpatents.write_html('data/yake_keywords_noun_chunks_dim_reduction_cluster.html')