In [9]:
import os
import umap
import pandas as pd
import numpy as np
import tomotopy as tp
import plotly.express as px
from tqdm import tqdm
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence

In [15]:
# Load model , embed words, UMAP and plot
PLDA_model = tp.PLDAModel.load('/Users/juergenthiesen/Documents/Patentsview/Cleantech Concepts/PLDA_PMI_model_cleantech_concepts.bin')

In [16]:
# Set up dataframe for word embeddings
df_word_embeddings = pd.DataFrame()
topics_per_label = PLDA_model.topics_per_label
num_label = len(PLDA_model.topic_label_dict)

df_word_embeddings['label'] = ''
df_word_embeddings['topic_prob'] = ''
df_word_embeddings['topic'] = ''
df_word_embeddings['prob'] = ''

for i in range(0,num_label):
    for j in range(0,topics_per_label):
        df_word_embeddings.at[i*topics_per_label+j, 'label'] = PLDA_model.topic_label_dict[i]
        df_word_embeddings.at[i*topics_per_label+j, 'topic_prob'] = PLDA_model.get_topic_words(i*topics_per_label+j,1)
        df_word_embeddings.at[i*topics_per_label+j, 'topic'] = df_word_embeddings.at[i*topics_per_label+j, 'topic_prob'][0][0]
        df_word_embeddings.at[i*topics_per_label+j, 'prob'] = df_word_embeddings.at[i*topics_per_label+j, 'topic_prob'][0][1]

In [17]:
# Initialize transformer model
embedding = TransformerWordEmbeddings('climatebert/distilroberta-base-climate-f')

# Generate word embeddings
df_word_embeddings['embedding'] = ''

for i in tqdm(range(0,len(df_word_embeddings))):
    sentence = Sentence(df_word_embeddings.at[i, 'topic'])
    embedding.embed(sentence)
    for token in sentence:
        # print(token.embedding)
        df_word_embeddings.at[i, 'embedding'] = token.embedding.numpy()

100%|██████████| 80/80 [00:01<00:00, 42.95it/s]


In [18]:
# Reduce dimensionality of word embeddings with UMAP
list_embeddings = df_word_embeddings['embedding'].tolist()

reducer = umap.UMAP(n_components = 2, random_state = 42)
embedding_UMAP = reducer.fit_transform(list_embeddings)

df_word_embeddings['embedding_UMAP'] = embedding_UMAP.tolist()

In [19]:
fig = px.scatter(
    df_word_embeddings,
    x=df_word_embeddings['embedding_UMAP'].apply(lambda x: x[0]),
    y=df_word_embeddings['embedding_UMAP'].apply(lambda x: x[1]),
    color=df_word_embeddings['label'],
    color_continuous_scale="Plasma",
    hover_name=df_word_embeddings["topic"],
    title=f"UMAP Embeddings of Cleantech marked Y02x Patents",
)
fig.write_html(f"/Users/juergenthiesen/Documents/Patentsview/Cleantech Concepts/Cleantech_Patents_PLDA_PMI.html")