In [1]:
import pandas as pd 
import numpy as np
import math 

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import re

import hdbscan
import umap

import seaborn as sns
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool,  FactorRange
from bokeh.io import push_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral10
from bokeh.models import Legend, LegendItem

import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

from sentence_transformers import SentenceTransformer
vector_embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
df = pd.read_pickle('./datasets/all_data_768dim_embeddings_fixed_authors.pkl')

In [2]:
def umap_visualization(clustered_df, query, vector_embedding_model):
    # Create UMAP reducer for visualization
    for_visual_umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1,  metric='euclidean', n_components=2, random_state=42)
    
    # Get embeddings for both documents and query
    doc_embeddings = clustered_df['embedding_llm_768_dim'].tolist()
    query_embedding = [vector_embedding_model.encode(query)]
    all_embeddings = doc_embeddings + query_embedding
    
    # Transform all embeddings together
    all_umap_embeddings = for_visual_umap_reducer.fit_transform(all_embeddings)
    
    # Split back into document and query embeddings
    for_visual_umap_embeddings = all_umap_embeddings[:-1]
    query_umap_embedding = all_umap_embeddings[-1]

    # Prepare the data for Bokeh
    clustered_df['cluster'] = clustered_df['cluster'].astype(str)
    unique_clusters = sorted(clustered_df['cluster'].unique())
    colors = ['black' if cluster == '-1' else Spectral10[i % len(Spectral10)] for i, cluster in enumerate(unique_clusters)]
    
    # Create the plot
    p = figure(title="UMAP projection of the HDBSCAN clusters",
               tools="pan,wheel_zoom,box_zoom,reset,hover,save",
               tooltips="@title",
               width=800, height=600)

    legend_items = []
    for cluster, color in zip(unique_clusters, colors):
        cluster_data = clustered_df[clustered_df['cluster'] == cluster]
        cluster_source = ColumnDataSource(data=dict(
            x=for_visual_umap_embeddings[clustered_df['cluster'] == cluster, 0],
            y=for_visual_umap_embeddings[clustered_df['cluster'] == cluster, 1],
            title=cluster_data['title']
        ))
        
        renderer = p.circle('x', 'y', size=10, source=cluster_source, color=color, fill_alpha=0.6, line_color=None)
        legend_items.append((str(cluster), [renderer]))

    # Add query point with triangle marker in bright pink
    p.triangle(query_umap_embedding[0], query_umap_embedding[1], size=15, color='deeppink', 
              fill_alpha=0.8, line_color='deeppink', legend_label='Query')

    # Add the legend
    legend = Legend(items=legend_items, location="center")
    p.add_layout(legend, 'right')

    # Customize the plot
    p.xaxis.axis_label = 'UMAP Dimension 1'
    p.yaxis.axis_label = 'UMAP Dimension 2'

    # Show the plot
    output_notebook()
    show(p, notebook_handle=True)

def plot_word_clouds(clustered_df, query):
    # Create a figure to hold all the word clouds
    num_clusters = len(clustered_df['cluster'].unique())
    ncols = 3
    nrows = (num_clusters // ncols) + (num_clusters % ncols > 0)
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5 * nrows))
    axes = axes.flatten()

    # Iterate through each cluster and generate a word cloud
    for i, cluster in enumerate(sorted(clustered_df['cluster'].unique())):
        cluster_data = clustered_df[clustered_df['cluster'] == cluster]
        titles_text = " ".join(cluster_data['title'].dropna())

        # Remove the query from titles_text (case insensitive)
        titles_text = re.sub(query, '', titles_text, flags=re.IGNORECASE)
        
        # Generate the word cloud
        wordcloud = WordCloud(width=400, height=200, background_color='white').generate(titles_text)
        
        # Display the word cloud
        axes[i].imshow(wordcloud, interpolation='bilinear')
        axes[i].axis('off')
        axes[i].set_title(f'Cluster {cluster}')

    # Remove any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
def vector_search(df, vector_embedding_model, query, threshold=0.2):
    """
    Returns a DataFrame of all vectors with cosine(angle) > threshold with the embedded query
    """

    if query == '':
        # If the query is empty, return the original DataFrame
        print('Returning all results')
        return df

    # Embed the query in transformer vector space (768 dimensions)
    vector = vector_embedding_model.encode(query)

    # Calculate the cosine similarity between the input vector and the vectors in the DataFrame
    similarities = np.dot(df['embedding_llm_768_dim'].tolist(), vector)
    norm_product = np.linalg.norm(df['embedding_llm_768_dim'].tolist(), axis=1) * np.linalg.norm(vector)
    cosines = similarities / norm_product

    # Add cosines to the DataFrame
    df['cosine'] = cosines

    # Filter the DataFrame by cosine similarity
    filtered_df = df[cosines > threshold]

    return filtered_df

def umap_reduce(df):
    """
    creates a umap manifold representing only the queried vectors
    """

    import pandas as pd 
    import numpy as np
    import math 
    import umap

    # df_nn_10_md_0_nc_50 was the chosen one
    
    # note if too few results are returned, then n_neighbors=10 is too much to create a manifold
    try:
        umap_reducer = umap.UMAP(n_neighbors=10, min_dist=0, n_components=50, random_state=42)
        umap_embeddings = umap_reducer.fit_transform(df['embedding_llm_768_dim'].tolist())
        df['embedding_umap'] = list(umap_embeddings)
    except Exception as e:
        # print('Too few vectors, cannot create a manifold')
        # print this later lol
        return None
    return df

def hdbscan_cluster(filtered_df):
    """
    Returns a DataFrame with cluster labels using the HDBSCAN clustering algorithm
    """

    if len(filtered_df) == 0:
        print('No papers found')
        return None

    # determine min cluster size based on the number of papers
    if len(filtered_df) < 20:
        min_cluster_size = 2
    else:
        min_cluster_size = math.ceil(len(filtered_df) / 20)

    embeddings = filtered_df['embedding_umap'].tolist()
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                    min_samples=5,
                                    cluster_selection_epsilon=0.5,
                                    metric='euclidean', 
                                    cluster_selection_method='eom', 
                                    allow_single_cluster=False,
                                    gen_min_span_tree=True)
    cluster_labels = clusterer.fit_predict(embeddings)

    # Add cluster labels to the DataFrame
    filtered_df['cluster'] = cluster_labels

    return filtered_df

def plot_cluster_word_frequencies(clustered_df):
    def get_word_freq(text):
        words = re.findall(r'\w+', text.lower())        
        words = [word for word in words if word not in stop_words and len(word) > 2]
        return Counter(words)

    def get_author_freq(authors_lists):
        # Flatten the list of author lists and count frequencies
        all_authors = [author for sublist in authors_lists for author in sublist]
        return Counter(all_authors)

    clusters = sorted(clustered_df['cluster'].unique())
    n_clusters = len(clusters)

    if n_clusters > 0:
        fig, axs = plt.subplots(n_clusters, 2, figsize=(24, 6*n_clusters))
        if n_clusters == 1:
            axs = axs.reshape(1, 2)
        
        for idx, cluster in enumerate(clusters):
            cluster_docs = clustered_df[clustered_df['cluster'] == cluster]
            
            # Word frequencies plot
            combined_text = ' '.join(cluster_docs['title'] + ' ' + cluster_docs['abstract'])
            word_freq = get_word_freq(combined_text)
            top_words = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10])
            
            # Reverse the order for vertical display
            keys = list(reversed(list(top_words.keys())))
            values = list(reversed(list(top_words.values())))
            axs[idx, 0].barh(range(len(keys)), values)
            axs[idx, 0].set_yticks(range(len(keys)))
            axs[idx, 0].set_yticklabels(keys)
            axs[idx, 0].set_title(f'Cluster {cluster}: Top 10 Words', fontsize=14)
            axs[idx, 0].tick_params(axis='both', labelsize=12)

            # Author frequencies plot
            author_freq = get_author_freq(cluster_docs['authors'])
            top_authors = dict(sorted(author_freq.items(), key=lambda x: x[1], reverse=True)[:10])
            
            # Reverse the order for vertical display
            keys = list(reversed(list(top_authors.keys())))
            values = list(reversed(list(top_authors.values())))
            axs[idx, 1].barh(range(len(keys)), values)
            axs[idx, 1].set_yticks(range(len(keys)))
            axs[idx, 1].set_yticklabels(keys)
            axs[idx, 1].set_title(f'Cluster {cluster}: Top 10 Authors', fontsize=14)
            axs[idx, 1].tick_params(axis='both', labelsize=12)
        
        plt.tight_layout()
        plt.show()
    else:
        print("No clusters found in the data")

def cluster_from_query(df, vector_embedding_model, query, cosine_threshold, extra_visuals):
    """
    Given a query and a DataFrame containing embedding_llm_768_dim, embedding_umap columns, returns a DataFrame with cluster labels
    """
    
    filtered_df = vector_search(df, vector_embedding_model, query, cosine_threshold)
    
    if len(filtered_df) == 0:
        print('No papers found')
        return None
    
    reduced_df = umap_reduce(filtered_df)

    if reduced_df is None:
        print('Too few vectors to run clustering algorithm')
        print('Here are the papers found:')
        display(filtered_df[['title', 'authors']])
        filtered_df['cluster'] = -1
        plot_cluster_word_frequencies(filtered_df)
        return None

    clustered_df = hdbscan_cluster(reduced_df)

    plot_cluster_word_frequencies(clustered_df)

    if extra_visuals:
        umap_visualization(clustered_df, query, vector_embedding_model)
        plot_word_clouds(clustered_df, query)
    
    return clustered_df


In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
out = widgets.Output()
# Define the function to run the code

def run_cluster_and_visualize(query, cosine_threshold, extra_visuals):
    out.clear_output(wait=True)  # Clear the output widget, not the whole cell
    with out:
        # Any print or output from cluster_from_query will be captured here
        cluster_from_query(df, vector_embedding_model, query, cosine_threshold, extra_visuals)

# Create a description of the tool
tool_label = widgets.HTML(
    value="""
    <b>This tool allows you to search for a topic and view the relevant papers and project pages within SINTEF, categorized into subgroups.<br>
    The cosine similarity threshold controls how close the results are to the query, 0.2 works well for general queries, and 0.4 for specific concepts.
    </b>
    """
)

# Create a search bar
search_bar = widgets.Text(
    value='',
    placeholder='Enter search query',
    description='Search:',
    disabled=False
)

# Create a value slider for cosine threshold selection
threshold_slider = widgets.FloatSlider(
    value=0.3,
    min=0.2,
    max=0.5,
    step=0.01,
    description='Threshold:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='.2f',
)

# Create a checkbox
toggle_checkbox = widgets.Checkbox(
    value=False,
    description='Extra cluster visuals',
    disabled=False
)

# Create a button
search_button = widgets.Button(
    description='Run',
    disabled=False,
    button_style='',
    tooltip='Click to run the search',
    icon='search'
)

# Define the button click event
def on_button_click(b):
    run_cluster_and_visualize(search_bar.value, threshold_slider.value, toggle_checkbox.value)

# Attach the button click event to the function
search_button.on_click(on_button_click)

# Create an HBox layout to place the search bar, slider, and button horizontally
hbox = widgets.HBox([search_bar, widgets.Label(" " * 20), threshold_slider, widgets.Label(" " * 2), toggle_checkbox, widgets.Label(" " * 20), search_button])

# Display the HBox layout and the button
display(tool_label, hbox, out)

HTML(value='\n    <b>This tool allows you to search for a topic and view the relevant papers and project pages…

HBox(children=(Text(value='', description='Search:', placeholder='Enter search query'), Label(value='         …

Output()