# Ideas

- try adding `section['page']['title']` and `section['page']['summary']` to `get_section_content`, to better align all sections from a page in meaning
- assign topics to pages
- fix document visualizations / topic lengths
- try plugging in LangChain
- add option to filter model to just certain domains from `pages-all.jsonl`

In [None]:
import logging
import torch
import pickle
import jsonlines
import os
import numpy as np
from sentence_transformers import SentenceTransformer, LoggingHandler

logging.basicConfig(
    format='%(asctime)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    level=logging.INFO,
    handlers=[LoggingHandler()]
)

embeddings_location = '../page-embeddings.pkl'
raw_jsonl_data_location = '/tmp/pages-all.jsonl'

def get_section_content(section):
    return section['content'].replace('\n', ' ').strip()

if not os.path.exists(embeddings_location):
    print('Creating embeddings for knowledge base.')
    sections = []
    with jsonlines.open(raw_jsonl_data_location, 'r') as pages:
        for page in pages:
            for page_section in page['sections']:
                section = dict()
                section['page'] = dict()
                section['page']['url'] = page['url']
                section['page']['title'] = page['title']
                section['page']['summary'] = page['summaries']['sbert']
                section['content'] = page_section['content']['raw']
                section['tokens'] = page_section['tokens']
                sections.append(section)

    passages = []
    passages.extend(map(get_section_content, sections))
    print('Passages:', len(passages))
    
    bi_encoder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    bi_encoder.max_seq_length = 350

    if not torch.cuda.is_available():
        print("Warning: No GPU found. Please add GPU to your notebook for better performance, falling back to CPU pooling.")
        pool = bi_encoder.start_multi_process_pool()
        corpus_embeddings = bi_encoder.encode_multi_process(passages, pool)
        bi_encoder.stop_multi_process_pool(pool)

    else:
        corpus_embeddings = bi_encoder.encode(passages)
    print('Corpus embeddings created.')
    
    with open(embeddings_location, "wb") as writer:
        pickle.dump({'sections': sections, 'embeddings': corpus_embeddings}, writer)
    print("Embeddings stored on disc: " + embeddings_location)

else:
    print("Loading pre-computed embeddings from disc: " + embeddings_location)
    with open(embeddings_location, "rb") as reader:
        cache_data = pickle.load(reader)
        sections = cache_data['sections']
        corpus_embeddings = cache_data['embeddings']

print('Corpus embedding size:', corpus_embeddings.shape)
print('Sections:', len(sections))

print("Successfully initialized / loaded embeddings.")

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

topics_location = 'BERTopic.model'
topics_propbs_location = 'BERTopic-topics-probs-pkl'

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embedding_model.max_seq_length = 350

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,          # Step 1 - Extract embeddings
    umap_model=umap_model,                    # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
    representation_model=representation_model # Step 6 - (Optional) Fine-tune topic represenations
)

passages = []
passages.extend(map(get_section_content, sections))
print('Passages:', len(passages))

if not os.path.exists(topics_location) or not os.path.exists(topics_propbs_location):
    print('Creating BERTopic model.')
    topics, probs = topic_model.fit_transform(passages, corpus_embeddings)

    topic_model.save(topics_location)
    print("BERTopic model stored on disc: " + topics_location)
    with open(topics_propbs_location, "wb") as writer:
        pickle.dump({'topics': topics, 'probs': probs}, writer)
    print("BERTopic topics, probs stored on disc: " + topics_location)
else:
    print("Loading pre-computed BERTopic model from disc: " + topics_location)
    topic_model = BERTopic.load(topics_location, embedding_model=embedding_model)

    print("Loading pre-computed BERTopic topics, probs from disc: " + topics_propbs_location)
    with open(topics_propbs_location, "rb") as reader:
        cache_data = pickle.load(reader)
        topics = cache_data['topics']
        probs = cache_data['probs']

print("Connect sections to assigned topics")
topic_sections = {topic: [] for topic in set(topics)}
for topic, section in zip(topics, sections):
    topic_sections[topic].append(section)

print("Successfully initialized / loaded BERTopic.")

In [28]:
topic_model = BERTopic.load(topics_location, embedding_model=embedding_model)

with open(topics_propbs_location, "rb") as reader:
    cache_data = pickle.load(reader)
    topics = cache_data['topics']
    probs = cache_data['probs']

In [None]:
topic_model.visualize_topics()

In [49]:
topic_model.get_topic(441)

[('checkboxgroup', 0.52888036),
 ('checkboxitem', 0.49878812),
 ('checkbox', 0.45189434),
 ('icons', 0.3672846),
 ('dialogtrigger', 0.35388327),
 ('itemindicator', 0.32335225),
 ('textfield', 0.30959886),
 ('icon', 0.3065734),
 ('searchfield', 0.3058104),
 ('workflow', 0.28506392)]

In [50]:
topic_model.get_representative_docs(441)

['Forms  Checkbox CheckboxGroup Form NumberField RadioGroup RangeSlider SearchField Slider Switch TextArea TextField  Icons  Custom Icons Workflow Icons  Navigation  Breadcrumbs Link Tabs',
 'Forms  Checkbox CheckboxGroup Form NumberField RadioGroup RangeSlider SearchField Slider Switch TextArea TextField  Icons  Custom Icons Workflow Icons  Navigation  Breadcrumbs Link Tabs',
 'Forms  Checkbox CheckboxGroup Form NumberField RadioGroup RangeSlider SearchField Slider Switch TextArea TextField  Icons  Custom Icons Workflow Icons  Navigation  Breadcrumbs Link Tabs']

In [54]:
similar_topics, similarity = topic_model.find_topics("checkbox", top_n=5)
topic_model.get_topic(similar_topics[0])

[('checkboxmultiple', 0.82010657),
 ('checkboxoption', 0.7888438),
 ('checkboxes', 0.7774192),
 ('checkboxalways', 0.77417445),
 ('checkbox_onsummarycontainer', 0.7508235),
 ('checkboxgroup', 0.74379194),
 ('checkboxesresponsive', 0.72956383),
 ('checkboxto', 0.7260996),
 ('checkbox_faux_container', 0.7236576),
 ('checkboxcheckbox', 0.7199409)]

In [55]:
topic_model.visualize_barchart()

In [None]:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(corpus_embeddings)
topic_model.visualize_documents(sections, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")
chain.run(input_documents=docs, question=query)

In [None]:
import numpy as np
import sklearn.decomposition
import pickle
import time

# Apply 'Algorithm 1' to the ada-002 embeddings to make them isotropic, taken from the paper:
# ALL-BUT-THE-TOP: SIMPLE AND EFFECTIVE POST- PROCESSING FOR WORD REPRESENTATIONS
# Jiaqi Mu, Pramod Viswanath

# This uses Principal Component Analysis (PCA) to 'evenly distribute' the embedding vectors (make them isotropic)
# For more information on PCA, see https://jamesmccaffrey.wordpress.com/2021/07/16/computing-pca-using-numpy-without-scikit/


# get the file pointer of the pickle containing the embeddings
fp = open('/path/to/your/data/Embedding-Latest.pkl', 'rb')


# the embedding data here is a dict consisting of key / value pairs
# the key is the hash of the message (SHA3-256), the value is the embedding from ada-002 (array of dimension 1536)
# the hash can be used to lookup the orignal text in a database
E = pickle.load(fp) # load the data into memory

# seperate the keys (hashes) and values (embeddings) into seperate vectors
K = list(E.keys()) # vector of all the hash values 
X = np.array(list(E.values())) # vector of all the embeddings, converted to numpy arrays


# list the total number of embeddings
# this can be truncated if there are too many embeddings to do PCA on
print(f"Total number of embeddings: {len(X)}")

# get dimension of embeddings, used later
Dim = len(X[0])

# flash out the first few embeddings
print("First two embeddings are: ")
print(X[0]) 
print(f"First embedding length: {len(X[0])}")
print(X[1])
print(f"Second embedding length: {len(X[1])}")


# compute the mean of all the embeddings, and flash the result
mu = np.mean(X, axis=0) # same as mu in paper
print(f"Mean embedding vector: {mu}")
print(f"Mean embedding vector length: {len(mu)}")


# subtract the mean vector from each embedding vector ... vectorized in numpy
X_tilde = X - mu # same as v_tilde(w) in paper



# do the heavy lifting of extracting the principal components
# note that this is a function of the embeddings you currently have here, and this set may grow over time
# therefore the PCA basis vectors may change over time, and your final isotropic embeddings may drift over time
# but the drift should stabilize after you have extracted enough embedding data to characterize the nature of the embedding engine
print(f"Performing PCA on the normalized embeddings ...")
pca = sklearn.decomposition.PCA()  # new object
TICK = time.time() # start timer
pca.fit(X_tilde) # do the heavy lifting!
TOCK = time.time() # end timer
DELTA = TOCK - TICK

print(f"PCA finished in {DELTA} seconds ...")

# dimensional reduction stage (the only hyperparameter)
# pick max dimension of PCA components to express embddings
# in general this is some integer less than or equal to the dimension of your embeddings
# it could be set as a high percentile, say 95th percentile of pca.explained_variance_ratio_
# but just hardcoding a constant here
D = 15 # hyperparameter on dimension (out of 1536 for ada-002), paper recommeds D = Dim/100


# form the set of v_prime(w), which is the final embedding
# this could be vectorized in numpy to speed it up, but coding it directly here in a double for-loop to avoid errors and to be transparent
E_prime = dict() # output dict of the new embeddings
N = len(X_tilde)
N10 = round(N/10)
U = pca.components_ # set of PCA basis vectors, sorted by most significant to least significant
print(f"Shape of full set of PCA componenents {U.shape}")
U = U[0:D,:] # take the top D dimensions (or take them all if D is the size of the embedding vector)
print(f"Shape of downselected PCA componenents {U.shape}")
for ii in range(N):
    v_tilde = X_tilde[ii]
    v = X[ii]
    v_projection = np.zeros(Dim) # start to build the projection
    # project the original embedding onto the PCA basis vectors, use only first D dimensions
    for jj in range(D):
        u_jj = U[jj,:] # vector
        v_jj = np.dot(u_jj,v) # scaler
        v_projection += v_jj*u_jj # vector
    v_prime = v_tilde - v_projection # final embedding vector
    v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
    E_prime[K[ii]] = v_prime 

    if (ii%N10 == 0) or (ii == N-1):
        print(f"Finished with {ii+1} embeddings out of {N} ({round(100*ii/N)}% done)")


# save as new pickle
print("Saving new pickle ...")
embeddingName = '/path/to/your/data/Embedding-Latest-Isotropic.pkl'
with open(embeddingName, 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([E_prime,mu,U], f)
    print(embeddingName)

print("Done!")

# When working with live data with a new embedding from ada-002, be sure to tranform it first with this function before comparing it
#
# def projectEmbedding(v,mu,U):
#     v = np.array(v)
#     v_tilde = v - mu
#     v_projection = np.zeros(len(v)) # start to build the projection
#     # project the original embedding onto the PCA basis vectors, use only first D dimensions
#     for u in U:
#         v_jj = np.dot(u,v) # scaler
#         v_projection += v_jj*u # vector
#     v_prime = v_tilde - v_projection # final embedding vector
#     v_prime = v_prime/np.linalg.norm(v_prime) # create unit vector
#     return v_prime 