In [1]:
import os
import json
import random
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import umap.umap_ as umap
from sentence_transformers import SentenceTransformer
from mpl_toolkits.mplot3d import Axes3D
import imageio

# Define the root directory and category folders
root_dir = './Thesis Data'
categories = {
    'Supreme Court': 'Supreme Court Jurisprudence',
    'Republic Acts': 'Republic Acts',
    'Commonwealth Acts': 'Commonwealth Acts',
    'Batas Pambata': 'Batas Pambata',
    'Acts': 'Acts'
}

# Define the sample sizes for each category for a total of Approx 1500 documents
sample_sizes = {
    'Supreme Court': 1059,
    'Republic Acts': 405,
    'Commonwealth Acts': 4,
    'Batas Pambata': 30,
    'Acts': 11
}


In [2]:
# Function to load and sample JSON files from a directory, including subdirectories
def sample_files_from_directory(directory, sample_size):
    files = []
    for root, _, filenames in os.walk(directory):
        files.extend([os.path.join(root, f) for f in filenames if f.endswith('.json')])
        
    sampled_files = random.sample(files, sample_size)
    data_list = []
    for file in sampled_files:
        with open(file, 'r') as f:
            data = json.load(f)
            if 'text' in data:
                data_list.append({
                    'text': data['text'],
                    'category': os.path.basename(directory)
                })
    return data_list



In [3]:
# Sample data from each category
sampled_data = []
for category, folder in categories.items():
    category_dir = os.path.join(root_dir, folder)
    sample_size = sample_sizes[category]
    sampled_data.extend(sample_files_from_directory(category_dir, sample_size))

# Convert sampled data to a DataFrame for further processing
df = pd.DataFrame(sampled_data)

# Check if a GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Generate embeddings using the Alibaba-NLP/gte-large-en-v1.5 model
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
model = model.to(device)



Using device: cuda




In [4]:
df.head()

Unnamed: 0,text,category
0,Under consideration is this petition for revie...,Supreme Court Jurisprudence
1,To entitle petitioner spouse to a declaration ...,Supreme Court Jurisprudence
2,Before the Court is a Petition for Review on C...,Supreme Court Jurisprudence
3,At bar is a Petition for Review on Certiorari ...,Supreme Court Jurisprudence
4,Before the Court is a Petition for Review on C...,Supreme Court Jurisprudence


In [5]:
np.shape(df)

(1509, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1509 entries, 0 to 1508
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      1509 non-null   object
 1   category  1509 non-null   object
dtypes: object(2)
memory usage: 23.7+ KB


In [7]:
df.to_csv('sample_data_umap.csv', index=False)

In [8]:
# Function to process and embed text chunks
# Recursive Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,  # Adjusted chunk size for mini-LM
    chunk_overlap=200,  # Adding some overlap to maintain context between chunks
    separators=["\n\n", "\n", "(?<=\. )", " "],
    length_function=len
)

In [9]:
def embed_text_chunks(text, model, device, doc_index, category):
    embeddings = []
    categories = []
    doc_indices = []
    # If text is a list of strings, join them into a single string
    if isinstance(text, list):
        text = ' '.join(text)
    if not isinstance(text, str):
        print(f'Skipping non-string entry at document {doc_index+1}')
        return embeddings, categories, doc_indices
    chunks = text_splitter.split_text(text)
    for chunk_index, chunk in enumerate(chunks):
        embedding = model.encode(chunk, convert_to_tensor=True, device=device)
        embeddings.append(embedding)
        categories.append(category)  # Assign the same category to all chunks of the document
        doc_indices.append(doc_index)
        print(f'Embedded chunk {chunk_index+1}/{len(chunks)} of document {doc_index+1}/{len(df)}')
    return embeddings, categories, doc_indices

# Initialize lists to store embeddings, categories, and document indices
all_embeddings = []
all_categories = []
all_doc_indices = []

# Process documents
for doc_index in range(len(df['text'])):
    text = df['text'].iloc[doc_index]
    category = df['category'].iloc[doc_index]  # Get the category of the document
    chunk_embeddings, chunk_categories, chunk_doc_indices = embed_text_chunks(text, model, device, doc_index, category)
    all_embeddings.extend(chunk_embeddings)
    all_categories.extend(chunk_categories)
    all_doc_indices.extend(chunk_doc_indices)

# Convert embeddings to numpy arrays and check shapes
numpy_embeddings = []
for i, embedding in enumerate(all_embeddings):
    numpy_embedding = embedding.cpu().numpy()
    if i == 0:
        embedding_size = numpy_embedding.shape[0]
    if numpy_embedding.shape[0] != embedding_size:
        raise ValueError(f"Embedding at index {i} has incorrect shape: {numpy_embedding.shape}")
    numpy_embeddings.append(numpy_embedding)

# Create a DataFrame to store the embeddings, categories, and document indices
embeddings_df = pd.DataFrame({
    'embedding': numpy_embeddings,
    'doc_index': all_doc_indices,
    'category': all_categories
})

# Verify the dimensions of the embeddings
print(f"All document embeddings have size: {embedding_size}")

Embedded chunk 1/4 of document 1/1509
Embedded chunk 2/4 of document 1/1509
Embedded chunk 3/4 of document 1/1509
Embedded chunk 4/4 of document 1/1509
Embedded chunk 1/7 of document 2/1509
Embedded chunk 2/7 of document 2/1509
Embedded chunk 3/7 of document 2/1509
Embedded chunk 4/7 of document 2/1509
Embedded chunk 5/7 of document 2/1509
Embedded chunk 6/7 of document 2/1509
Embedded chunk 7/7 of document 2/1509
Embedded chunk 1/6 of document 3/1509
Embedded chunk 2/6 of document 3/1509
Embedded chunk 3/6 of document 3/1509
Embedded chunk 4/6 of document 3/1509
Embedded chunk 5/6 of document 3/1509
Embedded chunk 6/6 of document 3/1509
Embedded chunk 1/4 of document 4/1509
Embedded chunk 2/4 of document 4/1509
Embedded chunk 3/4 of document 4/1509
Embedded chunk 4/4 of document 4/1509
Embedded chunk 1/12 of document 5/1509
Embedded chunk 2/12 of document 5/1509
Embedded chunk 3/12 of document 5/1509
Embedded chunk 4/12 of document 5/1509
Embedded chunk 5/12 of document 5/1509
Embedde

In [10]:
print(all_categories[:10])

['Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence', 'Supreme Court Jurisprudence']


In [11]:
embeddings_df.to_csv('embeddings_with_doc_indices.csv', index=False)
print('Embeddings DataFrame saved to CSV file.')

Embeddings DataFrame saved to CSV file.


## UMAP without Query

### Euclidean Distance (Default)

In [97]:
all_embeddings_np = np.vstack(embeddings_df['embedding'].values)

# Fit and transform the embeddings using UMAP
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2'])
umap_df['category'] = embeddings_df['category']

# Plot the UMAP embeddings with categories
plt.figure(figsize=(12, 8))
sns.scatterplot(x='dim1', y='dim2', hue='category', data=umap_df, palette='viridis', alpha=0.7)
plt.title('UMAP Visualization of Document Embeddings')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend(loc='best', title='Category')

# Save the plot as an image
output_dir = '.'  # You can specify your desired directory here
plot_path = os.path.join(output_dir, 'euclidean_umap_visualization.png')
plt.savefig(plot_path)
plt.close()

print(f'Plot saved to {plot_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Plot saved to .\euclidean_umap_visualization.png


In [98]:
all_embeddings_np = np.vstack(embeddings_df['embedding'].values)

# Fit and transform the embeddings using UMAP for 3D visualization
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, random_state=42)
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2', 'dim3'])
umap_df['category'] = embeddings_df['category']

# Create a directory to save frames
output_dir = './umap_frames'
os.makedirs(output_dir, exist_ok=True)

# Generate frames for the GIF
angles = np.linspace(0, 360, 60)  # 60 frames for a full rotation
filenames = []

for angle in angles:
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    sc = ax.scatter(umap_df['dim1'], umap_df['dim2'], umap_df['dim3'], c=umap_df['category'].astype('category').cat.codes, cmap='viridis', alpha=0.7)
    
    # Add legend
    handles, labels = sc.legend_elements()
    legend = ax.legend(handles, umap_df['category'].astype('category').cat.categories, title='Category')
    
    ax.set_title('3D UMAP Visualization of Document Embeddings')
    ax.set_xlabel('UMAP Dimension 1')
    ax.set_ylabel('UMAP Dimension 2')
    ax.set_zlabel('UMAP Dimension 3')
    ax.view_init(30, angle)  # Rotate the view

    # Save the frame
    filename = os.path.join(output_dir, f'frame_{int(angle):03d}.png')
    plt.savefig(filename)
    plt.close()
    filenames.append(filename)

# Create a GIF from the saved frames
gif_path = os.path.join(output_dir, 'euclidean_umap_visualization_3d.gif')
with imageio.get_writer(gif_path, mode='I', duration=0.1) as writer:
    for filename in filenames:
        image = imageio.imread(filename)
        writer.append_data(image)

# Clean up (optional): Remove the individual frame files
for filename in filenames:
    os.remove(filename)

print(f'GIF saved to {gif_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



GIF saved to ./umap_frames\euclidean_umap_visualization_3d.gif


### Cosine Similarity

In [99]:
all_embeddings_np = np.vstack(embeddings_df['embedding'].values)

# Fit and transform the embeddings using UMAP
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42,metric='cosine')
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2'])
umap_df['category'] = embeddings_df['category']

# Plot the UMAP embeddings with categories
plt.figure(figsize=(12, 8))
sns.scatterplot(x='dim1', y='dim2', hue='category', data=umap_df, palette='viridis', alpha=0.7)
plt.title('UMAP Visualization of Document Embeddings')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend(loc='best', title='Category')

# Save the plot as an image
output_dir = '.'  # You can specify your desired directory here
plot_path = os.path.join(output_dir, 'cosine_umap_visualization.png')
plt.savefig(plot_path)
plt.close()

print(f'Plot saved to {plot_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Plot saved to .\cosine_umap_visualization.png


In [100]:
all_embeddings_np = np.vstack(embeddings_df['embedding'].values)

# Fit and transform the embeddings using UMAP for 3D visualization
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, random_state=42,metric='cosine')
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2', 'dim3'])
umap_df['category'] = embeddings_df['category']

# Create a directory to save frames
output_dir = './umap_frames'
os.makedirs(output_dir, exist_ok=True)

# Generate frames for the GIF
angles = np.linspace(0, 360, 60)  # 60 frames for a full rotation
filenames = []

for angle in angles:
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    sc = ax.scatter(umap_df['dim1'], umap_df['dim2'], umap_df['dim3'], c=umap_df['category'].astype('category').cat.codes, cmap='viridis', alpha=0.7)
    
    # Add legend
    handles, labels = sc.legend_elements()
    legend = ax.legend(handles, umap_df['category'].astype('category').cat.categories, title='Category')
    
    ax.set_title('3D UMAP Visualization of Document Embeddings')
    ax.set_xlabel('UMAP Dimension 1')
    ax.set_ylabel('UMAP Dimension 2')
    ax.set_zlabel('UMAP Dimension 3')
    ax.view_init(30, angle)  # Rotate the view

    # Save the frame
    filename = os.path.join(output_dir, f'frame_{int(angle):03d}.png')
    plt.savefig(filename)
    plt.close()
    filenames.append(filename)

# Create a GIF from the saved frames
gif_path = os.path.join(output_dir, 'cosine_umap_visualization_3d.gif')
with imageio.get_writer(gif_path, mode='I', duration=0.1) as writer:
    for filename in filenames:
        image = imageio.imread(filename)
        writer.append_data(image)

# Clean up (optional): Remove the individual frame files
for filename in filenames:
    os.remove(filename)

print(f'GIF saved to {gif_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



GIF saved to ./umap_frames\cosine_umap_visualization_3d.gif


## UMAP with Query

In [101]:
query = "The court found the defendant guilty of murder in the first degree, citing overwhelming evidence of premeditation and intent. The prosecution presented a compelling case, including eyewitness testimony and forensic evidence linking the defendant to the crime scene."
query_embedding = model.encode(query, convert_to_tensor=True, device=device)
query_embedding = query_embedding.cpu().numpy()

embeddings_np = np.array(embeddings_df['embedding'].tolist())
print(embeddings_np.shape)
all_embeddings_np = np.vstack([embeddings_np, query_embedding])
print(all_embeddings_np.shape)

(10135, 1024)
(10136, 1024)


In [102]:
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2'])
# Ensure the category list is properly constructed
categories = embeddings_df['category'].tolist()
categories.append('Query')  # Add 'Query' with capital Q
umap_df['category'] = categories

# Plot the UMAP embeddings with categories, excluding 'Query' for now
plt.figure(figsize=(12, 8))
non_query_points = umap_df[umap_df['category'] != 'Query']
sns.scatterplot(x='dim1', y='dim2', hue='category', data=non_query_points, palette='viridis', alpha=0.7)

# Highlight the query embedding separately to avoid duplicate legend entries
query_point = umap_df[umap_df['category'] == 'Query']
plt.scatter(query_point['dim1'], query_point['dim2'], color='red', edgecolor='black', s=100, label='Query')

plt.title('UMAP Visualization of Document Embeddings')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend(loc='best', title='Category')

# Save the plot as an image
output_dir = '.'  # You can specify your desired directory here
plot_path = os.path.join(output_dir, 'euclidean_query_umap_visualization.png')
plt.savefig(plot_path)
plt.close()

print(f'Plot saved to {plot_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Plot saved to .\euclidean_query_umap_visualization.png


In [103]:
query = "The court found the defendant guilty of murder in the first degree, citing overwhelming evidence of premeditation and intent. The prosecution presented a compelling case, including eyewitness testimony and forensic evidence linking the defendant to the crime scene."
query_embedding = model.encode(query, convert_to_tensor=True, device=device)
query_embedding = query_embedding.cpu().numpy()

embeddings_np = np.array(embeddings_df['embedding'].tolist())
print(embeddings_np.shape)
all_embeddings_np = np.vstack([embeddings_np, query_embedding])
print(all_embeddings_np.shape)

(10135, 1024)
(10136, 1024)


In [104]:
import imageio.v2 as imageio  # Use imageio.v2 to avoid deprecation warning
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.cm import viridis
from matplotlib.colors import Normalize

umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, random_state=42)
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2', 'dim3'])
categories = embeddings_df['category'].tolist()
categories.append('Query')  # Add 'Query' with capital Q
umap_df['category'] = categories

# Get the query point coordinates
query_point = umap_df[umap_df['category'] == 'Query']
query_x, query_y, query_z = query_point.iloc[0][['dim1', 'dim2', 'dim3']]

# Normalize the categories for color mapping
unique_categories = umap_df['category'].unique()
norm = Normalize(vmin=0, vmax=len(unique_categories) - 2)
cmap = plt.get_cmap('viridis')

# Generate GIF frames by zooming in and out
frames = []
frame_paths = []  # To store paths of frame images
zoom_levels = np.linspace(0.1, 4, 30).tolist() + np.linspace(4, 0.1, 30).tolist()

for zoom in zoom_levels:
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot non-query points with colors from viridis colormap
    non_query_points = umap_df[umap_df['category'] != 'Query']
    for idx, category in enumerate(non_query_points['category'].unique()):
        category_points = non_query_points[non_query_points['category'] == category]
        color = cmap(norm(idx))
        ax.scatter(category_points['dim1'], category_points['dim2'], category_points['dim3'],
                   label=category, s=20, alpha=0.7, color=color)
    
    # Plot the query point
    ax.scatter(query_x, query_y, query_z, color='red', edgecolor='black', s=100, label='Query')
    
    # Set labels and title
    ax.set_xlabel('UMAP Dimension 1')
    ax.set_ylabel('UMAP Dimension 2')
    ax.set_zlabel('UMAP Dimension 3')
    ax.set_title('3D UMAP Visualization of Document Embeddings')
    ax.legend(loc='best', title='Category')
    
    # Adjust the camera position to zoom in and out on the query point
    ax.set_xlim([query_x - zoom, query_x + zoom])
    ax.set_ylim([query_y - zoom, query_y + zoom])
    ax.set_zlim([query_z - zoom, query_z + zoom])
    
    # Save frame
    frame_path = os.path.join(output_dir, f'frame_{zoom:.2f}.png')
    plt.savefig(frame_path)
    frame_paths.append(frame_path)
    frames.append(imageio.imread(frame_path))
    plt.close()

# Save the GIF
gif_path = os.path.join(output_dir, 'euclidean_query_3d_umap_zoom.gif')
imageio.mimsave(gif_path, frames, fps=10)

# Clean up frame images

print(f'GIF saved to {gif_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



GIF saved to .\euclidean_query_3d_umap_zoom.gif


### Cosine Similarity

In [105]:
query = "The court found the defendant guilty of murder in the first degree, citing overwhelming evidence of premeditation and intent. The prosecution presented a compelling case, including eyewitness testimony and forensic evidence linking the defendant to the crime scene."
query_embedding = model.encode(query, convert_to_tensor=True, device=device)
query_embedding = query_embedding.cpu().numpy()

embeddings_np = np.array(embeddings_df['embedding'].tolist())
print(embeddings_np.shape)
all_embeddings_np = np.vstack([embeddings_np, query_embedding])
print(all_embeddings_np.shape)

(10135, 1024)
(10136, 1024)


In [106]:
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42,metric='cosine')
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2'])
# Ensure the category list is properly constructed
categories = embeddings_df['category'].tolist()
categories.append('Query')  # Add 'Query' with capital Q
umap_df['category'] = categories

# Plot the UMAP embeddings with categories, excluding 'Query' for now
plt.figure(figsize=(12, 8))
non_query_points = umap_df[umap_df['category'] != 'Query']
sns.scatterplot(x='dim1', y='dim2', hue='category', data=non_query_points, palette='viridis', alpha=0.7)

# Highlight the query embedding separately to avoid duplicate legend entries
query_point = umap_df[umap_df['category'] == 'Query']
plt.scatter(query_point['dim1'], query_point['dim2'], color='red', edgecolor='black', s=100, label='Query')

plt.title('UMAP Visualization of Document Embeddings')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend(loc='best', title='Category')

# Save the plot as an image
output_dir = '.'  # You can specify your desired directory here
plot_path = os.path.join(output_dir, 'cosine_query_umap_visualization.png')
plt.savefig(plot_path)
plt.close()

print(f'Plot saved to {plot_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Plot saved to .\cosine_query_umap_visualization.png


In [107]:
query = "The court found the defendant guilty of murder in the first degree, citing overwhelming evidence of premeditation and intent. The prosecution presented a compelling case, including eyewitness testimony and forensic evidence linking the defendant to the crime scene."
query_embedding = model.encode(query, convert_to_tensor=True, device=device)
query_embedding = query_embedding.cpu().numpy()

embeddings_np = np.array(embeddings_df['embedding'].tolist())
print(embeddings_np.shape)
all_embeddings_np = np.vstack([embeddings_np, query_embedding])
print(all_embeddings_np.shape)

(10135, 1024)
(10136, 1024)


In [108]:
import imageio.v2 as imageio  # Use imageio.v2 to avoid deprecation warning
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.cm import viridis
from matplotlib.colors import Normalize

umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, random_state=42,metric='cosine')
umap_embeddings = umap_model.fit_transform(all_embeddings_np)

# Create a DataFrame with UMAP embeddings and categories
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2', 'dim3'])
categories = embeddings_df['category'].tolist()
categories.append('Query')  # Add 'Query' with capital Q
umap_df['category'] = categories

# Get the query point coordinates
query_point = umap_df[umap_df['category'] == 'Query']
query_x, query_y, query_z = query_point.iloc[0][['dim1', 'dim2', 'dim3']]

# Normalize the categories for color mapping
unique_categories = umap_df['category'].unique()
norm = Normalize(vmin=0, vmax=len(unique_categories) - 2)
cmap = plt.get_cmap('viridis')

# Generate GIF frames by zooming in and out
frames = []
frame_paths = []  # To store paths of frame images
zoom_levels = np.linspace(0.1, 4, 30).tolist() + np.linspace(4, 0.1, 30).tolist()

for zoom in zoom_levels:
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot non-query points with colors from viridis colormap
    non_query_points = umap_df[umap_df['category'] != 'Query']
    for idx, category in enumerate(non_query_points['category'].unique()):
        category_points = non_query_points[non_query_points['category'] == category]
        color = cmap(norm(idx))
        ax.scatter(category_points['dim1'], category_points['dim2'], category_points['dim3'],
                   label=category, s=20, alpha=0.7, color=color)
    
    # Plot the query point
    ax.scatter(query_x, query_y, query_z, color='red', edgecolor='black', s=100, label='Query')
    
    # Set labels and title
    ax.set_xlabel('UMAP Dimension 1')
    ax.set_ylabel('UMAP Dimension 2')
    ax.set_zlabel('UMAP Dimension 3')
    ax.set_title('3D UMAP Visualization of Document Embeddings')
    ax.legend(loc='best', title='Category')
    
    # Adjust the camera position to zoom in and out on the query point
    ax.set_xlim([query_x - zoom, query_x + zoom])
    ax.set_ylim([query_y - zoom, query_y + zoom])
    ax.set_zlim([query_z - zoom, query_z + zoom])
    
    # Save frame
    frame_path = os.path.join(output_dir, f'frame_{zoom:.2f}.png')
    plt.savefig(frame_path)
    frame_paths.append(frame_path)
    frames.append(imageio.imread(frame_path))
    plt.close()

# Save the GIF
gif_path = os.path.join(output_dir, 'cosine_query_3d_umap_zoom.gif')
imageio.mimsave(gif_path, frames, fps=10)

# Clean up frame images

print(f'GIF saved to {gif_path}')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



GIF saved to .\cosine_query_3d_umap_zoom.gif
