# Load and Prepare Data
Load the UMAP embeddings and cluster labels from numpy files, and prepare them for plotting.

In [1]:
# Load and Prepare Data

import numpy as np
from pathlib import Path

# Load UMAP embeddings
umap_embeddings_file = 'umap_embeddings.npy'
if not Path(umap_embeddings_file).exists():
    raise FileNotFoundError(f"{umap_embeddings_file} not found.")
umap_embeddings = np.load(umap_embeddings_file)

# Load cluster labels
cluster_labels_file = 'cluster_labels.npy'
if not Path(cluster_labels_file).exists():
    raise FileNotFoundError(f"{cluster_labels_file} not found.")
cluster_labels = np.load(cluster_labels_file)

# Prepare data for plotting
data = []
for embedding, cluster in zip(umap_embeddings, cluster_labels):
    data.append([embedding.tolist(), int(cluster)])

FileNotFoundError: umap_embeddings.npy not found.

# Create Basic Scatter Plot
Create a basic scatter plot using matplotlib to visualize the 2D UMAP embeddings.

In [None]:
import matplotlib.pyplot as plt

# Convert data to numpy array for easier manipulation
data_array = np.array(data)

# Extract embeddings and cluster labels
embeddings = np.vstack(data_array[:, 0])
clusters = data_array[:, 1].astype(int)

# Create scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings[:, 0], embeddings[:, 1], c=clusters, cmap='viridis', s=5)
plt.colorbar(scatter, label='Cluster')
plt.title('2D UMAP Embeddings Scatter Plot')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()

# Enhanced Visualization with Colors
Create a colored scatter plot where points are colored by their cluster labels. Add a colorbar and legend.

In [2]:
# Enhanced Visualization with Colors

import matplotlib.pyplot as plt
import numpy as np

# Convert data to numpy array for easier manipulation
data_array = np.array(data)

# Extract embeddings and cluster labels
embeddings = np.vstack(data_array[:, 0])
clusters = data_array[:, 1].astype(int)

# Create scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings[:, 0], embeddings[:, 1], c=clusters, cmap='viridis', s=5)
plt.colorbar(scatter, label='Cluster')
plt.title('2D UMAP Embeddings Scatter Plot')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.show()

NameError: name 'data' is not defined

# Interactive Plot with Plotly
Create an interactive scatter plot using Plotly that allows zooming and hovering over points.

In [None]:
import plotly.express as px
import pandas as pd

# Convert data to pandas DataFrame for Plotly
df = pd.DataFrame(data, columns=['embedding', 'cluster'])
df[['UMAP1', 'UMAP2']] = pd.DataFrame(df['embedding'].tolist(), index=df.index)
df.drop(columns=['embedding'], inplace=True)

# Create interactive scatter plot with Plotly
fig = px.scatter(
    df, x='UMAP1', y='UMAP2', color='cluster',
    title='Interactive 2D UMAP Embeddings Scatter Plot',
    labels={'UMAP1': 'UMAP Dimension 1', 'UMAP2': 'UMAP Dimension 2'},
    hover_data=['cluster']
)

# Show plot
fig.show()

# Plot Multiple Views
Create a subplot layout to show different aspects of the data, including cluster distributions and density plots.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Create a subplot layout
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Scatter plot of UMAP embeddings with cluster labels
scatter = axes[0, 0].scatter(embeddings[:, 0], embeddings[:, 1], c=clusters, cmap='viridis', s=5)
axes[0, 0].set_title('2D UMAP Embeddings Scatter Plot')
axes[0, 0].set_xlabel('UMAP Dimension 1')
axes[0, 0].set_ylabel('UMAP Dimension 2')
fig.colorbar(scatter, ax=axes[0, 0], label='Cluster')

# Cluster distribution bar plot
unique_clusters, counts = np.unique(clusters, return_counts=True)
sns.barplot(x=unique_clusters, y=counts, ax=axes[0, 1], palette='viridis')
axes[0, 1].set_title('Cluster Distribution')
axes[0, 1].set_xlabel('Cluster')
axes[0, 1].set_ylabel('Count')

# Density plot of UMAP Dimension 1
sns.kdeplot(embeddings[:, 0], ax=axes[1, 0], shade=True, color='blue')
axes[1, 0].set_title('Density Plot of UMAP Dimension 1')
axes[1, 0].set_xlabel('UMAP Dimension 1')
axes[1, 0].set_ylabel('Density')

# Density plot of UMAP Dimension 2
sns.kdeplot(embeddings[:, 1], ax=axes[1, 1], shade=True, color='green')
axes[1, 1].set_title('Density Plot of UMAP Dimension 2')
axes[1, 1].set_xlabel('UMAP Dimension 2')
axes[1, 1].set_ylabel('Density')

# Adjust layout
plt.tight_layout()
plt.show()