In [None]:
import torch

if torch.cuda.is_available():
    print("‚úÖ GPU is available!")
    print("üíª GPU Name:", torch.cuda.get_device_name(0))
else:
    print("‚ùå GPU not available. Using CPU.")


‚úÖ GPU is available!
üíª GPU Name: NVIDIA A100-SXM4-40GB


In [None]:
# üì¶ Install dependencies
!pip install umap-learn hdbscan -q

In [None]:
import numpy as np
import joblib
import hdbscan
from umap import UMAP
from tqdm import tqdm
import matplotlib.pyplot as plt
import time

# === Load Data ===
print("üì• Loading data...")
embeddings = np.load("movie_weighted_embeddings.npy")
metadata = joblib.load("movie_metadata.pkl")

# === Step 1: UMAP Reduction with tqdm ===
def umap_with_progress(X, batch_size=10000, **kwargs):
    print("üîÑ Fitting UMAP model on first batch...")
    umap_model = UMAP(**kwargs)
    umap_model.fit(X[:batch_size])

    reduced = []
    total_batches = len(X) // batch_size + (1 if len(X) % batch_size != 0 else 0)
    print("üìâ Transforming remaining batches with progress bar...")
    for i in tqdm(range(total_batches), desc="UMAP Reducing"):
        start = i * batch_size
        end = min((i + 1) * batch_size, len(X))
        reduced_chunk = umap_model.transform(X[start:end])
        reduced.append(reduced_chunk)

    return np.vstack(reduced), umap_model

# Step 1 Execution
start_umap = time.time()
reduced_embeddings, umap_model = umap_with_progress(
    embeddings,
    batch_size=10000,
    n_neighbors=30,
    n_components=10,
    min_dist=0.0,
    metric='cosine',
    random_state=42
)
print(f"‚úÖ UMAP reduction completed in {(time.time() - start_umap):.2f} seconds\n")

# === Step 2: HDBSCAN Clustering ===
print("üîç Running HDBSCAN clustering...")
start_hdb = time.time()
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=30,
    min_samples=10,
    metric='euclidean',
    prediction_data=True
)
labels = clusterer.fit_predict(reduced_embeddings)
print(f"‚úÖ HDBSCAN clustering completed in {(time.time() - start_hdb):.2f} seconds\n")

# === Step 3: Assign cluster to metadata ===
print("üìù Adding cluster labels to metadata...")
start_meta = time.time()
for i in tqdm(range(len(metadata)), desc="Metadata Update"):
    metadata[i]["cluster_umap"] = int(labels[i])
print(f"‚úÖ Metadata update completed in {(time.time() - start_meta):.2f} seconds\n")

# === Step 4: Save to Disk ===
print("üíæ Saving clustered metadata...")
joblib.dump(metadata, "movie_metadata_umap_clusters.pkl")
print("‚úÖ Saved to 'movie_metadata_umap_clusters.pkl'\n")

# === Step 5: Optional 2D Visualization ===
print("üìä Generating 2D UMAP projection for visualization...")
start_vis = time.time()
umap_2d = UMAP(n_components=2, metric='cosine', random_state=42).fit_transform(embeddings)

plt.figure(figsize=(12, 8))
plt.scatter(umap_2d[:, 0], umap_2d[:, 1], c=labels, cmap='tab20', s=1)
plt.title("UMAP 2D + HDBSCAN Clustering (700k Movies)")
plt.axis("off")
plt.show()
print(f"‚úÖ Visualization completed in {(time.time() - start_vis):.2f} seconds")

üì• Loading data...
üîÑ Fitting UMAP model on first batch...


  warn(


üìâ Transforming remaining batches with progress bar...


UMAP Reducing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 73/73 [09:38<00:00,  7.93s/it]


‚úÖ UMAP reduction completed in 636.73 seconds

üîç Running HDBSCAN clustering...
‚úÖ HDBSCAN clustering completed in 251.41 seconds

üìù Adding cluster labels to metadata...


Metadata Update: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 722317/722317 [00:00<00:00, 1652344.76it/s]


‚úÖ Metadata update completed in 0.44 seconds

üíæ Saving clustered metadata...
‚úÖ Saved to 'movie_metadata_umap_clusters.pkl'

üìä Generating 2D UMAP projection for visualization...


  warn(


In [None]:
!nvidia-smi

Fri Jun 13 10:09:59 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             46W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                