In [2]:
import joblib
from collections import Counter

# Load metadata
metadata = joblib.load("movie_metadata_umap_clusters.pkl")

# Count clusters
cluster_ids = [movie["cluster_umap"] for movie in metadata]
cluster_counts = Counter(cluster_ids)

# Show number of clusters (excluding noise, which is usually -1)
num_clusters = len(set(cid for cid in cluster_ids if cid != -1))
print(f"✅ Number of valid clusters (excluding noise): {num_clusters}")

# Optional: show top 10 largest clusters
print("\n🔝 Top 10 clusters by size:")
for cid, count in cluster_counts.most_common(10):
    print(f"Cluster {cid}: {count} movies")


✅ Number of valid clusters (excluding noise): 790

🔝 Top 10 clusters by size:
Cluster -1: 499564 movies
Cluster 150: 53117 movies
Cluster 606: 20134 movies
Cluster 564: 6188 movies
Cluster 497: 4090 movies
Cluster 443: 3884 movies
Cluster 4: 3787 movies
Cluster 74: 3398 movies
Cluster 39: 3372 movies
Cluster 777: 3313 movies


In [4]:
cluster_id = 42

# Filter movies in that cluster
movies_in_cluster = [m for m in metadata if m["cluster_umap"] == cluster_id]

# Print a few
for movie in movies_in_cluster[:10]:
    print(f"🎬 {movie['title']}\n📝 {movie['overview']}\n")


🎬 Gladiator II
📝 Years after witnessing the death of the revered hero Maximus at the hands of his uncle Lucius is forced to enter the Colosseum after his home is conquered by the tyrannical Emperors who now lead Rome with an iron fist. With rage in his heart and the future of the Empire at stake Lucius must look to his past to find strength and honor to return the glory of Rome to its people.

🎬 Gladiator
📝 In the year 180 the death of emperor Marcus Aurelius throws the Roman Empire into chaos.  Maximus is one of the Roman army's most capable and trusted generals and a key advisor to the emperor.  As Marcus' devious son Commodus ascends to the throne Maximus is set to be executed.  He escapes but is captured by slave traders.  Renamed Spaniard and forced to become a gladiator Maximus must battle to the death with other men for the amusement of paying audiences.

🎬 300: Rise of an Empire
📝 Greek general Themistocles attempts to unite all of Greece by leading the charge that will change 

In [10]:
for cid in sorted(cluster_counts):
    if cid == -1: continue  # skip noise
    print(f"\n📦 Cluster {cid} ({cluster_counts[cid]} movies):")
    examples = [m["title"] for m in metadata if m["cluster_umap"] == cid][:5]
    print(" ➤ Examples:", ", ".join(examples))



📦 Cluster 0 (1158 movies):
 ➤ Examples: Step-Brother 2, Exchange Sex With A Friend Couple, Blood Type O Watermelon Maid, Young Aunt 3, Nice Sister-In-Law 2

📦 Cluster 1 (34 movies):
 ➤ Examples: Neon Sign, Neon Sign, City Foliage, Halley’s Comet, Fire Escape

📦 Cluster 2 (37 movies):
 ➤ Examples: Period Piece, Waiting Room Window, City Foliage, Fire Escape, Resting Merchant

📦 Cluster 3 (43 movies):
 ➤ Examples: Werewolf, Ivory Wave, Unseen Enemy, Forced Nightmare, Unseen Enemy

📦 Cluster 4 (3787 movies):
 ➤ Examples: Horizon: An American Saga - Chapter 1, Once Upon a Time in the West, Django Unchained, Dead for a Dollar, The Old Way

📦 Cluster 5 (45 movies):
 ➤ Examples: Spongebob Squarepants: It Came from Goo Lagoon, Spongebob Squarepants: It Came from Goo Lagoon, A Grand Night In: The Story of Aardman, Trombone Trouble, Rollergator

📦 Cluster 6 (69 movies):
 ➤ Examples: Manodrome, Skinned Deep, Clown Fear, Attic Panic, Baghdad in My Shadow

📦 Cluster 7 (36 movies):
 ➤ Examples: Bul