In [30]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

movies_df = pd.read_parquet("..\movies_processed.parquet")


In [31]:
#Just in case: Drop any rows with missing intro_vector values:
movies_df = movies_df.dropna(subset=["intro_vector"])


In [32]:
#Create a NumPy array of intro_vectors for clustering:
intro_vectors = np.stack(movies_df["intro_vector"].values)



##1: KMeans Clustering

##2: DBSCAN Clustering

In [None]:
# Create a StandardScaler to normalize the data
scaler = StandardScaler()
scaled_vectors = scaler.fit_transform(intro_vectors)

# Experiment with different eps and min_samples values
dbscan = DBSCAN(eps=0.5, min_samples=5)

# Fit the model and predict the cluster labels
cluster_labels = dbscan.fit_predict(scaled_vectors)

# Exclude noise points (cluster label = -1)
core_samples_mask = np.zeros_like(cluster_labels, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
non_noise_labels = cluster_labels[core_samples_mask]

##3: Agglomerative Clustering

In [33]:
from sklearn.model_selection import ParameterGrid

# KMeans parameter grid
kmeans_grid = {
    'n_clusters': [3, 4, 5, 6, 7, 8, 9, 10],
    'random_state': [42],
}

# DBSCAN parameter grid
dbscan_grid = {
    'eps': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'min_samples': [3, 5, 7, 10, 15, 20],
}

# Agglomerative Clustering parameter grid
agg_clustering_grid = {
    'n_clusters': [3, 4, 5, 6, 7, 8, 9, 10],
    'linkage': ['ward', 'complete', 'average', 'single'],
}

# KMeans clustering
print("KMeans Clustering")
for params in ParameterGrid(kmeans_grid):
    kmeans = KMeans(**params)
    cluster_labels = kmeans.fit_predict(intro_vectors)
    silhouette_avg = silhouette_score(intro_vectors, cluster_labels)
    print(f"Parameters: {params} - Silhouette score: {silhouette_avg}")

print("\nDBSCAN Clustering")
for params in ParameterGrid(dbscan_grid):
    dbscan = DBSCAN(**params)
    cluster_labels = dbscan.fit_predict(scaled_vectors)

    core_samples_mask = np.zeros_like(cluster_labels, dtype=bool)
    core_samples_mask[dbscan.core_sample_indices_] = True
    non_noise_labels = cluster_labels[core_samples_mask]

    unique_labels = np.unique(non_noise_labels)
    if len(unique_labels) >= 2:
        silhouette_avg = silhouette_score(scaled_vectors[core_samples_mask], non_noise_labels, metric='euclidean')
        print(f"Parameters: {params} - Silhouette score: {silhouette_avg}")
    else:
        print(f"Parameters: {params} - Not enough clusters for silhouette score calculation.")

print("\nAgglomerative Clustering")
for params in ParameterGrid(agg_clustering_grid):
    agg_clustering = AgglomerativeClustering(**params)
    cluster_labels = agg_clustering.fit_predict(intro_vectors)
    silhouette_avg = silhouette_score(intro_vectors, cluster_labels)
    print(f"Parameters: {params} - Silhouette score: {silhouette_avg}")


KMeans Clustering




Parameters: {'n_clusters': 3, 'random_state': 42} - Silhouette score: 0.093935526907444




Parameters: {'n_clusters': 4, 'random_state': 42} - Silhouette score: 0.0582636259496212




Parameters: {'n_clusters': 5, 'random_state': 42} - Silhouette score: 0.055404312908649445




Parameters: {'n_clusters': 6, 'random_state': 42} - Silhouette score: 0.04422995075583458




Parameters: {'n_clusters': 7, 'random_state': 42} - Silhouette score: 0.037781696766614914




Parameters: {'n_clusters': 8, 'random_state': 42} - Silhouette score: 0.03526654466986656




Parameters: {'n_clusters': 9, 'random_state': 42} - Silhouette score: 0.033546555787324905




Parameters: {'n_clusters': 10, 'random_state': 42} - Silhouette score: 0.03233844041824341

DBSCAN Clustering
Parameters: {'eps': 0.3, 'min_samples': 3} - Silhouette score: 1.0
Parameters: {'eps': 0.3, 'min_samples': 5} - Not enough clusters for silhouette score calculation.
Parameters: {'eps': 0.3, 'min_samples': 7} - Not enough clusters for silhouette score calculation.
Parameters: {'eps': 0.3, 'min_samples': 10} - Not enough clusters for silhouette score calculation.
Parameters: {'eps': 0.3, 'min_samples': 15} - Not enough clusters for silhouette score calculation.
Parameters: {'eps': 0.3, 'min_samples': 20} - Not enough clusters for silhouette score calculation.
Parameters: {'eps': 0.4, 'min_samples': 3} - Silhouette score: 1.0
Parameters: {'eps': 0.4, 'min_samples': 5} - Not enough clusters for silhouette score calculation.
Parameters: {'eps': 0.4, 'min_samples': 7} - Not enough clusters for silhouette score calculation.
Parameters: {'eps': 0.4, 'min_samples': 10} - Not enough clu

In [39]:


# Set the number of clusters (experiment with different values)
n_clusters = 3

# Create the AgglomerativeClustering model
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage= 'average')

# Fit the model and predict the cluster labels
cluster_labels = agg_clustering.fit_predict(intro_vectors)

# Calculate the silhouette score
silhouette_avg = silhouette_score(intro_vectors, cluster_labels)
print(f"Agglomerative Clustering - Silhouette score: {silhouette_avg}")


Agglomerative Clustering - Silhouette score: 0.570956289768219
