In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [4]:
item_df = pd.read_csv("D:/projects/item.csv")
user_df = pd.read_csv("D:/projects/users.csv")

In [6]:
#Data cleaning
item_df['title'] = item_df['title'].str.strip('[]').str.replace('"', '').str.strip().str.lower()
user_df['favorites'] = user_df['favorites'].str.strip('[]').str.replace('"', '').str.strip().str.lower()

#Creating a unique list of TED talk titles and map each title to an index
title_to_id = {title: idx for idx, title in enumerate(item_df['title'].unique())}
id_to_title = {idx: title for title, idx in title_to_id.items()}
print(title_to_id)
print(id_to_title)


{'patrick chappatte: the power of cartoons': 0, "karen armstrong: let's revive the golden rule": 1, "david macaulay's rome antics": 2, 'rachel armstrong: architecture that repairs itself?': 3, 'rick warren: a life of purpose': 4, 'bonnie bassler: how bacteria \\talk\\': 5, 'sam richards: a radical experiment in empathy': 6, 'imogen heap plays \\wait it out\\': 7, 'karen bass: unseen footage, untamed nature': 8, 'barry schwartz: our loss of wisdom': 9, 'sheryl sandberg: why we have too few women leaders': 10, 'diane kelly: what we didn’t know about penis anatomy': 11, 'steven johnson: where good ideas come from': 12, 'lakshmi pratury on letter-writing': 13, 'yoav medan: ultrasound surgery -- healing without cuts': 14, 'saul griffith on everyday inventions': 15, 'emily pilloton: teaching design for change': 16, 'benoit mandelbrot: fractals and the art of roughness': 17, "peter haas: haiti's disaster of engineering": 18, 'don tapscott: four principles for the open world': 19, 'isaac mizra

In [8]:
#creating user item matrix
user_ids = user_df['user_id'].unique()
user_item_matrix = pd.DataFrame(0, index=user_ids, columns=title_to_id.values())
for _, row in user_df.iterrows():
    user_id = row['user_id']
    favorites = [fav.strip() for fav in row['favorites'].split(',')]
    for title in favorites:
        title = title.lower().strip()  
        if title in title_to_id:  
            user_item_matrix.at[user_id, title_to_id[title]] = 1
user_item_matrix.columns = [id_to_title[col] for col in user_item_matrix.columns]
print(user_item_matrix.head(10))

                                          patrick chappatte: the power of cartoons  \
e57cec766488c5a72d02dd6bcdbd1d67201ddc7f                                         0   
4c3e7cf74b5c596cf234e9055a436a23d32cb1b7                                         0   
394723943ac2a83beb72c860d77a8eca22087185                                         0   
a2715f02d578bfc667e0fb4691f5a5b1572b9b2e                                         0   
2c0871325f6f3e10bdeee9059d7a2e745929f702                                         0   
ed358a28ce7626a0788284bf939479ab3de7527c                                         0   
b771db584db36ac504874dc7d780c27143fa1cf1                                         0   
fa51240c15b28a4a340ff1b6cdce45f2d8b262bc                                         0   
728210957335809402cb85f8196aa07db5db71ee                                         0   
4bc8accb4d275e834226cb71b405c00bc6bc5b19                                         0   

                                          karen armst

In [10]:
eating user similarity matrix
user_similarity_matrix = pd.DataFrame(cosine_similarity(user_item_matrix),
                                      index=user_item_matrix.index,
                                      columns=user_item_matrix.index)
print(user_similarity_matrix.head(10))

                                          e57cec766488c5a72d02dd6bcdbd1d67201ddc7f  \
e57cec766488c5a72d02dd6bcdbd1d67201ddc7f                                  1.000000   
4c3e7cf74b5c596cf234e9055a436a23d32cb1b7                                  0.094491   
394723943ac2a83beb72c860d77a8eca22087185                                  0.000000   
a2715f02d578bfc667e0fb4691f5a5b1572b9b2e                                  0.188982   
2c0871325f6f3e10bdeee9059d7a2e745929f702                                  0.000000   
ed358a28ce7626a0788284bf939479ab3de7527c                                  0.000000   
b771db584db36ac504874dc7d780c27143fa1cf1                                  0.000000   
fa51240c15b28a4a340ff1b6cdce45f2d8b262bc                                  0.000000   
728210957335809402cb85f8196aa07db5db71ee                                  0.000000   
4bc8accb4d275e834226cb71b405c00bc6bc5b19                                  0.000000   

                                          4c3e7cf74b5

In [12]:

#building a model with k means
n_clusters = 3  
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(user_item_matrix)

In [14]:
#clustering it
user_item_matrix['Cluster'] = kmeans.labels_
print(user_item_matrix['Cluster'])
print("Cluster Centers:")
print(kmeans.cluster_centers_)

e57cec766488c5a72d02dd6bcdbd1d67201ddc7f    1
4c3e7cf74b5c596cf234e9055a436a23d32cb1b7    1
394723943ac2a83beb72c860d77a8eca22087185    1
a2715f02d578bfc667e0fb4691f5a5b1572b9b2e    1
2c0871325f6f3e10bdeee9059d7a2e745929f702    1
                                           ..
8049cb3b03306049a46ccb33ed74272dbb05b2c6    1
f67e59e49cf88f06a7cb62ca5067d73a4d7d361a    1
3cf3758fdaa289febd7152b917b7e6af6973f337    1
510eaa7cb573cc7b2184d5b01ff004d943df3eaf    1
7cfdeb1bfdd06010293a0c96ccc3719ee5ec75f5    0
Name: Cluster, Length: 12605, dtype: int32
Cluster Centers:
[[0.10550459 0.0733945  0.04587156 ... 0.         0.39908257 0.2706422 ]
 [0.00207297 0.00279851 0.00093284 ... 0.         0.02394279 0.00995025]
 [0.00730194 0.00766703 0.00620664 ... 0.         0.12011683 0.04162103]]


In [16]:
import pandas as pd

def recommend_talks(user_id, user_item_matrix, user_cluster_map, n_recommendations=5):
    # Get the user's cluster
    user_cluster = user_cluster_map[user_id]
    
    # Select all users in the same cluster
    cluster_users = user_item_matrix[user_item_matrix['Cluster'] == user_cluster]
    
    # Drop the current user and sum interactions across the cluster (using vectorized sum)
    cluster_interactions = cluster_users.drop(user_id).sum(axis=0)
    
    # Sort the items based on sum of interactions
    recommended_items = cluster_interactions.sort_values(ascending=False)
    
    # Filter for items with interaction > 0
    recommendations = recommended_items[recommended_items > 0].index[:n_recommendations]
    
    # Convert item IDs to titles
    recommended_titles = [id_to_title.get(title_to_id.get(col), col) for col in recommendations]
    
    return recommended_titles

# Precompute the clusters for each user (once, outside of the loop)
user_cluster_map = user_item_matrix['Cluster'].to_dict()

# Create recommendations for all users
user_recommendations = {}
for user_id in user_item_matrix.index:
    user_recommendations[user_id] = recommend_talks(user_id, user_item_matrix, user_cluster_map)

# Display recommendations for a specific user
user_id_to_display = user_ids[0]
print(f"Recommendations for user {user_id_to_display}: {user_recommendations[user_id_to_display]}")


Recommendations for user e57cec766488c5a72d02dd6bcdbd1d67201ddc7f: ['Cluster', "jill bolte taylor's stroke of insight", 'elizabeth gilbert: your elusive creative genius', 'brené brown: the power of vulnerability', 'simon sinek: how great leaders inspire action']


In [34]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import pandas as pd
from sklearn.cluster import KMeans

# Assuming `user_item_matrix` is already defined and clustered
# Drop the 'Cluster' column to use only feature data
feature_data = user_item_matrix.drop(columns=['Cluster'])

# Silhouette Score
silhouette_avg_kmeans = silhouette_score(feature_data, user_item_matrix['Cluster'])
print(f"Silhouette Score: {silhouette_avg_kmeans}")

# Calinski-Harabasz Index
calinski_harabasz_kmeans = calinski_harabasz_score(feature_data, user_item_matrix['Cluster'])
print(f"Calinski-Harabasz Index: {calinski_harabasz_kmeans}")

# Davies-Bouldin Score
davies_bouldin_kmeans = davies_bouldin_score(feature_data, user_item_matrix['Cluster'])
print(f"Davies-Bouldin Score: {davies_bouldin_kmeans}")

from sklearn.metrics import pairwise_distances
import numpy as np

# Assuming K-Means has been applied to `feature_data`
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(feature_data)

# Inertia
inertia_kmeans = kmeans.inertia_
print(f"Inertia: {inertia_kmeans}")

# Dunn Index
def dunn_index(data, labels):
    # Compute pairwise distances between all points
    distances = pairwise_distances(data)
    
    # Get unique cluster labels
    unique_labels = np.unique(labels)
    
    # Calculate intra-cluster distances (max distance within each cluster)
    intra_cluster_distances = []
    for label in unique_labels:
        cluster_points = data[labels == label]
        if len(cluster_points) > 1:  # Avoid single-point clusters
            intra_cluster_distances.append(np.max(pairwise_distances(cluster_points)))
        else:
            intra_cluster_distances.append(0)
    
    # Calculate inter-cluster distances (min distance between clusters)
    inter_cluster_distances = []
    for i, label1 in enumerate(unique_labels):
        for label2 in unique_labels[i + 1:]:
            cluster_points1 = data[labels == label1]
            cluster_points2 = data[labels == label2]
            inter_cluster_distances.append(np.min(pairwise_distances(cluster_points1, cluster_points2)))
    
    # Dunn Index = min(inter-cluster distances) / max(intra-cluster distances)
    return min(inter_cluster_distances) / max(intra_cluster_distances)

# Dunn Index Calculation
labels_kmeans = kmeans.labels_
dunn_kmeans = dunn_index(feature_data.values, labels_kmeans)
print(f"Dunn Index: {dunn_kmeans}")


Silhouette Score: 0.24042372105655704
Calinski-Harabasz Index: 345.53814059487036
Davies-Bouldin Score: 4.6555905280375205
Inertia: 111389.33662579607
Dunn Index: 0.03892494720807615


In [26]:
scaler = StandardScaler()
user_item_matrix_scaled = scaler.fit_transform(user_item_matrix)
gmm = GaussianMixture(n_components=3, random_state=42)
user_item_matrix['GMM_Cluster'] = gmm.fit_predict(user_item_matrix_scaled)
gmm

In [28]:
def recommend_talks(user_id, user_item_matrix, cluster_column, n_recommendations=5):
    user_cluster = user_item_matrix.loc[user_id, cluster_column]
    cluster_users = user_item_matrix[user_item_matrix[cluster_column] == user_cluster]
    recommended_items = cluster_users.drop(user_id).sum(axis=0).sort_values(ascending=False)
    recommendations = recommended_items[recommended_items > 0].index[:n_recommendations]
    recommended_titles = [id_to_title[title_to_id[col]] for col in recommendations if col in title_to_id]
    return recommended_titles

user_recommendations_gmm = {}

for user_id in user_item_matrix.index:
    user_recommendations_gmm[user_id] = recommend_talks(user_id, user_item_matrix, 'GMM_Cluster')
user_id_to_display = user_ids[0]
print(f"Recommendations for user {user_id_to_display} (GMM): {user_recommendations_gmm[user_id_to_display]}")

Recommendations for user e57cec766488c5a72d02dd6bcdbd1d67201ddc7f (GMM): ['ken robinson says schools kill creativity', "jill bolte taylor's stroke of insight", 'elizabeth gilbert: your elusive creative genius']


In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, pairwise_distances
import numpy as np

# Assuming user_item_matrix is already defined
scaler = StandardScaler()
user_item_matrix_scaled = scaler.fit_transform(user_item_matrix.drop(columns=['GMM_Cluster'], errors='ignore'))

# Fit GMM model
gmm = GaussianMixture(n_components=3, random_state=42)
user_item_matrix['GMM_Cluster'] = gmm.fit_predict(user_item_matrix_scaled)

# Extract GMM cluster labels
gmm_labels = user_item_matrix['GMM_Cluster']

# Silhouette Score
silhouette_avg_gmm = silhouette_score(user_item_matrix_scaled, gmm_labels)
print(f"Silhouette Score (GMM): {silhouette_avg_gmm}")

# Calinski-Harabasz Index
calinski_harabasz_gmm = calinski_harabasz_score(user_item_matrix_scaled, gmm_labels)
print(f"Calinski-Harabasz Index (GMM): {calinski_harabasz_gmm}")

# Davies-Bouldin Score
davies_bouldin_gmm = davies_bouldin_score(user_item_matrix_scaled, gmm_labels)
print(f"Davies-Bouldin Score (GMM): {davies_bouldin_gmm}")

# Inertia (approximation for GMM)
def gmm_inertia(data, model):
    centers = model.means_
    labels = model.predict(data)
    inertia = 0
    for i, center in enumerate(centers):
        cluster_points = data[labels == i]
        inertia += np.sum((cluster_points - center) ** 2)
    return inertia

inertia_gmm = gmm_inertia(user_item_matrix_scaled, gmm)
print(f"Inertia (GMM approximation): {inertia_gmm}")

# Dunn Index
def dunn_index(data, labels):
    distances = pairwise_distances(data)
    unique_labels = np.unique(labels)
    intra_cluster_distances = []
    for label in unique_labels:
        cluster_points = data[labels == label]
        if len(cluster_points) > 1:
            intra_cluster_distances.append(np.max(pairwise_distances(cluster_points)))
        else:
            intra_cluster_distances.append(0)
    inter_cluster_distances = []
    for i, label1 in enumerate(unique_labels):
        for label2 in unique_labels[i + 1:]:
            cluster_points1 = data[labels == label1]
            cluster_points2 = data[labels == label2]
            inter_cluster_distances.append(np.min(pairwise_distances(cluster_points1, cluster_points2)))
    return min(inter_cluster_distances) / max(intra_cluster_distances)

dunn_gmm = dunn_index(user_item_matrix_scaled, gmm_labels)
print(f"Dunn Index (GMM): {dunn_gmm}")


Silhouette Score (GMM): 0.8365892623658919
Calinski-Harabasz Index (GMM): 106.92004329798898
Davies-Bouldin Score (GMM): 1.2503146798236326
Inertia (GMM approximation): 13857249.90168556
Dunn Index (GMM): 0.6521960291084403


In [40]:
from sklearn.cluster import SpectralClustering
# Spectral Clustering
spectral_clustering = SpectralClustering(n_clusters=3, affinity='nearest_neighbors', random_state=42)
user_item_matrix['Spectral_Cluster'] = spectral_clustering.fit_predict(user_item_matrix_scaled)
spectral_clustering

In [42]:
print("Spectral Clustering Assignments:")
print(user_item_matrix['Spectral_Cluster'])

# Recommendations
user_recommendations_spectral = {}
for user_id in user_item_matrix.index:
    user_recommendations_spectral[user_id] = recommend_talks(user_id, user_item_matrix, 'Spectral_Cluster')
print(f"Recommendations for user {user_id_to_display} (Spectral): {user_recommendations_spectral[user_id_to_display]}")

Spectral Clustering Assignments:
e57cec766488c5a72d02dd6bcdbd1d67201ddc7f    0
4c3e7cf74b5c596cf234e9055a436a23d32cb1b7    0
394723943ac2a83beb72c860d77a8eca22087185    0
a2715f02d578bfc667e0fb4691f5a5b1572b9b2e    0
2c0871325f6f3e10bdeee9059d7a2e745929f702    0
                                           ..
8049cb3b03306049a46ccb33ed74272dbb05b2c6    0
f67e59e49cf88f06a7cb62ca5067d73a4d7d361a    0
3cf3758fdaa289febd7152b917b7e6af6973f337    0
510eaa7cb573cc7b2184d5b01ff004d943df3eaf    0
7cfdeb1bfdd06010293a0c96ccc3719ee5ec75f5    0
Name: Spectral_Cluster, Length: 12605, dtype: int32
Recommendations for user e57cec766488c5a72d02dd6bcdbd1d67201ddc7f (Spectral): ['ken robinson says schools kill creativity', "jill bolte taylor's stroke of insight", 'elizabeth gilbert: your elusive creative genius']


In [44]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, pairwise_distances
import numpy as np
from sklearn.cluster import SpectralClustering

# Spectral Clustering
spectral_clustering = SpectralClustering(n_clusters=3, affinity='nearest_neighbors', random_state=42)
user_item_matrix['Spectral_Cluster'] = spectral_clustering.fit_predict(user_item_matrix_scaled)

# Extract Spectral Clustering labels
spectral_labels = user_item_matrix['Spectral_Cluster']

# Silhouette Score
silhouette_avg_spectral = silhouette_score(user_item_matrix_scaled, spectral_labels)
print(f"Silhouette Score (Spectral): {silhouette_avg_spectral}")

# Calinski-Harabasz Index
calinski_harabasz_spectral = calinski_harabasz_score(user_item_matrix_scaled, spectral_labels)
print(f"Calinski-Harabasz Index (Spectral): {calinski_harabasz_spectral}")

# Davies-Bouldin Score
davies_bouldin_spectral = davies_bouldin_score(user_item_matrix_scaled, spectral_labels)
print(f"Davies-Bouldin Score (Spectral): {davies_bouldin_spectral}")

# Inertia (approximation)
def spectral_inertia(data, labels):
    unique_labels = np.unique(labels)
    centers = np.array([data[labels == label].mean(axis=0) for label in unique_labels])
    inertia = 0
    for label in unique_labels:
        cluster_points = data[labels == label]
        inertia += np.sum((cluster_points - centers[label]) ** 2)
    return inertia

inertia_spectral = spectral_inertia(user_item_matrix_scaled, spectral_labels)
print(f"Inertia (Spectral approximation): {inertia_spectral}")

# Dunn Index
def dunn_index(data, labels):
    distances = pairwise_distances(data)
    unique_labels = np.unique(labels)
    
    # Intra-cluster distances (max distance within each cluster)
    intra_cluster_distances = []
    for label in unique_labels:
        cluster_points = data[labels == label]
        if len(cluster_points) > 1:
            intra_cluster_distances.append(np.max(pairwise_distances(cluster_points)))
        else:
            intra_cluster_distances.append(0)
    
    # Inter-cluster distances (min distance between clusters)
    inter_cluster_distances = []
    for i, label1 in enumerate(unique_labels):
        for label2 in unique_labels[i + 1:]:
            cluster_points1 = data[labels == label1]
            cluster_points2 = data[labels == label2]
            inter_cluster_distances.append(np.min(pairwise_distances(cluster_points1, cluster_points2)))
    
    # Dunn Index = min(inter-cluster distances) / max(intra-cluster distances)
    return min(inter_cluster_distances) / max(intra_cluster_distances)

dunn_spectral = dunn_index(user_item_matrix_scaled, spectral_labels)
print(f"Dunn Index (Spectral): {dunn_spectral}")


Silhouette Score (Spectral): -0.4239781869534046
Calinski-Harabasz Index (Spectral): 1.1109659165428167
Davies-Bouldin Score (Spectral): 5.463403803465757
Inertia (Spectral approximation): 14089905.726864018
Dunn Index (Spectral): 0.0


In [46]:
scaler = StandardScaler()
user_item_matrix_scaled = scaler.fit_transform(user_item_matrix)

# DBSCAN Clustering
dbscan = DBSCAN(eps=1.5, min_samples=2, metric='cosine')  # eps: max distance, min_samples: min points per cluster
user_item_matrix['Cluster'] = dbscan.fit_predict(user_item_matrix_scaled)
dbscan

In [62]:
def recommend_talks_dbscan(user_id, user_item_matrix, n_recommendations=5):
    user_cluster = user_item_matrix.loc[user_id, 'Cluster']
    if user_cluster == -1:
        # User is an outlier, no recommendations availablen   
        return ["No recommendations (outlier user)"]
    
    cluster_users = user_item_matrix[user_item_matrix['Cluster'] == user_cluster]
    recommended_items = cluster_users.drop(['Cluster'], axis=1).drop(user_id).sum(axis=0).sort_values(ascending=False)
    recommendations = recommended_items[recommended_items > 0].index[:n_recommendations]
    return list(recommendations)

# Generate and display recommendations
user_recommendations_dbscan = {}
for user_id in user_item_matrix.index:
    user_recommendations_dbscan[user_id] = recommend_talks_dbscan(user_id, user_item_matrix)

In [64]:
user_id_to_display = user_ids[0]
print(f"Recommendations for user {user_id_to_display}: {user_recommendations_dbscan[user_id_to_display]}")

Recommendations for user e57cec766488c5a72d02dd6bcdbd1d67201ddc7f: ['GMM_Cluster', 'ken robinson says schools kill creativity', "jill bolte taylor's stroke of insight", 'elizabeth gilbert: your elusive creative genius', 'sir ken robinson: bring on the learning revolution!']


In [67]:
# Extract DBSCAN labels
dbscan_labels = user_item_matrix['Cluster']

dbscan_labels_silhouette = dbscan_labels[dbscan_labels != -1]
user_item_matrix_scaled_silhouette = user_item_matrix_scaled[dbscan_labels != -1]

silhouette_avg_dbscan = silhouette_score(user_item_matrix_scaled_silhouette, dbscan_labels_silhouette)
print(f"Silhouette Score (DBSCAN): {silhouette_avg_dbscan}")


calinski_harabasz_dbscan = calinski_harabasz_score(user_item_matrix_scaled_silhouette, dbscan_labels_silhouette)
print(f"Calinski-Harabasz Index (DBSCAN): {calinski_harabasz_dbscan}")


davies_bouldin_dbscan = davies_bouldin_score(user_item_matrix_scaled_silhouette, dbscan_labels_silhouette)
print(f"Davies-Bouldin Score (DBSCAN): {davies_bouldin_dbscan}")


def dbscan_inertia(data, labels):
    unique_labels = np.unique(labels)
    inertia = 0
    # Calculate the cluster centers
    centers = np.array([data[labels == label].mean(axis=0) for label in unique_labels if label != -1])  # Exclude noise points
    for label in unique_labels:
        if label != -1:
            cluster_points = data[labels == label]
            inertia += np.sum((cluster_points - centers[label]) ** 2)
    return inertia

inertia_dbscan = dbscan_inertia(user_item_matrix_scaled, dbscan_labels)
print(f"Inertia (DBSCAN approximation): {inertia_dbscan}")


def dunn_index(data, labels):
    distances = pairwise_distances(data)
    unique_labels = np.unique(labels)
    

    intra_cluster_distances = []
    for label in unique_labels:
        if label == -1:  # Skip noise points
            continue
        cluster_points = data[labels == label]
        if len(cluster_points) > 1:
            intra_cluster_distances.append(np.max(pairwise_distances(cluster_points)))
        else:
            intra_cluster_distances.append(0)

    inter_cluster_distances = []
    for i, label1 in enumerate(unique_labels):
        if label1 == -1:
            continue
        for label2 in unique_labels[i + 1:]:
            if label2 == -1:
                continue
            cluster_points1 = data[labels == label1]
            cluster_points2 = data[labels == label2]
            inter_cluster_distances.append(np.min(pairwise_distances(cluster_points1, cluster_points2)))
    

    return min(inter_cluster_distances) / max(intra_cluster_distances)

dunn_dbscan = dunn_index(user_item_matrix_scaled, dbscan_labels)
print(f"Dunn Index (DBSCAN): {dunn_dbscan}")

Silhouette Score (DBSCAN): 0.4884512987985067
Calinski-Harabasz Index (DBSCAN): 199.1400101342609
Davies-Bouldin Score (DBSCAN): 7.107526824793923
Inertia (DBSCAN approximation): 13685089.469043996
Dunn Index (DBSCAN): 0.03870306325558448


In [69]:
n_clusters = 3  
hac = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='ward')
user_item_matrix['Cluster'] = hac.fit_predict(user_item_matrix)
hac

In [54]:
print("User Clusters:")
print(user_item_matrix['Cluster'])

User Clusters:
e57cec766488c5a72d02dd6bcdbd1d67201ddc7f    1
4c3e7cf74b5c596cf234e9055a436a23d32cb1b7    1
394723943ac2a83beb72c860d77a8eca22087185    1
a2715f02d578bfc667e0fb4691f5a5b1572b9b2e    1
2c0871325f6f3e10bdeee9059d7a2e745929f702    1
                                           ..
8049cb3b03306049a46ccb33ed74272dbb05b2c6    1
f67e59e49cf88f06a7cb62ca5067d73a4d7d361a    1
3cf3758fdaa289febd7152b917b7e6af6973f337    1
510eaa7cb573cc7b2184d5b01ff004d943df3eaf    1
7cfdeb1bfdd06010293a0c96ccc3719ee5ec75f5    0
Name: Cluster, Length: 12605, dtype: int64


In [56]:
# Generate Recommendations
def recommend_talks_hac(user_id, user_item_matrix, n_recommendations=5):
    user_cluster = user_item_matrix.loc[user_id, 'Cluster']
    cluster_users = user_item_matrix[user_item_matrix['Cluster'] == user_cluster]
    recommended_items = cluster_users.drop(['Cluster'], axis=1).drop(user_id).sum(axis=0).sort_values(ascending=False)
    recommendations = recommended_items[recommended_items > 0].index[:n_recommendations]
    return list(recommendations)

# Generate and display recommendations
user_recommendations_hac = {}
for user_id in user_item_matrix.index:
    user_recommendations_hac[user_id] = recommend_talks_hac(user_id, user_item_matrix)

user_id_to_display = user_ids[0]
print(f"Recommendations for user {user_id_to_display}: {user_recommendations_hac[user_id_to_display]}")

Recommendations for user e57cec766488c5a72d02dd6bcdbd1d67201ddc7f: ['GMM_Cluster', 'ken robinson says schools kill creativity', "jill bolte taylor's stroke of insight", 'elizabeth gilbert: your elusive creative genius', 'sir ken robinson: bring on the learning revolution!']


In [61]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cdist


# Step 3: Calculate the clustering metrics

# 3.1: Silhouette Score
# We need the feature matrix for silhouette score, which could be the user-item matrix or a reduced version (e.g., PCA).
X = user_item_matrix.drop('Cluster', axis=1)
silhouette_avg = silhouette_score(X, user_item_matrix['Cluster'])
print(f"Silhouette Score: {silhouette_avg}")

# 3.2: Calinski-Harabasz Index
ch_score = calinski_harabasz_score(X, user_item_matrix['Cluster'])
print(f"Calinski-Harabasz Index: {ch_score}")

# 3.3: Davies-Bouldin Score
db_score = davies_bouldin_score(X, user_item_matrix['Cluster'])
print(f"Davies-Bouldin Score: {db_score}")

# 3.4: Inertia (approximation)
# Inertia is typically associated with KMeans, but we can compute an approximation using pairwise distances for HAC.
# Calculate distances within clusters
distances = pairwise_distances(X)
inertia = np.sum(np.min(distances, axis=1))
print(f"Inertia (approximation): {inertia}")

# 3.5: Dunn Index
# The Dunn Index requires the distances between clusters and the distance within clusters.
# We will compute this using the minimum distance between clusters and the maximum distance within clusters.
def dunn_index(X, labels):
    unique_labels = np.unique(labels)
    cluster_distances = np.array([[np.min(cdist(X[labels == l1], X[labels == l2])) for l2 in unique_labels] for l1 in unique_labels])
    cluster_distances[np.diag_indices_from(cluster_distances)] = np.inf  # Ignore self-distances
    min_intercluster_distance = np.min(cluster_distances)
    
    intra_distances = [np.mean(cdist(X[labels == label], X[labels == label])) for label in unique_labels]
    max_intracluster_distance = np.max(intra_distances)
    
    return min_intercluster_distance / max_intracluster_distance

dunn_score = dunn_index(X, user_item_matrix['Cluster'])
print(f"Dunn Index: {dunn_score}")


Silhouette Score: 0.46181406242864365
Calinski-Harabasz Index: 286.14934623079694
Davies-Bouldin Score: 5.593234240027816
Inertia (approximation): 0.0
Dunn Index: 0.2044287125539241
