# 3. Clustering - Pol.is Math Python Implementation

Clustering is a critical component of the Pol.is system. It groups participants into opinion groups based on their voting patterns. This notebook explores the clustering implementation in the Python conversion.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from polismath.math.named_matrix import NamedMatrix
from polismath.math.pca import pca_project_named_matrix
from polismath.math.clusters import cluster_named_matrix

## 3.1 Creating a Test Vote Matrix with Distinct Groups

We'll create a vote matrix with three clear opinion groups to demonstrate clustering.

In [None]:
# Create a vote matrix with three clear opinion groups
num_participants = 150
num_comments = 30
participant_ids = [f"p{i}" for i in range(num_participants)]
comment_ids = [f"c{i}" for i in range(num_comments)]

# Initialize a matrix with all NaN values
votes_matrix = np.full((num_participants, num_comments), np.nan)

# Fill in the matrix with votes for three groups:
# Group 1 (participants 0-49): Agrees with comments 0-9, disagrees with 10-29
# Group 2 (participants 50-99): Agrees with comments 10-19, disagrees with 0-9 and 20-29
# Group 3 (participants 100-149): Agrees with comments 20-29, disagrees with 0-19
for p_idx in range(num_participants):
    if p_idx < 50:  # Group 1
        group = 0
    elif p_idx < 100:  # Group 2
        group = 1
    else:  # Group 3
        group = 2
    
    for c_idx in range(num_comments):
        # Each group agrees with a different third of the comments
        if (group == 0 and c_idx < 10) or \
           (group == 1 and c_idx >= 10 and c_idx < 20) or \
           (group == 2 and c_idx >= 20):
            votes_matrix[p_idx, c_idx] = 1  # Agree
        else:
            votes_matrix[p_idx, c_idx] = -1  # Disagree

# Create the NamedMatrix
vote_matrix = NamedMatrix(votes_matrix, participant_ids, comment_ids)

print(f"Created vote matrix with {len(participant_ids)} participants and {len(comment_ids)} comments")
print(f"Number of votes: {np.sum(~np.isnan(votes_matrix))}")

## 3.2 Performing PCA and Visualizing Participants

Before clustering, we'll perform PCA and visualize the participants in 2D space to see if there are clear groups.

In [None]:
# Perform PCA
print("Running PCA...")
pca_results, proj_dict = pca_project_named_matrix(vote_matrix)

# Extract the projection coordinates for each participant
x_coords = []
y_coords = []
true_groups = []  # To color the points by their known group

for i, p_id in enumerate(participant_ids):
    if p_id in proj_dict:
        x_coords.append(proj_dict[p_id][0])
        y_coords.append(proj_dict[p_id][1])
        # Determine the true group
        if i < 50:
            true_groups.append(0)
        elif i < 100:
            true_groups.append(1)
        else:
            true_groups.append(2)

# Create a scatter plot of the projections
fig, ax = plt.subplots(figsize=(10, 8))

# Use different colors for the three groups
colors = ["blue" if g == 0 else "red" if g == 1 else "green" for g in true_groups]

# Create the scatter plot
scatter = ax.scatter(x_coords, y_coords, c=colors, alpha=0.6, s=50)

# Add title and labels
ax.set_title("PCA Projection of Participants - Three Groups")
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")

# Add a legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='Group 1'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='Group 2'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10, label='Group 3')
]
ax.legend(handles=legend_elements)

# Add grid lines
ax.grid(True, linestyle="--", alpha=0.7)

# Show the plot
plt.show()

## 3.3 Running Clustering on the PCA Projections

Now we'll run the clustering algorithm to identify the opinion groups automatically.

In [None]:
# Convert projections to a format suitable for clustering
proj_matrix = np.array([proj_dict[pid] for pid in participant_ids if pid in proj_dict])
proj_participants = [pid for pid in participant_ids if pid in proj_dict]
proj_named_matrix = NamedMatrix(proj_matrix, proj_participants, [f"dim{i}" for i in range(proj_matrix.shape[1])])

# Run clustering with k=3 (since we know there are 3 groups)
print("Running clustering with k=3...")
clusters = cluster_named_matrix(proj_named_matrix, k=3)

# Display information about the clusters
print("\nClustering Results:")
for i, cluster in enumerate(clusters):
    print(f"Cluster {i}: {len(cluster)} participants")
    print(f"  First 5 members: {cluster[:5]}")

## 3.4 Visualizing the Clustering Results

Let's visualize the PCA projections again, but this time colored by the detected clusters.

In [None]:
# Create a mapping from participant ID to cluster
id_to_cluster = {}
for cluster_idx, cluster_members in enumerate(clusters):
    for pid in cluster_members:
        id_to_cluster[pid] = cluster_idx

# Extract the projection coordinates and assigned clusters
x_coords_clustered = []
y_coords_clustered = []
assigned_clusters = []
true_groups_clustered = []

for i, p_id in enumerate(participant_ids):
    if p_id in proj_dict and p_id in id_to_cluster:
        x_coords_clustered.append(proj_dict[p_id][0])
        y_coords_clustered.append(proj_dict[p_id][1])
        assigned_clusters.append(id_to_cluster[p_id])
        
        # Record true group for comparison
        if i < 50:
            true_groups_clustered.append(0)
        elif i < 100:
            true_groups_clustered.append(1)
        else:
            true_groups_clustered.append(2)

# Create a scatter plot of the projections with assigned clusters
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

# Plot with true groups
colors_true = ["blue" if g == 0 else "red" if g == 1 else "green" for g in true_groups_clustered]
scatter1 = ax1.scatter(x_coords_clustered, y_coords_clustered, c=colors_true, alpha=0.6, s=50)
ax1.set_title("Participants Colored by True Groups")
ax1.set_xlabel("Principal Component 1")
ax1.set_ylabel("Principal Component 2")
legend_elements1 = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='True Group 1'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='True Group 2'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10, label='True Group 3')
]
ax1.legend(handles=legend_elements1)
ax1.grid(True, linestyle="--", alpha=0.7)

# Plot with detected clusters
colors_cluster = ["purple" if c == 0 else "orange" if c == 1 else "cyan" for c in assigned_clusters]
scatter2 = ax2.scatter(x_coords_clustered, y_coords_clustered, c=colors_cluster, alpha=0.6, s=50)
ax2.set_title("Participants Colored by Detected Clusters")
ax2.set_xlabel("Principal Component 1")
ax2.set_ylabel("Principal Component 2")
legend_elements2 = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='purple', markersize=10, label='Cluster 1'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=10, label='Cluster 2'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='cyan', markersize=10, label='Cluster 3')
]
ax2.legend(handles=legend_elements2)
ax2.grid(True, linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()

## 3.5 Evaluating Clustering Performance

Let's evaluate how well the clustering algorithm performed by comparing the detected clusters to the true groups.

In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score

# Calculate clustering metrics
ari = adjusted_rand_score(true_groups_clustered, assigned_clusters)
nmi = normalized_mutual_info_score(true_groups_clustered, assigned_clusters)

print("Clustering Evaluation Metrics:")
print(f"Adjusted Rand Index: {ari:.3f} (1.0 is perfect clustering)")
print(f"Normalized Mutual Information: {nmi:.3f} (1.0 is perfect clustering)")

# Create a confusion matrix to see how the clusters align with true groups
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(true_groups_clustered, assigned_clusters)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
            xticklabels=[f"Cluster {i}" for i in range(3)],
            yticklabels=[f"True Group {i}" for i in range(3)])
plt.title("Confusion Matrix: True Groups vs Detected Clusters")
plt.ylabel("True Group")
plt.xlabel("Detected Cluster")
plt.tight_layout()
plt.show()

## 3.6 Testing Clustering with Noisy Data

Let's create a more realistic scenario with noise in the votes.

In [None]:
# Create a vote matrix with three opinion groups, but with noise and some passes
num_participants = 150
num_comments = 30
participant_ids_noisy = [f"p{i}" for i in range(num_participants)]
comment_ids_noisy = [f"c{i}" for i in range(num_comments)]

# Initialize a matrix with all NaN values
votes_matrix_noisy = np.full((num_participants, num_comments), np.nan)

# Fill in the matrix with votes, but add noise and passes
for p_idx in range(num_participants):
    if p_idx < 50:  # Group 1
        group = 0
    elif p_idx < 100:  # Group 2
        group = 1
    else:  # Group 3
        group = 2
    
    for c_idx in range(num_comments):
        # Randomly determine if this participant votes on this comment (70% chance)
        if np.random.random() < 0.7:  
            if (group == 0 and c_idx < 10) or \
               (group == 1 and c_idx >= 10 and c_idx < 20) or \
               (group == 2 and c_idx >= 20):
                # High probability of agree (80%), but with some noise (20% disagree)
                vote = 1 if np.random.random() < 0.8 else -1
            else:
                # High probability of disagree (80%), but with some noise (20% agree)
                vote = -1 if np.random.random() < 0.8 else 1
                
            votes_matrix_noisy[p_idx, c_idx] = vote
        # Otherwise, leave as NaN (pass)

# Create the NamedMatrix
vote_matrix_noisy = NamedMatrix(votes_matrix_noisy, participant_ids_noisy, comment_ids_noisy)

print(f"Created noisy vote matrix with {len(participant_ids_noisy)} participants and {len(comment_ids_noisy)} comments")
num_votes = np.sum(~np.isnan(votes_matrix_noisy))
print(f"Number of votes: {num_votes} ({num_votes/(num_participants*num_comments)*100:.1f}% of possible votes)")
print(f"Number of agrees: {np.sum(votes_matrix_noisy == 1)} ({np.sum(votes_matrix_noisy == 1)/num_votes*100:.1f}%)")
print(f"Number of disagrees: {np.sum(votes_matrix_noisy == -1)} ({np.sum(votes_matrix_noisy == -1)/num_votes*100:.1f}%)")

In [None]:
# Run PCA and clustering on the noisy data
print("Running PCA on noisy data...")
pca_results_noisy, proj_dict_noisy = pca_project_named_matrix(vote_matrix_noisy)

# Convert noisy projections to a format suitable for clustering
proj_matrix_noisy = np.array([proj_dict_noisy[pid] for pid in participant_ids_noisy if pid in proj_dict_noisy])
proj_participants_noisy = [pid for pid in participant_ids_noisy if pid in proj_dict_noisy]
proj_named_matrix_noisy = NamedMatrix(proj_matrix_noisy, proj_participants_noisy, [f"dim{i}" for i in range(proj_matrix_noisy.shape[1])])

# Run clustering
print("Running clustering with k=3 on noisy data...")
clusters_noisy = cluster_named_matrix(proj_named_matrix_noisy, k=3)

# Display information about the clusters
print("\nClustering Results on Noisy Data:")
for i, cluster in enumerate(clusters_noisy):
    print(f"Cluster {i}: {len(cluster)} participants")
    print(f"  First 5 members: {cluster[:5]}")

In [None]:
# Create a mapping from participant ID to cluster for noisy data
id_to_cluster_noisy = {}
for cluster_idx, cluster_members in enumerate(clusters_noisy):
    for pid in cluster_members:
        id_to_cluster_noisy[pid] = cluster_idx

# Extract the projection coordinates and assigned clusters for noisy data
x_coords_noisy = []
y_coords_noisy = []
assigned_clusters_noisy = []
true_groups_noisy = []

for i, p_id in enumerate(participant_ids_noisy):
    if p_id in proj_dict_noisy and p_id in id_to_cluster_noisy:
        x_coords_noisy.append(proj_dict_noisy[p_id][0])
        y_coords_noisy.append(proj_dict_noisy[p_id][1])
        assigned_clusters_noisy.append(id_to_cluster_noisy[p_id])
        
        # Record true group for comparison
        if i < 50:
            true_groups_noisy.append(0)
        elif i < 100:
            true_groups_noisy.append(1)
        else:
            true_groups_noisy.append(2)

# Visualize the noisy clustering results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))

# Plot with true groups
colors_true_noisy = ["blue" if g == 0 else "red" if g == 1 else "green" for g in true_groups_noisy]
scatter1 = ax1.scatter(x_coords_noisy, y_coords_noisy, c=colors_true_noisy, alpha=0.6, s=50)
ax1.set_title("Noisy Data: Participants Colored by True Groups")
ax1.set_xlabel("Principal Component 1")
ax1.set_ylabel("Principal Component 2")
legend_elements1 = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='blue', markersize=10, label='True Group 1'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=10, label='True Group 2'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='green', markersize=10, label='True Group 3')
]
ax1.legend(handles=legend_elements1)
ax1.grid(True, linestyle="--", alpha=0.7)

# Plot with detected clusters
colors_cluster_noisy = ["purple" if c == 0 else "orange" if c == 1 else "cyan" for c in assigned_clusters_noisy]
scatter2 = ax2.scatter(x_coords_noisy, y_coords_noisy, c=colors_cluster_noisy, alpha=0.6, s=50)
ax2.set_title("Noisy Data: Participants Colored by Detected Clusters")
ax2.set_xlabel("Principal Component 1")
ax2.set_ylabel("Principal Component 2")
legend_elements2 = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='purple', markersize=10, label='Cluster 1'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=10, label='Cluster 2'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='cyan', markersize=10, label='Cluster 3')
]
ax2.legend(handles=legend_elements2)
ax2.grid(True, linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()

# Evaluate noisy clustering
ari_noisy = adjusted_rand_score(true_groups_noisy, assigned_clusters_noisy)
nmi_noisy = normalized_mutual_info_score(true_groups_noisy, assigned_clusters_noisy)

print("Clustering Evaluation Metrics for Noisy Data:")
print(f"Adjusted Rand Index: {ari_noisy:.3f} (1.0 is perfect clustering)")
print(f"Normalized Mutual Information: {nmi_noisy:.3f} (1.0 is perfect clustering)")

## 3.7 Testing Automatic K Selection

The Pol.is clustering system can automatically determine the optimal number of clusters using silhouette analysis.

In [None]:
from polismath.math.clusters import best_k, silhouette_score

# Try to automatically determine the best number of clusters (k) for our data
print("Determining optimal number of clusters...")
selected_k, silhouettes = best_k(proj_named_matrix, min_k=2, max_k=5)

print(f"Best number of clusters: {selected_k}")
print("Silhouette scores:")
for k, score in silhouettes.items():
    print(f"k={k}: {score:.3f}")

# Visualize the silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(list(silhouettes.keys()), list(silhouettes.values()), 'o-')
plt.scatter([selected_k], [silhouettes[selected_k]], c='red', s=100, zorder=10)
plt.title('Silhouette Scores for Different Numbers of Clusters')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(list(silhouettes.keys()))
plt.grid(True)
plt.show()

## 3.8 Examining the Clustering Algorithm Implementation

In [None]:
# Examine the clustering algorithm implementation
import inspect
from polismath.math.clusters import kmeans

print("K-means Clustering Implementation:")
print(inspect.getsource(kmeans))

## 3.9 Weighted K-means

The Pol.is clustering implementation supports weighted k-means, where some participants can have more influence on the cluster centroids.

In [None]:
# Create random weights for participants
np.random.seed(42)  # For reproducibility
weights = np.random.uniform(0.5, 1.5, len(proj_participants))
weights_dict = {pid: weight for pid, weight in zip(proj_participants, weights)}

# Run weighted clustering
print("Running weighted clustering with k=3...")
clusters_weighted = cluster_named_matrix(proj_named_matrix, k=3, weights=weights_dict)

# Display information about the weighted clusters
print("\nWeighted Clustering Results:")
for i, cluster in enumerate(clusters_weighted):
    print(f"Cluster {i}: {len(cluster)} participants")
    print(f"  First 5 members: {cluster[:5]}")
    
# Create a mapping from participant ID to weighted cluster
id_to_cluster_weighted = {}
for cluster_idx, cluster_members in enumerate(clusters_weighted):
    for pid in cluster_members:
        id_to_cluster_weighted[pid] = cluster_idx

# Visualize the weighted clustering results
x_coords_weighted = []
y_coords_weighted = []
assigned_clusters_weighted = []
point_weights = []

for i, p_id in enumerate(participant_ids):
    if p_id in proj_dict and p_id in id_to_cluster_weighted:
        x_coords_weighted.append(proj_dict[p_id][0])
        y_coords_weighted.append(proj_dict[p_id][1])
        assigned_clusters_weighted.append(id_to_cluster_weighted[p_id])
        point_weights.append(weights_dict.get(p_id, 1.0))

# Plot weighted clustering results
plt.figure(figsize=(12, 10))
scatter = plt.scatter(x_coords_weighted, y_coords_weighted, 
                      c=["purple" if c == 0 else "orange" if c == 1 else "cyan" for c in assigned_clusters_weighted], 
                      s=[w*50 for w in point_weights],  # Point size based on weight
                      alpha=0.6)
plt.title("Weighted Clustering Results (point size indicates weight)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='purple', markersize=10, label='Cluster 1'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=10, label='Cluster 2'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='cyan', markersize=10, label='Cluster 3')
]
plt.legend(handles=legend_elements)
plt.grid(True, linestyle="--", alpha=0.7)
plt.show()

## 3.10 Summary

The Pol.is clustering implementation:

1. Takes the PCA projections of participants and groups them into opinion clusters
2. Uses a k-means algorithm with optional weighting of participants
3. Can automatically determine the optimal number of clusters using silhouette analysis
4. Works well even with noisy and sparse data
5. Successfully identifies distinct opinion groups in the conversation

These clusters form the basis for the opinion group analysis in Pol.is, enabling the identification of representative comments for each group.