# 4. Representativeness - Pol.is Math Python Implementation

Representativeness analysis is a key feature of Pol.is. It identifies comments that best represent the opinions of each cluster of participants. This notebook explores the representativeness calculation in the Python conversion.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from polismath.math.named_matrix import NamedMatrix
from polismath.math.pca import pca_project_named_matrix
from polismath.math.clusters import cluster_named_matrix
from polismath.math.repness import conv_repness

## 4.1 Creating a Test Vote Matrix

We'll create a vote matrix with several opinion groups and distinctive voting patterns.

In [None]:
# Create a vote matrix with distinctive voting patterns for 3 groups
num_participants = 150
num_comments = 30
participant_ids = [f"p{i}" for i in range(num_participants)]
comment_ids = [f"c{i}" for i in range(num_comments)]

# Initialize a matrix with all NaN values
votes_matrix = np.full((num_participants, num_comments), np.nan)

# Fill in the matrix with votes for three groups:
# Group 1 (participants 0-49): Agrees with comments 0-9, disagrees with 10-29
# Group 2 (participants 50-99): Agrees with comments 10-19, disagrees with 0-9 and 20-29
# Group 3 (participants 100-149): Agrees with comments 20-29, disagrees with 0-19
for p_idx in range(num_participants):
    if p_idx < 50:  # Group 1
        group = 0
    elif p_idx < 100:  # Group 2
        group = 1
    else:  # Group 3
        group = 2
    
    for c_idx in range(num_comments):
        # Randomly determine if this participant votes on this comment (85% chance)
        if np.random.random() < 0.85:  
            # Each group agrees with a different third of the comments
            if (group == 0 and c_idx < 10) or \
               (group == 1 and c_idx >= 10 and c_idx < 20) or \
               (group == 2 and c_idx >= 20):
                votes_matrix[p_idx, c_idx] = 1  # Agree
            else:
                votes_matrix[p_idx, c_idx] = -1  # Disagree

# Create the NamedMatrix
vote_matrix = NamedMatrix(votes_matrix, participant_ids, comment_ids)

print(f"Created vote matrix with {len(participant_ids)} participants and {len(comment_ids)} comments")
num_votes = np.sum(~np.isnan(votes_matrix))
print(f"Number of votes: {num_votes} ({num_votes/(num_participants*num_comments)*100:.1f}% of possible votes)")
print(f"Number of agrees: {np.sum(votes_matrix == 1)} ({np.sum(votes_matrix == 1)/num_votes*100:.1f}%)")
print(f"Number of disagrees: {np.sum(votes_matrix == -1)} ({np.sum(votes_matrix == -1)/num_votes*100:.1f}%)")

## 4.2 Performing PCA and Clustering

Before calculating representativeness, we need to perform PCA and clustering to identify the opinion groups.

In [None]:
# Perform PCA
print("Running PCA...")
pca_results, proj_dict = pca_project_named_matrix(vote_matrix)

# Convert projections to a format suitable for clustering
proj_matrix = np.array([proj_dict[pid] for pid in participant_ids if pid in proj_dict])
proj_participants = [pid for pid in participant_ids if pid in proj_dict]
proj_named_matrix = NamedMatrix(proj_matrix, proj_participants, [f"dim{i}" for i in range(proj_matrix.shape[1])])

# Run clustering with k=3 (since we know there are 3 groups)
print("Running clustering with k=3...")
clusters = cluster_named_matrix(proj_named_matrix, k=3)

# Display information about the clusters
print("\nClustering Results:")
for i, cluster in enumerate(clusters):
    print(f"Cluster {i}: {len(cluster)} participants")
    print(f"  First 5 members: {cluster[:5]}")

## 4.3 Visualizing the PCA and Clustering Results

In [None]:
# Create a mapping from participant ID to cluster
id_to_cluster = {}
for cluster_idx, cluster_members in enumerate(clusters):
    for pid in cluster_members:
        id_to_cluster[pid] = cluster_idx

# Extract the projection coordinates and assigned clusters
x_coords = []
y_coords = []
assigned_clusters = []
true_groups = []

for i, p_id in enumerate(participant_ids):
    if p_id in proj_dict and p_id in id_to_cluster:
        x_coords.append(proj_dict[p_id][0])
        y_coords.append(proj_dict[p_id][1])
        assigned_clusters.append(id_to_cluster[p_id])
        
        # Record true group for comparison
        if i < 50:
            true_groups.append(0)
        elif i < 100:
            true_groups.append(1)
        else:
            true_groups.append(2)

# Create a scatter plot of the projections with assigned clusters
plt.figure(figsize=(12, 10))
colors = ["purple" if c == 0 else "orange" if c == 1 else "cyan" for c in assigned_clusters]
scatter = plt.scatter(x_coords, y_coords, c=colors, alpha=0.7, s=60)

# Add a legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='purple', markersize=10, label='Cluster 1'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=10, label='Cluster 2'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='cyan', markersize=10, label='Cluster 3')
]
plt.legend(handles=legend_elements)

# Add title and labels
plt.title("PCA Projection with Cluster Assignments")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True, linestyle="--", alpha=0.7)
plt.show()

## 4.4 Calculating Representativeness

Now we'll calculate the representativeness of each comment for each cluster.

In [None]:
# Format the clusters for the representativeness calculation
# Need to convert to the format expected by conv_repness
formatted_clusters = {}
for i, cluster in enumerate(clusters):
    formatted_clusters[str(i)] = cluster

# Calculate representativeness
print("Calculating representativeness...")
repness = conv_repness(vote_matrix, formatted_clusters)

# Display the structure of the repness results
print("\nRepresentativeness Results Structure:")
print(f"Keys: {list(repness.keys())}")
print(f"Group repness keys: {list(repness['group_repness'].keys())}")

## 4.5 Analyzing Representativeness Results

Let's examine the representative comments for each cluster.

In [None]:
# Display the top representative comments for each cluster
for group_id, comments in repness['group_repness'].items():
    print(f"\nTop Representative Comments for Cluster {group_id}:")
    print("-" * 50)
    
    # Sort by repness value (abs)
    sorted_comments = sorted(comments, key=lambda x: abs(x['repness']), reverse=True)
    
    # Display top 5 agrees
    agrees = [c for c in sorted_comments if c['repful'] == 'agree']
    print("Top 'Agree' comments:")
    for i, comment in enumerate(agrees[:5]):
        print(f"{i+1}. Comment {comment['comment_id']}: repness={comment['repness']:.3f}, z-score={comment['repness_z']:.3f}")
    
    # Display top 5 disagrees
    disagrees = [c for c in sorted_comments if c['repful'] == 'disagree']
    print("\nTop 'Disagree' comments:")
    for i, comment in enumerate(disagrees[:5]):
        print(f"{i+1}. Comment {comment['comment_id']}: repness={comment['repness']:.3f}, z-score={comment['repness_z']:.3f}")

## 4.6 Visualizing Comment Agreement by Cluster

Let's create a visualization of how each cluster voted on each comment.

In [None]:
# Calculate the average vote (agreement level) for each comment in each cluster
cluster_comment_agreement = {}

for cluster_idx, cluster_members in enumerate(clusters):
    # Create a subset of the vote matrix for this cluster
    cluster_matrix = vote_matrix.rowname_subset(cluster_members)
    
    # Calculate the mean agreement for each comment (ignoring NaN)
    agreement_values = []
    for c_idx, cid in enumerate(comment_ids):
        comment_votes = cluster_matrix.get_column_by_name(cid)
        valid_votes = comment_votes[~np.isnan(comment_votes)]
        if len(valid_votes) > 0:
            mean_agreement = np.mean(valid_votes)
        else:
            mean_agreement = 0  # No votes from this cluster on this comment
        agreement_values.append(mean_agreement)
    
    cluster_comment_agreement[cluster_idx] = agreement_values

# Create a heatmap of the agreement levels
fig, ax = plt.subplots(figsize=(15, 8))

# Convert to a matrix for plotting
agreement_matrix = np.array([cluster_comment_agreement[i] for i in range(len(clusters))])

# Create a custom colormap: red for disagree, white for neutral, green for agree
cmap = plt.cm.RdYlGn
norm = plt.Normalize(-1, 1)

# Create the heatmap
im = ax.imshow(agreement_matrix, cmap=cmap, norm=norm)

# Add labels
ax.set_xticks(np.arange(len(comment_ids)))
ax.set_yticks(np.arange(len(clusters)))
ax.set_xticklabels(comment_ids)
ax.set_yticklabels([f"Cluster {i}" for i in range(len(clusters))])

# Rotate the x-axis labels
plt.setp(ax.get_xticklabels(), rotation=90, ha="right", rotation_mode="anchor")

# Add a colorbar
cbar = ax.figure.colorbar(im, ticks=[-1, -0.5, 0, 0.5, 1])
cbar.ax.set_yticklabels(['Disagree', 'Mostly Disagree', 'Neutral', 'Mostly Agree', 'Agree'])

# Add title and labels
ax.set_title("Average Agreement by Cluster for Each Comment")
ax.set_xlabel("Comments")
ax.set_ylabel("Clusters")

# Adjust layout and display
fig.tight_layout()
plt.show()

## 4.7 Understanding the Z-scores

The representativeness calculation uses z-scores to identify comments with statistically significant differences in voting patterns between clusters.

In [None]:
# Create a DataFrame of the top representative comments for cluster 0
cluster0_comments = repness['group_repness']['0']
df_cluster0 = pd.DataFrame(cluster0_comments)

# Sort by absolute z-score
df_cluster0_sorted = df_cluster0.sort_values(by='repness_z', key=abs, ascending=False)

# Print the top 10 comments by absolute z-score
print("Top 10 Representative Comments for Cluster 0 by Z-score:")
print(df_cluster0_sorted[['comment_id', 'repful', 'repness', 'repness_z']].head(10))

# Visualize the z-scores
plt.figure(figsize=(12, 8))

# Create separate lists for agrees and disagrees
agree_comments = df_cluster0[df_cluster0['repful'] == 'agree']
disagree_comments = df_cluster0[df_cluster0['repful'] == 'disagree']

# Create a scatter plot of z-scores
plt.scatter(agree_comments['comment_id'].apply(lambda x: int(x[1:])),  # Extract numeric part of comment ID
            agree_comments['repness_z'],
            color='green', alpha=0.7, s=100, label='Agree')

plt.scatter(disagree_comments['comment_id'].apply(lambda x: int(x[1:])),  # Extract numeric part of comment ID
            disagree_comments['repness_z'],
            color='red', alpha=0.7, s=100, label='Disagree')

# Add a horizontal line at y=0
plt.axhline(y=0, color='k', linestyle='--', alpha=0.3)

# Add horizontal lines at y=Â±2 (typically considered statistically significant)
plt.axhline(y=2, color='b', linestyle=':', alpha=0.5)
plt.axhline(y=-2, color='b', linestyle=':', alpha=0.5)

# Add labels
plt.title("Z-scores for Comments in Cluster 0")
plt.xlabel("Comment Number")
plt.ylabel("Z-score")

# Add a legend
plt.legend()

# Add grid
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4.8 Examining the Representativeness Algorithm Implementation

In [None]:
# Examine the representativeness calculation implementation
import inspect
from polismath.math.repness import conv_repness, compute_repness

print("Representativeness High Level Implementation:")
print(inspect.getsource(conv_repness))

print("\nRepresentativeness Core Calculation:")
print(inspect.getsource(compute_repness))

## 4.9 Comparing Representativeness Across Clusters

Let's compare how the same comment can have different representativeness scores for different clusters.

In [None]:
# Create a DataFrame with all representativeness values
all_repness_data = []
for group_id, comments in repness['group_repness'].items():
    for comment in comments:
        comment_data = {
            'cluster': group_id,
            'comment_id': comment['comment_id'],
            'repful': comment['repful'],
            'repness': comment['repness'],
            'repness_z': comment['repness_z']
        }
        all_repness_data.append(comment_data)

df_all_repness = pd.DataFrame(all_repness_data)

# Create a pivot table to compare repness values across clusters
pivot_repness = df_all_repness.pivot_table(
    index='comment_id', 
    columns=['cluster', 'repful'], 
    values='repness_z',
    aggfunc='first'  # Take the first (only) value
)

# Display the pivot table for the first 10 comments
print("Representativeness Z-scores Across Clusters:")
print(pivot_repness.head(10))

# Create a visualization comparing comment representativeness across clusters
# Pick some interesting comments to visualize
selected_comments = ['c0', 'c10', 'c20']

# Create a DataFrame for plotting
plot_data = df_all_repness[df_all_repness['comment_id'].isin(selected_comments)]

# Create a grouped bar chart
fig, ax = plt.subplots(figsize=(15, 8))

# Set up positions for bars
bar_width = 0.35
r1 = np.arange(len(selected_comments))
r2 = [x + bar_width for x in r1]

# Extract data for agrees and disagrees
agrees = {}
disagrees = {}

for cluster in ['0', '1', '2']:
    agrees[cluster] = []
    disagrees[cluster] = []
    
    for comment in selected_comments:
        # Get agree value
        agree_rows = plot_data[(plot_data['comment_id'] == comment) & 
                              (plot_data['cluster'] == cluster) & 
                              (plot_data['repful'] == 'agree')]
        if not agree_rows.empty:
            agrees[cluster].append(agree_rows.iloc[0]['repness_z'])
        else:
            agrees[cluster].append(0)
            
        # Get disagree value
        disagree_rows = plot_data[(plot_data['comment_id'] == comment) & 
                                 (plot_data['cluster'] == cluster) & 
                                 (plot_data['repful'] == 'disagree')]
        if not disagree_rows.empty:
            disagrees[cluster].append(disagree_rows.iloc[0]['repness_z'])
        else:
            disagrees[cluster].append(0)

# Create grouped bar chart
fig, axes = plt.subplots(2, 1, figsize=(15, 12))

# Agree chart
axes[0].bar(r1, agrees['0'], width=bar_width, label='Cluster 0', color='purple', alpha=0.7)
axes[0].bar(r2, agrees['1'], width=bar_width, label='Cluster 1', color='orange', alpha=0.7)
axes[0].bar([x + bar_width*2 for x in r1], agrees['2'], width=bar_width, label='Cluster 2', color='cyan', alpha=0.7)

axes[0].set_title('Agreement Z-scores by Cluster')
axes[0].set_xticks([r + bar_width for r in range(len(selected_comments))])
axes[0].set_xticklabels(selected_comments)
axes[0].set_ylabel('Z-score')
axes[0].legend()
axes[0].grid(True, linestyle='--', alpha=0.5)

# Disagree chart
axes[1].bar(r1, disagrees['0'], width=bar_width, label='Cluster 0', color='purple', alpha=0.7)
axes[1].bar(r2, disagrees['1'], width=bar_width, label='Cluster 1', color='orange', alpha=0.7)
axes[1].bar([x + bar_width*2 for x in r1], disagrees['2'], width=bar_width, label='Cluster 2', color='cyan', alpha=0.7)

axes[1].set_title('Disagreement Z-scores by Cluster')
axes[1].set_xticks([r + bar_width for r in range(len(selected_comments))])
axes[1].set_xticklabels(selected_comments)
axes[1].set_ylabel('Z-score')
axes[1].legend()
axes[1].grid(True, linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 4.10 Summary

The Pol.is representativeness calculation:

1. Takes a vote matrix and cluster assignments as input
2. Calculates the statistical significance (z-scores) of agreement patterns for each comment in each cluster
3. Identifies comments that a cluster agrees with significantly more than other clusters (positive representative)
4. Identifies comments that a cluster disagrees with significantly more than other clusters (negative representative)
5. Provides a sorted list of the most representative comments for each cluster

This enables Pol.is to highlight the comments that best represent the opinions of each group, helping users understand the different perspectives in the conversation.