# 5. Correlation Analysis - Pol.is Math Python Implementation

Pol.is includes correlation analysis to understand relationships between comments based on how participants vote. This notebook explores the correlation and hierarchical clustering implementation in the Python conversion.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster import hierarchy
from polismath.math.named_matrix import NamedMatrix
from polismath.math.corr import comment_correlation, hierarchical_clustering

## 5.1 Creating a Test Vote Matrix

We'll create a vote matrix with clear patterns of correlation between comments.

In [None]:
# Create a vote matrix with three distinct comment groups
num_participants = 100
num_comments = 15  # 5 comments in each of 3 groups
participant_ids = [f"p{i}" for i in range(num_participants)]
comment_ids = [f"c{i}" for i in range(num_comments)]

# Initialize a matrix with all NaN values
votes_matrix = np.full((num_participants, num_comments), np.nan)

# Fill in the matrix with votes
# Comments 0-4 will be highly correlated
# Comments 5-9 will be highly correlated
# Comments 10-14 will be highly correlated
for p_idx in range(num_participants):
    # Add some randomness to votes
    random_state = np.random.RandomState(p_idx)  # Seed for reproducibility
    
    # First group of comments (0-4)
    if random_state.random() < 0.8:  # 80% chance to vote on these comments
        # Determine if this participant tends to agree or disagree with this group
        agrees_with_group1 = random_state.random() < 0.5
        
        # Vote on each comment in the group with some noise
        for c_idx in range(0, 5):
            if random_state.random() < 0.9:  # 90% consistency
                votes_matrix[p_idx, c_idx] = 1 if agrees_with_group1 else -1
            else:
                votes_matrix[p_idx, c_idx] = -1 if agrees_with_group1 else 1
    
    # Second group of comments (5-9)
    if random_state.random() < 0.8:  # 80% chance to vote on these comments
        # Determine if this participant tends to agree or disagree with this group
        agrees_with_group2 = random_state.random() < 0.5
        
        # Vote on each comment in the group with some noise
        for c_idx in range(5, 10):
            if random_state.random() < 0.9:  # 90% consistency
                votes_matrix[p_idx, c_idx] = 1 if agrees_with_group2 else -1
            else:
                votes_matrix[p_idx, c_idx] = -1 if agrees_with_group2 else 1
    
    # Third group of comments (10-14)
    if random_state.random() < 0.8:  # 80% chance to vote on these comments
        # Determine if this participant tends to agree or disagree with this group
        agrees_with_group3 = random_state.random() < 0.5
        
        # Vote on each comment in the group with some noise
        for c_idx in range(10, 15):
            if random_state.random() < 0.9:  # 90% consistency
                votes_matrix[p_idx, c_idx] = 1 if agrees_with_group3 else -1
            else:
                votes_matrix[p_idx, c_idx] = -1 if agrees_with_group3 else 1

# Create the NamedMatrix
vote_matrix = NamedMatrix(votes_matrix, participant_ids, comment_ids)

print(f"Created vote matrix with {len(participant_ids)} participants and {len(comment_ids)} comments")
num_votes = np.sum(~np.isnan(votes_matrix))
print(f"Number of votes: {num_votes} ({num_votes/(num_participants*num_comments)*100:.1f}% of possible votes)")
print(f"Number of agrees: {np.sum(votes_matrix == 1)} ({np.sum(votes_matrix == 1)/num_votes*100:.1f}%)")
print(f"Number of disagrees: {np.sum(votes_matrix == -1)} ({np.sum(votes_matrix == -1)/num_votes*100:.1f}%)")

## 5.2 Calculating Comment Correlations

We'll use the Pol.is correlation function to calculate correlations between comments.

In [None]:
# Calculate correlations between comments
print("Calculating comment correlations...")
corr_matrix = comment_correlation(vote_matrix)

print(f"Correlation matrix shape: {corr_matrix.shape}")
print("\nSample of the correlation matrix:")
print(corr_matrix[:5, :5])  # Show top-left 5x5 portion

## 5.3 Visualizing the Correlation Matrix

Let's create a heatmap to visualize the correlations between comments.

In [None]:
# Create a DataFrame for better visualization
corr_df = pd.DataFrame(corr_matrix, index=comment_ids, columns=comment_ids)

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_df, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, 
            annot_kws={"size": 10}, fmt=".2f")
plt.title('Comment Correlation Matrix')
plt.tight_layout()
plt.show()

# Display the correlation values as a table
print("Correlation Matrix as Table:")
print(corr_df.round(2))

## 5.4 Performing Hierarchical Clustering of Comments

Now we'll use hierarchical clustering to group similar comments.

In [None]:
# Perform hierarchical clustering on the correlation matrix
print("Performing hierarchical clustering of comments...")
linkage_matrix = hierarchical_clustering(corr_matrix)

print(f"Linkage matrix shape: {linkage_matrix.shape}")
print("\nSample of the linkage matrix:")
print(linkage_matrix[:5])  # Show first 5 rows

## 5.5 Visualizing the Hierarchical Clustering

Let's create a dendrogram to visualize the hierarchical clustering of comments.

In [None]:
# Create a dendrogram of the hierarchical clustering
plt.figure(figsize=(15, 8))
dendrogram = hierarchy.dendrogram(
    linkage_matrix,
    labels=comment_ids,
    orientation='top',
    leaf_font_size=12,
    color_threshold=0.7*np.max(linkage_matrix[:, 2]),  # Threshold for coloring
)
plt.title('Hierarchical Clustering of Comments')
plt.xlabel('Comments')
plt.ylabel('Distance')
plt.axhline(y=0.7*np.max(linkage_matrix[:, 2]), c='k', ls='--', alpha=0.3)  # Add a line at the cut threshold
plt.tight_layout()
plt.show()

## 5.6 Creating a Clustered Heatmap

Let's combine the correlation matrix and hierarchical clustering to create a clustered heatmap.

In [None]:
# Create a clustered heatmap using the hierarchical clustering
plt.figure(figsize=(14, 12))
clustergrid = sns.clustermap(
    corr_df, 
    method='average',  # Use average linkage
    cmap='coolwarm', 
    vmin=-1, 
    vmax=1, 
    center=0,
    annot=True,
    fmt=".2f",
    annot_kws={"size": 9},
    figsize=(14, 12),
    dendrogram_ratio=0.2,
    cbar_pos=(0.02, 0.8, 0.05, 0.18),
)
plt.title('Clustered Correlation Matrix of Comments', fontsize=14, pad=30)
plt.tight_layout()
plt.show()

## 5.7 Extracting Comment Clusters

We can extract distinct groups of comments from the hierarchical clustering.

In [None]:
# Extract clusters from the hierarchical clustering
def extract_clusters(linkage_matrix, labels, threshold):
    # Compute flat clusters
    clusters = hierarchy.fcluster(linkage_matrix, threshold, criterion='distance')
    
    # Group comments by cluster
    cluster_groups = {}
    for i, cluster_id in enumerate(clusters):
        if cluster_id not in cluster_groups:
            cluster_groups[cluster_id] = []
        cluster_groups[cluster_id].append(labels[i])
    
    return cluster_groups

# Choose a threshold based on the dendrogram
threshold = 0.7 * np.max(linkage_matrix[:, 2])
comment_clusters = extract_clusters(linkage_matrix, comment_ids, threshold)

print(f"Extracted {len(comment_clusters)} comment clusters with threshold {threshold:.2f}:")
for cluster_id, comments in comment_clusters.items():
    print(f"Cluster {cluster_id}: {comments}")

## 5.8 Examining Different Distance Thresholds

The number of clusters depends on the threshold we choose. Let's explore different thresholds.

In [None]:
# Try different thresholds
thresholds = [0.2, 0.4, 0.6, 0.8, 1.0]
threshold_max = np.max(linkage_matrix[:, 2])
thresholds = [t * threshold_max for t in thresholds]

for threshold in thresholds:
    comment_clusters = extract_clusters(linkage_matrix, comment_ids, threshold)
    print(f"\nThreshold {threshold:.2f}: {len(comment_clusters)} clusters")
    for cluster_id, comments in comment_clusters.items():
        print(f"  Cluster {cluster_id}: {comments}")

## 5.9 Examining the Implementation of Correlation and Clustering

In [None]:
# Examine the implementation of correlation and hierarchical clustering
import inspect

print("Comment Correlation Implementation:")
print(inspect.getsource(comment_correlation))

print("\nHierarchical Clustering Implementation:")
print(inspect.getsource(hierarchical_clustering))

## 5.10 Correlation Analysis with a Larger Dataset

Let's create a larger dataset with more complex correlation patterns.

In [None]:
# Create a larger vote matrix with more complex patterns
num_participants_large = 200
num_comments_large = 30
participant_ids_large = [f"p{i}" for i in range(num_participants_large)]
comment_ids_large = [f"c{i}" for i in range(num_comments_large)]

# Initialize a matrix with all NaN values
votes_matrix_large = np.full((num_participants_large, num_comments_large), np.nan)

# Define several correlated comment groups
comment_groups = [
    range(0, 6),       # Group 1: comments 0-5
    range(6, 12),      # Group 2: comments 6-11
    range(12, 18),     # Group 3: comments 12-17
    range(18, 24),     # Group 4: comments 18-23
    range(24, 30)      # Group 5: comments 24-29
]

# Fill in the matrix with votes
for p_idx in range(num_participants_large):
    # Add some randomness to votes
    random_state = np.random.RandomState(p_idx)  # Seed for reproducibility
    
    # For each comment group
    for group in comment_groups:
        if random_state.random() < 0.8:  # 80% chance to vote on this group
            # Determine if this participant tends to agree or disagree with this group
            agrees_with_group = random_state.random() < 0.5
            
            # Vote on each comment in the group with some noise
            for c_idx in group:
                if random_state.random() < 0.9:  # 90% consistency
                    votes_matrix_large[p_idx, c_idx] = 1 if agrees_with_group else -1
                else:
                    votes_matrix_large[p_idx, c_idx] = -1 if agrees_with_group else 1

# Create the NamedMatrix
vote_matrix_large = NamedMatrix(votes_matrix_large, participant_ids_large, comment_ids_large)

print(f"Created large vote matrix with {len(participant_ids_large)} participants and {len(comment_ids_large)} comments")
num_votes_large = np.sum(~np.isnan(votes_matrix_large))
print(f"Number of votes: {num_votes_large} ({num_votes_large/(num_participants_large*num_comments_large)*100:.1f}% of possible votes)")

In [None]:
# Calculate correlations for the larger dataset
print("Calculating correlations for the larger dataset...")
corr_matrix_large = comment_correlation(vote_matrix_large)

# Perform hierarchical clustering
print("Performing hierarchical clustering on the larger dataset...")
linkage_matrix_large = hierarchical_clustering(corr_matrix_large)

# Create a clustered heatmap
corr_df_large = pd.DataFrame(corr_matrix_large, index=comment_ids_large, columns=comment_ids_large)
plt.figure(figsize=(16, 14))
clustergrid_large = sns.clustermap(
    corr_df_large, 
    method='average',
    cmap='coolwarm', 
    vmin=-1, 
    vmax=1, 
    center=0,
    figsize=(16, 14),
    dendrogram_ratio=0.2,
    cbar_pos=(0.02, 0.8, 0.05, 0.18),
    # Don't show annotations for larger matrix
)
plt.title('Clustered Correlation Matrix of Comments (Larger Dataset)', fontsize=16, pad=30)
plt.tight_layout()
plt.show()

## 5.11 Extract and Visualize Clusters from the Larger Dataset

In [None]:
# Extract clusters from the larger dataset
threshold_large = 0.7 * np.max(linkage_matrix_large[:, 2])
comment_clusters_large = extract_clusters(linkage_matrix_large, comment_ids_large, threshold_large)

print(f"Extracted {len(comment_clusters_large)} comment clusters with threshold {threshold_large:.2f}:")
for cluster_id, comments in comment_clusters_large.items():
    print(f"Cluster {cluster_id}: {comments}")

# Create a dendrogram for the larger dataset
plt.figure(figsize=(18, 10))
dendrogram_large = hierarchy.dendrogram(
    linkage_matrix_large,
    labels=comment_ids_large,
    orientation='top',
    leaf_font_size=12,
    color_threshold=threshold_large,
)
plt.title('Hierarchical Clustering of Comments (Larger Dataset)', fontsize=16)
plt.xlabel('Comments', fontsize=14)
plt.ylabel('Distance', fontsize=14)
plt.axhline(y=threshold_large, c='k', ls='--', alpha=0.3)  # Add a line at the cut threshold
plt.tight_layout()
plt.show()

## 5.12 Summary

The Pol.is correlation and hierarchical clustering implementation:

1. Calculates correlations between comments based on how participants vote
2. Identifies groups of comments that tend to receive similar voting patterns
3. Uses hierarchical clustering to arrange comments into a tree structure
4. Allows extraction of comment clusters at various similarity thresholds

This analysis provides valuable insights into how comments relate to each other and helps identify themes or topics within the conversation.