# 2. PCA - Pol.is Math Python Implementation

Principal Component Analysis (PCA) is a key part of the Pol.is system. It reduces the high-dimensional vote matrix to a lower-dimensional space (typically 2D) for visualization and clustering. This notebook demonstrates the PCA implementation in the Python conversion.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from polismath.math.named_matrix import NamedMatrix
from polismath.math.pca import pca_project_named_matrix

## 2.1 Creating a Test Vote Matrix

We'll create a vote matrix with two clear opinion groups to demonstrate PCA.

In [None]:
# Create a vote matrix with two clear opinion groups
num_participants = 100
num_comments = 20
participant_ids = [f"p{i}" for i in range(num_participants)]
comment_ids = [f"c{i}" for i in range(num_comments)]

# Initialize a matrix with all NaN values
votes_matrix = np.full((num_participants, num_comments), np.nan)

# Fill in the matrix with votes
# Group 1 (participants 0-49) agrees with first half of comments, disagrees with second half
# Group 2 (participants 50-99) does the opposite
for p_idx in range(num_participants):
    group = 0 if p_idx < 50 else 1
    
    for c_idx in range(num_comments):
        # Determine vote based on group
        if (group == 0 and c_idx < 10) or (group == 1 and c_idx >= 10):
            votes_matrix[p_idx, c_idx] = 1  # Agree
        else:
            votes_matrix[p_idx, c_idx] = -1  # Disagree

# Create the NamedMatrix
vote_matrix = NamedMatrix(votes_matrix, participant_ids, comment_ids)

print(f"Created vote matrix with {len(participant_ids)} participants and {len(comment_ids)} comments")
print(f"Number of votes: {np.sum(~np.isnan(votes_matrix))}")

## 2.2 Running PCA on the Vote Matrix

In [None]:
# Perform PCA with our custom implementation
print("Running PCA...")
pca_results, proj_dict = pca_project_named_matrix(vote_matrix)

# Examine the results
print("\nPCA Results:")
print(f"PCA Results type: {type(pca_results)}")
print(f"Keys: {list(pca_results.keys())}")

# Examine center and components
print(f"\nCenter vector shape: {pca_results['center'].shape}")
print(f"PCA components shape: {pca_results['comps'].shape}")

# Examine the projection dictionary
print(f"\nProjection dictionary contains {len(proj_dict)} participant projections")
print(f"Example projection for p0: {proj_dict['p0']}")

## 2.3 Visualizing the PCA Results

Now we'll visualize the participants in the 2D space defined by the first two principal components.

In [None]:
# Extract the projection coordinates for each participant
x_coords = []
y_coords = []
groups = []  # To color the points by their known group

for i, p_id in enumerate(participant_ids):
    if p_id in proj_dict:
        x_coords.append(proj_dict[p_id][0])
        y_coords.append(proj_dict[p_id][1])
        groups.append(0 if i < 50 else 1)  # Known group assignment

# Create a scatter plot of the projections
fig, ax = plt.subplots(figsize=(10, 8))

# Use different colors for the two groups
colors = ["blue" if g == 0 else "red" for g in groups]

# Create the scatter plot
scatter = ax.scatter(x_coords, y_coords, c=colors, alpha=0.6, s=50)

# Add title and labels
ax.set_title("PCA Projection of Participants")
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")

# Add a legend
ax.legend(["Group 1", "Group 2"])

# Add grid lines
ax.grid(True, linestyle="--", alpha=0.7)

# Show the plot
plt.show()

## 2.4 Understanding the PCA Components

Let's examine the principal components to understand what each dimension represents in terms of the original comments.

In [None]:
# Create a DataFrame to examine the principal components
pca_components_df = pd.DataFrame(
    pca_results['comps'],
    index=comment_ids,
    columns=[f"PC{i+1}" for i in range(pca_results['comps'].shape[1])]
)

# Display the first two principal components
print("Principal Component Loadings:")
print(pca_components_df[["PC1", "PC2"]])

# Visualize the component loadings
fig, ax = plt.subplots(figsize=(12, 8))

# Sort comments by their PC1 loading
sorted_comments = pca_components_df.sort_values("PC1", ascending=False).index

# Create a bar chart of PC1 loadings
bars = ax.bar(range(len(sorted_comments)), 
        pca_components_df.loc[sorted_comments, "PC1"], 
        color=["green" if i < 10 else "orange" for i in range(len(sorted_comments))])

# Add labels
ax.set_title("Principal Component 1 Loadings by Comment")
ax.set_xlabel("Comments")
ax.set_ylabel("Loading")
ax.set_xticks(range(len(sorted_comments)))
ax.set_xticklabels(sorted_comments, rotation=90)

# Add a horizontal line at y=0
ax.axhline(y=0, color="k", linestyle="-", alpha=0.3)

# Add a legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor="green", label="First Half Comments"),
                  Patch(facecolor="orange", label="Second Half Comments")]
ax.legend(handles=legend_elements)

# Show the plot
plt.tight_layout()
plt.show()

## 2.5 More Realistic Example with Noise

Let's create a more realistic example with some noise in the votes.

In [None]:
# Create a vote matrix with two opinion groups, but with noise and some passes
num_participants = 100
num_comments = 20
participant_ids_noisy = [f"p{i}" for i in range(num_participants)]
comment_ids_noisy = [f"c{i}" for i in range(num_comments)]

# Initialize a matrix with all NaN values
votes_matrix_noisy = np.full((num_participants, num_comments), np.nan)

# Fill in the matrix with votes, but add noise and passes
for p_idx in range(num_participants):
    group = 0 if p_idx < 50 else 1
    
    for c_idx in range(num_comments):
        # Randomly determine if this participant votes on this comment (80% chance)
        if np.random.random() < 0.8:  
            # Base pattern: Group 1 agrees with first half, Group 2 agrees with second half
            if (group == 0 and c_idx < 10) or (group == 1 and c_idx >= 10):
                # High probability of agree (80%), but with some noise (20% disagree)
                vote = 1 if np.random.random() < 0.8 else -1
            else:
                # High probability of disagree (80%), but with some noise (20% agree)
                vote = -1 if np.random.random() < 0.8 else 1
                
            votes_matrix_noisy[p_idx, c_idx] = vote
        # Otherwise, leave as NaN (pass)

# Create the NamedMatrix
vote_matrix_noisy = NamedMatrix(votes_matrix_noisy, participant_ids_noisy, comment_ids_noisy)

print(f"Created noisy vote matrix with {len(participant_ids_noisy)} participants and {len(comment_ids_noisy)} comments")
num_votes = np.sum(~np.isnan(votes_matrix_noisy))
print(f"Number of votes: {num_votes} ({num_votes/(num_participants*num_comments)*100:.1f}% of possible votes)")
print(f"Number of agrees: {np.sum(votes_matrix_noisy == 1)} ({np.sum(votes_matrix_noisy == 1)/num_votes*100:.1f}%)")
print(f"Number of disagrees: {np.sum(votes_matrix_noisy == -1)} ({np.sum(votes_matrix_noisy == -1)/num_votes*100:.1f}%)")

In [None]:
# Run PCA on the noisy matrix
print("Running PCA on noisy data...")
pca_results_noisy, proj_dict_noisy = pca_project_named_matrix(vote_matrix_noisy)

# Extract the projection coordinates
x_coords_noisy = []
y_coords_noisy = []
groups_noisy = []

for i, p_id in enumerate(participant_ids_noisy):
    if p_id in proj_dict_noisy:
        x_coords_noisy.append(proj_dict_noisy[p_id][0])
        y_coords_noisy.append(proj_dict_noisy[p_id][1])
        groups_noisy.append(0 if i < 50 else 1)

# Create a scatter plot of the noisy projections
fig, ax = plt.subplots(figsize=(10, 8))

# Use different colors for the two groups
colors_noisy = ["blue" if g == 0 else "red" for g in groups_noisy]

# Create the scatter plot
scatter = ax.scatter(x_coords_noisy, y_coords_noisy, c=colors_noisy, alpha=0.6, s=50)

# Add title and labels
ax.set_title("PCA Projection of Participants (Noisy Data)")
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")

# Add a legend
ax.legend(["Group 1", "Group 2"])

# Add grid lines
ax.grid(True, linestyle="--", alpha=0.7)

# Show the plot
plt.show()

## 2.6 Custom Power Iteration Method

The Pol.is implementation uses a custom power iteration method for PCA instead of relying directly on standard library routines. This gives it flexibility and control over the process.

In [None]:
# Examine how the custom power iteration works by checking the code
import inspect
from polismath.math.pca import power_iteration, prepare_matrix

print("Power Iteration Implementation:")
print(inspect.getsource(power_iteration))

print("\nMatrix Preparation Implementation:")
print(inspect.getsource(prepare_matrix))

## 2.7 Handling Missing Values

Let's test PCA with very sparse data to see how the system handles large numbers of missing values.

In [None]:
# Create a very sparse vote matrix
num_participants = 100
num_comments = 50
participant_ids_sparse = [f"p{i}" for i in range(num_participants)]
comment_ids_sparse = [f"c{i}" for i in range(num_comments)]

# Initialize with all NaN values
votes_matrix_sparse = np.full((num_participants, num_comments), np.nan)

# Add very few votes (only about 10% of possible votes)
for p_idx in range(num_participants):
    group = 0 if p_idx < 50 else 1
    
    # Each participant only votes on a small subset of comments
    num_votes_per_participant = np.random.randint(3, 8)  # 3-7 votes per participant
    comment_indices = np.random.choice(range(num_comments), num_votes_per_participant, replace=False)
    
    for c_idx in comment_indices:
        if (group == 0 and c_idx < 25) or (group == 1 and c_idx >= 25):
            vote = 1 if np.random.random() < 0.8 else -1  # Mostly agree
        else:
            vote = -1 if np.random.random() < 0.8 else 1  # Mostly disagree
            
        votes_matrix_sparse[p_idx, c_idx] = vote

# Create the NamedMatrix
vote_matrix_sparse = NamedMatrix(votes_matrix_sparse, participant_ids_sparse, comment_ids_sparse)

num_votes_sparse = np.sum(~np.isnan(votes_matrix_sparse))
print(f"Created sparse vote matrix with {len(participant_ids_sparse)} participants and {len(comment_ids_sparse)} comments")
print(f"Number of votes: {num_votes_sparse} ({num_votes_sparse/(num_participants*num_comments)*100:.1f}% of possible votes)")

# Run PCA on the sparse matrix
print("\nRunning PCA on sparse data...")
pca_results_sparse, proj_dict_sparse = pca_project_named_matrix(vote_matrix_sparse)

# Extract the projection coordinates
x_coords_sparse = []
y_coords_sparse = []
groups_sparse = []

for i, p_id in enumerate(participant_ids_sparse):
    if p_id in proj_dict_sparse:
        x_coords_sparse.append(proj_dict_sparse[p_id][0])
        y_coords_sparse.append(proj_dict_sparse[p_id][1])
        groups_sparse.append(0 if i < 50 else 1)

# Visualize the sparse PCA results
fig, ax = plt.subplots(figsize=(10, 8))
colors_sparse = ["blue" if g == 0 else "red" for g in groups_sparse]
scatter = ax.scatter(x_coords_sparse, y_coords_sparse, c=colors_sparse, alpha=0.6, s=50)
ax.set_title("PCA Projection of Participants (Sparse Data)")
ax.set_xlabel("Principal Component 1")
ax.set_ylabel("Principal Component 2")
ax.legend(["Group 1", "Group 2"])
ax.grid(True, linestyle="--", alpha=0.7)
plt.show()

## 2.8 Summary

The Pol.is PCA implementation:

1. Takes a `NamedMatrix` of votes and projects participants into a low-dimensional space
2. Uses a custom power iteration method for PCA computation
3. Handles missing values (passes) effectively
4. Produces a dictionary mapping participant IDs to their projections
5. Works even with sparse data where many votes are missing

This projection is a critical step for both visualization and clustering in the Pol.is system.