In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.neighbors import kneighbors_graph
from scipy.sparse import csgraph
from scipy.sparse.linalg import eigsh
from sklearn.cluster import KMeans

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
sns.set_context('notebook')

data_path = Path('../data/MACH_data/data.cleaned.csv')

df = pd.read_csv(data_path)
print(f"Raw data shape: {df.shape}")
display(df.head())

Choose Features:

In [None]:
question_responses = ["Q1A", "Q2A", "Q3A", "Q4A", "Q5A", "Q6A", "Q7A", "Q8A", "Q9A", "Q10A",
                      "Q11A", "Q12A", "Q13A", "Q14A", "Q15A", "Q16A", "Q17A", "Q18A", "Q19A", "Q20A"]

X = df[question_responses].copy()
print(f"Using features: {question_responses}")
print(f"Feature shape: {X.shape}")
display(X.head())

Data Preprocessing:

In [None]:
clean_data = X.dropna().copy()
clean_data[question_responses] = clean_data[question_responses].astype(int)
print(f"Cleaned data shape (dropped NA): {clean_data.shape}")

mach_data = pd.DataFrame(clean_data, columns=question_responses, index=clean_data.index)
mach_sample = mach_data.sample(n=5000, random_state=42)

print(f"Sampled data shape: {mach_sample.shape}")
display(mach_sample.head())
display(mach_sample.describe().T.round(3))

Standardize the Data:

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(mach_sample)
X_scaled.shape

Function to Run Spectral Clustering:

In [None]:
def run_spectral(X, n_clusters):
    model = SpectralClustering(n_clusters=n_clusters, affinity='rbf', assign_labels='kmeans', random_state=42)
    labels = model.fit_predict(X)
    sil_score = silhouette_score(X, labels)

    print(f"Spectral Clustering with k = {n_clusters} clusters")
    print(f"Silhouette Score: {sil_score}")

    return labels, sil_score

Running Spectral with K = 2, 3, 4:

In [None]:
results = {}

for k in [2, 3, 4]:
    labels, sil = run_spectral(X_scaled, n_clusters=k)
    results[k] = (labels, sil)

Building a k-nearest-neighbors similarity graph, compute unnormalized laplacian:

In [None]:
k = 15

W = kneighbors_graph(
    X_scaled,
    n_neighbors=k,
    mode='distance',
    include_self=False
)

W = 0.5 * (W + W.T)

In [None]:
sigma = np.median(W.data)

W_sim = W.copy()
W_sim.data = np.exp(-(W_sim.data ** 2) / (2 * sigma ** 2))

In [None]:
L = csgraph.laplacian(W_sim, normed=True)
L

Extract First 2 Eigenvectors:

In [None]:
vals, vecs = eigsh(L, k=3, which='SM')
spectral_embedding = vecs[:, 1:3]

print("Embedding shape:", spectral_embedding.shape)

Visualize Spectral Embedding with K=3:

In [None]:
km = KMeans(n_clusters=3, random_state=42)
labels_k3 = km.fit_predict(spectral_embedding)

plt.figure(figsize=(7,6))
plt.scatter(
    spectral_embedding[:,0],
    spectral_embedding[:,1],
    c=labels_k3,
    cmap='Set1',
    s=10
)
plt.title("K-means on Spectral Embedding (k=3)")
plt.show()

Add clustering labels back to sampled data:

In [None]:
Xs_sample_with_labels = mach_sample.copy()
Xs_sample_with_labels['cluster'] = labels_k3

Compute MACH means per cluster:

In [None]:
cluster_means = Xs_sample_with_labels.groupby('cluster')[question_responses].mean().round(3)

print("Mean MACH item responses per cluster:")
display(cluster_means)

Compute a MACH similarity score:

In [None]:
mach_high = np.array([
    5,5,1,1,5,1,1,5,1,1,
    1,5,5,1,5,1,1,5,5,5
])

X_mat = mach_sample[question_responses].to_numpy()

mach_distance = np.abs(X_mat - mach_high).sum(axis=1)

Xs_sample_with_labels['mach_score'] = mach_distance

Cluster level MACH scores:

In [None]:
cluster_mach_scores = Xs_sample_with_labels.groupby('cluster')['mach_score'].mean().round(3)

print("Average Machiavellianism Score per cluster (higher = more Mach):")
display(cluster_mach_scores)

Interpret clusters:

In [None]:
for c in cluster_mach_scores.index:
    print(f"\nCluster {c}:")
    print(f"  Mean Mach Score: {cluster_mach_scores[c]}")
    
    sorted_items = cluster_means.loc[c]
    print("  Highest-Mach items for this cluster:")
    print(sorted_items.sort_values(ascending=False).head(5))
    print("  Lowest-Mach items for this cluster:")
    print(sorted_items.sort_values().head(5))

## Interpretation of Results:

After running spectral clustering on the MACH data with k = 3, I identified 3 distinct clusters:
Cluster 0 - High Machiavellianism (MACH-distance of 55.485)
    - This suggests these respondents are:
        - Willing to manipulate others
        - Skeptical of people's motives
        - Consider deception sometimes necessary
Cluster 1 - Low Machiavellianism (MACH-distance of 10.332)
    - This suggests these respondents have:
        - High trust
        - Low cynicism
        Preference for honesty and straightforward behavior
Cluster 2 - Moderate Machiavellianism (MACH-distance of 31.02)
    - This suggests that these respondents might:
        - Occasionally use manipulation or strategic thinking
        - Be wary of fully trusting others
        - Use flexible morals depending on the situation

These three clusters reveal a non-linear structure in the MACH personality data that other clustering algorithms may miss.

The three clusters:
    - Align with psychological theory
    - Display meaningful differences in response patterns
    - Allow clear interpretation of the three MACH clusters

This argues that sometimes Machiavellianism might be better understood as a spectrum as opposed to categories, which spectral clustering is uniquely capable of.