In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import ast

# Load the data
df = pd.read_csv('candidates_with_questions.csv')


In [74]:
# Process the answers column to get a numeric matrix
def extract_answers(row):
    # Convert string representation of list to actual list
    answers = ast.literal_eval(row['answers'])
    # Create dictionary with questionId as key and value as value
    return {item['questionId']: item['value']/100 for item in answers}

# Apply extraction to each row
candidates_answers = df.apply(extract_answers, axis=1)

# Create a dataframe with all questions
question_ids = set()
for answers in candidates_answers:
    question_ids.update(answers.keys())
question_ids = sorted(question_ids)


In [75]:
# Create feature matrix
X = np.zeros((len(df), len(question_ids)))
for i, answers in enumerate(candidates_answers):
    for j, q_id in enumerate(question_ids):
        if q_id in answers:
            X[i, j] = answers[q_id]


In [76]:
print(X.shape[1])

58


In [None]:
input_dim = X.shape[1]
encoding_dim = 4  # Number of dimensions for smart spider

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.GELU(),
            nn.Linear(32, 16),
            nn.GELU(),
            nn.Linear(16, encoding_dim),
            nn.Sigmoid()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 16),
            nn.GELU(),
            nn.Linear(16, 32),
            nn.GELU(),
            nn.Linear(32, input_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Prepare data
torch_X = torch.tensor(X, dtype=torch.float32).to(device)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(torch_X, test_size=0.2, random_state=42)

# Model, loss, optimizer
model = AutoEncoder(input_dim, encoding_dim).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 100
batch_size = 64
for epoch in range(num_epochs):
    model.train()
    perm = torch.randperm(X_train.size(0))
    for i in range(0, X_train.size(0), batch_size):
        idx = perm[i:i+batch_size]
        batch = X_train[idx]
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

Epoch 1/100, Loss: 0.6171
Epoch 2/100, Loss: 0.5209
Epoch 3/100, Loss: 0.5190
Epoch 2/100, Loss: 0.5209
Epoch 3/100, Loss: 0.5190
Epoch 4/100, Loss: 0.4534
Epoch 4/100, Loss: 0.4534
Epoch 5/100, Loss: 0.5352
Epoch 6/100, Loss: 0.4713
Epoch 5/100, Loss: 0.5352
Epoch 6/100, Loss: 0.4713
Epoch 7/100, Loss: 0.4551
Epoch 7/100, Loss: 0.4551
Epoch 8/100, Loss: 0.4985
Epoch 9/100, Loss: 0.4926
Epoch 8/100, Loss: 0.4985
Epoch 9/100, Loss: 0.4926
Epoch 10/100, Loss: 0.4806
Epoch 10/100, Loss: 0.4806
Epoch 11/100, Loss: 0.4748
Epoch 11/100, Loss: 0.4748
Epoch 12/100, Loss: 0.5104
Epoch 12/100, Loss: 0.5104
Epoch 13/100, Loss: 0.4489
Epoch 14/100, Loss: 0.4988
Epoch 13/100, Loss: 0.4489
Epoch 14/100, Loss: 0.4988
Epoch 15/100, Loss: 0.4688
Epoch 15/100, Loss: 0.4688
Epoch 16/100, Loss: 0.4758
Epoch 17/100, Loss: 0.4479
Epoch 16/100, Loss: 0.4758
Epoch 17/100, Loss: 0.4479
Epoch 18/100, Loss: 0.5057
Epoch 18/100, Loss: 0.5057
Epoch 19/100, Loss: 0.4805
Epoch 20/100, Loss: 0.4945
Epoch 19/100, Loss

In [None]:
# Get encoded features for test set
model.eval()
with torch.no_grad():
    encoded_features = model.encoder(X_test).cpu().numpy()
    last_layer = model(X_test).cpu().numpy()

In [78]:
# import numpy.testing as npt
# npt.assert_almost_equal(X_test.cpu().numpy()/2.5, last_layer/2.5, 1)

In [79]:
display(X)

array([[0.25, 0.75, 0.25, ..., 1.  , 0.33, 0.67],
       [1.  , 0.25, 0.25, ..., 0.67, 0.67, 0.67],
       [0.  , 1.  , 0.  , ..., 0.83, 0.  , 1.  ],
       ...,
       [0.  , 1.  , 0.  , ..., 1.  , 0.5 , 1.  ],
       [0.  , 1.  , 0.  , ..., 0.33, 0.  , 1.  ],
       [0.  , 1.  , 0.  , ..., 0.33, 0.  , 1.  ]], shape=(1877, 58))

In [80]:
display(last_layer)

array([[0.26927754, 0.95728064, 0.01795026, ..., 0.5350335 , 0.15194316,
        0.9411502 ],
       [0.34452948, 0.910514  , 0.08846618, ..., 0.66244686, 0.23519601,
        0.8548601 ],
       [0.815977  , 0.7519792 , 0.14170876, ..., 0.672533  , 0.26227337,
        0.8030926 ],
       ...,
       [0.55907816, 0.8304879 , 0.23911926, ..., 0.5837071 , 0.3165381 ,
        0.79590183],
       [0.5956442 , 0.5217943 , 0.6502328 , ..., 0.6148864 , 0.5641203 ,
        0.37151697],
       [0.94849455, 0.0244402 , 0.76786256, ..., 0.9675941 , 0.38622114,
        0.3559718 ]], shape=(376, 58), dtype=float32)

In [81]:

# Create spider charts
def create_spider_chart(features, candidate_name, party):
    # Number of variables
    N = features.shape[0]
    
    # What will be the angle of each axis in the plot
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the loop
    
    # Spider chart setup
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    
    # Add the feature values
    values = features.tolist()
    values += values[:1]  # Close the loop
    
    # Plot
    ax.plot(angles, values, linewidth=2, linestyle='solid', label=candidate_name)
    ax.fill(angles, values, alpha=0.25)
    
    # Set labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([f'Dimension {i+1}' for i in range(N)])
    
    plt.title(f"{candidate_name} ({party}) Political Profile", size=20)
    plt.tight_layout()
    return fig


In [82]:
for i in range(min(5, len(df))):
    features = encoded_features[i]
    features = (features - features.min()) / (features.max() - features.min() + 1e-10)
    fig = create_spider_chart(features, df.iloc[i]['firstname'] + ' ' + df.iloc[i]['lastname'], 
                              df.iloc[i]['partyAbbreviation'])
    plt.savefig(f"candidate_{i}_spider.png")
    plt.close()


In [83]:
correlations = []
for dim in range(encoding_dim):
    dim_corr = {}
    for j, q_id in enumerate(question_ids):
        corr = np.corrcoef(encoded_features[:, dim], X[:, j])[0, 1]
        dim_corr[q_id] = corr
    correlations.append(dim_corr)


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 376 and the array at index 1 has size 1877

In [None]:
for dim, corr_dict in enumerate(correlations):
    print(f"Dimension {dim+1} most correlated with:")
    top_corr = sorted(corr_dict.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
    for q_id, corr in top_corr:
        print(f"  Question {q_id}: {corr:.3f}")