In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import ast

# Load the data
input_df = pd.read_csv('candidates_with_questions.csv')


In [22]:
# Process the answers column to get a numeric matrix
def extract_answers(row):
    # Convert string representation of list to actual list
    answers = ast.literal_eval(row['answers'])
    # Create dictionary with questionId as key and value as value
    return {item['questionId']: item['value']/100 for item in answers}

# Apply extraction to each row
candidates_answers = input_df.apply(extract_answers, axis=1)

# Create a dataframe with all questions
question_ids = set()
for answers in candidates_answers:
    question_ids.update(answers.keys())
question_ids = sorted(question_ids)


In [23]:
# Create feature matrix
X = np.zeros((len(input_df), len(question_ids)))
for i, answers in enumerate(candidates_answers):
    for j, q_id in enumerate(question_ids):
        if q_id in answers:
            X[i, j] = answers[q_id]


In [24]:
print(X.shape[1])

58


In [25]:
input_dim = X.shape[1]
encoding_dim = 4  # Number of dimensions for smart spider

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.GELU(),
            nn.Linear(32, 16),
            nn.GELU(),
            nn.Linear(16, encoding_dim),
            nn.Sigmoid()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 16),
            nn.GELU(),
            nn.Linear(16, 32),
            nn.GELU(),
            nn.Linear(32, input_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# Prepare data
torch_X = torch.tensor(X, dtype=torch.float32).to(device)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(torch_X, test_size=0.2, random_state=42)

# Model, loss, optimizer
model = AutoEncoder(input_dim, encoding_dim).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 100
batch_size = 64
for epoch in range(num_epochs):
    model.train()
    perm = torch.randperm(X_train.size(0))
    for i in range(0, X_train.size(0), batch_size):
        idx = perm[i:i+batch_size]
        batch = X_train[idx]
        optimizer.zero_grad()
        output = model(batch)
        loss = criterion(output, batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

Epoch 1/100, Loss: 0.6450
Epoch 2/100, Loss: 0.5539
Epoch 3/100, Loss: 0.5162
Epoch 4/100, Loss: 0.4370
Epoch 5/100, Loss: 0.4889
Epoch 6/100, Loss: 0.4855
Epoch 7/100, Loss: 0.4599
Epoch 8/100, Loss: 0.4405
Epoch 9/100, Loss: 0.5063
Epoch 10/100, Loss: 0.4995
Epoch 11/100, Loss: 0.5286
Epoch 12/100, Loss: 0.4533
Epoch 13/100, Loss: 0.4416
Epoch 14/100, Loss: 0.4498
Epoch 15/100, Loss: 0.4712
Epoch 16/100, Loss: 0.5049
Epoch 17/100, Loss: 0.4858
Epoch 18/100, Loss: 0.4790
Epoch 19/100, Loss: 0.4510
Epoch 20/100, Loss: 0.4564
Epoch 21/100, Loss: 0.5524
Epoch 22/100, Loss: 0.4630
Epoch 23/100, Loss: 0.4250
Epoch 24/100, Loss: 0.4365
Epoch 25/100, Loss: 0.4913
Epoch 26/100, Loss: 0.4951
Epoch 27/100, Loss: 0.5128
Epoch 28/100, Loss: 0.4121
Epoch 29/100, Loss: 0.4276
Epoch 30/100, Loss: 0.5722
Epoch 31/100, Loss: 0.5244
Epoch 32/100, Loss: 0.4562
Epoch 33/100, Loss: 0.4761
Epoch 34/100, Loss: 0.4824
Epoch 35/100, Loss: 0.4917
Epoch 36/100, Loss: 0.4380
Epoch 37/100, Loss: 0.4936
Epoch 38/1

In [26]:
# Get encoded features for test set
model.eval()
with torch.no_grad():
    test_encoded_features = model.encoder(X_test).cpu().numpy()
    test_last_layer = model(X_test).cpu().numpy()
    all_encoded_features = model.encoder(torch_X).cpu().numpy()

In [34]:
mismatch = np.average(np.abs(X_test.cpu().numpy() - test_last_layer))
print(f"Average mismatch: {mismatch:.4f}")

Average mismatch: 0.1824


In [28]:
features_df = pd.DataFrame(all_encoded_features)
id_names = input_df[["firstname","lastname"]]
joined_df = pd.concat([id_names, features_df], axis=1)

joined_df = joined_df.rename(columns={"firstname": "first_name", "lastname": "last_name"})

joined_df.to_csv("spiders.csv", index=False)
torch.save(model.state_dict(), "neural_net_1.pt")

In [29]:

# Create spider charts
def create_spider_chart(features, candidate_name, party):
    # Number of variables
    N = features.shape[0]
    
    # What will be the angle of each axis in the plot
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]  # Close the loop
    
    # Spider chart setup
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    
    # Add the feature values
    values = features.tolist()
    values += values[:1]  # Close the loop
    
    # Plot
    ax.plot(angles, values, linewidth=2, linestyle='solid', label=candidate_name)
    ax.fill(angles, values, alpha=0.25)
    
    # Set labels
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([f'Dimension {i+1}' for i in range(N)])
    
    plt.title(f"{candidate_name} ({party}) Political Profile", size=20)
    plt.tight_layout()
    return fig


In [30]:
for i in range(min(5, len(input_df))):
    features = test_encoded_features[i]
    features = (features - features.min()) / (features.max() - features.min() + 1e-10)
    fig = create_spider_chart(features, input_df.iloc[i]['firstname'] + ' ' + input_df.iloc[i]['lastname'], 
                              input_df.iloc[i]['partyAbbreviation'])
    plt.savefig(f"candidate_{i}_spider.png")
    plt.close()
