In [7]:
import pandas as pd
import json
from sklearn.utils import shuffle
import numpy as np

In [8]:
with open("merged_votes.json", "r") as f:
    data = json.load(f)

id_names = []
X = []
Y = []

def norm(vote):
    vote = int(vote)
    if vote == 0:
        return (0, 1)
    elif vote == 1:
        return (1, 1)
    elif vote == 2:
        return (-1, 1)
    else:
        return (0, 0)

for person in data:
    id_names.append(
        {
            "id": person["id"],
            "first_name": person["first_name"],
            "last_name": person["last_name"],
        }
    )
    X.append([item for vote in person["votes"] for item in norm(vote)])
    Y.append(person["spider"])

    id_names, X, Y = shuffle(id_names, X, Y, random_state=42)

id_names = pd.DataFrame(id_names)
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

display(id_names.head())
display(X.head())
display(Y.head())

Unnamed: 0,id,first_name,last_name
0,95761945c08c4ddd80c4860fee1ff50e,Dominique,Bühler
1,ae95d67aee0a4e5988122059dca1727a,Ernst,Wandfluh
2,6180905781604d9ebb8bcb4d9b564ffb,Sandra,Hess
3,e9b225fd214c4297a97c8d19441a4d49,Beatrix,Hurni
4,4021b2678cc34f008dbd134997fbae58,Maurane,Riesen


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,-1,1,-1,1,1,1,1,1,...,-1,1,1,1,-1,1,1,1,1,1
3,1,1,-1,1,-1,1,-1,1,-1,1,...,1,1,-1,1,1,1,1,1,1,1
4,1,1,-1,1,1,1,-1,1,-1,1,...,1,1,-1,1,1,1,1,1,1,1


Unnamed: 0,0,1,2,3
0,0.089736745,0.8947511,0.48076287,0.5860747
1,0.4604929,0.4059707,0.028332038,0.2100404
2,0.68369,0.3712907,0.025119677,0.31691334
3,0.1973908,0.57748884,0.054053936,0.4970517
4,0.32184508,0.91632247,0.380341,0.6413553


In [9]:
import torch

import torch.nn as nn

# Convert pandas DataFrames to numpy arrays
X_np = X.values.astype('float32')
Y_np = Y.values.astype('float32')



# Define the neural network
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(X.shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, Y.shape[1]),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

def train_variant(i, X_train, Y_train, X_test, Y_test):
    # Convert to torch tensors
    X_train_tensor = torch.tensor(X_train)
    Y_train_tensor = torch.tensor(Y_train)
    X_test_tensor = torch.tensor(X_test)
    Y_test_tensor = torch.tensor(Y_test)

    model = SimpleNN()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    epochs = 200
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, Y_train_tensor)
        loss.backward()
        optimizer.step()

    # Evaluate on test set
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, Y_test_tensor)
        print(f"Test Loss: {test_loss.item():.4f}")

    print(Y_test_tensor[0])
    print(test_outputs[0])

    return test_outputs

group_count = X_np.shape[0] // 10

X_group = np.array_split(X_np, group_count, axis=0)
Y_group = np.array_split(Y_np, group_count, axis=0)


all_outputs = []

for i in range(group_count):
    print(f"Training variant {i+1}/{group_count}...")
    # Combine all groups except the current one for training
    X_train = np.concatenate([X_group[j] for j in range(group_count) if j != i])
    Y_train = np.concatenate([Y_group[j] for j in range(group_count) if j != i])

    # Use the current group for testing
    X_test = X_group[i]
    Y_test = Y_group[i]

    all_outputs.append(train_variant(i, X_train, Y_train, X_test, Y_test))

all_outputs = np.concatenate(all_outputs, axis=0)

Training variant 1/17...
Test Loss: 0.0118
tensor([0.0897, 0.8948, 0.4808, 0.5861])
tensor([0.2362, 0.8007, 0.3876, 0.5874])
Training variant 2/17...
Test Loss: 0.0186
tensor([0.5223, 0.4314, 0.0452, 0.1448])
tensor([0.5011, 0.3330, 0.0409, 0.1963])
Training variant 3/17...
Test Loss: 0.0077
tensor([0.7503, 0.5498, 0.0632, 0.3876])
tensor([0.6853, 0.4687, 0.1316, 0.4014])
Training variant 4/17...
Test Loss: 0.0159
tensor([0.3128, 0.5623, 0.1225, 0.1735])
tensor([0.4701, 0.3977, 0.0347, 0.2133])
Training variant 5/17...
Test Loss: 0.0153
tensor([0.7801, 0.7286, 0.1756, 0.7593])
tensor([0.6727, 0.6688, 0.1438, 0.6232])
Training variant 6/17...
Test Loss: 0.0068
tensor([0.4515, 0.2871, 0.0505, 0.0632])
tensor([0.4608, 0.3719, 0.0138, 0.2161])
Training variant 7/17...
Test Loss: 0.0112
tensor([0.2268, 0.9244, 0.5247, 0.6315])
tensor([0.2259, 0.9126, 0.4064, 0.5658])
Training variant 8/17...
Test Loss: 0.0108
tensor([0.7696, 0.4067, 0.0516, 0.4593])
tensor([0.6867, 0.4464, 0.0539, 0.3584])


In [10]:
print(Y_np[1])
print(all_outputs[1])
print(id_names.iloc[1])

[0.4604929  0.4059707  0.02833204 0.2100404 ]
[0.5076801  0.40779567 0.09097707 0.20966896]
id            ae95d67aee0a4e5988122059dca1727a
first_name                               Ernst
last_name                             Wandfluh
Name: 1, dtype: object


In [11]:
# Prepare DataFrame for export
spiders_votes_df = id_names.copy()
for idx in range(4):
    spiders_votes_df[str(idx)] = all_outputs[:, idx]

spiders_votes_df = spiders_votes_df.sort_values(by=["first_name", "last_name"]).reset_index(drop=True)

spiders_votes_df.to_csv("spiders_votes.csv", columns=["id", "first_name", "last_name", "0", "1", "2", "3"], index=False)