In [1]:
GROUP_SIZE = 3

In [2]:
import pandas as pd
import json
from sklearn.utils import shuffle

with open("merged_votes.json", "r") as f:
    data = json.load(f)

id_names = []
X = []
Y = []

def norm(vote):
    vote = int(vote)
    if vote == 0:
        return (0, 1)
    elif vote == 1:
        return (1, 1)
    elif vote == 2:
        return (-1, 1)
    else:
        return (0, 0)

for person in data:
    id_names.append(
        {
            "id": person["id"],
            "first_name": person["first_name"],
            "last_name": person["last_name"],
        }
    )
    X.append([item for vote in person["votes"] for item in norm(vote)])
    Y.append(person["spider"])

    id_names, X, Y = shuffle(id_names, X, Y, random_state=42)

id_names = pd.DataFrame(id_names)
X = pd.DataFrame(X)
Y = pd.DataFrame(Y)

display(id_names.head())
display(X.head())
display(Y.head())

Unnamed: 0,id,first_name,last_name
0,95761945c08c4ddd80c4860fee1ff50e,Dominique,Bühler
1,ae95d67aee0a4e5988122059dca1727a,Ernst,Wandfluh
2,6180905781604d9ebb8bcb4d9b564ffb,Sandra,Hess
3,e9b225fd214c4297a97c8d19441a4d49,Beatrix,Hurni
4,4021b2678cc34f008dbd134997fbae58,Maurane,Riesen


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3916,3917,3918,3919,3920,3921,3922,3923,3924,3925
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1,1,-1,1,1,1,-1,1,1,1,...,1,1,1,1,-1,1,-1,1,1,1
3,-1,1,-1,1,0,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,-1,1,1,1,-1,1,1,1,1,1,...,1,1,-1,1,1,1,1,1,1,1


Unnamed: 0,0,1,2,3
0,0.7335763,0.891815,0.47802696,0.77272904
1,0.41777325,0.44645295,0.27670503,0.23962282
2,0.18807982,0.3732473,0.23439717,0.31234184
3,0.62956667,0.6221469,0.48247215,0.3682272
4,0.5900239,0.58399355,0.60055274,0.79612595


In [3]:
import numpy as np
import torch
import torch.nn as nn

# Convert pandas DataFrames to numpy arrays
X_np = X.values.astype('float32')
Y_np = Y.values.astype('float32')



# Define the neural network
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(X.shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, Y.shape[1]),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

def train_variant(i, X_train, Y_train, X_test, Y_test):
    # Convert to torch tensors
    X_train_tensor = torch.tensor(X_train)
    Y_train_tensor = torch.tensor(Y_train)
    X_test_tensor = torch.tensor(X_test)
    Y_test_tensor = torch.tensor(Y_test)

    model = SimpleNN()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    epochs = 200
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, Y_train_tensor)
        loss.backward()
        optimizer.step()

    # Evaluate on test set
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_loss = criterion(test_outputs, Y_test_tensor)
        print(f"Test Loss: {test_loss.item():.4f}")

    print(Y_test_tensor[0])
    print(test_outputs[0])

    return test_outputs

group_count = X_np.shape[0] // GROUP_SIZE

X_group = np.array_split(X_np, group_count, axis=0)
Y_group = np.array_split(Y_np, group_count, axis=0)


all_outputs = []

for i in range(group_count):
    print(f"Training variant {i+1}/{group_count}...")
    # Combine all groups except the current one for training
    X_train = np.concatenate([X_group[j] for j in range(group_count) if j != i])
    Y_train = np.concatenate([Y_group[j] for j in range(group_count) if j != i])

    # Use the current group for testing
    X_test = X_group[i]
    Y_test = Y_group[i]

    all_outputs.append(train_variant(i, X_train, Y_train, X_test, Y_test))

all_outputs = np.concatenate(all_outputs, axis=0)

Training variant 1/57...
Test Loss: 0.0086
tensor([0.7336, 0.8918, 0.4780, 0.7727])
tensor([0.5362, 0.7704, 0.5328, 0.5864])
Training variant 2/57...
Test Loss: 0.0194
tensor([0.6296, 0.6221, 0.4825, 0.3682])
tensor([0.6301, 0.7976, 0.5141, 0.6826])
Training variant 3/57...
Test Loss: 0.0169
tensor([0.5865, 0.4815, 0.3992, 0.7798])
tensor([0.4623, 0.5189, 0.3702, 0.6889])
Training variant 4/57...
Test Loss: 0.0081
tensor([0.6265, 0.8045, 0.5452, 0.7240])
tensor([0.4778, 0.7323, 0.5350, 0.5796])
Training variant 5/57...
Test Loss: 0.0074
tensor([0.6671, 0.8435, 0.4938, 0.8171])
tensor([0.4741, 0.7934, 0.5072, 0.7022])
Training variant 6/57...
Test Loss: 0.0212
tensor([0.5041, 0.4015, 0.3036, 0.1927])
tensor([0.4173, 0.4939, 0.3283, 0.3513])
Training variant 7/57...
Test Loss: 0.0034
tensor([0.6180, 0.6712, 0.3819, 0.6536])
tensor([0.6341, 0.7159, 0.3794, 0.7752])
Training variant 8/57...
Test Loss: 0.0077
tensor([0.2047, 0.4577, 0.1327, 0.4326])
tensor([0.3114, 0.4361, 0.2584, 0.4634])


In [4]:
print(Y_np[1])
print(all_outputs[1])
print(id_names.iloc[1])

[0.41777325 0.44645295 0.27670503 0.23962282]
[0.424086   0.47529915 0.26487848 0.23789741]
id            ae95d67aee0a4e5988122059dca1727a
first_name                               Ernst
last_name                             Wandfluh
Name: 1, dtype: object


In [5]:
# Prepare DataFrame for export
spiders_votes_df = id_names.copy()
for idx in range(4):
    spiders_votes_df[str(idx)] = all_outputs[:, idx]

spiders_votes_df = spiders_votes_df.sort_values(by=["first_name", "last_name"]).reset_index(drop=True)

spiders_votes_df.to_csv("spiders_votes.csv", columns=["id", "first_name", "last_name", "0", "1", "2", "3"], index=False)