juliette.jin

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import CIFAR10, MNIST

# MNIST

Load the MNIST dataset (or any other dataset like HAM 10000)

In [2]:
dataset = MNIST(root='data/', download=True, transform=transforms.ToTensor())
train_ds, val_ds = random_split(dataset, [50000, 10000])
train_loader = DataLoader(train_ds, batch_size=128)
val_loader = DataLoader(val_ds, batch_size=128)

Extract two subsets of 600 data points each (without intersection)


In [3]:
subset1_indices = torch.randperm(50000)[:600]
subset2_indices = torch.randperm(50000)[:600]
subset1 = torch.utils.data.Subset(train_ds, subset1_indices)
subset2 = torch.utils.data.Subset(train_ds, subset2_indices)

Create a simple Convolutional Neural Network (2 convolutional layers and 2 dense layers, for example)

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(512, 64)
        self.fc2 = nn.Linear(64, 32)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(x.size(0), -1)  # Adjusted shape to match input size
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

model = Net()

Create a function average_model_parameters(models: iterable, average_weight): iterable that takes a list of models as an argument and returns the weighted average of the parameters of each model.

In [5]:
def average_model_parameters(models, average_weight):
    new_model = Net()
    # For each parameter in the new model, average the corresponding parameters in the input models
    for new_param, *params in zip(new_model.parameters(), *[model.parameters() for model in models]):
        new_param.data = sum(param.data for param in params) * average_weight
    return new_model

Create a function that updates the parameters of a model from a list of values

In [6]:
def update_model_parameters(model, parameters):
    for param, new_param in zip(model.parameters(), parameters):
        param.data = new_param.data

Create a script/code/function that reproduces Algorithm 1, considering that both models are on your machine. Use an average_weight=[1/2, 1/2]. Reuse the same setup as in the article (50 examples per local batch)

In [7]:
def algorithm1(model1, model2, X_train1, y_train1, X_train2, y_train2, average_weight):
    # Initialize the models with the same parameters
    update_model_parameters(model1, model2.parameters())

    # Split the training data into local batches
    local_batch_size = 50
    X_train1_batches = X_train1.split(local_batch_size)
    y_train1_batches = y_train1.split(local_batch_size)
    X_train2_batches = X_train2.split(local_batch_size)
    y_train2_batches = y_train2.split(local_batch_size)

    # Train the models on the local data
    for X1, y1, X2, y2 in zip(X_train1_batches, y_train1_batches, X_train2_batches, y_train2_batches):
        # Train model 1 on the local data
        optimizer1 = torch.optim.SGD(model1.parameters(), lr=0.01)
        model1.train()
        optimizer1.zero_grad()
        output1 = model1(X1)
        loss1 = F.nll_loss(output1, y1)
        loss1.backward()
        optimizer1.step()
        # Train model 2 on the local data
        optimizer2 = torch.optim.SGD(model2.parameters(), lr=0.01)
        model2.train()
        optimizer2.zero_grad()
        output2 = model2(X2)
        loss2 = F.nll_loss(output2, y2)
        loss2.backward()
        optimizer2.step()
   
    # Average the parameters of the models
    new_model = average_model_parameters([model1, model2], average_weight)
    return new_model

Train your models without initializing the common parameters and measure the performance on the entire dataset.

In [8]:
model1 = Net()
model2 = Net()

optimizer1 = optim.SGD(model1.parameters(), lr=0.01)
optimizer2 = optim.SGD(model2.parameters(), lr=0.01)

def train(model, optimizer, train_loader):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()

def test(model, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    return test_loss, accuracy

epochs = 20

for epoch in range(epochs):
    train(model1, optimizer1, train_loader)
    train(model2, optimizer2, train_loader)
    test_loss1, accuracy1 = test(model1, val_loader)
    test_loss2, accuracy2 = test(model2, val_loader)
    print(f'Epoch: {epoch+1}/{epochs}, Model 1: Test Loss: {test_loss1}, Accuracy: {accuracy1}, Model 2: Test Loss: {test_loss2}, Accuracy: {accuracy2}')

Epoch: 1/20, Model 1: Test Loss: 1.824346145248413, Accuracy: 62.83, Model 2: Test Loss: 1.8087377834320069, Accuracy: 54.92
Epoch: 2/20, Model 1: Test Loss: 0.6467162405967712, Accuracy: 82.0, Model 2: Test Loss: 0.6595020809173584, Accuracy: 80.24
Epoch: 3/20, Model 1: Test Loss: 0.433419891166687, Accuracy: 88.38, Model 2: Test Loss: 0.4738588337421417, Accuracy: 86.25
Epoch: 4/20, Model 1: Test Loss: 0.36560064415931703, Accuracy: 89.83, Model 2: Test Loss: 0.4000449149131775, Accuracy: 88.31
Epoch: 5/20, Model 1: Test Loss: 0.30979505019187925, Accuracy: 91.19, Model 2: Test Loss: 0.34133145456314085, Accuracy: 90.13
Epoch: 6/20, Model 1: Test Loss: 0.27706118597984314, Accuracy: 92.29, Model 2: Test Loss: 0.310067565536499, Accuracy: 90.65
Epoch: 7/20, Model 1: Test Loss: 0.2525043376922607, Accuracy: 92.83, Model 2: Test Loss: 0.2768687617182732, Accuracy: 91.78
Epoch: 8/20, Model 1: Test Loss: 0.22703383046388625, Accuracy: 93.55, Model 2: Test Loss: 0.2491548327922821, Accurac

Train your models with the initialization of common parameters and verify that the performance is better.

In [9]:
X_train1 = torch.cat([data for data, _ in DataLoader(subset1, batch_size=600)])
X_train2 = torch.cat([data for data, _ in DataLoader(subset2, batch_size=600)])
y_train1 = torch.cat([target for _, target in DataLoader(subset1, batch_size=600)])
y_train2 = torch.cat([target for _, target in DataLoader(subset2, batch_size=600)])

average_weight = torch.tensor([1/2, 1/2])

# Fix the average_model_parameters function to correctly average the parameters
def average_model_parameters(models, average_weight):
    new_model = Net()
    for new_param, params in zip(new_model.parameters(), zip(*[model.parameters() for model in models])):
        new_param.data = sum(w * param.data for w, param in zip(average_weight, params))
    return new_model

new_model = algorithm1(model1, model2, X_train1, y_train1, X_train2, y_train2, average_weight)
test_loss, accuracy = test(new_model, val_loader)
print(f'New Model: Test Loss: {test_loss}, Accuracy: {accuracy}')

New Model: Test Loss: 0.13597940123975277, Accuracy: 95.8


Reduce the number of data points in each sub-batch. What is the minimum number of data points necessary for the final model to have acceptable performance? Repeat the study on CIFAR-10

In [10]:
for batch_size in [256, 128, 64, 32, 16, 8, 4, 2, 1]:
    subset1_indices = torch.randperm(50000)[:batch_size]
    subset2_indices = torch.randperm(50000)[:batch_size]
    subset1 = torch.utils.data.Subset(train_ds, subset1_indices)
    subset2 = torch.utils.data.Subset(train_ds, subset2_indices)
    X_train1 = torch.cat([data for data, _ in DataLoader(subset1, batch_size=batch_size)])
    X_train2 = torch.cat([data for data, _ in DataLoader(subset2, batch_size=batch_size)])
    y_train1 = torch.cat([target for _, target in DataLoader(subset1, batch_size=batch_size)])
    y_train2 = torch.cat([target for _, target in DataLoader(subset2, batch_size=batch_size)])
    new_model = algorithm1(model1, model2, X_train1, y_train1, X_train2, y_train2, average_weight)
    test_loss, accuracy = test(new_model, val_loader)
    print(f'Batch Size: {batch_size}, New Model: Test Loss: {test_loss}, Accuracy: {accuracy}')

Batch Size: 256, New Model: Test Loss: 0.132783861297369, Accuracy: 96.0
Batch Size: 128, New Model: Test Loss: 0.13173209536969663, Accuracy: 95.88
Batch Size: 64, New Model: Test Loss: 0.13790862982869148, Accuracy: 95.77
Batch Size: 32, New Model: Test Loss: 0.1397119868248701, Accuracy: 95.76
Batch Size: 16, New Model: Test Loss: 0.1367542265355587, Accuracy: 95.99
Batch Size: 8, New Model: Test Loss: 0.13529405716359616, Accuracy: 95.96
Batch Size: 4, New Model: Test Loss: 0.1372001058280468, Accuracy: 95.84
Batch Size: 2, New Model: Test Loss: 0.2897350154221058, Accuracy: 90.33
Batch Size: 1, New Model: Test Loss: 0.28947960694432257, Accuracy: 90.34


**What is the minimum number of data points necessary for the final model to have acceptable performance?**

A baseline accuracy would be around 92% or 93% for MNIST.

But as we are using a CNN we should aim for a higher accuracy, more around 97% or more, which we are closely getting to the state of the art.

Maybe 96% would be a good threshold. And if we had done a bit more of training we could have reached 97% or more.

So I'd go with at least 256 here.

# CIFAR-10

In [11]:
cf_dataset = CIFAR10(root='data/', download=True, transform=transforms.ToTensor())
cf_train_ds, cf_val_ds = random_split(cf_dataset, [45000, 5000])
cf_train_loader = DataLoader(cf_train_ds, batch_size=128)
cf_val_loader = DataLoader(cf_val_ds, batch_size=128)

Files already downloaded and verified


In [12]:
cf_subset1_indices = torch.randperm(len(cf_train_ds))[:600]
cf_subset2_indices = torch.randperm(len(cf_train_ds))[:600]
cf_subset1 = torch.utils.data.Subset(cf_train_ds, cf_subset1_indices)
cf_subset2 = torch.utils.data.Subset(cf_train_ds, cf_subset2_indices)

In [13]:
class CFNet(nn.Module):
    def __init__(self):
        super(CFNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=5)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(32 * 5 * 5, 64)
        self.fc2 = nn.Linear(64, 32)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

cf_model = CFNet()
cf_model1 = CFNet()
cf_model2 = CFNet()

cf_optimizer1 = optim.SGD(cf_model1.parameters(), lr=0.01)
cf_optimizer2 = optim.SGD(cf_model2.parameters(), lr=0.01)

In [14]:
epochs = 20

for epoch in range(epochs):
    train(cf_model1, cf_optimizer1, cf_train_loader)
    train(cf_model2, cf_optimizer2, cf_train_loader)
    cf_test_loss1, cf_accuracy1 = test(cf_model1, cf_val_loader)
    cf_test_loss2, cf_accuracy2 = test(cf_model2, cf_val_loader)
    print(f'Epoch: {epoch+1}/{epochs}, Model 1: Test Loss: {cf_test_loss1}, Accuracy: {cf_accuracy1}, Model 2: Test Loss: {cf_test_loss2}, Accuracy: {cf_accuracy2}')

Epoch: 1/20, Model 1: Test Loss: 2.263892604446411, Accuracy: 15.06, Model 2: Test Loss: 2.269119260787964, Accuracy: 15.82
Epoch: 2/20, Model 1: Test Loss: 2.225563650894165, Accuracy: 15.68, Model 2: Test Loss: 2.2297130237579346, Accuracy: 14.46
Epoch: 3/20, Model 1: Test Loss: 2.202763724517822, Accuracy: 16.58, Model 2: Test Loss: 2.203890519332886, Accuracy: 15.52
Epoch: 4/20, Model 1: Test Loss: 2.1784815063476564, Accuracy: 18.46, Model 2: Test Loss: 2.16588244972229, Accuracy: 15.8
Epoch: 5/20, Model 1: Test Loss: 2.15202410697937, Accuracy: 18.94, Model 2: Test Loss: 2.1069220703125, Accuracy: 17.44
Epoch: 6/20, Model 1: Test Loss: 2.1064886562347414, Accuracy: 23.04, Model 2: Test Loss: 2.059283430480957, Accuracy: 23.76
Epoch: 7/20, Model 1: Test Loss: 2.047286938095093, Accuracy: 25.36, Model 2: Test Loss: 2.021228925704956, Accuracy: 24.52
Epoch: 8/20, Model 1: Test Loss: 1.9901579093933106, Accuracy: 28.14, Model 2: Test Loss: 2.0120998863220216, Accuracy: 26.92
Epoch: 9

In [15]:
cf_X_train1 = torch.cat([data for data, _ in DataLoader(cf_subset1, batch_size=len(cf_subset1))])
cf_X_train2 = torch.cat([data for data, _ in DataLoader(cf_subset2, batch_size=len(cf_subset2))])
cf_y_train1 = torch.cat([target for _, target in DataLoader(cf_subset1, batch_size=len(cf_subset1))])
cf_y_train2 = torch.cat([target for _, target in DataLoader(cf_subset2, batch_size=len(cf_subset2))])

cf_average_weight = torch.tensor([1/2, 1/2])

cf_new_model = algorithm1(cf_model1, cf_model2, cf_X_train1, cf_y_train1, cf_X_train2, cf_y_train2, cf_average_weight)
cf_test_loss, cf_accuracy = test(cf_new_model, cf_val_loader)
print(f'New Model: Test Loss: {cf_test_loss}, Accuracy: {cf_accuracy}')

New Model: Test Loss: 1.6659834827423097, Accuracy: 40.06


In [16]:
for batch_size in [256, 128, 64, 32, 16, 8, 4, 2, 1]:
    cf_subset1_indices = torch.randperm(len(cf_train_ds))[:batch_size]
    cf_subset2_indices = torch.randperm(len(cf_train_ds))[:batch_size]
    cf_subset1 = torch.utils.data.Subset(cf_train_ds, cf_subset1_indices)
    cf_subset2 = torch.utils.data.Subset(cf_train_ds, cf_subset2_indices)
    cf_X_train1 = torch.cat([data for data, _ in DataLoader(cf_subset1, batch_size=len(cf_subset1))])
    cf_X_train2 = torch.cat([data for data, _ in DataLoader(cf_subset2, batch_size=len(cf_subset2))])
    cf_y_train1 = torch.cat([target for _, target in DataLoader(cf_subset1, batch_size=len(cf_subset1))])
    cf_y_train2 = torch.cat([target for _, target in DataLoader(cf_subset2, batch_size=len(cf_subset2))])
    cf_new_model = algorithm1(cf_model1, cf_model2, cf_X_train1, cf_y_train1, cf_X_train2, cf_y_train2, cf_average_weight)
    cf_test_loss, cf_accuracy = test(cf_new_model, cf_val_loader)
    print(f'Batch Size: {batch_size}, New Model: Test Loss: {cf_test_loss}, Accuracy: {cf_accuracy}')

Batch Size: 256, New Model: Test Loss: 1.8403869047164918, Accuracy: 29.02
Batch Size: 128, New Model: Test Loss: 1.6881108165740968, Accuracy: 39.82
Batch Size: 64, New Model: Test Loss: 1.689157536506653, Accuracy: 38.9
Batch Size: 32, New Model: Test Loss: 1.719276428604126, Accuracy: 36.38
Batch Size: 16, New Model: Test Loss: 1.7397148279190064, Accuracy: 33.98
Batch Size: 8, New Model: Test Loss: 1.7265831121444701, Accuracy: 34.28
Batch Size: 4, New Model: Test Loss: 1.7187852586746215, Accuracy: 35.54
Batch Size: 2, New Model: Test Loss: 2.701066907119751, Accuracy: 14.06
Batch Size: 1, New Model: Test Loss: 2.4819874057769775, Accuracy: 18.96


So now we've repeated the study on CIFAR-10.

The baseline accuracy would be around 10% or ~20 to 30% for the simpliest models (random, linear models, etc)

For a basic CNN we should aim for a higher accuracy, more around 70% or more, which is not what we are getting here. We are in between a random model and a basic CNN.