In [1]:
!git clone https://github.com/weiaicunzai/pytorch-cifar100

Cloning into 'pytorch-cifar100'...
remote: Enumerating objects: 1188, done.[K
remote: Total 1188 (delta 0), reused 0 (delta 0), pack-reused 1188[K
Receiving objects: 100% (1188/1188), 530.69 KiB | 383.00 KiB/s, done.
Resolving deltas: 100% (753/753), done.


In [2]:
%cd pytorch-cifar100/

/mnt/batch/tasks/shared/LS_root/mounts/clusters/jcyuan-gpu/code/fedml-pipeline/pytorch-cifar100


In [3]:
from models.resnet import resnet18

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch.utils.data import random_split

import torchvision
import torchvision.transforms as transforms

import time
import numpy as np

In [4]:
class Node(object):
    def __init__(self, dataset, shuffle, batch_size=128):
        self.dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=2)
    def set_more_hyperp(self, lr, mom, wd):
        self.LEARNING_RATE = lr
        self.MOMENTUM = mom
        self.WEIGHT_DECAY = wd

    def create_model(self):
        self.model = resnet18()
        self.model = nn.DataParallel(self.model).cuda()
    def load_model(self, other_node):
        self.model = resnet18()
        self.model.load_state_dict(other_node.model.module.state_dict())
        self.model = nn.DataParallel(self.model).cuda()
    def aggregate_then_load_model(self, other_nodes):
        params = [node.model.module.state_dict() for node in other_nodes]
        self.model = resnet18()
        param_master = dict(self.model.state_dict())
        for name in param_master:
            param_master[name] = 0
            for p in params:
                param_master[name] += p[name] * (1.0 / len(other_nodes))
        self.model.load_state_dict(param_master)
        self.model = nn.DataParallel(self.model).cuda()
    def save_model(self, path):
      torch.save(self.model.module.state_dict(), path)
    def load_saved_model(self, path):
      weights = torch.load(path)
      self.model = resnet18()
      self.model.load_state_dict(weights)
      self.model = nn.DataParallel(self.model).cuda()

    class AverageMeter(object):
        def __init__(self):
            self.reset()
        def reset(self):
            self.val = 0
            self.avg = 0
            self.sum = 0
            self.count = 0
        def update(self, val, n=1):
            self.val = val
            self.sum += val * n
            self.count += n
            self.avg = self.sum / self.count
    def accuracy(self, output, target, topk=(1,)):
        """Computes the precision@k for the specified values of k"""
        maxk = max(topk)
        batch_size = target.size(0)
        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))
        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res

    def train_model(self, which_epoch):
        criterion = nn.CrossEntropyLoss().cuda()
        optimizer = optim.SGD(self.model.parameters(), self.LEARNING_RATE, momentum=self.MOMENTUM, weight_decay=self.WEIGHT_DECAY)

        batch_time = self.AverageMeter()
        data_time = self.AverageMeter()
        losses = self.AverageMeter()
        top1 = self.AverageMeter()

        self.model.train()

        end = time.time()
        for i, (input, target) in enumerate(self.dataloader):
            # measure data loading time
            data_time.update(time.time() - end)

            input, target = input.cuda(), target.cuda()

            # compute output
            output = self.model(input)
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec = self.accuracy(output, target)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(prec.item(), input.size(0))

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
        
        return {
            'epoch': which_epoch,
            'batch_time.avg': batch_time.avg,
            'data_time.avg': data_time.avg,
            'losses.avg': losses.avg,
            'top1.avg': top1.avg}

    def evaluate_model(self):
        criterion = nn.CrossEntropyLoss().cuda()
        batch_time = self.AverageMeter()
        losses = self.AverageMeter()
        top1 = self.AverageMeter()

        # switch to evaluate mode
        self.model.eval()

        end = time.time()
        with torch.no_grad():
            for i, (input, target) in enumerate(self.dataloader):
                input, target = input.cuda(), target.cuda()

                # compute output
                output = self.model(input)
                loss = criterion(output, target)

                # measure accuracy and record loss
                prec = self.accuracy(output, target)[0]
                losses.update(loss.item(), input.size(0))
                top1.update(prec.item(), input.size(0))

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

        return {
            'batch_time.avg': batch_time.avg,
            'losses.avg': losses.avg,
            'top1.avg': top1.avg}

In [6]:
CIFAR100_TRAIN_MEAN = [0.5070751592371323, 0.48654887331495095, 0.4409178433670343]
CIFAR100_TRAIN_STD = [0.2673342858792401, 0.2564384629170883, 0.27615047132568404]

train_dataset = torchvision.datasets.CIFAR100(
  root='./data',
  train=True,
  download=True,
  transform=transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD)
    ]))

test_dataset = torchvision.datasets.CIFAR100(
  root='./data',
  train=False,
  download=False,
  transform=transforms.Compose([
      transforms.ToTensor(),
      transforms.Normalize(CIFAR100_TRAIN_MEAN, CIFAR100_TRAIN_STD)
  ]))

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [None]:
class Simulator:
    def __init__(self):
        pass
    def __trainSingleModel():
        pass
    def __trainFederatedModel():
        pass
    def evalute():
        pass
    def training():
        pass
        