In [1]:
import os
import datetime

import numpy as np
import pandas as pd

import torch
import torch.multiprocessing as mp
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
from swadist.utils import spawn_fn

# mp.spawn may throw an error without this
os.environ['MKL_THREADING_LAYER'] = 'GNU'

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    
seed = int((datetime.date.today() - datetime.date(2022, 4, 11)).total_seconds())
print(f'seed: {int(seed)}')

seed: 259200


### Two-phase training: SGD + Codistillation

In phase 1, we train ResNet-8 asynchronously over the whole training set using the optimal hyperparameters given in [Shallue et al. 2019](http://arxiv.org/abs/1811.03600) (SGD w/ Nesterov momentum).

In phase 2, we run codistillation ([Hinton et al. 2015](http://arxiv.org/abs/1503.02531)).

In [2]:
# whether to log training to Tensorboard
log = False

# number of model replicas
world_size = 2

# overall size of training minibatches, aka effective batch size
eff_batch_size = 256

# optimizer
lr0, momentum,  = 2**-5., 0.975

# scheduler
alpha, decay_epochs = 0.25, 5

# training epochs
epochs_sgd, epochs_codist = 5, 10

dataloader_kwargs = {
    'dataset': 'cifar10',
    'batch_size': eff_batch_size // world_size,
    'num_workers': 4,
    'data_parallel': True,
}
model_kwargs = {
    'in_kernel_size': 3,
    'stack_sizes': [1, 1, 1],
    'n_classes': 10,
    'batch_norm': False,
}
optimizer_kwargs = {
    'lr': lr0,
    'momentum': momentum,
    'nesterov': True,
}
trainer_kwargs = {
    'log': log,
    'name': 'codist',
}
train_kwargs = {
    'epochs_sgd': epochs_sgd,
    'epochs_codist': epochs_codist,
}
scheduler_kwargs = {
    'alpha': alpha,
    'decay_epochs': decay_epochs,
}

args = (world_size,
        dataloader_kwargs,
        model_kwargs,
        optimizer_kwargs,
        trainer_kwargs,
        train_kwargs,
        scheduler_kwargs,
        None, # swa_scheduler_kwargs
        seed) # seed on rank i = seed + i

In [3]:
%%time

# begin training
mp.spawn(spawn_fn, args=args, nprocs=world_size, join=True)

Rank 1: joined process group on device cuda
Rank 1: torch.manual_seed(259201)
Using DistributedSampler
Number of training samples: 45000
Number of training batches: 176

Rank 1: torch.cuda.manual_seed(259203)
Param preview:
tensor([[[-0.0704,  0.1421,  0.0931],
         [-0.0356,  0.0484,  0.2311],
         [-0.1204,  0.0516, -0.1511]],

        [[-0.1264, -0.0484,  0.2217],
         [ 0.0109,  0.0558, -0.1112],
         [ 0.0887, -0.0500,  0.1206]],

        [[-0.0312,  0.0647, -0.0694],
         [-0.1086, -0.1056,  0.0937],
         [-0.1952, -0.0664, -0.1055]]], device='cuda:1',
       grad_fn=<SelectBackward0>) 

Worker 2/2 starting 15-epoch training loop...
Rank 0: joined process group on device cuda
Rank 0: torch.manual_seed(259200)
Using DistributedSampler
Number of training samples: 45000
Number of training batches: 176

Rank 0: torch.cuda.manual_seed(259202)
Param preview:
tensor([[[-0.0462, -0.0369, -0.0188],
         [-0.2006, -0.0966, -0.1857],
         [ 0.1451, -0.0573,  

### Two-phase training: SGD + Codistillation w/ data partitioning

As before, but this time we train each model replica on disjoint partitions of the training set in phase 1 and 2.

In [4]:
%%time

dataloader_kwargs['data_parallel'] = False
dataloader_kwargs['split_training'] = True
trainer_kwargs['name'] = 'codist-partitioned'

# begin training
mp.spawn(spawn_fn, args=args, nprocs=world_size, join=True)

Rank 0: joined process group on device cuda
Rank 0: torch.manual_seed(259200)
Using SubsetRandomSampler with samples 0 to 22499
Number of training samples: 45000
Number of training batches: 176

Rank 0: torch.cuda.manual_seed(259202)
Param preview:
tensor([[[-0.0462, -0.0369, -0.0188],
         [-0.2006, -0.0966, -0.1857],
         [ 0.1451, -0.0573,  0.0963]],

        [[ 0.0555, -0.0825, -0.1073],
         [-0.0361, -0.0622, -0.1082],
         [ 0.2288,  0.1807, -0.0981]],

        [[-0.0205, -0.0812, -0.0820],
         [ 0.1013,  0.0604, -0.1982],
         [ 0.1049,  0.0410,  0.0205]]], device='cuda:0',
       grad_fn=<SelectBackward0>) 

Worker 1/2 starting 15-epoch training loop...
SGD epochs: 5 | Codistillation epochs: 10 | SWA epochs: 0
DistributedDataParallel: False
Stopping accuracy: None

Rank 1: joined process group on device cuda
Rank 1: torch.manual_seed(259201)
Using SubsetRandomSampler with samples 22500 to 44999
Number of training samples: 45000
Number of training batch