# Comment

1. People say nn.DistributedDataParallel is way faster than nn.DataParallel (50%)

  - https://zhuanlan.zhihu.com/p/95700549

  - https://zhuanlan.zhihu.com/p/68717029

---

2. apex default use nn.DistributedDataParallel (But the below code use nn.DataParallel)

  - https://github.com/NVIDIA/apex
  - https://github.com/NVIDIA/apex/tree/master/examples/imagenet
  
**Can try to replace nn.DataParallel by nn.DistributedDataParallel**

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import torch.optim as optim
import torch

from efficientnet_pytorch import EfficientNet
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torch.optim import lr_scheduler

from hw_grapheme.train import generate_stratified_k_fold_index, train_model
from hw_grapheme.utils import load_model_weight
from hw_grapheme.data_pipeline import create_dataloaders, load_data
from hw_grapheme.model import EfficientNet_0
from hw_grapheme.loss_func import Loss_combine

from torchtools.optim import RangerLars, RAdam
from torch.optim import Optimizer

from apex import amp
from apex.parallel import DistributedDataParallel

To use this log_lamb_rs, please run 'pip install tensorboardx'. Also you must have Tensorboard running to see results


In [3]:
torch.nn.parallel.DistributedDataParallel or apex.parallel.DistributedDataParallel

torch.nn.parallel.distributed.DistributedDataParallel

In [4]:
def test_apex(mixed_precision, cuda_parallel, batch_size, opt_level):
    # not support in nb
    # if mixed_precision and cuda_parallel:
    #     torch.cuda.set_device(0)
    #     torch.distributed.init_process_group(backend="nccl", init_method='env://')

    # load data 
    pickle_paths = [
        "../data/processed_data/size_224/train_data_0.pickle",
    #     "../data/processed_data/size_224/train_data_1.pickle",
    #     "../data/processed_data/size_224/train_data_2.pickle",
    #     "../data/processed_data/size_224/train_data_3.pickle",
    ]

    image_data, name_data, label_data = load_data(pickle_paths)
    
    batch_size = batch_size
    num_workers = 6

    pin_memory = True
    n_epoch = 1

    n_splits = 5
    random_seed = 2020

    train_idx_list, valid_idx_list = generate_stratified_k_fold_index(
        image_data, label_data, n_splits, random_seed
    )

    train_idx = train_idx_list[0]
    valid_idx = valid_idx_list[0]

    # create loss function
    criterion = Loss_combine()

    # create data_transforms
    data_transforms = {
        'train': transforms.Compose([
            transforms.ToPILImage(),
            transforms.Grayscale(num_output_channels=3),
            transforms.ToTensor(),
        ]),
        'val': transforms.Compose([
            transforms.ToPILImage(),
            transforms.Grayscale(num_output_channels=3),
            transforms.ToTensor(),
        ]),
    }

    # create model 
    eff_b0 = EfficientNet_0()

    # create optimizer
    optimizer_ft = optim.Adam(eff_b0.parameters())

    # create data loader
    data_loaders = create_dataloaders(
        image_data, name_data, label_data, train_idx, valid_idx, 
        data_transforms, batch_size, num_workers, pin_memory
    )
    
    if mixed_precision and cuda_parallel:
        eff_b0.to("cuda")
        eff_b0, optimizer_ft = amp.initialize(eff_b0, optimizer_ft, opt_level=opt_level)
        eff_b0 = nn.DataParallel(eff_b0)
#         eff_b0 = apex.parallel.DistributedDataParallel(eff_b0)
    elif mixed_precision and not cuda_parallel:
        eff_b0.to("cuda")
        eff_b0, optimizer_ft = amp.initialize(eff_b0, optimizer_ft, opt_level=opt_level)
    elif not mixed_precision and cuda_parallel:
        eff_b0.to("cuda")
        eff_b0 = nn.DataParallel(eff_b0)
    elif not mixed_precision and not cuda_parallel:
        eff_b0.to("cuda")
        
    callbacks = {}

    callbacks = train_model(
        eff_b0, criterion, optimizer_ft, data_loaders,
        mixed_precision, callbacks, num_epochs=n_epoch,
        epoch_scheduler=None, save_dir=None
    )

In [5]:
!nvidia-smi

Sat Feb  8 20:26:15 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.40       Driver Version: 430.40       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:17:00.0 Off |                  N/A |
| 40%   40C    P0    74W / 250W |      1MiB / 11019MiB |     39%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:65:00.0  On |                  N/A |
| 41%   35C    P8    24W / 250W |   1350MiB / 11016MiB |      3%      Default |
+-------------------------------+----------------------+----------------------+
                                                                            

In [6]:
# ~ 5.2 iterations/s
# GPU RAM before start: 1, 1288
# GPU RAM after start: 7146, 1299
# GPU util before start: 0%, 7%
# GPU util after start: 94%, 10%

batch_size = 64
opt_level = "O1"
mixed_precision = False
cuda_parallel = False

test_apex(mixed_precision, cuda_parallel, batch_size, opt_level)

Load data done, shape: (50210, 224, 224), (50210,), (50210, 3)
StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
Creating train dataloader...
Creating test dataloader...
Epoch 0/0
----------


HBox(children=(FloatProgress(value=0.0, max=628.0), HTML(value='')))


Train Loss: 2.4181, root_acc: 0.2782, vowel_acc: 0.4066, consonant_acc: 0.6763, combined_acc: 0.4098


HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))


Val Loss: 1.7417, root_acc: 0.4415, vowel_acc: 0.5016, consonant_acc: 0.7637, combined_acc: 0.5371
In epoch 0, highest val accuracy increases from 0.0 to 0.5370942043417646.
In epoch 0, lowest val loss decreases from 999 to 1.7417345596865055.

Training complete in 2m 9s
Best Combnied Acc: 0.537094


In [7]:
# mode: ~6.3 iterations/s
# GPU RAM before start: 1, 1288
# GPU RAM after start: 4090, 5368
# GPU util before start: 0%, 7%
# GPU util after start: 61%, 57%


batch_size = 128
opt_level = "O1"
mixed_precision = False
cuda_parallel = True

test_apex(mixed_precision, cuda_parallel, batch_size, opt_level)

Load data done, shape: (50210, 224, 224), (50210,), (50210, 3)
StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
Creating train dataloader...
Creating test dataloader...
Epoch 0/0
----------


HBox(children=(FloatProgress(value=0.0, max=314.0), HTML(value='')))


Train Loss: 2.5650, root_acc: 0.2556, vowel_acc: 0.4271, consonant_acc: 0.6486, combined_acc: 0.3967


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Val Loss: 5.2745, root_acc: 0.0083, vowel_acc: 0.1295, consonant_acc: 0.6252, combined_acc: 0.1928
In epoch 0, highest val accuracy increases from 0.0 to 0.1927902808205537.
In epoch 0, lowest val loss decreases from 999 to 5.274504893473291.

Training complete in 1m 24s
Best Combnied Acc: 0.192790


In [8]:
# mode: ~6.0 iterations/s
# GPU RAM before start: 1, 1288
# GPU RAM after start: 4532, 1299
# GPU util before start: 0%, 7%
# GPU util after start: 80%, 10%

batch_size = 128
opt_level = "O1"
mixed_precision = True
cuda_parallel = False

test_apex(mixed_precision, cuda_parallel, batch_size, opt_level)

Load data done, shape: (50210, 224, 224), (50210,), (50210, 3)
StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
Creating train dataloader...
Creating test dataloader...
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Epoch 0/0
----------


HBox(children=(FloatProgress(value=0.0, max=314.0), HTML(value='')))


Train Loss: 2.6059, root_acc: 0.2483, vowel_acc: 0.4407, consonant_acc: 0.6387, combined_acc: 0.3940


HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))


Val Loss: 5.0669, root_acc: 0.0254, vowel_acc: 0.2098, consonant_acc: 0.6252, combined_acc: 0.2214
In epoch 0, highest val accuracy increases from 0.0 to 0.22144493128858794.
In epoch 0, lowest val loss decreases from 999 to 5.066860043890945.

Training complete in 1m 38s
Best Combnied Acc: 0.221445


In [9]:
import apex

In [10]:
# mode: ~4.9 iterations/s
# GPU RAM before start: 1, 1473
# GPU RAM after start: 2656, 4057
# GPU util before start: 0%, 5%
# GPU util after start: 44%, 40%

batch_size = 256
opt_level = "O1"
mixed_precision = True
cuda_parallel = True

# test_apex_mp_parallel.py used nn.DistributedDataParallel but IT IS NOT COMPLETED (e.g. DistributedSampler(train_dataset))
# python -m torch.distributed.launch --nproc_per_node=2 test_apex_mp_parallel.py
test_apex(mixed_precision, cuda_parallel, batch_size, opt_level)

Load data done, shape: (50210, 224, 224), (50210,), (50210, 3)
StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
Creating train dataloader...
Creating test dataloader...
Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Epoch 0/0
----------


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


Train Loss: 2.8607, root_acc: 0.1905, vowel_acc: 0.4305, consonant_acc: 0.6312, combined_acc: 0.3607


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


Val Loss: 3.8864, root_acc: 0.0219, vowel_acc: 0.2098, consonant_acc: 0.6252, combined_acc: 0.2197
In epoch 0, highest val accuracy increases from 0.0 to 0.21970225054769965.
In epoch 0, lowest val loss decreases from 999 to 3.886394448689649.

Training complete in 1m 1s
Best Combnied Acc: 0.219702
