# Example of ColossalAI applied on an external dataset


Creating the dataset

Kindly download the train.zip from https://www.kaggle.com/c/dogs-vs-cats/data

And create Train directory in Colab Notebooks directory
or import dataset and change code accordingly


Mounting the drive

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive"

os.chdir(path)
os.listdir(path)

unzipping train.zip


In [None]:
!unzip "/content/drive/MyDrive/Colab Notebooks/train.zip" -d "/content/drive/MyDrive/Colab Notebooks/Train/"

In [3]:
import pandas as pd
import os
import torch

train_df = pd.DataFrame(columns=["img_name","label"])
train_df["img_name"] = os.listdir("/content/drive/MyDrive/Colab Notebooks/Train/train/")
for idx, i in enumerate(os.listdir("/content/drive/MyDrive/Colab Notebooks/Train/train/")):
    if "cat" in i:
        train_df["label"][idx] = 0
    if "dog" in i:
        train_df["label"][idx] = 1

train_df.to_csv (r'train_csv.csv', index = False, header=True)

created train.csv containg the image title along with label

In [4]:
import numpy as np

In [5]:
from torch.utils.data import Dataset
import pandas as pd
import os
from PIL import Image
import torch

class CatsAndDogsDataset(Dataset):
    def __init__(self, root_dir, annotation_file, transform=None):
        self.root_dir = root_dir
        self.annotations = pd.read_csv(annotation_file)
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_id = self.annotations.iloc[index, 0]
        img = Image.open(os.path.join(self.root_dir, img_id)).convert("RGB")
        y_label = torch.tensor(float(self.annotations.iloc[index, 1]))
       
        
        if self.transform is not None:
            img = self.transform(img)
        # print(img.dtype)
        # print(y_label.dtype)
        img = np.asarray(np.copy(img), dtype='float32')
        label = np.asarray(np.copy(y_label), dtype='float32')
        img = np.expand_dims(img, axis=0)
        # Load the data into PyTorch tensors
        img = torch.from_numpy(img)
        label = torch.from_numpy(label)
        targets = label.view(1)
        return (img, targets)

Creating the custom dataset


In [6]:
!pip install ColossalAI deepspeed

Collecting ColossalAI
  Downloading colossalai-0.0.1b0-py3-none-any.whl (234 kB)
[K     |████████████████████████████████| 234 kB 3.1 MB/s 
[?25hCollecting deepspeed
  Downloading deepspeed-0.5.8.tar.gz (517 kB)
[K     |████████████████████████████████| 517 kB 40.6 MB/s 
Collecting tensorboardX
  Downloading tensorboardX-2.4.1-py2.py3-none-any.whl (124 kB)
[K     |████████████████████████████████| 124 kB 42.6 MB/s 
Collecting ninja
  Downloading ninja-1.10.2.3-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (108 kB)
[K     |████████████████████████████████| 108 kB 46.5 MB/s 
[?25hCollecting hjson
  Downloading hjson-3.0.2-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 2.5 MB/s 
[?25hCollecting triton
  Downloading triton-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[K     |████████████████████████████████| 18.2 MB 241 kB/s 
Building wheels for collected packages: deepspeed
  Building wheel for deepspeed (setup.py) 

In [7]:
import colossalai
from colossalai.engine import Engine, NoPipelineSchedule
from colossalai.trainer import Trainer
from colossalai.context import Config
import torch

Colossalai should be built with cuda extension to use the FP16 optimizer
Colossalai should be built with cuda extension to use the FP16 optimizer
apex is required for mixed precision training


First, we should initialize distributed environment. Though we just use single GPU in this example, we still need initialize distributed environment for compatibility. We just consider the simplest case here, so we just set the number of parallel processes to 1.

In [8]:
parallel_cfg = Config(dict(parallel=dict(
    data=dict(size=1),
    pipeline=dict(size=1),
    tensor=dict(size=1, mode=None),
)))
colossalai.init_dist(config=parallel_cfg,
          local_rank=0,
          world_size=1,
          host='127.0.0.1',
          port=8888,
          backend='nccl')

colossalai - torch.distributed.distributed_c10d - 2021-12-09 18:33:08,596 INFO: Added key: store_based_barrier_key:1 to store for rank: 0
colossalai - torch.distributed.distributed_c10d - 2021-12-09 18:33:08,598 INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 1 nodes.
colossalai - torch.distributed.distributed_c10d - 2021-12-09 18:33:08,602 INFO: Added key: store_based_barrier_key:2 to store for rank: 0
colossalai - torch.distributed.distributed_c10d - 2021-12-09 18:33:08,610 INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:2 with 1 nodes.
colossalai - torch.distributed.distributed_c10d - 2021-12-09 18:33:08,611 INFO: Added key: store_based_barrier_key:3 to store for rank: 0
colossalai - torch.distributed.distributed_c10d - 2021-12-09 18:33:08,615 INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:3 with 1 nodes.


process rank 0 is bound to device 0


Using transfer learning on Resnet50 and creating a CNN model

In [39]:
import torch.nn as nn
import torchvision.models as models

class CNN(nn.Module):
    def __init__(self, train_CNN=False, num_classes=1):
        super(CNN, self).__init__()
        self.train_CNN = train_CNN
        # self.inception = models.inception_v3(pretrained=True, aux_logits=False)
        # self.inception.fc = nn.Linear(self.inception.fc.in_features, num_classes)

        self.resnet50 = models.resnet50(pretrained=True)
        self.resnet50.fc = nn.Linear(self.resnet50.fc.in_features, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0)
        self.sigmoid = nn.Sigmoid()

    def forward(self, images):
        features = self.resnet50(images)
        return self.sigmoid(self.dropout(self.relu(features))).squeeze(1)

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

Doing data augmentation and normalization

In [11]:
transform = transforms.Compose(
        [
            transforms.Resize((356, 356)),
            transforms.RandomCrop((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        ]
    )

In [12]:
learning_rate = 0.00001
train_CNN = False
batch_size = 1
shuffle = True
pin_memory = True
num_workers = 1

Creating Dataloader

In [26]:
dataset = CatsAndDogsDataset("/content/drive/MyDrive/Colab Notebooks/Train/train","train_csv.csv",transform=transform)

train_set, validation_set = torch.utils.data.random_split(dataset,[20000,5000])
train_loader = DataLoader(dataset=train_set, shuffle=shuffle, batch_size=batch_size,num_workers=num_workers,pin_memory=pin_memory)
validation_loader = DataLoader(dataset=validation_set, shuffle=shuffle, batch_size=batch_size,num_workers=num_workers, pin_memory=pin_memory)

In [40]:
model = CNN().cuda()

for name, param in model.resnet50.named_parameters():
    if "fc.weight" in name or "fc.bias" in name:
        param.requires_grad = True
    else:
        param.requires_grad = train_CNN


Define a Loss function and optimizer. And then we use them to initialize Engine and Trainer. We provide various training / evaluating hooks. In this case, we just use the simplest hooks which can compute and print loss and accuracy.

In [41]:
import torch.optim as optim

# optimizer = optim.Adam(model.parameters(), lr=0.0001)
# criterion = nn.CrossEntropyLoss()

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
schedule = NoPipelineSchedule()

engine = Engine(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        lr_scheduler=None,
        schedule=schedule
    )
trainer = Trainer(engine=engine,
          hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],
          verbose=True)

colossalai - rank_0 - 2021-12-09 22:14:30,024 INFO: build LogMetricByEpochHook for train, priority = 1
colossalai - rank_0 - 2021-12-09 22:14:30,028 INFO: build LossHook for train, priority = 10
colossalai - rank_0 - 2021-12-09 22:14:30,030 INFO: build AccuracyHook for train, priority = 10


Easily training on a GPU using ColossalAI which doesn't require much change from normal pytorch coding practices

In [42]:
num_epochs = 2
test_interval = 1
trainer.fit(
        train_dataloader=train_loader,
        test_dataloader=validation_loader,
        max_epochs=num_epochs,
        display_progress=True,
        test_interval=test_interval
    )

[Epoch 0 train]: 100%|██████████| 20000/20000 [09:48<00:00, 33.99it/s]
colossalai - rank_0 - 2021-12-09 22:24:21,380 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 0.69329
[Epoch 0 val]: 100%|██████████| 5000/5000 [02:16<00:00, 36.71it/s]
colossalai - rank_0 - 2021-12-09 22:26:37,669 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss = 0.69581, Accuracy = 0.51040
[Epoch 1 train]: 100%|██████████| 20000/20000 [09:49<00:00, 33.92it/s]
colossalai - rank_0 - 2021-12-09 22:36:27,366 INFO: Training - Epoch 2 - LogMetricByEpochHook: Loss = 0.69329
[Epoch 1 val]: 100%|██████████| 5000/5000 [02:16<00:00, 36.73it/s]
colossalai - rank_0 - 2021-12-09 22:38:43,603 INFO: Testing - Epoch 2 - LogMetricByEpochHook: Loss = 0.69293, Accuracy = 0.51040


In [37]:
import torch.optim as optim

# optimizer = optim.Adam(model.parameters(), lr=0.0001)
# criterion = nn.CrossEntropyLoss()

# criterion = nn.BCELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
schedule = NoPipelineSchedule()

engine = Engine(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        lr_scheduler=None,
        schedule=schedule
    )
trainer = Trainer(engine=engine,
          hooks_cfg=[dict(type='LossHook'), dict(type='LogMetricByEpochHook'), dict(type='AccuracyHook')],
          verbose=True)

colossalai - rank_0 - 2021-12-09 21:48:19,860 INFO: build LogMetricByEpochHook for train, priority = 1
colossalai - rank_0 - 2021-12-09 21:48:19,862 INFO: build LossHook for train, priority = 10
colossalai - rank_0 - 2021-12-09 21:48:19,863 INFO: build AccuracyHook for train, priority = 10


In [38]:
num_epochs = 2
test_interval = 1
trainer.fit(
        train_dataloader=train_loader,
        test_dataloader=validation_loader,
        max_epochs=num_epochs,
        display_progress=True,
        test_interval=test_interval
    )

[Epoch 0 train]: 100%|██████████| 20000/20000 [09:48<00:00, 34.01it/s]
colossalai - rank_0 - 2021-12-09 21:58:11,553 INFO: Training - Epoch 1 - LogMetricByEpochHook: Loss = 0.69329
[Epoch 0 val]:  18%|█▊        | 920/5000 [00:25<01:51, 36.64it/s]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff604c469e0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
[Epoch 0 val]: 100%|██████████| 5000/5000 [02:16<00:00, 36.66it/s]
colossalai - rank_0 - 2021-12-09 22:00:28,045 INFO: Testing - Epoch 1 - LogMetricByEpochHook: Loss