In [1]:
! pip install --user torch
! pip install --user torchvision
! cd /data0
! pip install --user tqdm


You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [None]:
from torch.nn.parameter import Parameter

class LARC(object): 

    def __init__(self, optimizer, trust_coefficient=0.02, clip=True, eps=1e-8):
        self.param_groups = optimizer.param_groups
        self.optim = optimizer
        self.trust_coefficient = trust_coefficient
        self.eps = eps
        self.clip = clip

    def __getstate__(self):
        return self.optim.__getstate__()

    def __setstate__(self, state):
        self.optim.__setstate__(state)

    def __repr__(self):
        return self.optim.__repr__()

    def state_dict(self):
        return self.optim.state_dict()

    def load_state_dict(self, state_dict):
        self.optim.load_state_dict(state_dict)

    def zero_grad(self):
        self.optim.zero_grad()

    def add_param_group(self, param_group):
        self.optim.add_param_group( param_group)

    def step(self):
        with torch.no_grad():
            weight_decays = []
            for group in self.optim.param_groups:
                # absorb weight decay control from optimizer
                weight_decay = group['weight_decay'] if 'weight_decay' in group else 0
                weight_decays.append(weight_decay)
                group['weight_decay'] = 0
                for p in group['params']:
                    if p.grad is None:
                        continue
                    param_norm = torch.norm(p.data)
                    grad_norm = torch.norm(p.grad.data)

                    if param_norm != 0 and grad_norm != 0:
                        # calculate adaptive lr + weight decay
                        adaptive_lr = self.trust_coefficient * (param_norm) / (grad_norm + param_norm * weight_decay + self.eps)

                        # clip learning rate for LARC
                        if self.clip:
                            # calculation of adaptive_lr so that when multiplied by lr it equals `min(adaptive_lr, lr)`
                            adaptive_lr = min(adaptive_lr/group['lr'], 1)

                        p.grad.data += weight_decay * p.data
                        p.grad.data *= adaptive_lr

        self.optim.step()
        # return weight decay control to optimizer
        for i, group in enumerate(self.optim.param_groups):
            group['weight_decay'] = weight_decays[i]

In [2]:
import argparse
import os
import numpy as np
#from tqdm import tqdm_notebook as tqdm
"""
import importlib.util
spec = importlib.util.spec_from_file_location("notebook", "/global/homes/s/stuti/simclr/src/tqdm/tqdm/notebook.py")
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
foo.MyClass()

"""
import sys
sys.path.append('/home/stuti/.local/lib/python3.6/site-packages')
print ('\n'.join(sys.path))
from PIL import Image


import torch
print(torch.__version__)
print(os.path.abspath(torch.__file__))
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as tfs
import torchvision.datasets as datasets
from torchvision.datasets import *
from torchvision.models import *
import torchvision.models as models

#distributed
import torch.distributed as dist
import torch.utils.data.distributed

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

/usr/lib/python36.zip
/usr/lib/python3.6
/usr/lib/python3.6/lib-dynload

/home/stuti/.local/lib/python3.6/site-packages
/usr/local/lib/python3.6/dist-packages
/usr/lib/python3/dist-packages
/usr/lib/python3.6/dist-packages
/usr/local/lib/python3.6/dist-packages/IPython/extensions
/home/stuti/.ipython
/home/stuti/.local/lib/python3.6/site-packages
1.4.0
/home/stuti/.local/lib/python3.6/site-packages/torch/__init__.py


device(type='cuda')

In [3]:
tf_tr = tfs.Compose([
    tfs.RandomResizedCrop(32),
    tfs.RandomHorizontalFlip(),
    tfs.ColorJitter(0.5, 0.5, 0.5, 0.5),
    tfs.ToTensor(),
    tfs.Normalize(mean=[0.485, 0.456, 0.406], 
                  std=[0.229, 0.224, 0.225])
])

tf_de = tfs.Compose([
    tfs.Resize(32),
    tfs.ToTensor(),
    tfs.Normalize(mean=[0.485, 0.456, 0.406], 
                  std=[0.229, 0.224, 0.225])
])

tf_te = tfs.Compose([
    tfs.Resize(32),
    tfs.ToTensor(),
    tfs.Normalize(mean=[0.485, 0.456, 0.406], 
                  std=[0.229, 0.224, 0.225])
])

In [4]:
model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

In [5]:
parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')

parser.add_argument('data', metavar='/data0/imagenet',
                    help='path to dataset')
parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
                    choices=model_names,
                    help='model architecture: ' +
                        ' | '.join(model_names) +
                        ' (default: resnet18)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=90, type=int, metavar='N',
                    help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=128, type=int,
                    metavar='N',
                    help='mini-batch size (default: 256), this is the total '
                         'batch size of all GPUs on the current node when '
                         'using Data Parallel or Distributed Data Parallel')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                    metavar='LR', help='initial learning rate', dest='lr')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
                    help='momentum')
parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)',
                    dest='weight_decay')
parser.add_argument('-p', '--print-freq', default=10, type=int,
                    metavar='N', help='print frequency (default: 10)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                    help='use pre-trained model')


#distribution arguments:
parser.add_argument('--world-size', default=-1, type=int,
                    help='number of nodes for distributed training')
parser.add_argument('--rank', default=-1, type=int,
                    help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
                    help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
                    help='distributed backend')
parser.add_argument('--seed', default=None, type=int,
                    help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
                    help='GPU id to use.')
parser.add_argument('--multiprocessing-distributed', action='store_true',
                    help='Use multi-processing distributed training to launch '
                         'N processes per node, which has N GPUs. This is the '
                         'fastest way to use PyTorch for either single node or '
                         'multi node data parallel training')
parser.add_argument("--local_rank", default=0, type=int)






#args = parser.parse_args()

args, unknown = parser.parse_known_args()
traindir = os.path.join(args.data, 'train')
print(traindir)
traindir = '/data0/imagenet'
ds_tr = datasets.ImageFolder(traindir, tf_tr)
ds_de = datasets.ImageFolder(traindir, tf_de)
ds_te = datasets.ImageFolder(traindir, tf_te)




/run/user/1030/jupyter/kernel-d86d7a8f-3858-4aa6-a01f-e4c0e41fdd70.json/train


In [16]:
args.distributed = False
if 'WORLD_SIZE' in os.environ:
    args.distributed = int(os.environ['WORLD_SIZE']) > 1

if args.distributed:
    # FOR DISTRIBUTED:  Set the device according to local_rank.
    torch.cuda.set_device(args.local_rank)

    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
    # environment variables, and requires that you use init_method=`env://`.
    torch.distributed.init_process_group(backend='nccl',
                                         init_method='env://')
assert torch.backends.cudnn.enabled


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified
Files already downloaded and verified


In [None]:
if args.dist_url == "env://" and args.world_size == -1:
    args.world_size = int(os.environ["WORLD_SIZE"])

args.distributed = args.world_size > 1 or args.multiprocessing_distributed

ngpus_per_node = torch.cuda.device_count()


if args.dist_url == "env://" and args.rank == -1:
    args.rank = int(os.environ["RANK"])
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,world_size=args.world_size, rank=args.rank)

In [None]:
train_sampler = torch.utils.data.distributed.DistributedSampler(ds_tr)

In [6]:
dl_tr = DataLoader(ds_tr, batch_size=128, shuffle=True,sampler=train_sampler)
dl_de = DataLoader(ds_de, batch_size=128, shuffle=True)
dl_te = DataLoader(ds_te, batch_size=128, shuffle=False)
model = resnet50(pretrained=False)
model.conv1 = nn.Conv2d(3, 64, 3, 1, 1, bias=False)
model.maxpool = nn.Identity()

In [7]:

ch = model.fc.in_features
model.fc = nn.Sequential(nn.Linear(ch, ch),nn.ReLU(),nn.Linear(ch, ch))
#distributed:
#model = torch.nn.parallel.DistributedDataParallel(model)
model.to(device)
model.train()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): Identity()
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): BatchNorm2d(256, eps=1e-05, momentu

In [8]:

def pair_cosine_similarity(x, eps=1e-8):
    n = x.norm(p=2, dim=1, keepdim=True)
    return (x @ x.t()) / (n * n.t()).clamp(min=eps)

def nt_xent(x, t=0.5):
    x = pair_cosine_similarity(x)
    x = torch.exp(x / t)
    idx = torch.arange(x.size()[0])
    # Put positive pairs on the diagonal
    idx[::2] += 1
    idx[1::2] -= 1
    x = x[idx]
    # subtract the similarity of 1 from the numerator
    x = x.diag() / (x.sum(0) - torch.exp(torch.tensor(1 / t)))
    return -torch.log(x.mean())

In [9]:

optimizer = Adam(model.parameters(), lr=0.001)
optimizer = LARC(optimizer)

if args.distributed:
    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
    # apex.parallel.DistributedDataParallel.
    model = torch.nn.parallel.DistributedDataParallel(model)

In [12]:
from tqdm import tqdm_notebook
#print(dl_tr)
model.train()
for i in range(100):
    c, s = 0, 0
    pBar = tqdm_notebook(dl_tr)
    for data, target in pBar:
        #d = data.size()
        #print (d)
        #x = data.view(d[0], d[1], d[2], d[3]).to(device)
        x = data.to(device)
        y= target.to(device)
        optimizer.zero_grad()
        p = model(x)
        loss = nt_xent(p)
        s = ((s*c)+(float(loss)*len(p)))/(c+len(p))
        c += len(p)
        pBar.set_description('Train: ' +str(round(float(s),3)))
        loss.backward()
        optimizer.step()
             
    if (i+1) % 10 == 0:
        torch.save(model.state_dict(), path+'rn50-mlp-b2-t0.5-e'+str(i+1)+'.pt')

HBox(children=(IntProgress(value=0, max=10400), HTML(value='')))

RuntimeError: CUDA error: device-side assert triggered

In [11]:
print("done")

done


In [None]:
for param in model.parameters():
    param.requires_grad = False

In [None]:
model.fc = nn.Linear(ch, len(ds_de.classes))
model.to(device)

In [None]:
optimizer = LARC(optim.SGD(model.parameters(), lr=0.1))
criterion = nn.CrossEntropyLoss()

In [None]:

model.train()
for i in range(5):
    c, s = 0, 0
    pBar = tqdm(dl_de)
    for data in pBar:
        x, y = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        p = model(x)
        loss = criterion(p, y)
        s = ((s*c)+(float(loss)*len(p)))/(c+len(p))
        c += len(p)
        pBar.set_description('Train: '+str(round(float(s),3)))
        loss.backward()
        optimizer.step()

In [None]:
model.eval()
c, s = 0, 0
pBar = tqdm(dl_te)
for data in pBar:
    x, y, = data[0].to(device), data[1].to(device)
    p = model(x)
    loss = criterion(p, y)
    s = ((s*c)+(float(loss)*len(p)))/(c+len(p))
    c += len(p)
    pBar.set_description('Test: '+str(round(float(s),3)))

In [None]:
model.eval()
y_pred, y_true = [], []
pBar = tqdm(dl_te)
for data in pBar:
    x, y = data[0].to(device), data[1].to(device)
    p = model(x)
    y_pred.append(p.cpu().detach().numpy())
    y_true.append(y.cpu().detach().numpy())
y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

In [None]:
(y_true == y_pred.argmax(axis=1)).mean()
