In [1]:
import time
import math
import tables

import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from resnet_custom import resnet18_baseline # imports from custom_resnet file adapted from CLAM source code.

from torch.utils.tensorboard import SummaryWriter
import torch.utils.tensorboard as tensorboard
import torchvision.models as models #built-in NN, using ResNet18 in this case.

Input training parameters

In [2]:
dataname = 'synthetic32'
gpuid=0

# --- resnet params
n_classes = 2

# not sure if these will be necessary in the future. might be able to exclude these from state.
'''growth_rate=8 
block_config=(4, 4, 4, 4)
num_init_features=2
bn_size=4
drop_rate=0'''

# --- training params
batch_size=64
patch_size=32 #based on resnet architecture. Changed from 224
num_epochs = 1
phases = ["train","val", "test"] #how many phases did we create databases for?
validation_phases= ["val"] #when should we do valiation? note that validation is *very* time consuming, so as opposed to doing for both training and validation, we do it only for vlaidation at the end of the epoch
                           #additionally, using simply [], will skip validation entirely, drastically speeding things up

In [3]:
#helper function for pretty printing of current time and remaining time
def asMinutes(s):
   m = math.floor(s / 60)
   s -= m * 60
   return '%dm %ds' % (m, s)
def timeSince(since, percent):
   now = time.time()
   s = now - since
   es = s / (percent+.00001)
   rs = es - s
   return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [4]:
#specify if we should use a GPU (cuda) or only the CPU
if(torch.cuda.is_available()):
   print(torch.cuda.get_device_properties(gpuid))
   torch.cuda.set_device(gpuid)
   device = torch.device(f'cuda:{gpuid}')
else:
   device = torch.device(f'cpu')

In [5]:
resnet18 = models.resnet18()
# resnet18 = resnet18_baseline()
resnet18.fc = nn.Linear(resnet18.fc.in_features, 1)
print(f"total params: \t{sum([np.prod(p.size()) for p in resnet18.parameters()])}") # script for counting total params
                                                                                    # might need to look into docs for resnet18 to count parameters.


total params: 	11177538


Create Dataset class.

In [6]:
#this defines our dataset class which will be used by the dataloader
class Dataset(object):
   def __init__(self, fname ,img_transform=None):
      #nothing special here, just internalizing the constructor parameters
      self.fname=fname

      self.img_transform=img_transform
      
      with tables.open_file(self.fname,'r') as db:
         self.classsizes=db.root.classsizes[:]
         self.nitems=db.root.imgs.shape[0]
      
      self.imgs = None
      self.labels = None
      
   def __getitem__(self, index):
      #opening should be done in __init__ but seems to be
      #an issue with multithreading so doing here. need to do it everytime, otherwise hdf5 crashes

      with tables.open_file(self.fname,'r') as db:
         self.imgs=db.root.imgs
         self.labels=db.root.labels

         #get the requested image
         img = self.imgs[index,::]
         # img = img[:,:,None].repeat(3,axis=2) #convert to 3 channel RGB, MIGHT BE UNNECESSARY DUE TO 3 CHANNEL IMAGES
         # img = np.transpose(img, (2,0,1))  # 
         label = self.labels[index] 
      
      img_new = img
      
      if self.img_transform is not None:
         print(index)
         img_new = self.img_transform(img)
      return img_new, label, img
   def __len__(self):
      return self.nitems

In [7]:
img_transform = transforms.Compose([
   transforms.ToPILImage(),
   # transforms.RandomVerticalFlip(),
   # transforms.RandomHorizontalFlip(),
   # transforms.RandomCrop(size=(patch_size,patch_size),pad_if_needed=True), #these need to be in a reproducible order, first affine transforms and then color
   transforms.ToTensor()
   ])

dataset={}
dataLoader={}
for phase in phases: #now for each of the phases, we're creating the dataloader
                     #interestingly, given the batch size, i've not seen any improvements from using a num_workers>0
   
   dataset[phase]=Dataset(f"./data/{dataname}_{phase}.pytable", img_transform=img_transform)
   dataLoader[phase]=DataLoader(dataset[phase], batch_size=batch_size, 
                              shuffle=False, num_workers=0,pin_memory=True) 
   print(f"{phase} dataset size:\t{len(dataset[phase])}")

train dataset size:	10000
val dataset size:	1000
test dataset size:	3000


initialize optimizer. Again might need to look at resnet18 docs to figure out how to get its parameters

In [8]:
optim = torch.optim.Adam(resnet18.parameters()) #adam is going to be the most robust, though perhaps not the best performing, typically a good place to start.

In [9]:
#we have the ability to weight individual classes, in this case we'll do so based on their presense in the trainingset
#to avoid biasing any particular class
nclasses = dataset["train"].classsizes.shape[0]
class_weight=dataset["train"].classsizes
class_weight = torch.from_numpy(1-class_weight/class_weight.sum()).type('torch.FloatTensor').to(device)

print(class_weight) #show final used weights, make sure that they're reasonable before continouing
criterion = torch.nn.BCEWithLogitsLoss(weight = class_weight)

tensor([0.5012, 0.4988])


In [10]:
%load_ext line_profiler
#%lprun
# %%prun

In [11]:
def trainnetwork():
   writer=SummaryWriter() #open the tensorboard visualiser
   best_loss_on_test = np.Infinity

   start_time = time.time()
   for epoch in range(num_epochs):
      #zero out epoch based performance variables 
      all_acc = {key: 0 for key in phases} 
      all_loss = {key: torch.zeros(0).to(device) for key in phases} #keep this on GPU for greatly improved performance
      cmatrix = {key: np.zeros((n_classes,n_classes)) for key in phases}

      for phase in phases: #iterate through both training and validation states

            if phase == 'train':
               resnet18.train()  # Set model to training mode
            else: #when in eval mode, we don't want parameters to be updated
               resnet18.eval()   # Set model to evaluate mode

            for ii , (X, label, img_orig) in enumerate(dataLoader[phase]): #for each of the batches
               
               X = X.to(device)  # [Nbatch, 3, H, W]
               label = label.type('torch.LongTensor').to(device)  # [Nbatch, 1] with class indices (0, 1, 2,...n_classes)

               with torch.set_grad_enabled(phase == 'train'): #dynamically set gradient computation, in case of validation, this isn't needed
                                                               #disabling is good practice and improves inference time
                  prediction = resnet18(X)  # [Nbatch, Nclass]
                  print(prediction)
                  loss = criterion(prediction, label)
               
                  if phase=="train": #in case we're in train mode, need to do back propogation
                        optim.zero_grad()
                        loss.backward()
                        optim.step()
                        train_loss = loss


                  all_loss[phase]=torch.cat((all_loss[phase],loss.detach().view(1,-1)))

                  if phase in validation_phases: #if this phase is part of validation, compute confusion matrix
                        p=prediction.detach().cpu().numpy()
                        cpredflat=np.argmax(p,axis=1).flatten()
                        yflat=label.cpu().numpy().flatten()

                        cmatrix[phase]=cmatrix[phase]+confusion_matrix(yflat,cpredflat, labels=range(nclasses))
            print(cmatrix[phase].sum())
            all_acc[phase]=(cmatrix[phase]/cmatrix[phase].sum()).trace()
            all_loss[phase] = all_loss[phase].cpu().numpy().mean()

            #save metrics to tensorboard
            writer.add_scalar(f'{phase}/loss', all_loss[phase], epoch)
            if phase in validation_phases:
               writer.add_scalar(f'{phase}/acc', all_acc[phase], epoch)
               for r in range(nclasses):
                  for c in range(nclasses): #essentially write out confusion matrix
                        writer.add_scalar(f'{phase}/{r}{c}', cmatrix[phase][r][c],epoch)

      print('%s ([%d/%d] %d%%), train loss: %.4f test loss: %.4f' % (timeSince(start_time, (epoch+1) / num_epochs), 
                                                   epoch+1, num_epochs ,(epoch+1) / num_epochs * 100, all_loss["train"], all_loss["val"]),end="")    

      #if current loss is the best we've seen, save model state with all variables
      #necessary for recreation
      if all_loss["val"] < best_loss_on_test:
            best_loss_on_test = all_loss["val"]
            print("  **")
            state = {'epoch': epoch + 1,
            'model_dict': resnet18.state_dict(),
            'optim_dict': optim.state_dict(),
            'best_loss_on_test': all_loss,
            'n_classes': n_classes}
            torch.save(state, f"{dataname}_resnet_best_model.pth")
      else:
            print("")

In [12]:
%lprun -f trainnetwork trainnetwork()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
tensor([[-1.1434,  0.3688],
        [-0.0395, -0.4349],
        [-0.0709, -0.4173],
        [-0.1445, -1.6772],
        [ 0.3168, -0.6194],
        [-0.2608, -0.6784],
        [-0.5176, -1.2054],
        [ 0.1055, -0.6822],
        [ 0.4776, -0.5569],
        [ 0.0599,  0.7958],
        [-1.2158, -0.3529],
        [-0.0043, -0.4555],
        [ 0.3510, -0.9297],
        [ 0.3022, -0.2135],
        [-0.2484,  0.2391],
        [-0.3166, -1.1087],
        [-0.0321, -0.2343],
        [ 0.3498,  0.2570],
        [ 0.5330, -1.6882],
        [ 1.0666, -1.0328],
        [ 0.7162, -1.0815],
        [ 0.1801, -1.1505],
        [ 0.7750,  0.2026],
        [-0.0105, -0.9249],
        [-0.5498, -1.3738],
        [ 0.9631, -1.2126],
        [-0.3486, -1.1025],
        [ 0.3963, -0.7383],
        [-0.3298, -1.0002],
      

ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([64, 2]))

In [13]:
trainnetwork()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
tensor([[-1.1434,  0.3688],
        [-0.0395, -0.4349],
        [-0.0709, -0.4173],
        [-0.1445, -1.6772],
        [ 0.3168, -0.6194],
        [-0.2608, -0.6784],
        [-0.5176, -1.2054],
        [ 0.1055, -0.6822],
        [ 0.4776, -0.5569],
        [ 0.0599,  0.7958],
        [-1.2158, -0.3529],
        [-0.0043, -0.4555],
        [ 0.3510, -0.9297],
        [ 0.3022, -0.2135],
        [-0.2484,  0.2391],
        [-0.3166, -1.1087],
        [-0.0321, -0.2343],
        [ 0.3498,  0.2570],
        [ 0.5330, -1.6882],
        [ 1.0666, -1.0328],
        [ 0.7162, -1.0815],
        [ 0.1801, -1.1505],
        [ 0.7750,  0.2026],
        [-0.0105, -0.9249],
        [-0.5498, -1.3738],
        [ 0.9631, -1.2126],
        [-0.3486, -1.1025],
        [ 0.3963, -0.7383],
        [-0.3298, -1.0002],
      

ValueError: Target size (torch.Size([64])) must be the same as input size (torch.Size([64, 2]))

In [None]:
%reload_ext tensorboard

In [None]:
%tensorboard --logdir runs --host localhost --port 6008