In [56]:
import time
import functools
import random
from pathlib import Path
import numpy as np

from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from torch.optim import lr_scheduler, swa_utils
import torchvision
from torchvision.datasets import ImageFolder, DatasetFolder
from torchvision import transforms


In [57]:
from diskcache import Cache
cache = Cache('.cache/diskcache')

class Ageset(Dataset):
    def __init__(self, path, transforms = None, valid=False, split_pct = 0.3):
        self.image_paths = list(Path(path).rglob("*.png"))
        random.seed(42)
        random.shuffle(self.image_paths)
        split_point = int(len(self)*0.3)
        if valid:
            self.image_paths = self.image_paths[:split_point]
            print('len validation dataset', len(self.image_paths))
        else:
            self.image_paths = self.image_paths[split_point:]
            print('len train dataset', len(self.image_paths))
    def __len__(self):
        return len(self.image_paths)

    @classmethod # somehow this is needed for diskcache to work properly. Or define the function outside of the class
    @functools.lru_cache(maxsize=None)
    @cache.memoize()
    def imgpath_to_normalized_tensor(cls,imgpath):
            # makes a tensor, scales range to 0-1 and normalizes to same as imagenet
            normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                        std=[0.229, 0.224, 0.225])
            return normalize(transforms.PILToTensor()(Image.open(imgpath)).float()/255)
            
    def __getitem__(self,i):
        if isinstance(i, slice):
            return [self[n] for n,_ in enumerate(self.image_paths[i])]
        
        return (self.imgpath_to_normalized_tensor(self.image_paths[i]),
                int(self.image_paths[i].parent.name))

In [58]:
def construct_tensor_dataset(path):
    image_paths = list(Path(path).rglob("*.png"))
    random.shuffle(image_paths)
    xs = torch.stack([Ageset.imgpath_to_normalized_tensor(loc) for loc in image_paths[:10]])
    ys = torch.stack([torch.Tensor([int(Path(loc).parent.name)]) for loc in image_paths[:10]])
    return xs,ys

In [59]:
class AgeTensorDataset(TensorDataset):
    def __init__(self, xs, ys, valid=False, split_pct=0.3):
        length = len(xs)
        split = int(xs.shape[0]*split_pct)
        if valid:
            super().__init__(xs[:split],ys[:split])
        else:
            super().__init__(xs[split:],ys[split:])

    def __getitem__(self,x):
        return super().__getitem__(x)

In [60]:
class AgeResnet(nn.Module):
    def __init__(self, size='18', feat_extract=False):
        super().__init__()
        resnet = 'torchvision.models.resnet'+size+'(pretrained=True)'
        resnet = eval(resnet)
        modules=list(resnet.children())[:-1]
        self.resnet =nn.Sequential(*modules)

        if feat_extract:
            # with feature extraction we only train the linear layer and keep the resnet parameters fixed 
            for m in self.modules():
                m.requires_grad_(False)

        self.fc = nn.Linear(in_features=512, out_features=1, bias=True)
        nn.init.kaiming_normal_(self.fc.weight)

    def forward(self,x):
        out = self.resnet(x)
        x = torch.flatten(out, 1)
        return self.fc(x)

In [101]:
def determine_size(dataset):
    num_items = len(dataset)
    img_dimensions = list(dataset[0][0].shape)
    bytes_per_fp32 = 4
    bytes_per_gb = 1024**3
    size_in_gb = num_items * int(np.product(img_dimensions)) * bytes_per_fp32 / bytes_per_gb
    print('size of ds in memory in gb:', size_in_gb)

In [63]:
def train():
    best_loss = 1000000000
    not_improve_count = 0
    loss = {'train':[], 'val':[]}

    for epoch in range(NUM_EPOCH):
        print(f'Starting epoch {epoch}')
        start_time = time.time()
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            total_loss = 0
            for data in dls[phase]:
                x, y = data[0].to(DEVICE), data[1].to(DEVICE)
                with torch.set_grad_enabled(phase == 'train'):
                    pred = model(x)
                    loss = loss_fn(y, pred)
                    total_loss += loss * len(y)
                    if phase == 'train':
                        loss.backward()
                        opt.step()
                        opt.zero_grad()
                        if SWA_ENABLED and epoch > SWA_START:
                            swa_model.update_parameters(model)
                            swa_sched.step()
                        elif SCHED_ENABLED:
                            sched.step(loss)
                            writer.add_scalar('lr/scheduler', sched.get_last_lr()[0], epoch)
                        writer.add_scalar('lr/optparamgroup0', opt.param_groups[0]['lr'], epoch)
                writer.add_scalar('batchloss/'+phase, loss, epoch)
            
            writer.add_scalar('loss/'+phase, total_loss/len(dls[phase].dataset), epoch)
    
        if total_loss < best_loss:
            best_loss = total_loss
            not_improve_count = 0
        else:
            not_improve_count += 1
            if not_improve_count > 4:
                print('early stopping!')
                break
            
        print(f"loss after epoch {epoch} : {total_loss / len(dls['val'].dataset)}")
        writer.add_scalar('time', (time.time()-start_time)/60, epoch)


    if SWA_ENABLED:
        # swa_utils.update_bn(train_dl, swa_model)
        total_loss_train = 0
        total_loss_val = 0
        with torch.no_grad():
            for data in train_dl:
                    x, y = data[0].to(DEVICE), data[1].to(DEVICE)
                    total_loss_train += loss_fn(y, model(x)) * len(y)
            writer.add_scalar('loss/train', total_loss_train/len(train_set), epoch+1)

            for data in val_dl:
                    x, y = data[0].to(DEVICE), data[1].to(DEVICE)
                    total_loss_val += loss_fn(y, model(x)) * len(y)
            writer.add_scalar('loss/val', total_loss_train/len(val_set), epoch+1)

    writer.flush()
    writer.close()

In [104]:
# adam works best with lr of 0.001 (tested 0.1 and 0.01)
# adam without scheduler works best
# first it took 6 minutes to load all the datasets. With lru cache it was immediate (6GB memory use). With disk cache it took about 1-2 minutes. Great result


def mae_loss(y, pred):
    return (torch.abs(y-pred.T)).mean()
loss_fn = mae_loss

NUM_EPOCH = 40
SWA_START = 30
LR = 0.001
BATCH_SIZE = 64
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
SWA_ENABLED = False
SCHED_ENABLED = False

# xs,ys = construct_tensor_dataset('data/face_age')

train_set = AgeTensorDataset(xs,ys, valid=False)
# train_set = Ageset("data/face_age")[:4000]
train_dl = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
determine_size(Ageset("data/face_age"))

val_set = AgeTensorDataset(xs,ys, valid=True)
# val_set = Ageset("data/face_age", valid=True)[:1000]
val_dl = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True)
dls = {'train': train_dl, 'val': val_dl}


feat={True:'feat_ext', False:'finetune'}
opts = {0:'adam',1:'adabound'}

for i in [True, False]:
    for j in range(2):

        writer = SummaryWriter(comment=f'{feat[i]} opt {opts[j]} epoch {NUM_EPOCH} SWA_START {SWA_START} LR BATCH_SIZE {LR}')
        model = AgeResnet(size='18', feat_extract=i)
        model.to(DEVICE)

        if j ==0:
            opt = torch.optim.Adam(model.parameters(), LR)
        if j ==1:
            opt = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)


        if SCHED_ENABLED:
            # if i ==0:
            #     sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt)
            #     sched.get_last_lr = lambda: [1]
            # if i == 1:
            #     sched = torch.optim.lr_scheduler.OneCycleLR(opt, LR, steps_per_epoch=len(train_dl), epochs=NUM_EPOCH)
            # if i == 2:
            sched = torch.optim.lr_scheduler.MultiplicativeLR(opt, lr_lambda=lambda x: 1)
        if SWA_ENABLED:
            swa_model = swa_utils.AveragedModel(model)
            swa_sched = swa_utils.SWALR(opt, swa_lr = 0.0005)
        train()

len train dataset 160
size of ds in memory in gb: 0.07152557373046875


KeyboardInterrupt: 

In [24]:

for i in test_set[:10]:
    preds = i[1],model(i[0][None].to(DEVICE)).item()
    print(f'target {preds[0]}, predicted {preds[1]}')
    loss = abs(preds[0]-preds[1])
    # print('loss',loss)
    running += loss
print(running, running/2)

target 16, predicted -0.47240400314331055


NameError: name 'running' is not defined

In [12]:
!tensorboard.main --logdir='runs' ip

'tensorboard.main' is not recognized as an internal or external command,
operable program or batch file.


In [11]:
a=time.time()

In [13]:
(a-time.time())/60

-0.2206122080485026

In [None]:
#launch tensorboard
python -m tensorboard.main --logdir=runs --host=0.0.0.0 --port=6006

In [None]:
tensorboard dev upload --logdir runs \
--name "My latest experiment" \ # optional
--description "Simple comparison of several hyperparameters" # optional

In [42]:
class Learner():
    def __init__(self, dls, model, opt, metric, parameters, sched=None, swa_model=None, swa_sched=None):
        self.dls, self.model, self.opt, self.metric, self.p = dls, model, opt, metric, parameters
        self.sched, self.swa_model, self.swa_sched = sched, swa_model, swa_sched
        
    def train(self:)
        for epoch in range(NUM_EPOCH):
                print('epoch start')
                self.model.train()
                total_loss_train = 0
                for data in self.train_dl:
                    x, y = data[0].to(DEVICE), data[1].to(DEVICE)
                    opt.zero_grad()
                    pred = model(x)
                    loss = loss_fn(y, pred)
                    total_loss_train += loss * len(y)
                    loss.backward()
                    opt.step()
                    if epoch > SWA_START:
                        swa_model.update_parameters(model)
                        swa_sched.step(loss)
                    else:
                        sched.step()
                
                model.eval()
                total_loss_test = 0
                with torch.no_grad():
                    for data in test_dl:
                        x, y = data[0].to(DEVICE), data[1].to(DEVICE)
                        total_loss_test += loss_fn(y, model(x)) * len(y)
                register_runs(total_loss_train, total_loss_test, epoch)

    def register_runs(total_loss_train, total_loss_test, epoch): 
        writer.add_scalar('loss/train', total_loss_train/len(train_set), epoch)
        writer.add_scalar('loss/test', total_loss_test/len(test_set), epoch)
        writer.add_scalar('lr/optparamgroup0', opt.param_groups[0]['lr'], epoch)
        writer.add_scalar('lr/scheduler', sched.get_last_lr()[0], epoch)
        print(f'loss after epoch {epoch}: {total_loss_train/len(train_set)}, {total_loss_test/len(test_set)}')

    def validate_swa_model(swa_model):
        # swa_utils.update_bn(train_dl, swa_model)
        total_loss_train = 0
            total_loss_test = 0
            with torch.no_grad():
            for data in train_dl:
                    x, y = data[0].to(DEVICE), data[1].to(DEVICE)
                    total_loss_train += loss_fn(y, model(x)) * len(y)
            writer.add_scalar('loss/train', total_loss_train/len(train_set), epoch+1)

            for data in test_dl:
                    x, y = data[0].to(DEVICE), data[1].to(DEVICE)
                    total_loss_test += loss_fn(y, model(x)) * len(y)
            writer.add_scalar('loss/test', total_loss_train/len(test_set), epoch+1)

            writer.flush()
            writer.close()

SyntaxError: invalid syntax (<ipython-input-42-fcd86dbf9b07>, line 6)