In [None]:
from google.colab import files

Test Markdown
second row

In [2]:
!cp "/content/drive/MyDrive/8. Semester/architectures.py" /content
!cp "/content/drive/MyDrive/8. Semester/dataloader.py" /content
!cp "/content/drive/MyDrive/8. Semester/ex4_sample.py" /content
!cp "/content/drive/MyDrive/8. Semester/utils.py" /content
!cp "/content/drive/MyDrive/8. Semester/working_config.json" /content
!cp "/content/drive/MyDrive/8. Semester/training.zip" /content

In [None]:
!unzip /content/training.zip

In [4]:
import os
import pickle

import numpy as np
import torch
import torch.utils.data
import torch.utils.data
import tqdm
import shutil

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision.transforms import transforms
from tqdm import tqdm

from architectures import ChallengeCNN
from dataloader import AugmentedImages, image_collate_fn, TestDataset, SimpleImageDataset, AugmentedImages
from utils import plot, RMSELoss, store_to_drive, restore_from_drive

In [5]:
def evaluate_model(model: torch.nn.Module, dataloader: torch.utils.data.DataLoader, loss_fn, device: torch.device):
    """Function for evaluation of a model `model` on the data in `dataloader` on device `device`,
    using the specified `loss_fn` loss function"""
    model.eval()
    # We will accumulate the mean loss in variable `loss`
    loss = 0
    with torch.no_grad():  # We do not need gradients for evaluation
        # Loop over all samples in `dataloader`
        for data in tqdm(dataloader, desc="scoring", position=0):
            # Get a sample and move inputs and targets to device
            images, inputs, known, means, stds, file_names = data
            
            inputs = inputs.to(device)
            known = known.to(device)
            means = means.to(device)
            stds = stds.to(device)
            images = images.to(device)

            # Get outputs of the specified model
            outputs = model(inputs)

            outputs = (outputs.permute(2, 3, 0, 1) * stds + means).permute(2, 3, 0, 1)
            # outputs = (outputs.permute(1, 2, 3, 0) * stds + means).permute(3, 0, 1, 2)

            # mask out known values
            masked_outputs = torch.where(known == 0, outputs, images) * 255
            images *= 255

            masked_outputs.int()
            images.int()
            # Here we could clamp the outputs to the minimum and maximum values of inputs for better performance

            # Add the current loss, which is the mean loss over all minibatch samples
            # (unless explicitly otherwise specified when creating the loss function!)
            loss += loss_fn(masked_outputs, images).item()
    # Get final mean loss by dividing by the number of minibatch iterations (which
    # we summed up in the above loop)
    loss /= len(dataloader)
    model.train()
    print(f"loss: {loss}")
    return loss

In [6]:
def main(results_path,
         training_path: str,
         network_config: dict,
         learningrate: int = 1e-3,
         weight_decay: float = 1e-5,
         n_updates: int = int(1e5),
         device: torch.device = torch.device("cuda:0"),
         num_workers: int = 0,
         batch_size: int = 2):
    """Main function that takes hyperparameters and performs training and evaluation of model"""

    results_path = f"{results_path}{training_path.replace('training', '')}_ks{network_config['kernel_size']}_l{network_config['n_hidden_layers']}"

    # Prepare a path to plot to
    plotpath = os.path.join(results_path, 'plots')
    os.makedirs(plotpath, exist_ok=True)

    # Load or dataset
    image_dataset = SimpleImageDataset(data_folder=training_path)

    # 1. Decide which samples you want to use in your training-, validation- or test sets.
    # Split dataset into training, validation, and test set randomly
    training_set = torch.utils.data.Subset(image_dataset, indices=np.arange(int(len(image_dataset) * (3 / 5))))
    validation_set = torch.utils.data.Subset(image_dataset, indices=np.arange(int(len(image_dataset) * (3 / 5)),
                                                                              int(len(image_dataset) * (4 / 5))))
    test_set = torch.utils.data.Subset(image_dataset,
                                       indices=np.arange(int(len(image_dataset) * (4 / 5)), len(image_dataset)))

    # Create datasets and dataloaders without augmentation (for evaluation)
    train_loader = DataLoader(AugmentedImages(training_set),
                              batch_size=1,
                              shuffle=False,
                              num_workers=num_workers,
                              collate_fn=image_collate_fn)
    val_loader = DataLoader(AugmentedImages(validation_set),
                            batch_size=1, shuffle=False,
                            num_workers=num_workers,
                            collate_fn=image_collate_fn)
    test_loader = DataLoader(AugmentedImages(test_set),
                             batch_size=1, shuffle=False,
                             num_workers=num_workers,
                             collate_fn=image_collate_fn)

    # Create datasets and dataloaders with rotated targets with augmentation (for training)
    transform_chain = transforms.Compose([transforms.RandomHorizontalFlip(),
                                          transforms.RandomVerticalFlip()])

    training_set_augmented = AugmentedImages(dataset=training_set, transform_chain=transform_chain)
    train_loader_augmented = DataLoader(training_set_augmented, batch_size=batch_size, shuffle=True,
                                        num_workers=num_workers,
                                        collate_fn=image_collate_fn)

    # Define a tensorboard summary writer that writes to directory "results_path/tensorboard"
    writer = SummaryWriter(log_dir=os.path.join(results_path, 'tensorboard'))

    # Create Network
    net = ChallengeCNN(**network_config)
    net.to(device)

    # Get mse loss function
    mse = RMSELoss

    # Get adam optimizer
    optimizer = torch.optim.Adam(net.parameters(), lr=learningrate, weight_decay=weight_decay)

    interval = 1000
    print_stats_at = 100  # print status to tensorboard every x updates
    plot_at = interval  # plot every x updates
    validate_at = interval  # evaluate model on validation set and check for new best model every x updates

    # Save initial model as "best" model (will be overwritten later)
    best_model_file = os.path.join(results_path, f"best_model.pt")
    last_model_file = os.path.join(results_path, f"last_model.pt")
    progress_file = os.path.join(results_path, f"progress.txt")

    if TRAIN:  # Train until n_updates updates have been reached
        update_progress_bar = tqdm(total=n_updates, desc=f"loss: {np.nan:7.5f}", position=0)

        if TRAIN_CONTINUE:
            #restore_from_drive(results_path)
            with open(progress_file, 'r') as f:
                x = f.read().split('\n')
                update = int(x[0])
                best_validation_loss = float(x[1])
                f.close()

            update_progress_bar.n = update
            update_progress_bar.refresh()
            net = torch.load(last_model_file)
        else:
            update = 0  # current update counter
            best_validation_loss = np.inf  # best validation loss so far
            torch.save(net, best_model_file)

        while update < n_updates:
            for data in train_loader_augmented:
                # Get next samples
                images, inputs, known, means, stds, ids = data

                inputs = inputs.to(device)
                known = known.to(device)
                means = means.to(device)
                stds = stds.to(device)
                images = images.to(device)

                # Reset gradients
                optimizer.zero_grad()

                # Get outputs for network
                outputs = net(inputs)

                outputs = (outputs.permute(2, 3, 0, 1) * stds + means).permute(2, 3, 0, 1)

                # Calculate loss, do backward pass, and update weights
                # mask out known values
                masked_outputs = torch.where(known == 0.0, outputs, images) * 255
                images *= 255

                loss = mse(masked_outputs, images)
                loss.backward()
                optimizer.step()

                # Print current status and score
                if (update + 1) % print_stats_at == 0:
                    writer.add_scalar(tag="training/loss", scalar_value=loss.cpu(), global_step=update)

                # Plot output
                if PLOT_TRAINING and (update + 1) % plot_at == 0:
                    plot(inputs.detach().cpu().numpy(), images.detach().cpu().numpy(),
                         masked_outputs.detach().cpu().numpy(),
                         plotpath, update)

                # Evaluate model on validation set
                if (update + 1) % validate_at == 0:
                    val_loss = evaluate_model(net, dataloader=val_loader, loss_fn=mse, device=device)
                    writer.add_scalar(tag="validation/loss", scalar_value=val_loss, global_step=update)
                    # Add weights and gradients as arrays to tensorboard
                    for i, (name, param) in enumerate(net.named_parameters()):
                        writer.add_histogram(tag=f"validation/param_{i} ({name})", values=param.cpu(),
                                             global_step=update)
                        writer.add_histogram(tag=f"validation/gradients_{i} ({name})", values=param.grad.cpu(),
                                             global_step=update)
                    # Save best model for early stopping
                    if val_loss < best_validation_loss:
                        best_validation_loss = val_loss
                        torch.save(net, best_model_file)

                    torch.save(net, last_model_file)
                    with open(progress_file, 'w') as f:
                        f.write(str(update + 1))
                        f.write('\n')
                        f.write(str(best_validation_loss))
                        f.close()

                    #store_to_drive(results_path)

                update_progress_bar.set_description(f"loss: {loss:7.5f}", refresh=True)
                update_progress_bar.update()

                # Increment update counter, exit if maximum number of updates is reached
                # Here, we could apply some early stopping heuristic and also exit if its
                # stopping criterion is met
                update += 1
                if update >= n_updates:
                    break

        update_progress_bar.close()
        writer.close()
        print("\nFinished Training!")

    else:

        writer.close()
        print("\nNo Training!")

    if EVALUATE:
        # Load best model and compute score on test set
        print(f"Computing scores for best model")
        net = torch.load(best_model_file)
        train_loss = evaluate_model(net, dataloader=train_loader, loss_fn=mse, device=device)
        val_loss = evaluate_model(net, dataloader=val_loader, loss_fn=mse, device=device)
        test_loss = evaluate_model(net, dataloader=test_loader, loss_fn=mse, device=device)

        print(f"Scores:")
        print(f"  training loss: {train_loss}")
        print(f"validation loss: {val_loss}")
        print(f"      test loss: {test_loss}")

        # Write result to file
        with open(os.path.join(results_path, f"results.txt"), "w") as rf:
            print(f"Scores:", file=rf)
            print(f"  training loss: {train_loss}", file=rf)
            print(f"validation loss: {val_loss}", file=rf)
            print(f"      test loss: {test_loss}", file=rf)

    if PREDICT:
        print(f"Computing pixels for challenge")
        net = torch.load(best_model_file)
        challenge_test_dataset = TestDataset("test")
        challenge_plotpath = os.path.join(results_path, 'predictions')
        os.makedirs(challenge_plotpath, exist_ok=True)
        challenge_test_loader = DataLoader(challenge_test_dataset, shuffle=False, batch_size=1,
                                           collate_fn=image_collate_fn)
        # collate_fn=image_collate_fn)
        predict_unknown(net, dataloader=challenge_test_loader, results_path=results_path, plot_path=challenge_plotpath,
                        device=device)

In [7]:
# enable training will create a new model and overwrite any existing
TRAIN = True
TRAIN_CONTINUE = True

EVALUATE = True
PREDICT = False

PLOT_TRAINING = False
PLOT_PREDICTIONS = False


In [8]:
import json

with open("working_config.json") as cf:
    config = json.load(cf)

main(**config)

scoring: 100%|██████████| 5882/5882 [00:53<00:00, 109.51it/s]
loss: 16.52635:  20%|██        | 4000/20000 [23:01<77:32:21, 17.45s/it]

loss: 15.700774341446735


scoring: 100%|██████████| 5882/5882 [00:55<00:00, 106.65it/s]
loss: 15.99991:  25%|██▌       | 5000/20000 [45:48<74:24:24, 17.86s/it]

loss: 15.71925485853596


loss: 16.89904:  27%|██▋       | 5333/20000 [53:02<5:24:41,  1.33s/it]

KeyboardInterrupt: ignored

In [None]:
!nvidia-smi

In [9]:
# store to drive
!cp -r /content/results_ks3_l6 "/content/drive/MyDrive/8. Semester/results_ks3_l6_nopad"

In [1]:
# restore from drive
!cp -r "/content/drive/MyDrive/8. Semester/results_ks3_l6_nopad" /content/results_ks3_l6

In [None]:
!rm -rf /content/results_ks3_l6

In [None]:
def store_to_drive(results_path):
    drive_path = os.path.join( "/content/drive/MyDrive/8. Semester/", results_path)
    shutil.copytree(os.path.join("/content", results_path), drive_path)

def restore_from_drive(results_path):
    drive_path = os.path.join( "/content/drive/MyDrive/8. Semester/", results_path)
    shutil.copytree(drive_path, os.path.join("/content", results_path))
