In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torchvision import transforms
import torchvision

import matplotlib.pyplot as plt
import random
import numpy as np
import copy
import time
import os
import cv2
from PIL import Image
from torchvision.models import vgg19

# controllare

# Load data

In [2]:
def set_seed(seed, use_gpu = True):
    """
    Set SEED for PyTorch reproducibility
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if use_gpu:
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

SEED = 44

USE_SEED = False

if USE_SEED:
    set_seed(SEED, torch.cuda.is_available())

In [3]:
def load_image(image_path, device, output_size=None):
    """Loads an image by transforming it into a tensor."""
    img = Image.open(image_path)

    output_dim = None
    if output_size is None:
        output_dim = (img.size[1], img.size[0])
    elif isinstance(output_size, int):
        output_dim = (output_size, output_size)
    elif isinstance(output_size, tuple):
        if (len(output_size) == 2) and isinstance(output_size[0], int) and isinstance(output_size[1], int):
            output_dim = output_size
    else:
        raise ValueError("ERROR: output_size must be an integer or a 2-tuple of (height, width) if provided.")

    torch_loader = transforms.Compose(
        [
            transforms.Resize(output_dim),
            transforms.ToTensor()
        ]
    )
    
    img_tensor = torch_loader(img).unsqueeze(0)
    return img_tensor.to(device)

In [4]:
def image_style_transfer(config):
    """Implements neural style transfer on a content image using a style image, applying provided configuration."""
    ...
    
    # load content and style images
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    output_size = config.get('output_image_size')
    if output_size is not None:
        if len(output_size) > 1: 
            output_size = tuple(output_size)
        else:
            output_size = output_size[0]

    content_tensor = load_image(content_path, device, output_size=output_size)
    output_size = (content_tensor.shape[2], content_tensor.shape[3])
    style_tensor = load_image(style_path, device, output_size=output_size)

In [5]:
"""Gloria´s paths"""
#content_path = "/home/gloria/Scrivania/Vision_and_cognitive_system/content_style/content.jpg"
#style_path = "/home/gloria/Scrivania/Vision_and_cognitive_system/content_style/style1.jpg"

"""Sara´s paths"""
content_path = "/home/sara/Scrivania/Physics_of_Data/2nd Year/Vision_cognitive_sys/Projects/neural_style_transfer/home.jpeg"
style_path = "/home/sara/Scrivania/Physics_of_Data/2nd Year/Vision_cognitive_sys/Projects/neural_style_transfer/vangogh.jpg"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
output_size = 512

content_tensor = load_image(content_path, device, output_size=output_size)
output_size = (content_tensor.shape[2], content_tensor.shape[3])
style_tensor = load_image(style_path, device, output_size=output_size)

In [6]:
print(content_tensor.shape)
print(style_tensor.shape)

torch.Size([1, 3, 512, 512])
torch.Size([1, 3, 512, 512])


In [7]:
train_mean = style_tensor.mean(axis=(0,2,3)) 
train_std = content_tensor.std(axis=(0,2,3))

print("-----  TRAIN NORMALIZATION VALUES  -----")
print(f"Mean: {train_mean}")
print(f"Standard Deviation: {train_std}")

-----  TRAIN NORMALIZATION VALUES  -----
Mean: tensor([0.3010, 0.3623, 0.4342])
Standard Deviation: tensor([0.2239, 0.2188, 0.2747])


In [8]:
style_tensor

tensor([[[[0.1843, 0.1098, 0.1098,  ..., 0.6706, 0.6941, 0.7255],
          [0.1882, 0.1373, 0.1412,  ..., 0.6980, 0.6784, 0.7059],
          [0.1608, 0.1412, 0.1373,  ..., 0.7412, 0.7490, 0.6902],
          ...,
          [0.5412, 0.5098, 0.5059,  ..., 0.3216, 0.4078, 0.5294],
          [0.5490, 0.5686, 0.5373,  ..., 0.3765, 0.5686, 0.6549],
          [0.6039, 0.5333, 0.5176,  ..., 0.6157, 0.6980, 0.6941]],

         [[0.2039, 0.1255, 0.1255,  ..., 0.6039, 0.6353, 0.6471],
          [0.1922, 0.1412, 0.1490,  ..., 0.6314, 0.6157, 0.6235],
          [0.1608, 0.1412, 0.1451,  ..., 0.6824, 0.6902, 0.6196],
          ...,
          [0.4314, 0.4314, 0.4745,  ..., 0.2902, 0.3529, 0.4745],
          [0.4784, 0.5176, 0.5137,  ..., 0.3373, 0.5059, 0.5882],
          [0.5490, 0.4902, 0.4863,  ..., 0.5686, 0.6196, 0.6118]],

         [[0.2078, 0.1804, 0.2353,  ..., 0.4941, 0.4588, 0.5059],
          [0.2784, 0.2863, 0.3294,  ..., 0.5176, 0.4824, 0.5176],
          [0.2588, 0.2902, 0.3176,  ..., 0

# VGG model

In [9]:
class VGG19(nn.Module):
    def __init__(self):
        super(VGG19, self).__init__()

        #select 5 convolutional layers
        self.chosen_features = {0: 'conv1_1', 5: 'conv2_1', 10: 'conv3_1', 19: 'conv4_1', 21: 'conv4_2', 28: 'conv5_1'}
        self.vgg = torchvision.models.vgg19(pretrained=True).features[:37]
        
    def forward(self, x):
        feature_maps = dict()
        for idx, layer in enumerate(self.vgg):
            x = layer(x)
            if idx in self.chosen_features.keys():
                feature_maps[self.chosen_features[idx]] = x
        
        return feature_maps



In [10]:
#load the model
vgg = VGG19().to(device).eval()



In [11]:
vgg

VGG19(
  (vgg): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding

### Structure of VGG
The structure is:
- Conv1:
    - conv 1_1 [0]
    - conv 1_2 [2]
- Conv2:
    - conv 2_1 [5]
    - conv 2_2 [7]
- Conv3:
    - conv 3_1 [10]
    - conv 3_2 [12]
    - conv 3_3 [14]
    - conv 3_4 [16]
- Conv4:
    - conv 4_1 [19]
    - conv 4_2 [21]
    - conv 4_3 [23]
    - conv 4_4 [25]
- Conv5:
    - conv 5_1 [28]
    - conv 5_2 [30]
    - conv 5_3 [32]
    - conv 5_4 [34]
    
We will use:
- For content loss: conv4_2 
- For style loss: conv1_1,conv2_1,conv3_1,conv4_1,conv5_1

# Losses

The overall loss is constituted by the loss of the target image with respect to the content image, and the loss of the target image with respect to the style image. $$L_{tot}=L_{content}+L_{style}$$
For this process it wouldn´t make sense to compare the images pixel by pixel: for example if the content image contains a house and the predominant style of the style image is to have diagonal lines, we would want the target image to be a house which is inclinated diagonally; comparing pixel by pixel an image with a diagonal house and an image with a house would return a much higher loss than we expect, because the pixel by pixel comparison doesn´t take into account more ǵeneric'features
In order to perform a more accurate comparison, both these losses are evaluated between **feature maps** which take into account the more generic features of both images.

### Content loss
The content loss is computed at the end of the CNN; we compute the mean squared error between the target feature map and the content feature map.

In [12]:
def get_content_loss(target_map, content_map):
    #return torch.mean((content_original-content_current)**2)
    return torch.nn.MSELoss(reduction='mean')(target_map, content_map)

### Style loss
For the style loss, the procedure is more complicated.
We are interested in co-occurrences of pairs of features to highlight important stylistic combinations.    

If we have a feature map, of height and width $h,w$ and lenght $k$, which is the number of maps applied, we want to compute cooccurrences between each pair of  maps $i,j$ with values in $[0,k]$ range (these are also called *channels*): we obtain a $kxk$ matrix in which each entry is the dot product between two maps, a scalar.    
Given the feature map of an image, this matrix, called the **Gram matrix** can be computed easily as the sum of the matrix multiplication between the whole feature map and its transpose.
   
This is done both with the feature map of the STYLE IMAGE and the feature map of the TARGET IMAGE.
We compute a Gram matrix for both images for each convolutional layer considered $l$, and end up with:
   - 5 Gram matrices of the style image feature maps $G_{style}^l$
   - 5 Gram matrices of the target image feature maps  $G_{target}^l$    
   
The loss of each layer $l$ is computed via MSE between the two gram matrices, and the overall style loss will be the average of these values over the number of layers (in our case 5).

In [13]:
def get_style_loss(target_map,style_map):
    """Compute MSE between gram matrix of style feature map and of generated feature map as style loss."""
    _, channel, height, width = target_map.shape
    
    #computing Gram matrix of the style feature map
    style_gram = style_map.view(channel, height*width).mm(
        style_map.view(channel, height*width).t()
    )
    #computing Gram matrix of the target feature map
    target_gram = target_map.view(channel, height*width).mm(
        target_map.view(channel, height*width).t()
    )

    return torch.nn.MSELoss(reduction='sum')(target_gram,style_gram)



# Training

### Initialize random (target) image

In [14]:
img=content_tensor
img.shape         #to take the right dimensions for the generated image

torch.Size([1, 3, 512, 512])

In [15]:
'Generating random noise as input image '
#gaussian_noise_img = np.random.normal(loc=0, scale=90., size=img.shape).astype(np.float32)
white_noise_img = np.random.uniform(-90., 90., img.shape).astype(np.float32)
init_img = torch.from_numpy(white_noise_img).float().to(device)

In [16]:
init_img = (init_img - init_img.min()) / (init_img.max() - init_img.min())

In [17]:
init_img.shape

torch.Size([1, 3, 512, 512])

### Parameters

In [36]:
style_layers = ['conv1_1','conv2_1','conv3_1','conv4_1']
content_layers = ['conv4_2']

In [37]:
content = content_tensor
style = style_tensor
target = init_img.requires_grad_(True)  #requires_grad is needed to make sure that the image is updated


learn_rate=1e1
alpha=1
beta=1000


from torchvision.utils import save_image
intermediate_dir="/home/sara/Scrivania/Physics_of_Data/2nd Year/Vision_cognitive_sys/Projects/neural_style_transfer/intermediate"

In [39]:
def train_image(content, style, target, device, output_img_fmt, content_img_name, style_img_name, num_epochs):
    """Update the output image using pre-trained VGG19 model."""
    ...
    
    model = VGG19().to(device).eval()    # freeze parameters in the model
    optimizer = torch.optim.Adam([target], lr=learn_rate)
    
    
    for epoch in range(num_epochs):
        # get features maps of content, style and generated images from chosen layers
        content_features = model(content)
        style_features = model(style)
        target_features = model(target)

        content_loss  = 0.0
        style_loss= 0.0
        
        """Computing loss"""
        for layer in target_features.keys(): 
            content_feature = content_features[layer]
            style_feature = style_features[layer]
            target_feature = target_features[layer]
   
            
            if layer in content_layers:
            # computes content loss on layer 4_2
                content_loss_per_feature = get_content_loss(content_feature, target_feature)
                content_loss += content_loss_per_feature
                
            if layer in style_layers:
            # computes style loss for all 5 style layers
                style_loss_per_feature = get_style_loss(style_feature, target_feature)
                style_loss+=style_loss_per_feature
        
        #average style loss over all 5 layers
        style_loss /= len(style_layers)
        
        # Total loss 
        total_loss = alpha * content_loss + beta * style_loss

        #optimizer.zero_grad()
      #compute the gradient
        total_loss.backward()
      #update parameters
        optimizer.step()
        optimizer.zero_grad()
        
        #Save every 100 steps
        if ((epoch+1)%10)==0:
            save_image(target, os.path.join(intermediate_dir, f'nst-{content_img_name}-{style_img_name}-{epoch + 1}.{output_img_fmt}'))

        print(f"\tEpoch {epoch + 1}/{num_epochs}, loss = {total_loss}")
    ...

    return 1

In [None]:
train_image(content, style, target, device,'jpeg', 'house', 'vangogh',500)

	Epoch 1/500, loss = 6.758702270906368e+16
	Epoch 2/500, loss = 1.217178323658214e+23
	Epoch 3/500, loss = 1.8044223598014245e+22
	Epoch 4/500, loss = 5.318443113710044e+22
	Epoch 5/500, loss = 5.15630767244519e+22
	Epoch 6/500, loss = 4.4495505771625345e+22
	Epoch 7/500, loss = 4.233692596662704e+22
	Epoch 8/500, loss = 3.978865418947199e+22
	Epoch 9/500, loss = 3.6103439679188007e+22
	Epoch 10/500, loss = 3.210899901128951e+22
	Epoch 11/500, loss = 2.834294263589441e+22
	Epoch 12/500, loss = 2.497508102875253e+22
	Epoch 13/500, loss = 2.197009921338584e+22


In [34]:
save_image(content, os.path.join(intermediate_dir, 'content_after.jpg'))
save_image(style, os.path.join(intermediate_dir, 'style_after.jpg'))