#1.  GAN generation and manipulation 

# Setup GenForce
GenForce is an open source library for generative model. For this assignment, we will mainly focus on StyleGAN2. 

In [None]:
# Do not change this code block
import os
os.chdir('/content')
CODE_DIR = 'GenForce'
!git clone https://github.com/genforce/genforce.git $CODE_DIR
os.chdir(f'./{CODE_DIR}')
!pip install -r requirements.txt > installation_output.txt

import os
import subprocess
import io
import IPython.display
import numpy as np
import PIL.Image
import torch

from models import MODEL_ZOO
from models import build_generator
from utils.visualizer import fuse_images
import tqdm


# Load StyleGAN2 Model for human face
First we need to initialize the StyleGAN2 model and load the checkpoint pre-trained on FF-HQ. We will use the model with 256*256 resolution to save time. If you are seeking better generation quality, you may change to 512 or 1024 resolution after you finish all the implementation.

In [None]:
# Do not change this code block
# Download checkpoint, this should take ~30s
model_name = "stylegan_ffhq256"
model_url = "https://mycuhk-my.sharepoint.com/:u:/g/personal/1155082926_link_cuhk_edu_hk/ES-NAUCC2qdHg87BftvlBiQBVpbJ8-005Q4TNr5KrOxQEw?e=00AnWt&download=1"
os.makedirs('checkpoints', exist_ok=True)
checkpoint_path = os.path.join('checkpoints', model_name + '.pth')
subprocess.call(['wget', '-O', checkpoint_path, model_url])

# Initialize StyleGAN generator
model_config = MODEL_ZOO[model_name].copy()
model_config.pop('url')
generator = build_generator(**model_config)
generator = generator.cuda()
generator.eval()

# Load checkpoint
checkpoint = torch.load(checkpoint_path, map_location='cpu')
generator.load_state_dict(checkpoint['generator'])
print(f'Finish loading checkpoint.')

# Define utility functions


In [None]:
# Do not change this code block
def postprocess(images):
  """Post-processes images from `torch.Tensor` to `numpy.ndarray`."""
  images = images.detach().cpu().numpy()
  images = (images + 1) * 255 / 2
  images = np.clip(images + 0.5, 0, 255).astype(np.uint8)
  images = images.transpose(0, 2, 3, 1)
  return images

def imshow(images, viz_size=256, col=0, spacing=0):
  """Shows images in one figure."""
  fused_image = fuse_images(
    images,
    col=col,
    image_size=viz_size,
    row_spacing=spacing,
    col_spacing=spacing
  )
  fused_image = np.asarray(fused_image, dtype=np.uint8)
  data = io.BytesIO()
  PIL.Image.fromarray(fused_image).save(data, 'jpeg')
  im_data = data.getvalue()
  disp = IPython.display.display(IPython.display.Image(im_data))
  return disp

# Generate 10 Image and store the corresponding latent codes (both z latent and w latent)
This is a sample code for you for generating 10 human face images. StyleGAN2 has two latent space, z and w, and we will test both latent space on their disentangling ability.

In [None]:
# Do not change this code block
# Set random seed.
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
num = 10
batch_size = 1

# data structure to store latent codes (both z and w), you may use any structure as you like
z = []
w = []

# Sample and synthesize.
outputs = []
for idx in tqdm.tqdm(range(0, num, batch_size)):
  batch = min(batch_size, num - idx)
  latent_z = torch.randn(batch, generator.z_space_dim).cuda()
  
  with torch.no_grad():
    latent_w = generator.mapping(latent_z)['w']
    z.append(latent_z.detach().cpu())
    w.append(latent_w.detach().cpu())
    wp = generator.truncation(latent_w)
    images = generator.synthesis(wp)['image']
    images = postprocess(images)
  outputs.append(images)
img_tensor = np.concatenate(outputs, axis=0)


imshow(img_tensor)

# Extract face attributes
We will use deepface API to extract face attributes for each image.
The code below is a simple example of using deepface API to extract age and emotion attribute of the 10 person generated in previous section. Your task is:

1. Generate 10k images using StyleGAN, store the z and w latent code
2. Get the age and emotion attributes of the 10k generated images, and store the attributes.
3. Process the attributes to be binary label. For age attribute, label age > 20 as 1 and age < 20 as -1. For emotion attribute, label "happy" as 1 and the other as -1.

You can choose any data structure as you like. We suggest to store the label and latent codes in the drive since getting them might take a long time and you need to re-run when resume your work, but it is not required.

In [None]:
!pip install deepface

In [None]:
from deepface import DeepFace
attribute = DeepFace.analyze(img_path = [img_tensor[1], img_tensor[2]], actions = ['age','emotion'], enforce_detection = False, prog_bar = False)
attribute

## 1. Generate 5k latent code (z and w)   
Similar to the provided example, please generate 5k images and store the z and w latent code seperately. Visualization might help you to debug but it is not required.

In [None]:
def get_latent(generator):
  #################################
  ##### your code starts here #####
  #################################

  #################################
  ##### your code ends here #######
  #################################
  return latent_z_list, latent_w_list, img_tensor
latent_z_list, latent_w_list, img_tensor = get_latent(generator)

## 2. Get attributes    
Get labels for the generated images. This part might take 30 minutes to complete. For more info about deepface attributes detection, you can check https://github.com/serengil/deepface and https://github.com/serengil/deepface/blob/master/deepface/DeepFace.py#L267

Store the attributes and return.

In [None]:
# we need to convert image tensor to image list for deepface 
def tensor2list(img_tensor):
  img_list = []
  for tensor in img_tensor:
    img_list.append(tensor)
  return img_list

def get_attribute(img_tensor):
  img_list = tensor2list(img_tensor)
  #################################
  ##### your code starts here #####
  #################################

  #################################
  ##### your code ends here #######
  #################################
  
  return attribute

attribute = get_attribute(img_tensor)


## 3. Process attributes to get binary labels 
Process the attributes to be binary label. For age attribute, label age > 20 as 1 and age < 20 as -1. For emotion attribute, label "happy" as 1 and the other as -1

In [None]:
def get_label(raw_attributes):
  #################################
  ##### your code starts here #####
  #################################

  #################################
  ##### your code ends here #######
  #################################

  return label_age, label_emotion

label_age, label_emotion = get_label(attribute)

# InterfaceGAN
[InterfaceGAN ](https://arxiv.org/abs/2005.09635) is a simple approach to manipulate the latent space. It takes latent codes as training data, their corresponding attribute as labels, and train a SVM to classify the latent codes from the labels, then extract the classification boundary as the attribute boundary for StyleGAN. 

## Get decision boundary 
In this part we will implemente InterfaceGAN to derive the boundary of each attributes. Different from original paper which uses SVM, we will instead use logistic regression since SVM is not covered in class. The latent code should be your data, and apply logistic regression with the binary label. You may re-use any code from previous assignments, or you can use functions in sklearn. You should get FOUR boundaries: z_age_boundary, z_emotion_boundary, w_age_boundary, w_emotion_boundary

In [None]:
import sklearn.linear_model
def get_boundary(latent, label):
  #################################
  ##### your code starts here #####
  #################################

  #################################
  ##### your code ends here #######
  #################################
z_age_boundary = get_boundary(latent_z_list, label_age)
z_emotion_boundary = get_boundary(latent_z_list, label_emotion)
w_age_boundary = get_boundary(latent_w_list, label_age)
w_emotion_boundary = get_boundary(latent_w_list, label_emotion)

## Apply InterfaceGAN 
To apply InterfaceGAN, add the initial latent code with the boundary such that w_elder_latent = w_latent + scale * w_age_boundary where you can choose your own scale parameter to make the generated image better. We are able to get a decent result by setting scale=2. Feed the "editted" latent to the generator and visualize the result.

In [None]:
def apply_interfacegan(generator, age_boundary, emotion_boundary, scale):
  #################################
  ##### your code starts here #####
  #################################

  #################################
  ##### your code ends here #######
  #################################
  return img_tensor, img_tensor_edit_age, img_tensor_edit_emotion
  


## Visualize your result
The face might not be high quality since we are using the StyleGAN model of resolution 256 for fast inference time. If you are interested in higher quality for image generation or InterfaceGAN, feel free to change the resolution to 1024.

Here we provide an expected output. As long as your manipulation makes sense you will get full credit.

In [None]:
raw, w_age_editted, w_emotion_editted = apply_interfacegan(generator, w_age_boundary, w_emotion_boundary, scale)
print("Raw image")
imshow(raw)
print("Younger face")
imshow(w_age_editted)
print("Happier face")
imshow(w_emotion_editted)

# Interpolation 
Generate the Interpolation of age and happy attribute. You should first generate 1 image, edit it using age and happy boundary with scale from [-5, 5]. Return the interpolated image tensor. 

In [None]:
def interpolation(generator, boundary):
  scale = np.arange(-5, 5, 1)
  interpolation = []
  #################################
  ##### your code starts here #####
  #################################
  
  #################################
  ##### your code ends here #######
  #################################
  img_tensor = np.concatenate(interpolation, axis=0)
  return img_tensor

In [None]:
imshow(img_tensor)

# Question: From your observation, which latent space is more powerful (disentangled) for image manipulation? w space or z space?

## Your Anwser: 

# GAN Inversion
Now we went through how to generate image from a random latent code, and manipulate the latent code to edit images. However, many real-life application requires to have an image as input and edit the image, that is, "encode" the image into GAN's latent space. We call this process GAN Inversion. The idea of GAN Inversion is to first generate the image from a random latent code, and calculate the loss between the generated image and the real image, then backprogate through a neural network (VGG) to optimize the latent code until the output images are similar enough.

## 1. process input image
Upload img.jpg to your colab, and copy the path to read image

In [None]:
# process input image
from PIL import Image
from torchvision import transforms

# Opens a image in RGB mode
input_image = Image.open(r"/content/img.jpg")#.resize((256, 256))
input_image_vis = np.asarray(input_image.resize((256, 256))).reshape(1,256,256,3)

# transform
transform = transforms.Compose(
        [
            transforms.Resize((256, 256)),
            transforms.CenterCrop((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
        ]
    )

input_image = transform(input_image)
imshow(input_image_vis)

def vis(input_img):
  wp = generator.truncation(input_img.cuda())
  images = generator.synthesis(wp)['image']
  images = postprocess(images)
  imshow(images)

## 2. build VGG network

In [None]:
import torchvision.models as models
vgg16 = models.vgg16(pretrained=True)

## 3. Get mean latent and visualize the mean face
Optimizing from pure random variable is very difficult. Instead, we first get the mean latent of 10k generated face to represent an average face of human and start the optimization from it.

In [None]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
num = 10000
batch_size = 1

w_samples = []

# Sample and synthesize.
outputs = []
for idx in tqdm.tqdm(range(0, num, batch_size)):
  batch = min(batch_size, num - idx)
  latent_z = torch.randn(batch, generator.z_space_dim).cuda()
  
  with torch.no_grad():
    latent_w = generator.mapping(latent_z)['w']
    w_samples.append(latent_w.detach().cpu())
    
w_avg = np.mean(w_samples, axis=0, keepdims=True)
w_std = (np.sum((w_samples - w_avg) ** 2) / num) ** 0.5

vis(w_avg[0].cuda())

## 4. Use VGG to Get image feature

In [None]:
input_image = input_image.cuda()
vgg16.eval().cuda()

#################################
##### your code starts here #####
#################################
target_features = 
#################################
##### your code ends here #######
#################################



# 5. Noise and optimizer initialization

In [None]:
noise_bufs = { name: buf for (name, buf) in generator.synthesis.named_buffers() if 'apply_noise' in name }

w_opt = torch.tensor(w_avg[0], dtype=torch.float32, device=device, requires_grad=True) 
optimizer = torch.optim.Adam([w_opt] + list(noise_bufs.values()), betas=(0.9, 0.999), lr=initial_learning_rate)

# Init noise.
for buf in noise_bufs.values():
    buf.requires_grad = False

    buf[:] = torch.randn_like(buf)
    buf.requires_grad = True

# 6. Hyperparameter setup

In [None]:
import torch.nn.functional as F

device = "cuda"
num_steps                  = 1500
initial_learning_rate      = 0.05
initial_noise_factor       = 0.05
lr_rampdown_length         = 0.25
lr_rampup_length           = 0.05
noise_ramp_length          = 0.75
regularize_noise_weight    = 1e5

input_image = input_image.cuda()
vgg16.eval().cuda()


# Start optimization process
We start get the inversion by optimization. The inverted image by this method might not look like our target, (and usually look completely different). Don't worry too much about the final result, as long as the implementation and the result makes sense you will get full credit.

Your task in this part are:

*    Get synth_images from w latent code
*    Get synthesis feature from synth_images
*    Implement MSE loss between target feature and synthesis feature for the optimizaiton 
*    Tune the hyperparameter to get a "good enough" result. It's fine that your result looks very different from the reference image. However, you are expected to get a human image with some ideneities similar to the reference image (e.g. race). 

In [None]:
for step in tqdm.tqdm(range(num_steps)):
    # Learning rate schedule.
    t = step / num_steps
    w_noise_scale = w_std * initial_noise_factor * max(0.0, 1.0 - t / noise_ramp_length) ** 2
    lr_ramp = min(1.0, (1.0 - t) / lr_rampdown_length)
    lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi)
    lr_ramp = lr_ramp * min(1.0, t / lr_rampup_length)
    lr = initial_learning_rate * lr_ramp
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # Synth images from opt_w.

    w_noise = torch.randn_like(w_opt) * w_noise_scale.cuda()
    w_latent = (w_opt + w_noise).repeat([1, 1, 1])

    #################################
    ##### your code starts here #####
    #################################
    synth_images = 
    # Features for synth images.
    synth_features = 
    # MSE Loss
    mse_loss = 
    #################################
    ##### your code ends here #######
    #################################

    
    
    # Noise regularization.
    reg_loss = 0.0
    for v in noise_bufs.values():
        noise = v[None,None,:,:] # must be [1,1,H,W] for F.avg_pool2d()
        while True:
            reg_loss += (noise*torch.roll(noise, shifts=1, dims=3)).mean()**2
            reg_loss += (noise*torch.roll(noise, shifts=1, dims=2)).mean()**2
            if noise.shape[2] <= 8:
                break
            noise = F.avg_pool2d(noise, kernel_size=2)
    loss = mse_loss + reg_loss * regularize_noise_weight

    # Step
    optimizer.zero_grad(set_to_none=True)
    loss.backward(retain_graph=True)
    optimizer.step()
   

    # Save projected W for each optimization step.
    w_out = w_opt.detach()[0]

    # Normalize noise.
    with torch.no_grad():
        for buf in noise_bufs.values():
            buf = buf - buf.mean()
            buf = buf * buf.square().mean().rsqrt()


In [None]:
vis(w_out.unsqueeze(0))