# Практическое занятие №5

## Необходимые сегодня библиотеки

* torch
* timm
* scikit-learn
* pandas

## [Автоэнкодер](https://neurohive.io/ru/osnovy-data-science/avtojenkoder-tipy-arhitektur-i-primenenie/)

Инициализация конфига для дальнейшей работы

In [None]:
config = {
    'batch_size'    : 32,
    'total_epochs'  : 40,
    'learning_rate' : 1e-4,
    'save_path'     : './weights', # путь куда сохранять модельки
    'seed'          : 42,
    'dim_code'      : 512 # длина вектора латентного пространства
}

In [None]:
import os

In [None]:
if not os.path.exists(config['save_path']):
    os.mkdir(config['save_path'])

In [None]:
save_path = config['save_path']

### Подготовка данных

In [None]:
import numpy as np
from torch.autograd import Variable
from torchvision import datasets
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils
import torch
import matplotlib.pyplot as plt
import random

import os
import pandas as pd
import skimage.io
from skimage.transform import resize

from IPython.display import clear_output

%matplotlib inline

Будем использовать базы лиц [LFW](https://vis-www.cs.umass.edu/lfw/)

In [None]:
def fetch_dataset(attrs_name = "lfw_attributes.txt",
                      images_name = "lfw-deepfunneled",
                      dx=80,dy=80,
                      dimx=64,dimy=64
    ):

    #download if not exists
    if not os.path.exists(images_name):
        print("images not found, donwloading...")
        os.system("wget http://vis-www.cs.umass.edu/lfw/lfw-deepfunneled.tgz -O tmp.tgz")
        print("extracting...")
        os.system("tar xvzf tmp.tgz && rm tmp.tgz")
        print("done")
        assert os.path.exists(images_name)

    if not os.path.exists(attrs_name):
        print("attributes not found, downloading...")
        os.system("wget http://www.cs.columbia.edu/CAVE/databases/pubfig/download/%s" % attrs_name)
        print("done")

    #read attrs
    df_attrs = pd.read_csv("lfw_attributes.txt",sep='\t',skiprows=1,)
    df_attrs = pd.DataFrame(df_attrs.iloc[:,:-1].values, columns = df_attrs.columns[1:])

    #read photos
    photo_ids = []
    for dirpath, dirnames, filenames in os.walk(images_name):
        for fname in filenames:
            if fname.endswith(".jpg"):
                fpath = os.path.join(dirpath,fname)
                photo_id = fname[:-4].replace('_',' ').split()
                person_id = ' '.join(photo_id[:-1])
                photo_number = int(photo_id[-1])
                photo_ids.append({'person':person_id,'imagenum':photo_number,'photo_path':fpath})

    photo_ids = pd.DataFrame(photo_ids)
    df = pd.merge(df_attrs,photo_ids,on=('person','imagenum'))

    assert len(df)==len(df_attrs),"lost some data when merging dataframes"

    #image preprocessing
    all_photos =df['photo_path'].apply(skimage.io.imread)\
                                .apply(lambda img:img[dy:-dy,dx:-dx])\
                                .apply(lambda img: resize(img,[dimx,dimy]))

    all_photos = np.stack(all_photos.values)#.astype('uint8')
    all_attrs = df.drop(["photo_path","person","imagenum"],axis=1)

    return all_photos, all_attrs

In [None]:
# Получаем изображения и аттрибуты к ним
data, attrs = fetch_dataset()

Фиксируем `seed` для обучения

In [None]:
torch.manual_seed(config['seed'])
random.seed(config['seed'])
np.random.seed(config['seed'])

device = torch.device('cpu')
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config['seed'])
    device = torch.device('cuda:0')
    
device

Разбиваем выборку на `train` и `val`

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, val_data, train_attrs, val_attrs = train_test_split(data, attrs, test_size=0.2, random_state=config['seed'])

Выведем несколько изображений

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, sharey=True, sharex=True)

for fig_x in ax.flatten():
    idx = random.randrange(0, train_data.shape[0])
    fig_x.imshow(train_data[idx])
    plt.xticks([])
    plt.yticks([])
    plt.grid(True)
plt.show()

Приведем картинки к тензорам и создадим даталоадеры

Размерность изображений выглядит следующим образом (height, width, channels)

In [None]:
print(train_data[0].shape)

Мы хотим, чтобы было: (channels, height, width) поэтому применяем permute

In [None]:
train_data = torch.FloatTensor(train_data).permute(0,3,1,2)
val_data = torch.FloatTensor(val_data).permute(0,3,1,2)

In [None]:
print(train_data[0].shape)

Создаём даталоадеры

In [None]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=config['batch_size'])
val_loader = torch.utils.data.DataLoader(val_data, batch_size=config['batch_size'])

### Архитектура модели

Реализуем класс автоэнкодера

Полезные ссылки:
* https://indoml.com/2018/03/07/student-notes-convolutional-neural-networks-cnn-introduction/
* https://towardsdatascience.com/conv2d-to-finally-understand-what-happens-in-the-forward-pass-1bbaafb0b148

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, dim_code, in_channels=3, out_channels=32):
        super().__init__()
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.dim_code = dim_code
        
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=self.in_channels, out_channels=self.out_channels, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.out_channels, out_channels=self.out_channels * 2, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 2),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.out_channels * 2, out_channels=self.out_channels * 4, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 4),
            nn.ReLU(),
            nn.Flatten(start_dim=1),
            # 8 и 8, потому что у нас исходный размер картинок 64х64
            # и мы сделали три свертки со страйдом = 2, поэтому
            # 64 -> 32 -> 16 -> 8
            nn.Linear(self.out_channels * 4 * 8 * 8, self.dim_code)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(self.dim_code, self.out_channels * 4 * 8 * 8),
            nn.Unflatten(dim=1, unflattened_size=(self.out_channels * 4, 8, 8)),
            nn.ConvTranspose2d(in_channels=self.out_channels * 4, out_channels=self.out_channels * 2, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 2),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=self.out_channels * 2, out_channels=self.out_channels, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(num_features=self.out_channels),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=self.out_channels, out_channels=self.in_channels, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        # в качестве возвращаемых переменных:
        # латентное представление картинки (latent_code)
        # и полученная реконструкция изображения (reconstruction)
        latent_code = self.encoder(x)        
        reconstruction = self.decoder(latent_code)

        return reconstruction, latent_code
    
    def get_latent(self, x):
        # метод для получения только латентного представления
        return self.encoder(x)
    
    def get_reconstruction(self, x):
        # метод для получения реконструкции по входному латентному представлению
        return self.decoder(x)

Инициализация начальных параметров

In [None]:
criterion = nn.MSELoss()

autoencoder = Autoencoder(dim_code=config['dim_code']).to(device)

optimizer = torch.optim.AdamW(autoencoder.parameters(), lr=config['learning_rate'])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.1)

### Обучение

In [None]:
def train(config, model, optimizer, criterion, train_loader, val_loader, scheduler):
    target = next(iter(val_loader))
    log = {"epoch": [], "train_loss": [],  "val_loss": []}
    
    for epoch in range(config['total_epochs']):
        log['epoch'].append(epoch)
        
        avg_train_loss = 0
        avg_val_loss = 0

        # train
        model.train()
        for data in train_loader:
            data = data.to(device)
            optimizer.zero_grad()
            reconstruction, _ = model(data)
            loss = criterion(reconstruction, data)
            loss.backward()
            optimizer.step()
            avg_train_loss += loss.item() / len(train_loader)
        scheduler.step()
        log['train_loss'].append(avg_train_loss)
        
        # val
        model.eval()
        with torch.no_grad():
            for data in val_loader:
                data = data.to(device)
                reconstruction, _ = model(data)
                loss = criterion(reconstruction, data)
                avg_val_loss += loss.item() / len(val_loader)
        log['val_loss'].append(avg_val_loss)
        
        predict, _ = model(target.to(device))
        # Visualize tools
        clear_output(wait=True)
        for k in range(6):
            plt.subplot(2, 6, k+1)
            plt.imshow(target[k].detach().cpu().permute(1,2,0))
            plt.title('Target')
            plt.axis('off')

            plt.subplot(2, 6, k+7)
            plt.imshow(predict[k].detach().cpu().permute(1,2,0))
            plt.title('Predict')
            plt.axis('off')
        plt.suptitle('%d / %d - train_loss: %f, val_loss: %f' % 
                    (epoch+1, config['total_epochs'], log['train_loss'][-1], log['val_loss'][-1]))
        plt.show()
        
        plt.plot(log['epoch'], log['train_loss'], label='train')
        plt.plot(log['epoch'], log['val_loss'], label='val')
        plt.legend()
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.title('Loss')
        plt.show()
    return log

In [None]:
log = train(config,
            autoencoder, optimizer, criterion,
            train_loader, val_loader, scheduler)

Сохраняем модельку и логи обучения

In [None]:
torch.save(autoencoder.state_dict(), f'{save_path}/autoencoder_model.pt')
torch.save(log, f'{save_path}/autoencoder_log.pt')

Посмотрим, как автоэнкодер кодирует и восстанавливает картинки

In [None]:
target = next(iter(val_loader))
autoencoder.eval()
with torch.no_grad():
    predict, _ = autoencoder(target.to(device))
    
    for k in range(6):
        plt.subplot(2, 6, k+1)
        plt.imshow(target[k+6].detach().cpu().permute(1,2,0))
        plt.title('Target')
        plt.axis('off')

        plt.subplot(2, 6, k+7)
        plt.imshow(predict[k+6].detach().cpu().permute(1,2,0))
        plt.title('Predict')
        plt.axis('off')
    plt.show()

### Сэмплинг

In [None]:
real_latents = torch.FloatTensor()
autoencoder.eval()
with torch.no_grad():
    for _data in train_loader:
        _latent = autoencoder.get_latent(_data.to(device))
        real_latents = torch.cat((real_latents, _latent.to('cpu')))
        
mu = torch.mean(real_latents, dim=0).to(device)
sigma = torch.std(real_latents, dim=0).to(device)

In [None]:
with torch.no_grad():
    output = autoencoder.get_latent(target.to(device))
z = 0.5 * sigma * torch.randn(25, config['dim_code']).to(device) + mu
# берем первый вектор
fig, ax = plt.subplots(2)
ax[0].hist(output[0].detach().cpu())
ax[0].set_title('Real')
ax[1].hist(z[0].detach().cpu())
ax[1].set_title('Generated')
fig.tight_layout()
plt.show()

Теперь пропустим через модель

In [None]:
reconstruction = autoencoder.get_reconstruction(z.to(device))

plt.figure(figsize=(10, 10))
for i in range(reconstruction.shape[0]):
  plt.subplot(5, 5, i + 1)
  plt.imshow(reconstruction[i].detach().cpu().permute(1,2,0))
  plt.axis('off')

plt.show()

## [Вариационный автоэнкодер](https://proglib.io/p/variacionnye-avtoenkodery-vae-dlya-chaynikov-poshagovoe-rukovodstvo-2021-07-05)

Инициализация конфига

In [None]:
config = {
    'batch_size'    : 32,
    'total_epochs'  : 40,
    'learning_rate' : 1e-4,
    'save_path'     : './weights', # путь куда сохранять модельки
    'seed'          : 42,
    'features'      : 16 # длина вектора сэмплирования
}

### Подготовка данных

В этот раз используем набор рукописных цифр [MNIST](https://blog.skillfactory.ru/glossary/mnist-dataset/)

In [None]:
batch_size = 32
# MNIST Dataset
train_dataset = datasets.MNIST(root='./mnist_data/', train=True, transform=transforms.ToTensor(), download=True)
val_dataset = datasets.MNIST(root='./mnist_data/', train=False, transform=transforms.ToTensor(), download=False)

# Data Loader (Input Pipeline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=config['batch_size'], shuffle=False)

### Архитектура модели

In [None]:
class VAE(nn.Module):
    def __init__(self, features, in_channels=1, out_channels=32):
        super().__init__()
        # encoder должен кодировать картинку в 2 переменные -- mu и logsigma
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.features = features

        # encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channels=self.in_channels, out_channels=self.out_channels, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.out_channels, out_channels=self.out_channels * 2, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 2),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.out_channels * 2, out_channels=self.out_channels * 4, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 4),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.out_channels * 4, out_channels=self.out_channels * 8, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 8),
            nn.ReLU(),
            nn.Conv2d(in_channels=self.out_channels * 8, out_channels=self.out_channels * 16, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 16),
            nn.ReLU()
        )

        self.mu = nn.Linear(self.out_channels * 16, self.features)
        self.logsigma = nn.Linear(self.out_channels * 16, self.features)

        # decoder
        # отдельно делаем инпут для декодера, т.к. надо будет 
        # приводить размерность
        self.decoder_input = nn.Linear(self.features, self.out_channels * 16)

        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(in_channels=self.out_channels * 16, out_channels=self.out_channels * 8, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 8),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=self.out_channels * 8, out_channels=self.out_channels * 4, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(num_features=self.out_channels * 4),
            nn.ReLU(),
            # output_padding ставим 0, потому что у нас
            # размер входа 28*28, иначе получим 32*32
            nn.ConvTranspose2d(in_channels=self.out_channels * 4, out_channels=self.out_channels * 2, kernel_size=3, stride=2, padding=1, output_padding=0),
            nn.BatchNorm2d(num_features=self.out_channels * 2),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=self.out_channels * 2, out_channels=self.out_channels, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(num_features=self.out_channels),
            nn.ReLU(),
            nn.ConvTranspose2d(in_channels=self.out_channels, out_channels=self.in_channels, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )

    def reparameterize(self, mu, log_var):
        """
        :param mu: mean from the encoder's latent space
        :param log_var: log variance from the encoder's latent space
        """
        std = torch.exp(0.5 * log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling as if coming from the input space
        return sample

    def encode(self, x):
        x = self.encoder(x)
        x = torch.flatten(x, start_dim=1)

        mu = self.mu(x)
        logsigma = self.logsigma(x)

        return mu, logsigma

    def gaussian_sampler(self, mu, logsigma):
        if self.training:
            # сэмплируем латентный вектор из нормального распределения с параметрами mu и sigma
            return self.reparameterize(mu, logsigma)
        else:
            # на инференсе возвращаем не случайный вектор из нормального распределения, а центральный -- mu.
            # на инференсе выход автоэнкодера должен быть детерминирован.
            return mu

    def decode(self, z):
        z = self.decoder_input(z)
        z = z.view(-1, self.out_channels * 16, 1, 1)
        reconstruction = self.decoder(z)
        return reconstruction

    def forward(self, x):
        mu, logsigma = self.encode(x)
        z = self.reparameterize(mu, logsigma)
        reconstruction = self.decode(z)
        return mu, logsigma, reconstruction
    
    def get_latent(self, x):
        mu, logsigma = self.encode(x)
        return self.reparameterize(mu, logsigma)
    
    def get_reconstruction(self, x):
        return self.decode(x)

Определяем лосс

In [None]:
def KL_divergence(mu, logsigma):
    """
    часть функции потерь, которая отвечает за "близость" латентных представлений разных людей
    """
    loss = -0.5 * torch.sum(1 + logsigma - mu.pow(2) - logsigma.exp())
    return loss

def log_likelihood(x, reconstruction):
    """
    часть функции потерь, которая отвечает за качество реконструкции (как mse в обычном autoencoder)
    """
    loss = nn.BCELoss(reduction='sum')
    return loss(reconstruction, x)

def loss_vae(x, mu, logsigma, reconstruction):
    return KL_divergence(mu, logsigma) + log_likelihood(x, reconstruction)

### Обучение

In [None]:
criterion = loss_vae
vae = VAE(features=config['features']).to(device)

lr = 1e-4
optimizer = torch.optim.AdamW(vae.parameters(), lr=config['learning_rate'])
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.1)

In [None]:
def train(model, optimizer, criterion, epochs, train_loader, val_loader, scheduler):
    target = next(iter(val_loader))
    log = {"epoch": [], "train_loss": [],  "val_loss": []}
    
    for epoch in range(epochs):
        log['epoch'].append(epoch)
        
        avg_train_loss = 0
        avg_val_loss = 0

        model.train()
        for data in train_loader:
            data = data[0].to(device)
            optimizer.zero_grad()
            mu, logsigma, reconstruction = model(data)
            loss = criterion(data, mu, logsigma, reconstruction)
            loss.backward()
            optimizer.step()
            avg_train_loss += loss.item() / len(train_loader)
        scheduler.step()
        log['train_loss'].append(avg_train_loss)
        
        model.eval()
        with torch.no_grad():
            for data in val_loader:
                data = data[0].to(device)
                mu, logsigma, reconstruction = model(data)
                loss = criterion(data, mu, logsigma, reconstruction)
                avg_val_loss += loss.item() / len(val_loader)
        log['val_loss'].append(avg_val_loss)
        
        _, _, predict = model(target[0].to(device))
        # Visualize tools
        clear_output(wait=True)
        for k in range(6):
            plt.subplot(2, 6, k+1)
            plt.imshow(target[0][k].detach().cpu().permute(1,2,0))
            plt.title('Target')
            plt.axis('off')

            plt.subplot(2, 6, k+7)
            plt.imshow(predict[k].detach().cpu().permute(1,2,0))
            plt.title('Predict')
            plt.axis('off')
        plt.suptitle('%d / %d - train_loss: %f, val_loss: %f' % 
                    (epoch+1, epochs, log['train_loss'][-1], log['val_loss'][-1]))
        plt.show()
        
        plt.plot(log['epoch'], log['train_loss'], label='train')
        plt.plot(log['epoch'], log['val_loss'], label='val')
        plt.legend()
        plt.xlabel('epoch')
        plt.ylabel('loss')
        plt.title('Loss')
        plt.show()
    return log

In [None]:
log = train(vae, optimizer, criterion, config['total_epochs'],
            train_loader, val_loader, scheduler)

Сохраняем модельку и лог обучения

In [None]:
torch.save(autoencoder.state_dict(), f'{save_path}/vae_model.pt')
torch.save(log, f'{save_path}/vae_log.pt')

Посмотрим, как вае кодирует и восстанавливает картинки

In [None]:
target = next(iter(val_loader))
vae.eval()
with torch.no_grad():
    _, _, predict = vae(target[0].to(device))
    
    for k in range(6):
        plt.subplot(2, 6, k+1)
        plt.imshow(target[0][k+6].detach().cpu().permute(1,2,0))
        plt.title('Target')
        plt.axis('off')

        plt.subplot(2, 6, k+7)
        plt.imshow(predict[k+6].detach().cpu().permute(1,2,0))
        plt.title('Predict')
        plt.axis('off')
    plt.show()

### Сэмплинг

In [None]:
z = np.array([np.random.normal(0, 1, config['features']) for _ in range(25)])
with torch.no_grad():
    output = vae.get_reconstruction(torch.FloatTensor(z).to(device))
fig, ax = plt.subplots(5, 5, figsize=(6, 6))
for i in range(5):
    for j in range(5):
        axes = ax[i, j]
        axes.imshow(output[5*i+j].cpu().squeeze(0))
        axes.axis('off')
plt.tight_layout()
plt.show()