**Использование псевдоразметки. Семинар.**

In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import random
import numpy as np

In [2]:
torch.manual_seed(123)
torch.cuda.manual_seed(123)
np.random.seed(123)
random.seed(123)
torch.backends.cudnn.deterministic = True

Начнем с загрузки датасета. Речевые данные (и модели, обучаемые на них) очень тяжелые, поэтому мы обойдемся чем-нибудь попроще.

In [3]:
train_dataset = \
    datasets.MNIST('./data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
test_dataset = \
    datasets.MNIST('./data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [4]:
len(train_dataset), len(test_dataset)

(60000, 10000)

Итак, трейн состоит из 60000 картинок цифр. Для того, чтобы получше увидеть эффект от псевдолейблов, мы оставим только 100 этих картинок в качестве размеченных данных. Остальные 59900 будут в качестве неразмеченных. 

На масштабах 100 записей могут проявиться неприятные эффекты, если какие-то из классов не будут достаточно хорошо представлены. Чтобы этого избежать, будем аккуратно семплировать. Самый простой вариант - просто случайно разделять, пока не получится удачное разбиение.

Для начала определим удачность разбиения. Будем считать размеченный датасет хорошим, если из 100 примеров в нем есть хотя бы по 8 представителей каждого класса. Напишите функцию, которая делает такую проверку.

In [12]:
test_dataset.__getitem__(2)[1]

1

In [13]:
...

def check_dataset(dataset):
    l = dataset.__len__()
    total_classes = set()
    for i in range(100):
      rand_id = random.randint(0, l)
      total_classes.add(dataset.__getitem__(rand_id)[1])
    if len(total_classes) > 7:
      return True
    else:
      return False

In [18]:
sampling_iteration = 0
while True:
    labeled_train_dataset, unlabeled_train_dataset = torch.utils.data.random_split(train_dataset, [100, 59900])
    if check_dataset(labeled_train_dataset):
        break
    sampling_iteration += 1
print(f'Split the dataset after {sampling_iteration} resamplings')

Split the dataset after 0 resamplings


In [19]:
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=64, shuffle=False)
labeled_train_loader = torch.utils.data.DataLoader(
    labeled_train_dataset, batch_size=64, shuffle=True)
unlabeled_train_loader = torch.utils.data.DataLoader(
    unlabeled_train_dataset, batch_size=64, shuffle=False)

Теперь, когда мы получили данные, определим архитектуру сети. Возьмем простую сверточную сетку с droupout'ом.

In [20]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, kernel_size=5)
        self.conv2 = nn.Conv2d(20, 40, kernel_size=5)
        self.dropout = nn.Dropout2d(p=0.5)
        self.fc1 = nn.Linear(640, 150)
        self.fc2 = nn.Linear(150, 10)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 1, 28, 28)
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.dropout(self.conv2(x)), 2))
        x = x.view(-1, 640)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.log_softmax(x)
        return x

Опишем вспомогательные функции.

In [21]:
def train(epoch_idx, model, optimizer, train_loader, loss_func=F.nll_loss):
    model.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        x, target = x.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(x)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()

In [22]:
def test(epoch_idx, model, test_loader):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for x, target in test_loader:
            x, target = x.cuda(), target.cuda()
            output = model(x)
            test_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('Epoch {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        epoch_idx, test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [23]:
def predict(model, loader):
    model.eval()
    result = []
    with torch.no_grad():
        for x, _ in loader:
            result.append(model(x.cuda()))
    return torch.cat(result)

Создадим модель и обучим ее на нашем размеченном датасете.

In [24]:
model = Net().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [25]:
for i in range(400):
    train(i, model, optimizer, labeled_train_loader)
    if i % 10 == 0:
        test(i, model, test_loader)



Epoch 0: Average loss: 2.2950, Accuracy: 1253/10000 (13%)
Epoch 10: Average loss: 1.8219, Accuracy: 4334/10000 (43%)
Epoch 20: Average loss: 0.9842, Accuracy: 7412/10000 (74%)
Epoch 30: Average loss: 0.7605, Accuracy: 7487/10000 (75%)
Epoch 40: Average loss: 0.5439, Accuracy: 8425/10000 (84%)
Epoch 50: Average loss: 0.5683, Accuracy: 8399/10000 (84%)
Epoch 60: Average loss: 0.4262, Accuracy: 8818/10000 (88%)
Epoch 70: Average loss: 0.4740, Accuracy: 8735/10000 (87%)
Epoch 80: Average loss: 0.4628, Accuracy: 8794/10000 (88%)
Epoch 90: Average loss: 0.4893, Accuracy: 8789/10000 (88%)
Epoch 100: Average loss: 0.4700, Accuracy: 8871/10000 (89%)
Epoch 110: Average loss: 0.4666, Accuracy: 8882/10000 (89%)
Epoch 120: Average loss: 0.4790, Accuracy: 8878/10000 (89%)
Epoch 130: Average loss: 0.5363, Accuracy: 8788/10000 (88%)
Epoch 140: Average loss: 0.5816, Accuracy: 8787/10000 (88%)
Epoch 150: Average loss: 0.5435, Accuracy: 8812/10000 (88%)
Epoch 160: Average loss: 0.5552, Accuracy: 8779/100

Теперь попробуем побить этот результат с помощью псевдолейблов. Напишем функцию, которая принимает модель и возращает DataLoader с хард-лейблами, и запустим обучение.

In [59]:
def get_pseudo_loader(model):
    dataset = list(unlabeled_train_dataset)
    soft_labels = predict(model, dataset)
    hard_labels = torch.argmax(soft_labels, 1)
    for idx, i in enumerate(dataset):
      # print(i[1], hard_labels[idx])
      dataset[idx] = (i[0], hard_labels[idx])
    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True)

In [60]:
model_hard = Net().cuda()
model_hard.load_state_dict(model.state_dict())
optimizer_hard = torch.optim.SGD(model_hard.parameters(), lr=0.1)

In [61]:
hard_labeled_loader = get_pseudo_loader(model)
for i in range(10):
    train(i, model_hard, optimizer_hard, hard_labeled_loader)
    train(i, model_hard, optimizer_hard, labeled_train_loader)
    test(i, model_hard, test_loader)



Epoch 0: Average loss: 0.3560, Accuracy: 9014/10000 (90%)
Epoch 1: Average loss: 0.4231, Accuracy: 8994/10000 (90%)
Epoch 2: Average loss: 0.3778, Accuracy: 9105/10000 (91%)
Epoch 3: Average loss: 0.4300, Accuracy: 9056/10000 (91%)
Epoch 4: Average loss: 0.4270, Accuracy: 9007/10000 (90%)
Epoch 5: Average loss: 0.4094, Accuracy: 9032/10000 (90%)
Epoch 6: Average loss: 0.3992, Accuracy: 9053/10000 (91%)
Epoch 7: Average loss: 0.4301, Accuracy: 9017/10000 (90%)
Epoch 8: Average loss: 0.4489, Accuracy: 9050/10000 (90%)
Epoch 9: Average loss: 0.4518, Accuracy: 9077/10000 (91%)


**Итеративная псевдоразметка.**

Мы уже видим небольшое улучшение, но можно пойти дальше.

In [62]:
model_hard_iter = Net().cuda()
model_hard_iter.load_state_dict(model.state_dict())
optimizer_hard_iter = torch.optim.SGD(model_hard_iter.parameters(), lr=0.1)

In [63]:
for i in range(20):
    hard_labeled_loader = get_pseudo_loader(model_hard_iter)
    train(i, model_hard_iter, optimizer_hard_iter, hard_labeled_loader)
    train(i, model_hard_iter, optimizer_hard_iter, labeled_train_loader)
    test(i, model_hard_iter, test_loader)



Epoch 0: Average loss: 0.3581, Accuracy: 9062/10000 (91%)
Epoch 1: Average loss: 0.3643, Accuracy: 9105/10000 (91%)
Epoch 2: Average loss: 0.3638, Accuracy: 9127/10000 (91%)
Epoch 3: Average loss: 0.3718, Accuracy: 9217/10000 (92%)
Epoch 4: Average loss: 0.3325, Accuracy: 9247/10000 (92%)
Epoch 5: Average loss: 0.3360, Accuracy: 9296/10000 (93%)
Epoch 6: Average loss: 0.3014, Accuracy: 9322/10000 (93%)
Epoch 7: Average loss: 0.2760, Accuracy: 9367/10000 (94%)
Epoch 8: Average loss: 0.2848, Accuracy: 9373/10000 (94%)
Epoch 9: Average loss: 0.3037, Accuracy: 9376/10000 (94%)
Epoch 10: Average loss: 0.3252, Accuracy: 9364/10000 (94%)
Epoch 11: Average loss: 0.3257, Accuracy: 9374/10000 (94%)
Epoch 12: Average loss: 0.2874, Accuracy: 9427/10000 (94%)
Epoch 13: Average loss: 0.2856, Accuracy: 9434/10000 (94%)
Epoch 14: Average loss: 0.2901, Accuracy: 9423/10000 (94%)
Epoch 15: Average loss: 0.2860, Accuracy: 9437/10000 (94%)
Epoch 16: Average loss: 0.2794, Accuracy: 9466/10000 (95%)
Epoch 1

**Домашнее задание.**

Модифицировать функцию `get_pseudo_loader`, чтобы она могла возвращать софт-лейблы (2 балла).

Правильно запустить обучение - в качестве лосса используем KL-дивергенцию. Получить accuracy 90% или выше. (+5 баллов).

Интуитивно кажется, что модель не должна ничему учиться, т.к. ее выход будет полностью совпадать с софт-лейблами. Напишите (текстом), почему тем не менее удается сильно выиграть относительно бейзлайна. (+3 балла).

In [109]:
model_soft_iter = Net().cuda()
model_soft_iter.load_state_dict(model.state_dict())
optimizer_soft_iter = torch.optim.SGD(model_soft_iter.parameters(), lr=0.1)
KL_loss = torch.nn.KLDivLoss(log_target=True, reduction='batchmean')

In [102]:
def get_pseudo_loader(model, soft=False):
    dataset = list(unlabeled_train_dataset)
    soft_labels = predict(model, dataset)
    if soft == True:
      for idx, i in enumerate(dataset):
        # print(i[1], hard_labels[idx])
        dataset[idx] = (i[0], soft_labels[idx])
    else:
      hard_labels = torch.argmax(soft_labels, 1)
      for idx, i in enumerate(dataset):
        # print(i[1], hard_labels[idx])
        dataset[idx] = (i[0], hard_labels[idx])
    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True)

In [103]:
def test(epoch_idx, model, test_loader):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for x, target in test_loader:
            x, target = x.cuda(), target.cuda()
            output = model(x)
            test_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            # print(pred)
            correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('Epoch {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        epoch_idx, test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [105]:
def train(epoch_idx, model, optimizer, train_loader, loss_func=F.nll_loss):
    model.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        x, target = x.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(x)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()

In [96]:
first = torch.Tensor([-3.2228e+01, -2.4380e+01, -2.5240e+01, -2.0968e+01, -3.2228e+01,-3.2228e+01, -3.2228e+01,  0.0000e+00, -3.1462e+01, -3.2108e+01])
second = torch.Tensor([-3.8978e+01, -3.6266e+01, -3.6124e+01, -3.1969e+01, -3.8322e+01, -3.8978e+01, -3.8978e+01,  0.0000e+00, -3.7471e+01, -3.3333e+01])

In [110]:
KL_loss(first, second)

tensor(-1.5284e-14)

In [111]:
for i in range(20):
    soft_labeled_loader = get_pseudo_loader(model_soft_iter, soft=True)
    train(i, model_soft_iter, optimizer_soft_iter, soft_labeled_loader, loss_func=torch.nn.KLDivLoss(log_target=True, reduction='batchmean'))
    train(i, model_soft_iter, optimizer_soft_iter, labeled_train_loader)
    test(i, model_soft_iter, test_loader)




Epoch 0: Average loss: 0.3404, Accuracy: 9095/10000 (91%)
Epoch 1: Average loss: 0.2865, Accuracy: 9202/10000 (92%)
Epoch 2: Average loss: 0.2748, Accuracy: 9226/10000 (92%)
Epoch 3: Average loss: 0.2846, Accuracy: 9194/10000 (92%)
Epoch 4: Average loss: 0.2725, Accuracy: 9221/10000 (92%)
Epoch 5: Average loss: 0.2653, Accuracy: 9237/10000 (92%)
Epoch 6: Average loss: 0.2595, Accuracy: 9275/10000 (93%)
Epoch 7: Average loss: 0.2522, Accuracy: 9290/10000 (93%)
Epoch 8: Average loss: 0.2471, Accuracy: 9284/10000 (93%)
Epoch 9: Average loss: 0.2415, Accuracy: 9310/10000 (93%)
Epoch 10: Average loss: 0.2400, Accuracy: 9316/10000 (93%)
Epoch 11: Average loss: 0.2422, Accuracy: 9301/10000 (93%)
Epoch 12: Average loss: 0.2397, Accuracy: 9313/10000 (93%)
Epoch 13: Average loss: 0.2305, Accuracy: 9335/10000 (93%)
Epoch 14: Average loss: 0.2330, Accuracy: 9312/10000 (93%)
Epoch 15: Average loss: 0.2357, Accuracy: 9328/10000 (93%)
Epoch 16: Average loss: 0.2351, Accuracy: 9319/10000 (93%)
Epoch 1