# import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ["MKL_NUM_THREADS"] = "2"
os.environ["NUMEXPR_NU M_THREADS"] = "2"
os.environ["OMP_NUM_THREADS"] = "2"
import numpy as np
import tqdm
import time

import torch
import torch.nn as nn
import torch.utils.data as data
import torch.nn.functional as F
from torch.optim.lr_scheduler import LambdaLR
from advertorch.attacks import GradientSignAttack
from torch.utils.tensorboard import SummaryWriter

In [3]:
import sys
sys.path.append("../src/")

from datasetManager import DatasetManager
from generators import Generator, CoTrainingGenerator
from samplers import CoTrainingSampler
import signal_augmentations as sa 

# Utils

## Metrics

In [4]:
class Metrics:
    def __init__(self, epsilon=1e-10):
        self.value = 0
        self.accumulate_value = 0
        self.count = 0
        self.epsilon = epsilon
        
    def reset(self):
        self.accumulate_value = 0
        self.count = 0
        
    def __call__(self):
        self.count += 1

        
class BinaryAccuracy(Metrics):
    def __init__(self, epsilon=1e-10):
        Metrics.__init__(self, epsilon)
        
    def __call__(self, y_pred, y_true):
        super().__call__()
        
        with torch.set_grad_enabled(False):
            y_pred = (y_pred>0.5).float()
            correct = (y_pred == y_true).float().sum()
            self.value = correct/ (y_true.shape[0] * y_true.shape[1])
            
            self.accumulate_value += self.value
            return self.accumulate_value / self.count
        
        
class CategoricalAccuracy(Metrics):
    def __init__(self, epsilon=1e-10):
        Metrics.__init__(self, epsilon)
        
    def __call__(self, y_pred, y_true):
        super().__call__()
        
        with torch.set_grad_enabled(False):
            self.value = torch.mean((y_true == y_pred).float())
            self.accumulate_value += self.value

            return self.accumulate_value / self.count

        
class Ratio(Metrics):
    def __init__(self, epsilon=1e-10):
        Metrics.__init__(self, epsilon)
        
    def __call__(self, y_pred, y_adv_pred):
        super().__call__()
        
        results = zip(y_pred, y_adv_pred)
        results_bool = [int(r[0] != r[1]) for r in results]
        self.value = sum(results_bool) / len(results_bool) * 100
        self.accumulate_value += self.value
        
        return self.accumulate_value / self.count

In [5]:
import datetime
def get_datetime():
    now = datetime.datetime.now()
    return str(now)[:10] + "_" + str(now)[11:-7]

# Initialization

## set seeds

In [6]:
def reset_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
reset_seed()

## Prepare GPU

In [7]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
# cudnn.benchmark = True

# Model definition

## CNN
https://arxiv.org/pdf/1608.04363.pdf

In [8]:
class ConvPoolReLU(nn.Sequential):
    def __init__(self, in_size, out_size, kernel_size, stride, padding):
        super(ConvPoolReLU, self).__init__(
            nn.Conv2d(in_size, out_size, kernel_size=kernel_size, stride=stride, padding=padding),
            nn.MaxPool2d(kernel_size=(4, 2), stride=(4, 2)),
            nn.BatchNorm2d(out_size),
            nn.ReLU6(inplace=True),
        )
        
class ConvReLU(nn.Sequential):
    def __init__(self, in_size, out_size, kernel_size, stride, padding):
        super(ConvReLU, self).__init__(
            nn.Conv2d(in_size, out_size, kernel_size=kernel_size, stride=stride, padding=padding),
            nn.ReLU6(inplace=True),
        )

In [9]:
class cnn(nn.Module):
    def __init__(self):
        super(cnn, self).__init__()
        
        self.features = nn.Sequential(
            ConvPoolReLU(1, 24, 3, 1, 1),
            ConvPoolReLU(24, 48, 3, 1, 1),
            ConvPoolReLU(48, 48, 3, 1, 1),
            ConvReLU(48, 48, 3, 1, 1),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(0.5),
            nn.Linear(1008, 10),
#             nn.ReLU(inplace=True),
#             nn.Dropout(0.5),
#             nn.Linear(64, 10),
        )
                
        
    def forward(self, x):
        x = x.view(-1, 1, *x.shape[1:])

        x = self.features(x)
        x = self.classifier(x)
        
        return x

# ======== Co-Training ========

## Define loss functions

In [10]:
def Lsup(logit_S1, logit_S2, labels_S1, labels_S2):
    ce = nn.CrossEntropyLoss() 
    loss1 = ce(logit_S1, labels_S1)
    loss2 = ce(logit_S2, labels_S2) 
    return (loss1+loss2)

def Lcot(U_p1, U_p2):
# the Jensen-Shannon divergence between p1(x) and p2(x)
    S = nn.Softmax(dim = 1)
    LS = nn.LogSoftmax(dim = 1)
    U_batch_size = U_p1.size()[0]
    
    a1 = 0.5 * (S(U_p1) + S(U_p2))
    loss1 = a1 * torch.log(a1)
    loss1 = -torch.sum(loss1)
    loss2 = S(U_p1) * LS(U_p1)
    loss2 = -torch.sum(loss2)
    loss3 = S(U_p2) * LS(U_p2)
    loss3 = -torch.sum(loss3)

    return (loss1 - 0.5 * (loss2 + loss3))/U_batch_size


def Ldiff(logit_S1, logit_S2, perturbed_logit_S1, perturbed_logit_S2, logit_U1, logit_U2, perturbed_logit_U1, perturbed_logit_U2):
    S = nn.Softmax(dim = 1)
    LS = nn.LogSoftmax(dim = 1)
    batch_size = logit_S1.size()[0] + logit_U1.size()[0]
    
    
    a = S(logit_S2) * LS(perturbed_logit_S1)
    a = torch.sum(a)

    b = S(logit_S1) * LS(perturbed_logit_S2)
    b = torch.sum(b)

    c = S(logit_U2) * LS(perturbed_logit_U1)
    c = torch.sum(c)

    d = S(logit_U1) * LS(perturbed_logit_U2)
    d = torch.sum(d)

    return -(a+b+c+d)/batch_size

In [11]:
def adjust_lamda(epoch):
    epoch = epoch + 1
    global lambda_cot
    global lambda_diff
    if epoch <= 80:
        lambda_cot = lambda_cot_max*np.exp(-5*(1-epoch/80)**2)
        lambda_diff = lambda_diff_max*np.exp(-5*(1-epoch/80)**2)
    else: 
        lambda_cot = lambda_cot_max
        lambda_diff = lambda_diff_max

## Prep model

In [12]:
torch.cuda.empty_cache()

model_func = cnn
m1 = model_func()
m2 = model_func()

m1.cuda()
m2.cuda()

multi_gpu = True
if multi_gpu:
    m1 = nn.DataParallel(m1)
    m2 = nn.DataParallel(m2)

## Prep data

In [13]:
audio_root = "../dataset/audio"
metadata_root = "../dataset/metadata"

dataset = DatasetManager(metadata_root, audio_root, verbose=1)

100%|██████████| 9/9 [00:13<00:00,  1.50s/it]
100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


## Prep training

In [14]:
# loss and optimizer
criterion_bce = nn.CrossEntropyLoss(reduction="mean")

# optimizer
parameters = list(m1.parameters()) + list(m2.parameters())
optimizer = torch.optim.SGD(
    parameters,
    momentum=0.9,
    weight_decay=1e-4,
    lr=0.05
)

# Augmentation to use
# ps1 = sa.PitchShift(0.5, DatasetManager.SR, (-2, 3))
# n = sa.Noise(0.5, (0.05, 0.2))
augments = []


In [15]:
adv_gen_1 = GradientSignAttack( 
                m1,
                loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                eps=0.02, clip_min=-np.inf, clip_max=np.inf, targeted=False
            )

adv_gen_2 = GradientSignAttack( 
                m2,
                loss_fn=nn.CrossEntropyLoss(reduction="sum"),
                eps=0.02, clip_min=-np.inf, clip_max=np.inf, targeted=False
            )

In [16]:
lambda_cot_max = 10
lambda_diff_max = 0.5
lambda_cot = 0.0
lambda_diff = 0.0
best_acc = 0.0 

In [17]:
# training parameters
ratio = 0.1
batch_size = 100
nb_epoch = 600

# prepare the sampler with the specified ratio of supervised fime and a specific batch size
nb_train_file = len(dataset.audio["train"])
nb_s_file = int(nb_train_file * ratio)   # theorical number of supervised file
nb_s_file = nb_s_file - (nb_s_file % DatasetManager.NB_CLASS)  # need to be a multiple of number of class
nb_u_file = nb_train_file - nb_s_file
sampler = CoTrainingSampler(batch_size, nb_s_file, nb_u_file, nb_view=2, ratio=None, method="duplicate")

# create the generator and the loader
generator = CoTrainingGenerator(dataset, sampler, augments=augments)
train_loader = data.DataLoader(generator, batch_sampler=sampler)

# val loader
x, y = generator.validation
x = torch.from_numpy(x)
y = torch.from_numpy(y)
val_dataset = torch.utils.data.TensorDataset(x, y)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# scheduler
lr_lambda = lambda epoch: 0.5 * (np.cos(np.pi * epoch / nb_epoch) + 1)
lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda)
callbacks = [lr_scheduler]
# callbacks = []

# tensorboard
title = "%s_cnn_Cosd-lr_sgd-0.05lr-wd0.0001-m0.9_%de_noaugment" % ( get_datetime(), nb_epoch )
tensorboard = SummaryWriter(log_dir="cotraining/%s" % title, comment=model_func.__name__)

100%|██████████| 837/837 [00:04<00:00, 189.14it/s]


In [18]:
S1, S2, U = [], [], []
for batch in tqdm.tqdm(sampler):
    s1, s2, u = batch[0][0], batch[0][1], batch[0][2]
    S1.extend(s1)
    S2.extend(s2)
    U.extend(u)
    
print(len(S1), len(np.unique(S1)))
print(len(S2), len(np.unique(S2)))
print(len(U), len(np.unique(U)))
      

 99%|█████████▊| 77/78 [00:00<00:00, 126750.95it/s]

693 693
693 693
6930 6930





## training

In [19]:
acc1_func = CategoricalAccuracy()
acc2_func = CategoricalAccuracy()
   
def SU_train(epoch, train_loader):
    m1.train()
    m2.train()

    adjust_lamda(epoch)
    
    acc1_func.reset()
    acc2_func.reset()
    
    running_loss = 0.0
    ls = 0.0
    lc = 0.0 
    ld = 0.0
    
    start_time = time.time()
    
    for i, (X, y) in enumerate(train_loader):
        # output add one extra dimension (1, B, ...) instead of (B, ...)
        X = [x.squeeze() for x in X]
        y = [y_.squeeze() for y_ in y]
        
        # separate Supervised (S) and Unsupervised (U) parts
        X_S, X_U = X[:-1], X[-1]
        y_S, y_U = y[:-1], y[-1]
        
        # move to GPU
        X_S = [xs.cuda().float() for xs in X_S]
        y_S = [ys.cuda().long() for ys in y_S]
        X_U = X_U.cuda().float()
        y_U = y_U.cuda().long()
        
        # Prediction
        logits_S1 = m1(X_S[0])
        logits_S2 = m2(X_S[1])
        logits_U1 = m1(X_U)
        logits_U2 = m2(X_U)

        # pseudo labels of U
        # TODO pseudo labels ? how many class ?
        _, pred_U1 = torch.max(logits_U1, 1)
        _, pred_U2 = torch.max(logits_U2, 1)

        # fix batchnorm & dropout
        m1.eval()
        m2.eval()
        
        #generate adversarial examples
        # Multi-target doesn't work with advertorch. Using the pseudo label of the prediction
        perturbed_data_S1 = adv_gen_1.perturb(X_S[0], y_S[0])
        perturbed_data_S2 = adv_gen_2.perturb(X_S[1], y_S[1])
        
        perturbed_data_U1 = adv_gen_1.perturb(X_U, pred_U1)
        perturbed_data_U2 = adv_gen_2.perturb(X_U, pred_U2)
        
        m1.train()
        m2.train()

        # predict using adversarial samples
        perturbed_logits_S1 = m1(perturbed_data_S2)
        perturbed_logits_S2 = m2(perturbed_data_S1)

        perturbed_logits_U1 = m1(perturbed_data_U2)
        perturbed_logits_U2 = m2(perturbed_data_U1)

        # zero the parameter gradients
        optimizer.zero_grad()
        m1.zero_grad()
        m2.zero_grad()
        
        Loss_sup = Lsup(logits_S1, logits_S2, y_S[0], y_S[1])
        Loss_cot = Lcot(logits_U1, logits_U2)
        Loss_diff = Ldiff(
            logits_S1, logits_S2, perturbed_logits_S1, perturbed_logits_S2,
            logits_U1, logits_U2, perturbed_logits_U1, perturbed_logits_U2
        )
        
        total_loss = Loss_sup + lambda_cot*Loss_cot + lambda_diff*Loss_diff
        
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        # calc the metrics
#         pred_S1 = torch.softmax(logits_S1, dim=1)
#         pred_S2 = torch.softmax(logits_S2, dim=1)
        _, pred_S1 = torch.max(pred_S1, 1)
        _, pred_S2 = torch.max(pred_S2, 1)
        acc_1 = acc1_func(pred_S1, y_S[0])
        acc_2 = acc2_func(pred_S2, y_S[1])
        
        running_loss += total_loss.item()
        ls += Loss_sup.item()
        lc += Loss_cot.item()
        ld += Loss_diff.item()
        
        # using tensorboard to monitor loss and acc
        print("Epoch {:4}, {:3d}% \t acc: {:3.4e} {:3.4e} - loss {:3.4e} {:3.4e} {:3.4e} {:3.4e} took: {:.2f}s".format(
            epoch+1,
            int(100 * (i+1) / sampler.nb_batch),
            
            acc_1, acc_2,
            running_loss/(i+1), ls/(i+1), lc/(i+1), ld/(i+1), 
            time.time() - start_time,
        ), end="\r")
            
    # using tensorboard to monitor loss and acc
    tensorboard.add_scalar('train/total_loss', total_loss.item(), epoch)
    tensorboard.add_scalar('train/Lsup', Loss_sup.item(), epoch )
    tensorboard.add_scalar('train/Lcot', Loss_cot.item(), epoch )
    tensorboard.add_scalar('train/Ldiff', Loss_diff.item(), epoch )
    tensorboard.add_scalar("train/acc_1", acc_1, epoch )
    tensorboard.add_scalar("train/acc_2", acc_2, epoch )
        
def test(epoch):
    global best_acc
    m1.eval()
    m2.eval()
    
    acc1_func.reset()
    acc2_func.reset()
    
    with torch.no_grad():
        for X, y in val_loader:
            X = X.cuda().float()
            y = y.cuda().long()

            logits_S1 = m1(X)
            logits_S2 = m2(X)

            # calc the metrics
#             pred_S1 = torch.softmax(logits_S1, dim=1)
#             pred_S2 = torch.softmax(logits_S2, dim=1)
            _, pred_S1 = torch.max(pred_S1, 1)
            _, pred_S2 = torch.max(pred_S2, 1)
            acc_1_val = acc1_func(pred_S1, y)
            acc_2_val = acc2_func(pred_S2, y)
            
    tensorboard.add_scalar('val/acc_1', acc_1_val, epoch)
    tensorboard.add_scalar('val/acc_2', acc_2_val, epoch)
    
    for callback in callbacks:
        callback.step()

    print('\nnet1 test acc: %.3e | net2 test acc: %.3e' % (acc_1_val, acc_2_val))

In [None]:
# Full Co-training
nb_epoch = 600

for epoch in range(nb_epoch):
    SU_train(epoch, train_loader)
    test(epoch)

Epoch    1,  98% 	 acc: 1.8470e-01 1.7605e-01 - loss 4.5511e+00 4.5268e+00 6.8578e-02 5.0059e+00 took: 51.70s
net1 test acc: 1.053e-01 | net2 test acc: 1.109e-01
Epoch    2,  98% 	 acc: 1.1111e-01 1.2698e-01 - loss 4.6088e+00 4.5861e+00 2.4825e-02 4.7559e+00 took: 45.32s
net1 test acc: 1.209e-01 | net2 test acc: 1.804e-01
Epoch    3,  98% 	 acc: 8.6580e-02 1.3709e-01 - loss 4.5852e+00 4.5592e+00 2.9029e-02 4.7562e+00 took: 44.31s
net1 test acc: 6.568e-02 | net2 test acc: 2.189e-01
Epoch    4,  98% 	 acc: 1.2554e-01 1.5440e-01 - loss 4.5174e+00 4.4871e+00 3.6703e-02 4.7839e+00 took: 44.37s
net1 test acc: 1.787e-01 | net2 test acc: 1.052e-01
Epoch    5,  98% 	 acc: 1.5296e-01 1.8903e-01 - loss 4.4310e+00 4.3940e+00 5.3192e-02 4.9257e+00 took: 44.35s
net1 test acc: 1.825e-01 | net2 test acc: 2.411e-01
Epoch    6,  98% 	 acc: 1.7605e-01 2.0202e-01 - loss 4.3261e+00 4.2866e+00 5.0039e-02 4.6991e+00 took: 47.37s
net1 test acc: 1.925e-01 | net2 test acc: 2.990e-01
Epoch    7,  98% 	 acc: 1.87

# ♫♪.ılılıll|̲̅̅●̲̅̅|̲̅̅=̲̅̅|̲̅̅●̲̅̅|llılılı.♫♪