In [49]:
import pandas as pd
from random import sample
import random
import numpy as np
from sklearn.model_selection import train_test_split
import os

for folder in ['multitask', 'para_single', 'sts_single', 'multitask_pcgrad', 'multitask_gradnorm', 'multitask_cagrad']:
    try:
        os.mkdir('./models/' + folder)
    except:
        pass

random.seed(0)

sts_train = pd.read_csv('./data/sts-train.csv', sep="\t")
sts_train = sts_train.dropna()

para_train = pd.read_csv('./data/quora-train.csv', sep="\t")
para_train = para_train.dropna()[:len(sts_train)]

sts_dev = pd.read_csv('./data/sts-dev.csv', sep="\t")
sts_dev = sts_dev.dropna() 

para_dev = pd.read_csv('./data/quora-dev.csv', sep="\t")
para_dev = para_dev.dropna()[:len(sts_dev)]

sts_combined_set = pd.concat([sts_train, sts_dev], ignore_index=True, axis=0)
para_combined_set = pd.concat([para_train, para_dev], ignore_index=True, axis=0)

sts_train, sts_dev = train_test_split(sts_combined_set, test_size=0.3, train_size=0.7, shuffle=False)
sts_dev, sts_test = train_test_split(sts_dev, test_size=0.5, train_size=0.5, shuffle=False)
para_train, para_dev = train_test_split(para_combined_set, test_size=0.3, train_size=0.7, shuffle=False)
para_dev, para_test = train_test_split(para_dev, test_size=0.5, train_size=0.5, shuffle=False)

In [50]:
from sentence_transformers import SentenceTransformer
# sentences = ["This is an example sentence", "Each sentence is converted"]

# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# embeddings = model.encode(sentences)
# print(embeddings)
# print(para_train.head())

In [51]:
from torch import nn
import torch
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch.utils.data import DataLoader, Dataset


N_PARAPHRASE_CLASSES = 1
N_SIMILARITY_CLASSES = 1
DROPOUT_PROB = 0.5
INPUT_SIZE = 768

class NLP_Model(nn.Module):
    def __init__(self, model):
        super(NLP_Model, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(DROPOUT_PROB)
        self.paraphrase_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.paraphrase_linear_interact = nn.Linear(INPUT_SIZE, N_PARAPHRASE_CLASSES)
        self.similarity_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.similarity_linear_interact = nn.Linear(INPUT_SIZE, N_SIMILARITY_CLASSES)
    
    def forward(self, sentences1, sentences2, task, device):
        '''
        Task 0 is para. Task 1 is similarity.
        '''
        sentences1 = torch.as_tensor(self.model.encode(sentences1.tolist()))
        sentences1 = sentences1.to(device)
        sentences2 = torch.as_tensor(self.model.encode(sentences2.tolist()))
        sentences2 = sentences2.to(device)
        if task == 0:
            sentences1 = self.dropout(sentences1)
            sentences1 = F.relu(self.paraphrase_linear(sentences1))
            sentences2 = self.dropout(sentences2)
            sentences2 = F.relu(self.paraphrase_linear(sentences2))
            combined = torch.concat((sentences1, sentences2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.paraphrase_linear_interact(combined))
        if task == 1:
            sentences1 = self.dropout(sentences1)
            sentences1 = F.relu(self.similarity_linear(sentences1))
            sentences2 = self.dropout(sentences2)
            sentences2 = F.relu(self.similarity_linear(sentences2))
            combined = torch.concat((sentences1, sentences2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.similarity_linear_interact(combined)) * 5


In [52]:
from torch.optim import AdamW

def save_model(model, filepath):
    save_info = {
        'model': model.state_dict(),
        'system_rng': random.getstate(),
        'numpy_rng': np.random.get_state(),
        'torch_rng': torch.random.get_rng_state(),
    }

    torch.save(save_info, f'{filepath}/model')
    model.model.save(f'{filepath}/transformer')
    print(f"saved the model to {filepath}")

def load_model(filepath, device):
    with torch.no_grad():
        save_info = torch.load(f'{filepath}/model')
        transformer_model = SentenceTransformer(f'{filepath}/transformer')
        transformer_model.to(device)
        
        model = NLP_Model(transformer_model)
        model.load_state_dict(save_info['model'])
        model.to(device)
        
        random.setstate(save_info['system_rng'])
        np.random.set_state(save_info['numpy_rng'])
        torch.random.set_rng_state(save_info['torch_rng'])
    return model, optimizer

In [53]:
from math import ceil

def get_batches(dataset, batch_size=512):
    """
    Pass in dataset and batch size.
    Get generator which yields batches.
    """
    return enumerate(dataset[i*batch_size:(i+1)*batch_size] for i in range(ceil(dataset.shape[0] / batch_size)))

In [54]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
NUM_EPOCHS = 30

def train_singletask_para_model(para_train, para_dev, filepath):
    '''
    use AdamW optimizer.
    binary cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)

    model = NLP_Model(transformer)
    model = model.to(device)

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=1e-2) #~SGD with weight decay 0.01
    scheduler = ReduceLROnPlateau(optimizer, 'max')

    best_dev_acc = 0

    train_para_accuracy = eval_singletask_model(model, device, para_train, 0, 'train')
    dev_para_accuracy = eval_singletask_model(model, device, para_dev, 0, 'dev')
    print(f"epoch number: 0, para train accuracy: {train_para_accuracy}, para dev accuracy: {dev_para_accuracy}")

    for epoch in range(NUM_EPOCHS):
        model.train()
        transformer.train()

        for step, batch in tqdm(get_batches(para_train), desc='train'):
            b_sentences1, b_sentences2, b_labels = batch['sentence1'], batch['sentence2'], batch['is_duplicate']
            optimizer.zero_grad()
            logits = model.forward(b_sentences1, b_sentences2, 0, device).flatten()

            b_labels = torch.as_tensor(b_labels.values, dtype=torch.float32)
            b_labels = b_labels.to(device)

            loss = F.binary_cross_entropy(logits, b_labels, reduction='mean')
            loss.backward()
            optimizer.step()

        train_para_accuracy = eval_singletask_model(model, device, para_train, 0, 'train')
        dev_para_accuracy = eval_singletask_model(model, device, para_dev, 0, 'dev')
        print(f"epoch number: {epoch + 1}, para train accuracy: {train_para_accuracy}, para dev accuracy: {dev_para_accuracy}")

        scheduler.step(dev_para_accuracy)

        if dev_para_accuracy >= best_dev_acc:
            best_dev_acc = dev_para_accuracy
            print('New best model. Saving.')
            save_model(model, filepath)


def train_singletask_sts_model(sts_train, sts_dev, filepath):
    '''
    use AdamW optimizer.
    multi-class cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)

    model = NLP_Model(transformer)
    model = model.to(device)

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=1e-2) #~SGD with weight decay 0.01
    scheduler = ReduceLROnPlateau(optimizer, 'max')

    best_dev_acc = 0

    train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
    dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
    print(f"epoch number: 0, sts train accuracy: {train_sts_acc}, sts dev accuracy: {dev_sts_acc}")

    for epoch in range(NUM_EPOCHS):
        model.train()
        transformer.train()

        for step, batch in tqdm(get_batches(sts_train), desc='train'):
            b_sentences1, b_sentences2, b_labels = batch['sentence1'], batch['sentence2'], batch['similarity']
            optimizer.zero_grad()
            logits = model.forward(b_sentences1, b_sentences2, 1, device).flatten()

            b_labels = torch.as_tensor(b_labels.values, dtype=torch.float32)
            b_labels = b_labels.to(device)

            loss = F.mse_loss(logits, b_labels, reduction='mean')
            loss.backward()
            optimizer.step()

        train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
        dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
        print(f"epoch number: {epoch + 1}, sts train accuracy: {train_sts_acc}, sts dev accuracy: {dev_sts_acc}")

        scheduler.step(dev_sts_acc)

        if dev_sts_acc >= best_dev_acc:
            best_dev_acc = dev_sts_acc
            print('New best model. Saving.')
            save_model(model, filepath)

https://huggingface.co/docs/transformers/training#train-in-native-pytorch

In [55]:
def eval_singletask_model(model, device, dataset, task, flag):
    '''
    given dataloader, 2 task-specific finetuned models, and device
    return the accuracy for para and for sts
    '''
    model.eval()
    model.model.eval()
    with torch.no_grad():
        truth = []
        predictions = []
        for step, batch in tqdm(get_batches(dataset), desc=f"{flag} eval"):
            b_sentences1, b_sentences2, b_labels = batch['sentence1'], batch['sentence2'], batch['is_duplicate' if task == 0 else 'similarity']
            truth.extend(b_labels)
            logits = model.forward(b_sentences1, b_sentences2, task, device)
            logits = logits.detach().cpu().numpy().flatten()
            if task == 0:
                new_predictions = np.round(logits).flatten()
            else:
                new_predictions = logits.flatten()
            predictions.extend(new_predictions)
        if task == 0:
            accuracy = (np.array(truth).flatten() == np.array(predictions).flatten()).mean()
        else:
            accuracy = (np.round(np.array(truth).flatten()) == np.round(np.array(predictions).flatten())).mean()
    return accuracy

In [56]:
from pcgrad import PCGrad

def train_multitask_model(para_train, para_dev, sts_train, sts_dev, filepath, pcgrad_flag):
    '''
    use AdamW optimizer.
    binary cross-entropy loss for para, multi-class cross-entropy loss for sts, sum loss functions. 
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)

    model = NLP_Model(transformer)
    model = model.to(device)

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=1e-2) #~SGD with weight decay 0.01
    scheduler = ReduceLROnPlateau(optimizer, 'min')
    if pcgrad_flag: optimizer = PCGrad(optimizer)

    best_dev_acc = 0

    train_para_acc = eval_singletask_model(model, device, para_train, 0, 'train')
    dev_para_acc = eval_singletask_model(model, device, para_dev, 0, 'dev')
    train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
    dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
    print(f"epoch number: 0, para train accuracy: {train_para_acc}, para dev accuracy: {dev_para_acc}, sts train accuracy: {train_sts_acc}, sts dev accuracy: {dev_sts_acc}")

    train_acc = (train_para_acc + train_sts_acc) / 2
    dev_acc = (dev_para_acc + dev_sts_acc) / 2
    print(f"epoch number: 0, avg train accuracy: {train_acc}, avg dev accuracy: {dev_acc}")

    for epoch in range(NUM_EPOCHS):
        model.train()
        transformer.train()

        for (para_step, para_batch), (sts_step, sts_batch) in zip(tqdm(get_batches(para_train), desc='train'), tqdm(get_batches(sts_train), desc='train')):
            b_para_sentences1, b_para_sentences2, b_para_labels = para_batch['sentence1'], para_batch['sentence2'], para_batch['is_duplicate']
            b_sts_sentences1, b_sts_sentences2, b_sts_labels = sts_batch['sentence1'], sts_batch['sentence2'], sts_batch['similarity']

            optimizer.zero_grad()

            para_logits = model.forward(b_para_sentences1, b_para_sentences2, 0, device).flatten()
            sts_logits = model.forward(b_sts_sentences1, b_sts_sentences2, 1, device).flatten()

            b_para_labels = torch.as_tensor(b_para_labels.values, dtype=torch.float32)
            b_para_labels = b_para_labels.to(device)
            b_sts_labels = torch.as_tensor(b_sts_labels.values, dtype=torch.float32)
            b_sts_labels = b_sts_labels.to(device)

            loss = (F.binary_cross_entropy(para_logits, b_para_labels, reduction='mean') + F.mse_loss(sts_logits, b_sts_labels, reduction='mean')) / 2
            loss.backward()
            optimizer.step()

        train_para_acc = eval_singletask_model(model, device, para_train, 0, 'train')
        dev_para_acc = eval_singletask_model(model, device, para_dev, 0, 'dev')
        train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
        dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
        print(f"epoch number: {epoch + 1}, para train accuracy: {train_para_acc}, para dev accuracy: {dev_para_acc}, sts train accuracy: {train_sts_acc}, sts dev accuracy: {dev_sts_acc}")

        train_acc = (train_para_acc + train_sts_acc) / 2
        dev_acc = (dev_para_acc + dev_sts_acc) / 2
        print(f"epoch number: {epoch + 1}, avg train accuracy: {train_acc}, avg dev accuracy: {dev_acc}")

        scheduler.step(dev_acc, 'max')

        if dev_acc >= best_dev_acc:
            best_dev_acc = dev_acc
            print('New best model. Saving.')
            save_model(model, filepath) 

In [57]:
def train_multitask_model_gradnorm(para_train, para_dev, sts_train, sts_dev, filepath, alpha, layer):
    '''
    use AdamW optimizer.
    binary cross-entropy loss for para, multi-class cross-entropy loss for sts, sum loss functions. 
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)

    model = NLP_Model(transformer)
    model = model.to(device)

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=1e-2) #~SGD with weight decay 0.01
    scheduler = ReduceLROnPlateau(optimizer, 'min')

    best_dev_acc = 0

    train_para_acc = eval_singletask_model(model, device, para_train, 0, 'train')
    dev_para_acc = eval_singletask_model(model, device, para_dev, 0, 'dev')
    train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
    dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
    print(f"epoch number: 0, para train accuracy: {train_para_acc}, para dev accuracy: {dev_para_acc}, sts train accuracy: {train_sts_acc}, sts dev accuracy: {dev_sts_acc}")

    train_acc = (train_para_acc + train_sts_acc) / 2
    dev_acc = (dev_para_acc + dev_sts_acc) / 2
    print(f"epoch number: 0, avg train accuracy: {train_acc}, avg dev accuracy: {dev_acc}")

    iters = 0

    for epoch in range(NUM_EPOCHS):
        model.train()
        transformer.train()

        for (para_step, para_batch), (sts_step, sts_batch) in zip(tqdm(get_batches(para_train), desc='train'), tqdm(get_batches(sts_train), desc='train')):
            b_para_sentences1, b_para_sentences2, b_para_labels = para_batch['sentence1'], para_batch['sentence2'], para_batch['is_duplicate']
            b_sts_sentences1, b_sts_sentences2, b_sts_labels = sts_batch['sentence1'], sts_batch['sentence2'], sts_batch['similarity']

            optimizer.zero_grad()

            para_logits = model.forward(b_para_sentences1, b_para_sentences2, 0, device).flatten()
            sts_logits = model.forward(b_sts_sentences1, b_sts_sentences2, 1, device).flatten()

            b_para_labels = torch.as_tensor(b_para_labels.values, dtype=torch.float32)
            b_para_labels = b_para_labels.to(device)
            b_sts_labels = torch.as_tensor(b_sts_labels.values, dtype=torch.float32)
            b_sts_labels = b_sts_labels.to(device)

            loss = (F.binary_cross_entropy(para_logits, b_para_labels, reduction='mean') + F.mse_loss(sts_logits, b_sts_labels, reduction='mean')) / 2
            if iters == 0:
                weights = torch.ones_like(loss)
                weights = torch.nn.Parameter(weights)
                T = weights.sum().detach()
                optimizer2 = torch.optim.Adam([weights], lr=1e-2)
                l0 = loss.detach()
            weighted_loss = np.dot(weights, loss)
            weighted_loss.backward()
            gradient_weights = []
            for i in range(len(loss)):
                d_l = torch.autograd.grad(weights[i] * loss[i], layer.parameters())[0]
                gradient_weights.append(torch.norm(d_l))
            gradient_weights = torch.stack(gradient_weights)
            lossratio = loss.detach() / l0
            r_t = lossratio / lossratio.mean()
            avg_gradient_weight = gradient_weights.mean().detach()
            const_factor = (avg_gradient_weight * r_t ** alpha).detach()
            gradnormloss = torch.abs(gradient_weights - const_factor).sum()
            optimizer2.zero_grad()
            gradnormloss.backward()
            optimizer.step()
            optimizer2.step()
            weights = (weights / weights.sum() * T).detach()
            weights = torch.nn.Parameter(weights)
            optimizer2 = torch.optim.Adam([weights], lr=1e-2)
            iters += 1

        train_para_acc = eval_singletask_model(model, device, para_train, 0, 'train')
        dev_para_acc = eval_singletask_model(model, device, para_dev, 0, 'dev')
        train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
        dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
        print(f"epoch number: {epoch + 1}, para train accuracy: {train_para_acc}, para dev accuracy: {dev_para_acc}, sts train accuracy: {train_sts_acc}, sts dev accuracy: {dev_sts_acc}")

        train_acc = (train_para_acc + train_sts_acc) / 2
        dev_acc = (dev_para_acc + dev_sts_acc) / 2
        print(f"epoch number: {epoch + 1}, avg train accuracy: {train_acc}, avg dev accuracy: {dev_acc}")

        scheduler.step(dev_acc, 'max')

        if dev_acc >= best_dev_acc:
            best_dev_acc = dev_acc
            print('New best model. Saving.')
            save_model(model, filepath) 

In [58]:
def test_singletask_model(filepath1, filepath2):
    train_singletask_para_model(para_train, para_dev, filepath1)
    train_singletask_sts_model(sts_train, sts_dev, filepath2)

    device = torch.device('cuda')

    para_model, _ = load_model(filepath1, device)
    sts_model, _ = load_model(filepath2, device)

    para_acc = eval_singletask_model(para_model, device, para_test, 0, 'test')
    sts_acc = eval_singletask_model(sts_model, device, sts_test, 1, 'test')

    print(f'Final test accuracy. PARA: {para_acc}, STS: {sts_acc}.')

In [59]:
test_singletask_model('./models/para_single', './models/sts_single')

In [None]:
def test_multitask_model(filepath, pcgrad_flag):
    train_multitask_model(para_train, para_dev, sts_train, sts_dev, filepath, pcgrad_flag)

    device = torch.device('cuda')

    model, _ = load_model(filepath, device)

    para_acc = eval_singletask_model(model, device, para_test, 0, 'test')
    sts_acc = eval_singletask_model(model, device, sts_test, 1, 'test')

    print(f'Final test accuracy. PARA: {para_acc}, STS: {sts_acc}.')

In [None]:
test_multitask_model('./models/multitask', False)

train eval: 1it [00:01,  1.38s/it]
dev eval: 1it [00:00,  3.25it/s]
train eval: 1it [00:01,  1.28s/it]
dev eval: 1it [00:00,  3.26it/s]


epoch number: 0, para train accuracy: 0.36007827788649704, para dev accuracy: 0.36363636363636365, sts train accuracy: 0.1643835616438356, sts dev accuracy: 0.19090909090909092
epoch number: 0, avg train accuracy: 0.2622309197651663, avg dev accuracy: 0.2772727272727273


train: 1it [00:02,  2.69s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.19it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.24it/s]


epoch number: 1, para train accuracy: 0.649706457925636, para dev accuracy: 0.6363636363636364, sts train accuracy: 0.2172211350293542, sts dev accuracy: 0.23636363636363636
epoch number: 1, avg train accuracy: 0.4334637964774951, avg dev accuracy: 0.43636363636363634
New best model. Saving.
saved the model to ./models/multitask


train: 1it [00:02,  2.68s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.16it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.24it/s]


epoch number: 2, para train accuracy: 0.7103718199608611, para dev accuracy: 0.6818181818181818, sts train accuracy: 0.2054794520547945, sts dev accuracy: 0.18181818181818182
epoch number: 2, avg train accuracy: 0.45792563600782776, avg dev accuracy: 0.43181818181818177


train: 1it [00:02,  2.67s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.38s/it]
dev eval: 1it [00:00,  3.20it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.23it/s]


epoch number: 3, para train accuracy: 0.7925636007827789, para dev accuracy: 0.7272727272727273, sts train accuracy: 0.26418786692759294, sts dev accuracy: 0.2636363636363636
epoch number: 3, avg train accuracy: 0.5283757338551859, avg dev accuracy: 0.4954545454545455
New best model. Saving.
saved the model to ./models/multitask


train: 1it [00:02,  2.70s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.38s/it]
dev eval: 1it [00:00,  3.19it/s]
train eval: 1it [00:01,  1.27s/it]
dev eval: 1it [00:00,  3.21it/s]


epoch number: 4, para train accuracy: 0.8160469667318982, para dev accuracy: 0.7090909090909091, sts train accuracy: 0.2857142857142857, sts dev accuracy: 0.2
epoch number: 4, avg train accuracy: 0.550880626223092, avg dev accuracy: 0.4545454545454546


train: 1it [00:02,  2.66s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.40s/it]
dev eval: 1it [00:00,  3.24it/s]
train eval: 1it [00:01,  1.30s/it]
dev eval: 1it [00:00,  3.28it/s]


epoch number: 5, para train accuracy: 0.8199608610567515, para dev accuracy: 0.7, sts train accuracy: 0.2974559686888454, sts dev accuracy: 0.2545454545454545
epoch number: 5, avg train accuracy: 0.5587084148727984, avg dev accuracy: 0.47727272727272724


train: 1it [00:02,  2.71s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.41s/it]
dev eval: 1it [00:00,  3.20it/s]
train eval: 1it [00:01,  1.32s/it]
dev eval: 1it [00:00,  3.26it/s]


epoch number: 6, para train accuracy: 0.8395303326810176, para dev accuracy: 0.6818181818181818, sts train accuracy: 0.30332681017612523, sts dev accuracy: 0.2727272727272727
epoch number: 6, avg train accuracy: 0.5714285714285714, avg dev accuracy: 0.47727272727272724


train: 1it [00:02,  2.73s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.41s/it]
dev eval: 1it [00:00,  3.20it/s]
train eval: 1it [00:01,  1.32s/it]
dev eval: 1it [00:00,  3.11it/s]


epoch number: 7, para train accuracy: 0.8551859099804305, para dev accuracy: 0.6818181818181818, sts train accuracy: 0.3268101761252446, sts dev accuracy: 0.2545454545454545
epoch number: 7, avg train accuracy: 0.5909980430528375, avg dev accuracy: 0.46818181818181814


train: 1it [00:02,  2.74s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.43s/it]
dev eval: 1it [00:00,  3.08it/s]
train eval: 1it [00:01,  1.32s/it]
dev eval: 1it [00:00,  3.11it/s]


epoch number: 8, para train accuracy: 0.8688845401174168, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.37377690802348335, sts dev accuracy: 0.2727272727272727
epoch number: 8, avg train accuracy: 0.6213307240704501, avg dev accuracy: 0.4590909090909091


train: 1it [00:02,  2.74s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.42s/it]
dev eval: 1it [00:00,  3.18it/s]
train eval: 1it [00:01,  1.32s/it]
dev eval: 1it [00:00,  3.11it/s]


epoch number: 9, para train accuracy: 0.8943248532289628, para dev accuracy: 0.6636363636363637, sts train accuracy: 0.41487279843444225, sts dev accuracy: 0.2818181818181818
epoch number: 9, avg train accuracy: 0.6545988258317026, avg dev accuracy: 0.4727272727272728


train: 1it [00:02,  2.71s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.40s/it]
dev eval: 1it [00:00,  3.22it/s]
train eval: 1it [00:01,  1.30s/it]
dev eval: 1it [00:00,  3.21it/s]


epoch number: 10, para train accuracy: 0.9412915851272016, para dev accuracy: 0.6636363636363637, sts train accuracy: 0.4520547945205479, sts dev accuracy: 0.23636363636363636
epoch number: 10, avg train accuracy: 0.6966731898238747, avg dev accuracy: 0.45


train: 1it [00:02,  2.67s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.38s/it]
dev eval: 1it [00:00,  3.19it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.24it/s]


epoch number: 11, para train accuracy: 0.9608610567514677, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.4598825831702544, sts dev accuracy: 0.19090909090909092
epoch number: 11, avg train accuracy: 0.7103718199608611, avg dev accuracy: 0.4181818181818182


train: 1it [00:02,  2.68s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.38s/it]
dev eval: 1it [00:00,  3.17it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.28it/s]


epoch number: 12, para train accuracy: 0.9726027397260274, para dev accuracy: 0.6636363636363637, sts train accuracy: 0.5264187866927593, sts dev accuracy: 0.21818181818181817
epoch number: 12, avg train accuracy: 0.7495107632093934, avg dev accuracy: 0.4409090909090909


train: 1it [00:02,  2.67s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.19it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.29it/s]


epoch number: 13, para train accuracy: 0.974559686888454, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.6007827788649707, sts dev accuracy: 0.24545454545454545
epoch number: 13, avg train accuracy: 0.7876712328767124, avg dev accuracy: 0.4454545454545455


train: 1it [00:02,  2.66s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.20it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.22it/s]


epoch number: 14, para train accuracy: 0.9784735812133072, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.6105675146771037, sts dev accuracy: 0.24545454545454545
epoch number: 14, avg train accuracy: 0.7945205479452054, avg dev accuracy: 0.4454545454545455


train: 1it [00:02,  2.67s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.22it/s]
train eval: 1it [00:01,  1.27s/it]
dev eval: 1it [00:00,  3.23it/s]


epoch number: 15, para train accuracy: 0.9902152641878669, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.5792563600782779, sts dev accuracy: 0.2
epoch number: 15, avg train accuracy: 0.7847358121330723, avg dev accuracy: 0.42272727272727273


train: 1it [00:02,  2.66s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.38s/it]
dev eval: 1it [00:00,  3.21it/s]
train eval: 1it [00:01,  1.28s/it]
dev eval: 1it [00:00,  3.27it/s]


epoch number: 16, para train accuracy: 0.9921722113502935, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.639921722113503, sts dev accuracy: 0.2
epoch number: 16, avg train accuracy: 0.8160469667318982, avg dev accuracy: 0.42272727272727273


train: 1it [00:02,  2.66s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.20it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.16it/s]


epoch number: 17, para train accuracy: 0.9960861056751468, para dev accuracy: 0.6545454545454545, sts train accuracy: 0.7045009784735812, sts dev accuracy: 0.2636363636363636
epoch number: 17, avg train accuracy: 0.850293542074364, avg dev accuracy: 0.4590909090909091


train: 1it [00:02,  2.68s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.38s/it]
dev eval: 1it [00:00,  3.20it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.25it/s]


epoch number: 18, para train accuracy: 0.9960861056751468, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.6986301369863014, sts dev accuracy: 0.24545454545454545
epoch number: 18, avg train accuracy: 0.8473581213307241, avg dev accuracy: 0.4454545454545455


train: 1it [00:02,  2.68s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.21it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.27it/s]


epoch number: 19, para train accuracy: 0.9960861056751468, para dev accuracy: 0.6545454545454545, sts train accuracy: 0.6908023483365949, sts dev accuracy: 0.22727272727272727
epoch number: 19, avg train accuracy: 0.8434442270058709, avg dev accuracy: 0.4409090909090909


train: 1it [00:02,  2.69s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.19it/s]
train eval: 1it [00:01,  1.30s/it]
dev eval: 1it [00:00,  3.24it/s]


epoch number: 20, para train accuracy: 0.9960861056751468, para dev accuracy: 0.6454545454545455, sts train accuracy: 0.7162426614481409, sts dev accuracy: 0.22727272727272727
epoch number: 20, avg train accuracy: 0.8561643835616438, avg dev accuracy: 0.4363636363636364


test eval: 1it [00:00,  3.35it/s]
test eval: 1it [00:00,  3.45it/s]

Final test accuracy. PARA: 0.6454545454545455, STS: 0.2636363636363636.





In [None]:
test_multitask_model('./models/multitask_pcgrad', True)

train eval: 1it [00:01,  1.40s/it]
dev eval: 1it [00:00,  3.26it/s]
train eval: 1it [00:01,  1.25s/it]
dev eval: 1it [00:00,  3.39it/s]


epoch number: 0, para train accuracy: 0.649706457925636, para dev accuracy: 0.6363636363636364, sts train accuracy: 0.2035225048923679, sts dev accuracy: 0.21818181818181817
epoch number: 0, avg train accuracy: 0.42661448140900193, avg dev accuracy: 0.42727272727272725


train: 1it [00:02,  2.61s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.35s/it]
dev eval: 1it [00:00,  3.29it/s]
train eval: 1it [00:01,  1.26s/it]
dev eval: 1it [00:00,  3.32it/s]


epoch number: 1, para train accuracy: 0.649706457925636, para dev accuracy: 0.6363636363636364, sts train accuracy: 0.22113502935420742, sts dev accuracy: 0.22727272727272727
epoch number: 1, avg train accuracy: 0.43542074363992167, avg dev accuracy: 0.4318181818181818
New best model. Saving.
saved the model to ./models/multitask_pcgrad


train: 1it [00:02,  2.65s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.37s/it]
dev eval: 1it [00:00,  3.22it/s]
train eval: 1it [00:01,  1.28s/it]
dev eval: 1it [00:00,  3.31it/s]


epoch number: 2, para train accuracy: 0.8082191780821918, para dev accuracy: 0.6909090909090909, sts train accuracy: 0.14481409001956946, sts dev accuracy: 0.18181818181818182
epoch number: 2, avg train accuracy: 0.4765166340508806, avg dev accuracy: 0.4363636363636364
New best model. Saving.
saved the model to ./models/multitask_pcgrad


train: 1it [00:02,  2.71s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.18it/s]
train eval: 1it [00:01,  1.28s/it]
dev eval: 1it [00:00,  3.25it/s]


epoch number: 3, para train accuracy: 0.350293542074364, para dev accuracy: 0.36363636363636365, sts train accuracy: 0.14481409001956946, sts dev accuracy: 0.18181818181818182
epoch number: 3, avg train accuracy: 0.24755381604696672, avg dev accuracy: 0.2727272727272727


train: 1it [00:02,  2.67s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.20it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.22it/s]


epoch number: 4, para train accuracy: 0.4050880626223092, para dev accuracy: 0.36363636363636365, sts train accuracy: 0.14481409001956946, sts dev accuracy: 0.18181818181818182
epoch number: 4, avg train accuracy: 0.2749510763209393, avg dev accuracy: 0.2727272727272727


train: 1it [00:02,  2.68s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.22it/s]
train eval: 1it [00:01,  1.30s/it]
dev eval: 1it [00:00,  3.24it/s]


epoch number: 5, para train accuracy: 0.7534246575342466, para dev accuracy: 0.6181818181818182, sts train accuracy: 0.14481409001956946, sts dev accuracy: 0.18181818181818182
epoch number: 5, avg train accuracy: 0.449119373776908, avg dev accuracy: 0.4


train: 1it [00:02,  2.69s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.17it/s]
train eval: 1it [00:01,  1.30s/it]
dev eval: 1it [00:00,  3.14it/s]


epoch number: 6, para train accuracy: 0.7553816046966731, para dev accuracy: 0.7, sts train accuracy: 0.14481409001956946, sts dev accuracy: 0.18181818181818182
epoch number: 6, avg train accuracy: 0.4500978473581213, avg dev accuracy: 0.4409090909090909
New best model. Saving.
saved the model to ./models/multitask_pcgrad


train: 1it [00:02,  2.74s/it]
train: 0it [00:02, ?it/s]
train eval: 1it [00:01,  1.39s/it]
dev eval: 1it [00:00,  3.19it/s]
train eval: 1it [00:01,  1.29s/it]
dev eval: 1it [00:00,  3.12it/s]


epoch number: 7, para train accuracy: 0.735812133072407, para dev accuracy: 0.7090909090909091, sts train accuracy: 0.14481409001956946, sts dev accuracy: 0.18181818181818182
epoch number: 7, avg train accuracy: 0.44031311154598823, avg dev accuracy: 0.44545454545454544
New best model. Saving.
saved the model to ./models/multitask_pcgrad


train: 0it [00:01, ?it/s]
train: 0it [00:01, ?it/s]


KeyboardInterrupt: 