In [1]:
import pandas as pd
from random import sample
import random
import numpy as np

random.seed(0)

sts_train = pd.read_csv('./data/sts-train.csv', sep="\t")
sts_train = sts_train.dropna()

para_train = pd.read_csv('./data/quora-train.csv', sep="\t")
para_train = para_train.dropna()[:len(sts_train)]

sts_dev = pd.read_csv('./data/sts-dev.csv', sep="\t")
sts_dev = sts_dev.dropna() 

para_dev = pd.read_csv('./data/quora-dev.csv', sep="\t")
para_dev = para_dev.dropna()[:len(sts_dev)]

para_dev, para_test = para_dev[:(para_dev.shape[0] // 2)], para_dev[(para_dev.shape[0] // 2):]
sts_dev, sts_test = sts_dev[:(sts_dev.shape[0] // 2)], sts_dev[(sts_dev.shape[0] // 2):]

In [2]:
from sentence_transformers import SentenceTransformer
# sentences = ["This is an example sentence", "Each sentence is converted"]

# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# embeddings = model.encode(sentences)
# print(embeddings)
# print(para_train.head())

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from torch import nn
import torch
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch.utils.data import DataLoader, Dataset


N_PARAPHRASE_CLASSES = 1
N_SIMILARITY_CLASSES = 1
DROPOUT_PROB = 0.5
INPUT_SIZE = 768

class NLP_Model(nn.Module):
    def __init__(self, model):
        super(NLP_Model, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(DROPOUT_PROB)
        self.paraphrase_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.paraphrase_linear_interact = nn.Linear(INPUT_SIZE, N_PARAPHRASE_CLASSES)
        self.similarity_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.similarity_linear_interact = nn.Linear(INPUT_SIZE, N_SIMILARITY_CLASSES)
    
    def forward(self, sentences1, sentences2, task, device):
        '''
        Task 0 is para. Task 1 is similarity.
        '''
        sentences1 = torch.as_tensor(self.model.encode(sentences1.tolist()))
        sentences1 = sentences1.to(device)
        sentences2 = torch.as_tensor(self.model.encode(sentences2.tolist()))
        sentences2 = sentences2.to(device)
        if task == 0:
            sentences1 = self.dropout(sentences1)
            sentences1 = F.relu(self.paraphrase_linear(sentences1))
            sentences2 = self.dropout(sentences2)
            sentences2 = F.relu(self.paraphrase_linear(sentences2))
            combined = torch.concat((sentences1, sentences2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.paraphrase_linear_interact(combined))
        if task == 1:
            sentences1 = self.dropout(sentences1)
            sentences1 = F.relu(self.similarity_linear(sentences1))
            sentences2 = self.dropout(sentences2)
            sentences2 = F.relu(self.similarity_linear(sentences2))
            combined = torch.concat((sentences1, sentences2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.similarity_linear_interact(combined)) * 5


In [4]:
from torch.optim import AdamW

def save_model(model, optimizer, filepath):
    save_info = {
        'model': model.state_dict(),
        'optim': optimizer.state_dict(),
        'system_rng': random.getstate(),
        'numpy_rng': np.random.get_state(),
        'torch_rng': torch.random.get_rng_state(),
    }

    torch.save(save_info, f'{filepath}/model')
    model.model.save(f'{filepath}/transformer')
    print(f"saved the model to {filepath}")

def load_model(filepath, device):
    with torch.no_grad():
        save_info = torch.load(f'{filepath}/model')
        transformer_model = SentenceTransformer(f'{filepath}/transformer')
        transformer_model.to(device)
        
        model = NLP_Model(transformer_model)
        model.load_state_dict(save_info['model'])
        model.to(device)
        
        optimizer = AdamW(list(model.parameters()) + list(transformer_model.parameters()), lr=1e-4)
        optimizer.load_state_dict(save_info['optim'])
        
        random.setstate(save_info['system_rng'])
        np.random.set_state(save_info['numpy_rng'])
        torch.random.set_rng_state(save_info['torch_rng'])
    return model, optimizer

In [5]:
from math import ceil

def get_batches(dataset, batch_size=1024):
    """
    Pass in dataset and batch size.
    Get generator which yields batches.
    """
    return enumerate(dataset[i*batch_size:(i+1)*batch_size] for i in range(ceil(dataset.shape[0] / batch_size)))

In [6]:
from torch.optim import AdamW
from tqdm import tqdm
NUM_EPOCHS = 12

def train_singletask_para_model(para_train, para_dev, filepath):
    '''
    use AdamW optimizer.
    binary cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)

    model = NLP_Model(transformer)
    model = model.to(device)

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=3e-3) #~SGD with weight decay 0.01
    best_dev_acc = 0

    train_para_accuracy = eval_singletask_model(model, device, para_train, 0, 'train')
    dev_para_accuracy = eval_singletask_model(model, device, para_dev, 0, 'dev')
    print(f"epoch number: 0, para train accuracy: {train_para_accuracy}, para dev accuracy: {dev_para_accuracy}")

    for epoch in range(NUM_EPOCHS):
        model.train()
        transformer.train()

        for step, batch in tqdm(get_batches(para_train), desc='train'):
            b_sentences1, b_sentences2, b_labels = batch['sentence1'], batch['sentence2'], batch['is_duplicate']
            optimizer.zero_grad()
            logits = model.forward(b_sentences1, b_sentences2, 0, device).flatten()

            b_labels = torch.as_tensor(b_labels.values, dtype=torch.float32)
            b_labels = b_labels.to(device)

            loss = F.binary_cross_entropy(logits, b_labels, reduction='mean')
            loss.backward()
            optimizer.step()

        train_para_accuracy = eval_singletask_model(model, device, para_train, 0, 'train')
        dev_para_accuracy = eval_singletask_model(model, device, para_dev, 0, 'dev')
        print(f"epoch number: {epoch + 1}, para train accuracy: {train_para_accuracy}, para dev accuracy: {dev_para_accuracy}")

        if dev_para_accuracy >= best_dev_acc:
            best_dev_acc = dev_para_accuracy
            print('New best model. Saving.')
            save_model(model, optimizer, filepath)


def train_singletask_sts_model(sts_train, sts_dev, filepath):
    '''
    use AdamW optimizer.
    multi-class cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)

    model = NLP_Model(transformer)
    model = model.to(device)

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=3e-3) #~SGD with weight decay 0.01
    best_dev_acc = 0

    train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
    dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
    print(f"epoch number: 0, sts train acc: {train_sts_acc}, sts dev acc: {dev_sts_acc}")

    for epoch in range(NUM_EPOCHS):
        model.train()
        transformer.train()

        for step, batch in tqdm(get_batches(sts_train), desc='train'):
            b_sentences1, b_sentences2, b_labels = batch['sentence1'], batch['sentence2'], batch['similarity']
            optimizer.zero_grad()
            logits = model.forward(b_sentences1, b_sentences2, 1, device).flatten()

            b_labels = torch.as_tensor(b_labels.values, dtype=torch.float32)
            b_labels = b_labels.to(device)

            loss = F.mse_loss(logits, b_labels, reduction='mean')
            loss.backward()
            optimizer.step()

        train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
        dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
        print(f"epoch number: {epoch + 1}, sts train acc: {train_sts_acc}, sts dev acc: {dev_sts_acc}")

        if dev_sts_acc >= best_dev_acc:
            best_dev_acc = dev_sts_acc
            print('New best model. Saving.')
            save_model(model, optimizer, filepath)

https://huggingface.co/docs/transformers/training#train-in-native-pytorch

In [7]:
def eval_singletask_model(model, device, dataset, task, flag):
    '''
    given dataloader, 2 task-specific finetuned models, and device
    return the accuracy for para and for sts
    '''
    model.eval()
    model.model.eval()
    with torch.no_grad():
        truth = []
        predictions = []
        for step, batch in tqdm(get_batches(dataset), desc=f"{flag} eval"):
            b_sentences1, b_sentences2, b_labels = batch['sentence1'], batch['sentence2'], batch['is_duplicate' if task == 0 else 'similarity']
            truth.extend(b_labels)
            logits = model.forward(b_sentences1, b_sentences2, task, device)
            logits = logits.detach().cpu().numpy().flatten()
            if task == 0:
                new_predictions = np.round(logits).flatten()
            else:
                new_predictions = logits.flatten()
            predictions.extend(new_predictions)
        if task == 0:
            accuracy = (np.array(truth).flatten() == np.array(predictions).flatten()).mean()
        else:
            accuracy = (np.round(np.array(truth).flatten()) == np.round(np.array(predictions).flatten())).mean()
    return accuracy


def train_multitask_model(para_train, para_dev, sts_train, sts_dev, filepath):
    '''
    use AdamW optimizer.
    binary cross-entropy loss for para, multi-class cross-entropy loss for sts, sum loss functions. 
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)

    model = NLP_Model(transformer)
    model = model.to(device)

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=3e-5) #~SGD with weight decay 0.01
    best_dev_acc = 0

    train_para_acc = eval_singletask_model(model, device, para_train, 0, 'train')
    dev_para_acc = eval_singletask_model(model, device, para_dev, 0, 'dev')
    train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
    dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
    print(f"epoch number: 0, para train acc: {train_para_acc}, para dev acc: {dev_para_acc}, sts train acc: {train_sts_acc}, sts dev acc: {dev_sts_acc}")

    train_acc = (train_para_acc + train_sts_acc) / 2
    dev_acc = (dev_para_acc + dev_sts_acc) / 2
    print(f"epoch number: 0, avg train acc: {train_acc}, avg dev acc: {dev_acc}")

    for epoch in range(NUM_EPOCHS):
        model.train()
        transformer.train()

        for (para_step, para_batch), (sts_step, sts_batch) in zip(tqdm(get_batches(para_train), desc='train'), tqdm(get_batches(sts_train), desc='train')):
            b_para_sentences1, b_para_sentences2, b_para_labels = para_batch['sentence1'], para_batch['sentence2'], para_batch['is_duplicate']
            b_sts_sentences1, b_sts_sentences2, b_sts_labels = sts_batch['sentence1'], sts_batch['sentence2'], sts_batch['similarity']

            optimizer.zero_grad()

            para_logits = model.forward(b_para_sentences1, b_para_sentences2, 0, device).flatten()
            sts_logits = model.forward(b_sts_sentences1, b_sts_sentences2, 1, device).flatten()

            b_para_labels = torch.as_tensor(b_para_labels.values, dtype=torch.float32)
            b_para_labels = b_para_labels.to(device)
            b_sts_labels = torch.as_tensor(b_sts_labels.values, dtype=torch.float32)
            b_sts_labels = b_sts_labels.to(device)

            loss = (F.binary_cross_entropy(para_logits, b_para_labels, reduction='mean') + F.mse_loss(sts_logits, b_sts_labels, reduction='mean')) / 2
            loss.backward()
            optimizer.step()

        train_para_acc = eval_singletask_model(model, device, para_train, 0, 'train')
        dev_para_acc = eval_singletask_model(model, device, para_dev, 0, 'dev')
        train_sts_acc = eval_singletask_model(model, device, sts_train, 1, 'train')
        dev_sts_acc = eval_singletask_model(model, device, sts_dev, 1, 'dev')
        print(f"epoch number: {epoch + 1}, para train acc: {train_para_acc}, para dev acc: {dev_para_acc}, sts train acc: {train_sts_acc}, sts dev acc: {dev_sts_acc}")

        train_acc = (train_para_acc + train_sts_acc) / 2
        dev_acc = (dev_para_acc + dev_sts_acc) / 2
        print(f"epoch number: {epoch + 1}, avg train acc: {train_acc}, avg dev acc: {dev_acc}")

        if dev_acc >= best_dev_acc:
            best_dev_acc = dev_acc
            print('New best model. Saving.')
            save_model(model, optimizer, filepath) 

In [8]:
def test_singletask_model(filepath1, filepath2):
    train_singletask_para_model(para_train, para_dev, filepath1)
    train_singletask_sts_model(sts_train, sts_dev, filepath2)

    device = torch.device('cuda')

    para_model, _ = load_model(filepath1, device)
    sts_model, _ = load_model(filepath2, device)

    para_acc = eval_singletask_model(para_model, device, para_test, 0, 'test')
    sts_acc = eval_singletask_model(sts_model, device, sts_test, 1, 'test')

    print(f'Final test accuracy. PARA: {para_acc}, STS: {sts_acc}.')

In [9]:
test_singletask_model('./models/para_single', './models/sts_single')

  super().__init__(params, defaults)
train eval: 6it [00:15,  2.56s/it]
dev eval: 1it [00:01,  1.15s/it]


epoch number: 0, para train accuracy: 0.6296357615894039, para dev accuracy: 0.6241299303944315


train: 6it [00:14,  2.48s/it]
train eval: 6it [00:15,  2.52s/it]
dev eval: 1it [00:01,  1.17s/it]


epoch number: 1, para train accuracy: 0.6301324503311259, para dev accuracy: 0.6218097447795824
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:15,  2.55s/it]
train eval: 6it [00:15,  2.57s/it]
dev eval: 1it [00:01,  1.19s/it]


epoch number: 2, para train accuracy: 0.6753311258278145, para dev accuracy: 0.642691415313225
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:15,  2.59s/it]
train eval: 6it [00:15,  2.60s/it]
dev eval: 1it [00:01,  1.21s/it]


epoch number: 3, para train accuracy: 0.7192052980132451, para dev accuracy: 0.6705336426914154
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:15,  2.63s/it]
train eval: 6it [00:15,  2.65s/it]
dev eval: 1it [00:01,  1.24s/it]


epoch number: 4, para train accuracy: 0.729635761589404, para dev accuracy: 0.6844547563805105
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:16,  2.68s/it]
train eval: 6it [00:16,  2.70s/it]
dev eval: 1it [00:01,  1.24s/it]


epoch number: 5, para train accuracy: 0.7437086092715232, para dev accuracy: 0.6937354988399071
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:16,  2.70s/it]
train eval: 6it [00:16,  2.71s/it]
dev eval: 1it [00:01,  1.26s/it]


epoch number: 6, para train accuracy: 0.7516556291390728, para dev accuracy: 0.7053364269141531
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:16,  2.73s/it]
train eval: 6it [00:16,  2.72s/it]
dev eval: 1it [00:01,  1.26s/it]


epoch number: 7, para train accuracy: 0.7622516556291391, para dev accuracy: 0.7122969837587007
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:16,  2.72s/it]
train eval: 6it [00:16,  2.72s/it]
dev eval: 1it [00:01,  1.26s/it]


epoch number: 8, para train accuracy: 0.7687086092715232, para dev accuracy: 0.7192575406032483
New best model. Saving.
saved the model to ./models/para_single


train: 6it [00:16,  2.72s/it]
train eval: 6it [00:16,  2.76s/it]
dev eval: 1it [00:01,  1.26s/it]


epoch number: 9, para train accuracy: 0.7754966887417218, para dev accuracy: 0.7053364269141531


train: 6it [00:16,  2.75s/it]
train eval: 6it [00:16,  2.76s/it]
dev eval: 1it [00:01,  1.28s/it]


epoch number: 10, para train accuracy: 0.7836092715231788, para dev accuracy: 0.7006960556844548


train: 6it [00:16,  2.78s/it]
train eval: 6it [00:16,  2.79s/it]
dev eval: 1it [00:01,  1.28s/it]


epoch number: 11, para train accuracy: 0.7928807947019868, para dev accuracy: 0.691415313225058


train: 6it [00:16,  2.80s/it]
train eval: 6it [00:16,  2.79s/it]
dev eval: 1it [00:01,  1.29s/it]


epoch number: 12, para train accuracy: 0.7988410596026491, para dev accuracy: 0.6960556844547564


train eval: 6it [00:14,  2.50s/it]
dev eval: 1it [00:01,  1.08s/it]


epoch number: 0, sts train acc: 0.2195364238410596, sts dev acc: 0.25986078886310904


train: 6it [00:15,  2.50s/it]
train eval: 6it [00:14,  2.50s/it]
dev eval: 1it [00:01,  1.09s/it]


epoch number: 1, sts train acc: 0.22913907284768212, sts dev acc: 0.26218097447795824
New best model. Saving.
saved the model to ./models/sts_single


train: 6it [00:14,  2.48s/it]
train eval: 6it [00:14,  2.50s/it]
dev eval: 1it [00:01,  1.08s/it]


epoch number: 2, sts train acc: 0.22483443708609271, sts dev acc: 0.2482598607888631


train: 6it [00:15,  2.52s/it]
train eval: 6it [00:15,  2.51s/it]
dev eval: 1it [00:01,  1.09s/it]


epoch number: 3, sts train acc: 0.23410596026490066, sts dev acc: 0.25986078886310904


train: 6it [00:14,  2.49s/it]
train eval: 6it [00:14,  2.48s/it]
dev eval: 1it [00:01,  1.07s/it]


epoch number: 4, sts train acc: 0.24056291390728476, sts dev acc: 0.25986078886310904


train: 6it [00:14,  2.49s/it]
train eval: 6it [00:15,  2.50s/it]
dev eval: 1it [00:01,  1.09s/it]


epoch number: 5, sts train acc: 0.24536423841059601, sts dev acc: 0.25754060324825984


train: 6it [00:15,  2.53s/it]
train eval: 6it [00:14,  2.50s/it]
dev eval: 1it [00:01,  1.09s/it]


epoch number: 6, sts train acc: 0.2544701986754967, sts dev acc: 0.24361948955916474


train: 6it [00:15,  2.50s/it]
train eval: 6it [00:14,  2.49s/it]
dev eval: 1it [00:01,  1.08s/it]


epoch number: 7, sts train acc: 0.25612582781456955, sts dev acc: 0.24361948955916474


train: 6it [00:14,  2.49s/it]
train eval: 6it [00:14,  2.48s/it]
dev eval: 1it [00:01,  1.08s/it]


epoch number: 8, sts train acc: 0.2589403973509934, sts dev acc: 0.2482598607888631


train: 6it [00:14,  2.48s/it]
train eval: 6it [00:14,  2.49s/it]
dev eval: 1it [00:01,  1.07s/it]


epoch number: 9, sts train acc: 0.26109271523178806, sts dev acc: 0.2529002320185615


train: 6it [00:14,  2.50s/it]
train eval: 6it [00:14,  2.49s/it]
dev eval: 1it [00:01,  1.08s/it]


epoch number: 10, sts train acc: 0.2662251655629139, sts dev acc: 0.2482598607888631


train: 6it [00:14,  2.49s/it]
train eval: 6it [00:14,  2.50s/it]
dev eval: 1it [00:01,  1.08s/it]


epoch number: 11, sts train acc: 0.2683774834437086, sts dev acc: 0.26218097447795824
New best model. Saving.


In [None]:
def test_multitask_model(filepath):
    train_multitask_model(para_train, para_dev, sts_train, sts_dev, filepath)

    device = torch.device('cuda')

    model, _ = load_model(filepath, device)

    para_acc = eval_singletask_model(model, device, para_test, 0, 'test')
    sts_acc = eval_singletask_model(model, device, sts_test, 1, 'test')

    print(f'Final test accuracy. PARA: {para_acc}, STS: {sts_acc}.')

In [None]:
test_multitask_model('./models/multitask')

In [None]:
class NLP_Model_CAGrad(nn.Module):
    def __init__(self, model):
        super(NLP_Model, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(DROPOUT_PROB)
        self.paraphrase_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.paraphrase_linear_interact = nn.Linear(INPUT_SIZE, N_PARAPHRASE_CLASSES)
        self.similarity_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.similarity_linear_interact = nn.Linear(INPUT_SIZE, N_SIMILARITY_CLASSES)
    
    def forward(self, para_sentences1, para_sentences2, sts_sentences1, sts_sentences2, device, compute_grad=False):
        para_sentences1 = torch.as_tensor(self.model.encode(para_sentences1.tolist()))
        para_sentences1 = para_sentences1.to(device)
        para_sentences2 = torch.as_tensor(self.model.encode(para_sentences2.tolist()))
        para_sentences2 = para_sentences2.to(device)

        sts_sentences1 = torch.as_tensor(self.model.encode(sts_sentences1.tolist()))
        sts_sentences1 = sts_sentences1.to(device)
        sts_sentences2 = torch.as_tensor(self.model.encode(sts_sentences2.tolist()))
        sts_sentences2 = sts_sentences2.to(device)

        if task == 0:
            sentences1 = self.dropout(sentences1)
            sentences1 = F.relu(self.paraphrase_linear(sentences1))
            sentences2 = self.dropout(sentences2)
            sentences2 = F.relu(self.paraphrase_linear(sentences2))
            combined = torch.concat((sentences1, sentences2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.paraphrase_linear_interact(combined))
        if task == 1:
            sentences1 = self.dropout(sentences1)
            sentences1 = F.relu(self.similarity_linear(sentences1))
            sentences2 = self.dropout(sentences2)
            sentences2 = F.relu(self.similarity_linear(sentences2))
            combined = torch.concat((sentences1, sentences2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.similarity_linear_interact(combined)) * 5