In [155]:
import pandas as pd
from random import sample
import random
import numpy as np

random.seed(0)
para_train = pd.read_csv('./data/quora-train.csv', sep="\t")
sts_train = pd.read_csv('./data/sts-train.csv', sep="\t")
para_train = para_train.dropna()
para_train = para_train.head(10000) #!!!!
sts_train = sts_train.dropna()
para_dev = pd.read_csv('./data/quora-dev.csv', sep="\t")
sts_dev = pd.read_csv('./data/sts-dev.csv', sep="\t")
para_dev = para_dev.dropna()
para_dev = para_dev.head(10) #!!!!
sts_dev = sts_dev.dropna() 
para_test = pd.read_csv('./data/quora-test-student.csv', sep="\t")
sts_test = pd.read_csv('./data/sts-test-student.csv', sep="\t")
para_test = para_test.dropna()
sts_test = sts_test.dropna() 

In [21]:
from sentence_transformers import SentenceTransformer
# sentences = ["This is an example sentence", "Each sentence is converted"]

# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# embeddings = model.encode(sentences)
# print(embeddings)
# print(para_train.head())

   Unnamed: 0                         id  \
0      321296  a4da64a4e943bf8b008d174ca   
1      373075  f965ab2d29d51008ab3872f47   
2      213557  5aca073f4a977d55176f68572   
3      209343  a8abbd08d3401287da42acfb9   
4      389061  65dffaf8ac0ed7f3ebfcf0b02   

                                           sentence1  \
0               How can I master myself in geometry?   
1    Can deleted pictures on Instagram be recovered?   
2  Who is the best prime minister India has ever ...   
3  What qualifications do I need to get a job in ...   
4              In what situations is it okay to lie?   

                                           sentence2  is_duplicate  
0          How can I master geometry for the CAT-14?           0.0  
1            How do I delete a picture on Instagram?           0.0  
2  Who is the best prime minister of the india ti...           1.0  
3  What qualifications do I need to be able to wo...           1.0  
4                            When is it okay to lie? 

In [149]:
from torch import nn
import torch
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch.utils.data import DataLoader, Dataset


N_PARAPHRASE_CLASSES = 1
N_SIMILARITY_CLASSES= 5
DROPOUT_PROB = 0.5
INPUT_SIZE = 768

class NLP_Model(nn.Module):
    def __init__(self, model):
        super(NLP_Model, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(DROPOUT_PROB)
        self.paraphrase_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.paraphrase_linear_interact = nn.Linear(INPUT_SIZE, N_PARAPHRASE_CLASSES)
        self.similarity_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.similarity_linear_interact = nn.Linear(INPUT_SIZE, N_SIMILARITY_CLASSES)
    
    def forward(self, sentence1, sentence2, task, device):
        '''
        Task 0 is para. Task 1 is similarity.
        '''
        sentence1 = torch.from_numpy(self.model.encode(sentence1))
        sentence1 = sentence1.to(device)
        sentence2 = torch.from_numpy(self.model.encode(sentence2))
        sentence2 = sentence2.to(device)
        if task == 0:
            sentence1 = self.dropout(sentence1)
            sentence1 = F.relu(self.paraphrase_linear(sentence1))
            sentence2 = self.dropout(sentence2)
            sentence2 = F.relu(self.paraphrase_linear(sentence2))
            combined = torch.concat((sentence1, sentence2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.paraphrase_linear_interact(combined))
        if task == 1:
            sentence1 = self.dropout(sentence1)
            sentence1 = F.relu(self.similarity_linear(sentence1))
            sentence2 = self.dropout(sentence2)
            sentence2 = F.relu(self.similarity_linear(sentence2))
            combined = torch.concat((sentence1, sentence2), dim=-1)
            combined = self.dropout(combined)
            return F.softmax(self.similarity_linear_interact(combined), dim=-1)


In [150]:
def save_model(model, optimizer, config, filepath):
    save_info = {
        'model': model.state_dict(),
        'optim': optimizer.state_dict(),
        'model_config': config,
        'system_rng': random.getstate(),
        'numpy_rng': np.random.get_state(),
        'torch_rng': torch.random.get_rng_state(),
    }

    torch.save(save_info, filepath)
    print(f"save the model to {filepath}")

In [152]:
from torch.optim import AdamW
from tqdm import tqdm
NUM_EPOCHS = 10

def train_singletask_para_model(para_train, para_dev):
    '''
    use AdamW optimizer.
    binary cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')

    #para_train_features = para_train[['sentence1', 'sentence2']]
    #para_train_targets = para_train['is_duplicate']
    #print(para_train_features.head())
    #print(len(para_train_features), len(para_train_features.iloc[0]))
    #para_train_features = torch.tensor(para_train_features.values)
    #para_train_targets = torch.tensor(para_train_targets.values)
    #para_train_data = data_utils.Dataset(para_train_features, para_train_targets)
    #para_dev_features = para_dev[['sentence1', 'sentence2']]
    #para_dev_targets = para_dev['is_duplicate']
    #para_dev_features = torch.tensor(para_dev_features.values)
    #para_dev_targets = torch.tensor(para_dev_targets.values)
    #para_dev_data = data_utils.Dataset(para_dev_features, para_dev_targets)

    #para_train_dataloader = DataLoader(para_train_data, shuffle=True, batch_size=16)
    #para_dev_dataloader = DataLoader(para_dev_data, shuffle=True, batch_size=16)
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)
    #transformer.train()

    model = NLP_Model(transformer)
    model = model.to(device)
    #print(model.parameters())

    optimizer = AdamW(list(model.parameters()) + list(transformer.parameters()), lr=1e-4) #~SGD with weight decay 0.01
    best_dev_acc = 0

    train_para_accuracy = test_singletask_model(model, device, para_train, 0, 0)
    dev_para_accuracy = test_singletask_model(model, device, para_dev, 0, 1)
    print(f"epoch number: 0, para train accuracy: {train_para_accuracy}, para dev accuracy: {dev_para_accuracy}")

    for epoch in range(NUM_EPOCHS):
        #print(model.parameters())
        model.train()
        transformer.train()


        for step, batch in tqdm(para_train.iterrows(), desc='train'):

            b_sentence1, b_sentence2, b_labels = batch['sentence1'], batch['sentence2'], batch['is_duplicate']
            optimizer.zero_grad()
            logits = model.forward(b_sentence1, b_sentence2, 0, device)

            #predictions = np.round(logits).flatten()
            b_labels = torch.tensor([b_labels])
            b_labels = b_labels.to(device)

            loss = F.binary_cross_entropy(logits, b_labels, reduction='mean')
            loss.backward()
            optimizer.step()

        train_para_accuracy = test_singletask_model(model, device, para_train, 0, 0)
        dev_para_accuracy = test_singletask_model(model, device, para_dev, 0, 1)
        print(f"epoch number: {epoch + 1}, para train accuracy: {train_para_accuracy}, para dev accuracy: {dev_para_accuracy}")

     

def train_singletask_sts_model():
    '''
    use AdamW optimizer.
    multi-class cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')

    sts_train_dataloader = DataLoader(sts_train, shuffle=True, batch_size=16)
    sts_dev_dataloader = DataLoader(sts_dev, shuffle=True, batch_size=16)
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.to(device)
    #transformer.train()

    pass

https://huggingface.co/docs/transformers/training#train-in-native-pytorch

In [153]:
def test_singletask_model(model, device, dataset, task, dev_flag):
    '''
    given dataloader, 2 task-specific finetuned models, and device
    return the accuracy for para and for sts
    '''
    model.eval()
    model.model.eval()
    with torch.no_grad():
        #saved = torch.load(filepath)
        #config = saved['model_config']
        #model = NLP_Model(config, task)
        #model.load_state_dict(saved['model'])
        #model = model.to(device)
        #para_saved = torch.load(para_filepath)
        #sts_saved = torch.load(sts_filepath)
        #para_config = para_saved['model_config']
        #sts_config = sts_saved['model_config']
        #para_model = NLP_Model(para_config, 0)
        #sts_model = NLP_Model(sts_config, 1)
        #para_model.load_state_dict(para_saved['model'])
        #sts_model.load_state_dict(sts_saved['model'])
        #para_model = para_model.to(device)
        #sts_model = sts_model.to(device)
        #print(f"Loaded model")

        #para_test_dataloader = DataLoader(para_test, shuffle=True, batch_size=16)
        #para_dev_dataloader = DataLoader(para_dev, shuffle=True, batch_size=16)
        #sts_test_dataloader = DataLoader(sts_test, shuffle=True, batch_size=16)
        #sts_dev_dataloader = DataLoader(sts_dev, shuffle=True, batch_size=16)
        truth = []
        predictions = []
        for step, batch in tqdm(dataset.iterrows(), desc=f"{'dev ' if dev_flag else 'train '}eval"):
            b_sentence1, b_sentence2, b_labels = batch['sentence1'], batch['sentence2'], batch['is_duplicate']
            truth.extend([b_labels])
            logits = model.forward(b_sentence1, b_sentence2, task, device)
            logits = logits.detach().cpu().numpy()
            if task == 0:
                new_predictions = np.round(logits).flatten()
            else:
                new_predictions = np.argmax(logits, axis=1).flatten()
            predictions.extend(new_predictions)
        accuracy = (np.array(truth).flatten() == np.array(predictions).flatten()).mean()
    return accuracy

def train_multitask_model():
    '''
    use AdamW optimizer.
    binary cross-entropy loss for para, multi-class cross-entropy loss for sts, sum loss functions. 
    make sure to save model at end to a specific path.
    '''
    pass 

def test_multitask_model():
    '''
    given dataloader, multitask finetuned model, and device
    return the accuracy for para and for sts
    '''
    #shawty
    pass

In [156]:
train_singletask_para_model(para_train, para_dev)

  super(AdamW, self).__init__(params, defaults)
train eval: 10000it [03:54, 42.70it/s]
dev eval: 10it [00:00, 42.57it/s]


epoch number: 0, para train accuracy: 0.4106, para dev accuracy: 0.4


train: 10000it [04:07, 40.36it/s]
train eval: 10000it [03:54, 42.59it/s]
dev eval: 10it [00:00, 40.07it/s]


epoch number: 1, para train accuracy: 0.6828, para dev accuracy: 0.8


train: 1081it [00:26, 40.58it/s]


KeyboardInterrupt: 