In [6]:
import pandas as pd
from random import sample
import random
import numpy as np

random.seed(0)
para_train = pd.read_csv('./data/quora-train.csv', sep="\t")
sts_train = pd.read_csv('./data/sts-train.csv', sep="\t")
para_train = para_train.dropna()
sts_train = sts_train.dropna() 
para_dev = pd.read_csv('./data/quora-dev.csv', sep="\t")
sts_dev = pd.read_csv('./data/sts-dev.csv', sep="\t")
para_dev = para_dev.dropna()
sts_dev = sts_dev.dropna() 
para_test = pd.read_csv('./data/quora-test.csv', sep="\t")
sts_test = pd.read_csv('./data/sts-test.csv', sep="\t")
para_test = para_test.dropna()
sts_test = sts_test.dropna() 
# rand_data = sample(range(1, len(quora_df)-1), len(sts_df)-1)
# rand_data.insert(0, 0)
# quora_df_cut = pd.DataFrame([quora_df.iloc[i] for i in rand_data])

In [7]:
from sentence_transformers import SentenceTransformer
# sentences = ["This is an example sentence", "Each sentence is converted"]

# model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
# embeddings = model.encode(sentences)
# print(embeddings)

In [8]:
# quora_1 = list(quora_df_cut['sentence1'])
# quora_2 = list(quora_df_cut['sentence2'])
# sts_1 = list(sts_df['sentence1'])
# sts_2 = list(sts_df['sentence2'])

In [9]:
# sentence_groups = [quora_1, quora_2, sts_1, sts_2]
# embeddings = [model.encode(group, show_progress_bar=True) for group in sentence_groups]
# quora_embeddings_1 = pd.DataFrame(embeddings[0])
# quora_embeddings_2 = pd.DataFrame(embeddings[1])
# sts_embeddings_1 = pd.DataFrame(embeddings[2])
# sts_embeddings_2 = pd.DataFrame(embeddings[3])
# quora_embeddings_1.to_csv("quora_embeddings_1.csv")
# quora_embeddings_2.to_csv("quora_embeddings_2.csv")
# sts_embeddings_1.to_csv("sts_embeddings_1.csv")
# sts_embeddings_2.to_csv("sts_embeddings_2.csv")

In [10]:
from torch import nn
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

N_PARAPHRASE_CLASSES = 1
N_SIMILARITY_CLASSES= 5
DROPOUT_PROB = 0.5
INPUT_SIZE = 768

class NLP_Model(nn.Module):
    def __init__(self, model):
        super(NLP_Model, self).__init__()
        self.model = model
        self.dropout = nn.Dropout(DROPOUT_PROB)
        self.paraphrase_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.paraphrase_linear_interact = nn.Linear(INPUT_SIZE, N_PARAPHRASE_CLASSES)
        self.similarity_linear = nn.Linear(INPUT_SIZE, INPUT_SIZE // 2)
        self.similarity_linear_interact = nn.Linear(INPUT_SIZE, N_SIMILARITY_CLASSES)
    
    def forward(self, sentence1, sentence2, task):
        '''
        Task 0 is para. Task 1 is similarity.
        '''
        sentence1 = torch.from_numpy(self.model.encode(sentence1))
        sentence2 = torch.from_numpy(self.model.encode(sentence2))
        if task == 0:
            sentence1 = self.dropout(sentence1)
            sentence1 = F.relu(self.paraphrase_linear(sentence1))
            sentence2 = self.dropout(sentence2)
            sentence2 = F.relu(self.paraphrase_linear(sentence2))
            combined = torch.concat((sentence1, sentence2), dim=-1)
            combined = self.dropout(combined)
            return F.sigmoid(self.paraphrase_linear_interact(combined))
        if task == 1:
            sentence1 = self.dropout(sentence1)
            sentence1 = F.relu(self.similarity_linear(sentence1))
            sentence2 = self.dropout(sentence2)
            sentence2 = F.relu(self.similarity_linear(sentence2))
            combined = torch.concat((sentence1, sentence2), dim=-1)
            combined = self.dropout(combined)
            return F.softmax(self.similarity_linear_interact(combined), dim=-1)


In [11]:
def save_model(model, optimizer, config, filepath):
    save_info = {
        'model': model.state_dict(),
        'optim': optimizer.state_dict(),
        'model_config': config,
        'system_rng': random.getstate(),
        'numpy_rng': np.random.get_state(),
        'torch_rng': torch.random.get_rng_state(),
    }

    torch.save(save_info, filepath)
    print(f"save the model to {filepath}")

https://huggingface.co/docs/transformers/training#train-in-native-pytorch

In [14]:
def test_singletask_model(para_filepath, sts_filepath, device, dataloader):
    '''
    given dataloader, 2 models, and device
    return the accuracy for para and for sts
    '''
    with torch.no_grad():
        para_saved = torch.load(para_filepath)
        sts_saved = torch.load(sts_filepath)
        para_config = para_saved['model_config']
        sts_config = sts_saved['model_config']
        para_model = NLP_Model(para_config, 0)
        sts_model = NLP_Model(sts_config, 1)
        para_model.load_state_dict(para_saved['model'])
        sts_model.load_state_dict(sts_saved['model'])
        para_model = para_model.to(device)
        sts_model = sts_model.to(device)
        print(f"Loaded models")

        para_test_dataloader = DataLoader(para_test, shuffle=True)
    pass

def train_multitask_model():
    '''
    use AdamW optimizer.
    binary cross-entropy loss for para, multi-class cross-entropy loss for sts, sum loss functions. 
    make sure to save model at end to a specific path.
    '''
    pass 

def test_multitask_model():
    '''
    just accuracy for para.
    accuracy + MSE for five class.
    '''
    pass

In [None]:
NUM_EPOCHS = 3

def train_singletask_para_model():
    '''
    use AdamW optimizer.
    binary cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    device = torch.device('cuda')

    para_train_dataloader = Dataloader(para_train, shuffle=True, batch_size=16)
    
    transformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    transformer.train()

    model = NLP_Model(transformer)
    model = model.to(device)
    print(model.parameters())

    optimizer = AdamW(model.parameters(), lr=5e-5) #SGD with weight decay 0.01

    for epoch in range(NUM_EPOCHS):
        print(model.parameters())


     

def train_singletask_sts_model():
    '''
    use AdamW optimizer.
    multi-class cross-entropy loss.
    make sure to save model at end to a specific path.
    '''
    pass