In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [2]:
import csv
data_path = '/home/ubuntu/NLP-brain-biased-robustness/data/stsb/stsbenchmark/'

#wget https://data.deepai.org/Stsbenchmark.zip

def read_csv(csv_file):
    file = open(csv_file)
    csvreader = csv.reader(file, delimiter="\t")
    header = next(csvreader)
    rows = []
    for row in csvreader:
        rows.append(row)
    file.close()
    return rows

In [3]:
train_set = read_csv(data_path+'sts-train.csv')
dev_set = read_csv(data_path+'sts-dev.csv')
test_set = read_csv(data_path+'sts-test.csv')

In [4]:
def split_data():
    headlines = []
    images = []
    MSRpar = []
    MSRvid = []
    for dataset in [train_set, dev_set, test_set]:
        for i in range(len(dataset)):
            if dataset[i][1] == 'headlines':
                headlines.append(dataset[i])
            if dataset[i][1] == 'images':
                images.append(dataset[i])
            if dataset[i][1] == 'MSRpar':
                MSRpar.append(dataset[i])
            if dataset[i][1] == 'MSRvid':
                MSRvid.append(dataset[i])
    return headlines, images, MSRpar, MSRvid

In [5]:
headlines, images, MSRpar, MSRvid = split_data()

In [53]:
def create_dataset(split):
    dataset = []
    for example in split:
        if not len(example) < 7:
            data = {}
            data['sentence_1'] = example[5]
            data['sentence_2'] = example[6]
            data['labels'] = float(example[4])
            dataset.append(data)
    return dataset

headlines_dataset = create_dataset(headlines)
images_dataset = create_dataset(images)
MSRpar_dataset = create_dataset(MSRpar)
MSRvid_dataset = create_dataset(MSRvid)

headlines_dataloader = DataLoader(headlines_dataset)
images_dataloader = DataLoader(images_dataset)
MSRpar_dataloader = DataLoader(MSRpar_dataset)
MSRvid_dataloader = DataLoader(MSRvid_dataset)

In [95]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import BertModel
from datasets import load_dataset

from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm


class PlaceHolderBERT(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    def forward(self, x):
        embeddings = self.tokenizer(x, return_tensors='pt', padding=True, truncation=True)
        embeddings.to(self.device)
        representations = self.bert(**embeddings).last_hidden_state
        cls_representation = representations[:,0,:]
        return cls_representation
    
    
def train(model, dataloader, num_epochs=1): #can scrap keyword
    #optimizer as usual
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_function = torch.nn.MSELoss()
    #learning rate scheduler
    num_training_steps = num_epochs * len(dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    #auto logging; progress bar
    progress_bar = tqdm(range(num_training_steps))
    
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)

    #training loop
    model.train()
    for epoch in range(num_epochs):
        for batch in dataloader: #tryin unpacking text from 'labels' as in model development
            #batch = {k: v.to(device) for k, v in batch.items()}
            #features = {k: v for k, v in batch.items() if k != 'labels'}
            vec_1 = model(batch['sentence_1'])
            vec_2 = model(batch['sentence_2'])
            cosine_similarity_times_5 = cos(vec_1, vec_2) * 5
            targets = batch['labels'].float().to(device)
            loss = loss_function(cosine_similarity_times_5, targets) #replace .loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            

def evaluate(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    cosine_similarities = []
    gold = []
    for batch in dataloader:
        with torch.no_grad():
            vec_1 = model(batch['sentence_1'])
            vec_2 = model(batch['sentence_2'])
            cosine_similarity = cos(vec_1, vec_2)
            golds = batch['labels'].float()
            for idx, similarity in enumerate(cosine_similarity):
                cosine_similarities.append(similarity)
                gold.append(golds[idx])
    torch_cosines = torch.tensor(cosine_similarities)
    torch_gold = torch.tensor(gold)
    
    torch_cosines = torch_cosines.reshape((1,torch_cosines.shape[0]))
    torch_gold = torch_gold.reshape((1,torch_gold.shape[0]))
    
    combined = torch.cat((torch_cosines, torch_gold), axis=0)
    
    return torch.corrcoef(combined)


In [58]:
model = PlaceHolderBERT()
train(model, headlines_dataloader)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2249 [00:00<?, ?it/s]

In [96]:
evaluate(model, headlines_dataloader) #still have to do train/test splits

tensor([[1.0000, 0.8946],
        [0.8946, 1.0000]])

In [97]:
evaluate(model, images_dataloader)

tensor([[1.0000, 0.8254],
        [0.8254, 1.0000]])

In [98]:
evaluate(model, MSRpar_dataloader)

tensor([[1.0000, 0.5131],
        [0.5131, 1.0000]])

In [99]:
evaluate(model, MSRvid_dataloader)

tensor([[1.0000, 0.8309],
        [0.8309, 1.0000]])