In [1]:
#turing data json into pandas dataframe
# wic_df = pd.read_json('WiC/train.jsonl', lines=True)
# print(wic_df.iloc[0:1])

In [2]:
import torch
from torch import nn
from torch import optim
import os
import string
import copy
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_transformers import *
import numpy as np
import json
import collections
import transformers
import pandas as pd
import random
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

In [3]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [15]:
BATCH_SIZE = 4
EPOCHS = 3
PATIENCE = 3
# Prepare Torch to use GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #torch.device("cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 970'

In [4]:
# Converts the jsonl files into json files
def load_json_objects_from_file(filename):
    json_objects = []
    #Iterates throught the jsonl files and appends them to the json list
    with open(filename, mode = "r") as jsonl_file:
        for line in jsonl_file:
            json_objects.append(json.loads(line))
    return json_objects

#This will find the word within the given roBERTa tokenized sentence
def find_word_in_tokenized_sentence(word,token_ids):
    decomposedWord = tokenizer.encode(word)
    #This will iterate through the token ids, ie the given tokenized sentence, and find the word
    for i in range(len(token_ids)):
        if token_ids[i] == decomposedWord[0] and token_ids[i:i+len(decomposedWord)] == decomposedWord:
            return (i,i+len(decomposedWord)-1)
    # This is the ouput when no matching pattern is found
    return (-1,-1)
  
def find_words_in_tokenized_sentences(wordList,token_ids):
    intList = []
    for word in wordList:
        if len(intList) == 0:
            intList.append(find_word_in_tokenized_sentence(word,token_ids))
        else:
            afterLastInterval = intList[-1][1]+1
            interv = find_word_in_tokenized_sentence(word,token_ids[afterLastInterval:])
            actualPositions = (interv[0] + afterLastInterval,interv[1]+afterLastInterval)
            intList.append(actualPositions)
    return intList

In [6]:
# Create a function to preprocess the WiC data
def wic_preprocessing(json_objects, training = True, shuffle_data = False, verbose = False):
    wic_sentences = []
    wic_encoded = []
    wic_labels = []
    wic_word_locs = []
    wic_indexes = []
    for index, example in enumerate(json_objects):
        #wic_indexes.append(example['idx']) # Is it the index??
        wic_indexes.append(index)
        sentence = f"<s>{example['sentence1']}</s><s>{example['sentence2']}</s>"
        wic_sentences.append(sentence)
        # Then encode the sentences
        wic_encoded.append(tokenizer.encode(sentence, add_special_tokens=False))
        # Find the word in each sentences
        word = example['word']
        word_locs = (-1, -1)
        # Split the 2 sentences on space. (Also, lemmatize and uncapitilize each word)
        sent1_split = example['sentence1'].split(' ')
        sent2_split = example['sentence2'].split(' ')
        # Get the index of word in both sentences
        sent1_word_char_loc = (example['start1'], example['end1'])
        sent2_word_char_loc = (example['start2'], example['end2'])
        # Create a variable to keep track of the number of characters parsed in each sentence as we loop
        sent_chars = 0
        # Loop over the words in the first sentence
        i, j = 0, 0
        word1_not_found, word2_not_found = True, True
        while word1_not_found and i < len(sent1_split):
            word_len = len(sent1_split[i])
            if sent_chars >= sent1_word_char_loc[0] or sent_chars + word_len >= sent1_word_char_loc[1]:
                word_locs = (i, -1) # Found the word in the sentence
                word1_not_found = False
            elif sent_chars > sent1_word_char_loc[1]:
                # If we somehow got past the word. Assume it was the previous word
                word_locs = (i - 1, -1) # Found the word in the sentence
                word1_not_found = False
            else:
                # Look at the next word
                sent_chars += word_len + 1 # Plus one for the space
                i += 1
        # Loop over the words in the second
        sent_chars = 0 # Reset
        while word2_not_found and j < len(sent2_split):
            word_len = len(sent2_split[j])
            if sent_chars >= sent2_word_char_loc[0] or sent_chars + word_len >= sent2_word_char_loc[1]:
                word_locs = (i, j) # Found the word in the sentence
                word2_not_found = False
            elif sent_chars > sent2_word_char_loc[1]:
                # If we somehow got past the word. Assume it was the previous word
                word_locs = (i, j - 1) # Found the word in the sentence
                word2_not_found = False
            else:
                # Look at the next word
                sent_chars += word_len + 1 # Plus one for the space
                j += 1
        # For testing
        if verbose:
            print(word)
            print(sent1_split)
            print(sent2_split)
            print(word_locs)
        # Now to find the word in the tokenized sentences
        word1 = sent1_split[word_locs[0]].translate(str.maketrans('', '', string.punctuation)) #Remove punctuation (See https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string)
        word2 = sent2_split[word_locs[1]].translate(str.maketrans('', '', string.punctuation)) #Remove punctuation
        token_word_locs = find_words_in_tokenized_sentences([word1, word2], wic_encoded[-1])
        wic_word_locs.append(token_word_locs)
        # Get the label if we expect it to be there
        if training:
            if example['label']:
                wic_labels.append(1)
            else:
                wic_labels.append(0)
    # Pad the sequences and find the encoded word location in the combined input
    max_len = np.array([len(ex) for ex in wic_encoded]).max()
    wic_padded = {"input_ids" : [], "attention_mask" : [], "token_type_ids" : [], "word1_locs": [], "word2_locs" : [], "index" : wic_indexes}
    for i in range(0, len(wic_encoded)):
        enc_sentence = wic_encoded[i]
        word_locs = wic_word_locs[i]
        # Pad the sequences
        ex_len = len(enc_sentence)
        padded_sentence = enc_sentence.copy()
        padded_sentence.extend([0]*(max_len - ex_len))
        wic_padded["input_ids"].append(padded_sentence)
        padded_mask = [1] * ex_len
        padded_mask.extend([0]*(max_len - ex_len))
        wic_padded["attention_mask"].append(padded_mask)
        # Create the vector to get back the words after RoBERTa
        token_word_locs = wic_word_locs[i]
        first_word_loc = []
        second_word_loc = []
        len_first_word = token_word_locs[0][1] - token_word_locs[0][0] + 1
        len_second_word = token_word_locs[1][1] - token_word_locs[1][0] + 1
        for j in range(0, max_len):
            if j >= token_word_locs[0][0] and j <= token_word_locs[0][1]:
                #Part of the first word
                first_word_loc.append(1.0 / len_first_word)
            else:
                first_word_loc.append(0.0)
            if j >= token_word_locs[1][0] and j <= token_word_locs[1][1]:
                #Part of the second word
                second_word_loc.append(1.0 / len_second_word)
            else:
                second_word_loc.append(0.0)
        # We want to append a [1, max_len] vector instead of a [max_len] vector so wrap in an array
        wic_padded["word1_locs"].append([first_word_loc])
        wic_padded["word2_locs"].append([second_word_loc])
        # token_type_ids is a mask that tells where the first and second sentences are
        token_type_id = []
        first_sentence = True
        sentence_start = True
        for token in padded_sentence:
            if first_sentence and sentence_start and token == 0:
                # Allows 0 at the start of the first sentence
                token_type_id.append(0)
            elif first_sentence and token > 0:
                if sentence_start:
                    sentence_start = False
                token_type_id.append(0)
            elif first_sentence and not sentence_start and token == 0:
                first_sentence = False
                # Start of second sentence
                token_type_id.append(1)
            else:
                # Second sentence
                token_type_id.append(1)
        wic_padded["token_type_ids"].append(token_type_id)
    if training:
        if shuffle_data:
            # Shuffle the data
            raw_set = {"input_ids": [], "token_type_ids": [], "attention_mask": [], "labels": [], "word1_locs": [], "word2_locs" : [], "index" : []}
            raw_set["input_ids"], raw_set["token_type_ids"], raw_set["attention_mask"], raw_set["labels"], raw_set["word1_locs"], raw_set["word2_locs"], raw_set["index"] = shuffle(
              wic_padded["input_ids"], wic_padded["token_type_ids"], wic_padded["attention_mask"], wic_labels, wic_padded["word1_locs"], wic_padded["word2_locs"], wic_padded["index"])
        else:
            raw_set = {"input_ids": wic_padded["input_ids"], "token_type_ids": wic_padded["token_type_ids"],
                     "attention_mask": wic_padded["attention_mask"], "labels": wic_labels, "index" : wic_padded["index"],
                     "word1_locs": wic_padded["word1_locs"], "word2_locs" : wic_padded["word2_locs"]}
    else: # No labels present (Testing set)
        # Do not shuffle the testing set
        raw_set = {"input_ids": wic_padded["input_ids"], "token_type_ids": wic_padded["token_type_ids"], 
               "attention_mask": wic_padded["attention_mask"], "index" : wic_padded["index"], 
               "word1_locs": wic_padded["word1_locs"], "word2_locs" : wic_padded["word2_locs"]}
    # Return the raw data (Need to put them in a PyTorch tensor and dataset)
    return raw_set

In [7]:
# Process the data
train_json_objs = load_json_objects_from_file("WiC/train.jsonl")
raw_train_set = wic_preprocessing(train_json_objs, shuffle_data=True, verbose = False) # We do not want to shuffle for now.
print(len(raw_train_set["labels"])/BATCH_SIZE)

1357.0


In [8]:
# Create a PyTorch dataset for it
train_data = TensorDataset(
    torch.tensor(raw_train_set["input_ids"]).to(device),
    torch.tensor(raw_train_set["token_type_ids"]).to(device),
    torch.tensor(raw_train_set["attention_mask"]).to(device),
    torch.tensor(raw_train_set["labels"]).to(device),
    torch.tensor(raw_train_set["word1_locs"]).to(device),
    torch.tensor(raw_train_set["word2_locs"]).to(device),
    torch.tensor(raw_train_set["index"]).to(device)
)
# Create a sampler and loader
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

In [9]:
# Load the json objects from each file
test_json_objs = load_json_objects_from_file("WiC/test.jsonl")
valid_json_objs = load_json_objects_from_file("WiC/val.jsonl")
# Process the objects
raw_test_set = wic_preprocessing(test_json_objs, training = False) # The labels for the testing set are unknown
raw_valid_set = wic_preprocessing(valid_json_objs)
# Create PyTorch datasets
test_data = TensorDataset(
    torch.tensor(raw_test_set["input_ids"]).to(device),
    torch.tensor(raw_test_set["token_type_ids"]).to(device),
    torch.tensor(raw_test_set["attention_mask"]).to(device),
    torch.tensor(raw_test_set["word1_locs"]).to(device),
    torch.tensor(raw_test_set["word2_locs"]).to(device),
    torch.tensor(raw_test_set["index"]).to(device)
)
validation_data = TensorDataset(
    torch.tensor(raw_valid_set["input_ids"]).to(device),
    torch.tensor(raw_valid_set["token_type_ids"]).to(device),
    torch.tensor(raw_valid_set["attention_mask"]).to(device),
    torch.tensor(raw_valid_set["labels"]).to(device),
    torch.tensor(raw_valid_set["word1_locs"]).to(device),
    torch.tensor(raw_valid_set["word2_locs"]).to(device),
    torch.tensor(raw_valid_set["index"]).to(device)
)
# Create a sampler and loader for each
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

In [10]:
#Loading in model
model = RobertaForMaskedLM.from_pretrained('roberta-base')

class WiC_Head(torch.nn.Module):
    def __init__(self, roberta_based_model, embedding_size = 768):
        """
        Keeps a reference to the provided RoBERTa model. 
        It then adds a linear layer that takes the distance between two 
        """
        super(WiC_Head, self).__init__()
        self.embedding_size = embedding_size
        self.embedder = roberta_based_model
        self.linear_diff = torch.nn.Linear(embedding_size, 250, bias = True)
        self.linear_seperator = torch.nn.Linear(250, 2, bias = True)
        self.loss = torch.nn.CrossEntropyLoss()
        self.activation = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax()

    def forward(self, input_ids=None, attention_mask=None, labels=None,
                word1_locs = None, word2_locs = None):
        """
        Takes in the same argument as RoBERTa forward plus two tensors for the location of the 2 words to compare
        """
        if word1_locs is None or word2_locs is None:
            raise ValueError("The tensors (word1_locs, word1_locs) containing the location of the words to compare in the input vector must be provided.")
        elif input_ids is None:
            raise ValueError("The input_ids tensor must be provided.")
        elif word1_locs.shape[0] != input_ids.shape[0] or word2_locs.shape[0] != input_ids.shape[0]:
            raise ValueError("All provided vectors should have the same batch size.")
        batch_size = word1_locs.shape[0]
        # Get the embeddings
        embs, _ = self.embedder.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Get the words
        word1s = torch.matmul(word1_locs, embs).view(batch_size, self.embedding_size)
        word2s = torch.matmul(word2_locs, embs).view(batch_size, self.embedding_size)
        diff = word1s - word2s
        # Calculate outputs using activation
        layer1_results = self.activation(self.linear_diff(diff))
        logits = self.softmax(self.linear_seperator(layer1_results))
        outputs = logits
        # Calculate the loss
        if labels is not None:
            #  We want seperation like a SVM so use Hinge loss
            loss = self.loss(logits.view(-1, 2), labels.view(-1))
            outputs = (loss, logits)
        return outputs

In [11]:
#CREATE WiC MODEL
class_model = WiC_Head(model, embedding_size = 768)

In [16]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels, return_predict_correctness = False):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    if return_predict_correctness:
        return np.sum(pred_flat == labels_flat) / len(labels_flat), pred_flat == labels_flat
    else:
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [12]:
#TRAINING LOOP
# Variable for minimal accuracy
MIN_ACCURACY = 0.73 # Based on the average accuracy
REACHED_MIN_ACCURACY = False
best_weights = class_model.state_dict()
# Want to maximize accuracy
max_val_acc = (0, 0)
# Put the model in CPU
class_model.cuda()
# Create the optimizer
param_optimizer = list(class_model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# I use the one that comes with the models, but any other optimizer could be used
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
# Store our loss and accuracy for plotting
fit_history = {"loss": [],  "accuracy": [], "val_loss": [], "val_accuracy": []}
epoch_number = 0
epoch_since_max = 0
continue_learning = True
while epoch_number < EPOCHS and continue_learning:
    epoch_number += 1
    print(f"Training epoch #{epoch_number}")
    # Tracking variables
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Training
    # Set our model to training mode (as opposed to evaluation mode)
    class_model.train()
    # Freeze RoBERTa weights
    #class_model.embedder.eval()
    class_model.embedder.requires_grad_ = False
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.cuda() for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)   
        loss, logits = class_model(b_input_ids, attention_mask=b_input_mask, labels=b_labels, word1_locs = b_word1, word2_locs = b_word2) 
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        # Calculate the accuracy
        b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification
        # Append to fit history
        fit_history["loss"].append(loss.item()) 
        fit_history["accuracy"].append(b_accuracy) 
        # Update tracking variables
        tr_loss += loss.item()
        tr_accuracy += b_accuracy
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        if nb_tr_steps%10 == 0:
            print("\t\tTraining Batch {}: Loss: {}; Accuracy: {}".format(nb_tr_steps, loss.item(), b_accuracy))
    print("Training:\n\tLoss: {}; Accuracy: {}".format(tr_loss/nb_tr_steps, tr_accuracy/nb_tr_steps))
    # Validation
    # Put model in evaluation mode to evaluate loss on the validation set
    class_model.eval()
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.cuda() for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)
            loss, logits = class_model(b_input_ids, attention_mask=b_input_mask, labels=b_labels, word1_locs = b_word1, word2_locs = b_word2)
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()
        # Calculate the accuracy
        b_accuracy = flat_accuracy(logits, label_ids) # For RobertaForClassification
        # Append to fit history
        fit_history["val_loss"].append(loss.item()) 
        fit_history["val_accuracy"].append(b_accuracy) 
        # Update tracking variables
        eval_loss += loss.item()
        eval_accuracy += b_accuracy
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
        if nb_eval_steps%10 == 0:
            print("\t\tValidation Batch {}: Loss: {}; Accuracy: {}".format(nb_eval_steps, loss.item(), b_accuracy))
    eval_acc = eval_accuracy/nb_eval_steps
    if eval_acc >= max_val_acc[0]:
        max_val_acc = (eval_acc, epoch_number)
        continue_learning = True
        epoch_since_max = 0 # New max
        best_weights = copy.deepcopy(class_model.state_dict()) # Keep the best weights
        # See if we have reached min_accuracy
        if eval_acc >= MIN_ACCURACY:
            REACHED_MIN_ACCURACY = True
        # Save to file only if it has reached min acc
        if REACHED_MIN_ACCURACY:
            # Save the best weights to file
            torch.save(best_weights, os.path.join(PATH,'WiCHead.pt'))
            continue_learning = False # Stop learning. Reached baseline acc for this model
    else:
        epoch_since_max += 1
        if epoch_since_max > PATIENCE:
            continue_learning = False # Stop learning, starting to overfit
    print("Validation:\n\tLoss={}; Accuracy: {}".format(eval_loss/nb_eval_steps, eval_accuracy/nb_eval_steps))
print(f"Best accuracy ({max_val_acc[0]}) obtained at epoch #{max_val_acc[1]}.")
# Reload the best weights (from memory)
class_model.load_state_dict(best_weights)


#I am getting the error CUDA out of memory, I think this is because the batch size is too big, 
#though I'm not too sure what to change it too or where to find that variable.
#I think it might be from lines 41 to 43

Training epoch #1


  logits = self.softmax(self.linear_seperator(layer1_results))
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:1025.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


		Training Batch 10: Loss: 0.6881631016731262; Accuracy: 0.5
		Training Batch 20: Loss: 0.6880441904067993; Accuracy: 0.5
		Training Batch 30: Loss: 0.7051643133163452; Accuracy: 0.25
		Training Batch 40: Loss: 0.70003342628479; Accuracy: 0.25
		Training Batch 50: Loss: 0.688072681427002; Accuracy: 0.75
		Training Batch 60: Loss: 0.6815245151519775; Accuracy: 0.75
		Training Batch 70: Loss: 0.6829959750175476; Accuracy: 0.75
		Training Batch 80: Loss: 0.6893532276153564; Accuracy: 0.75
		Training Batch 90: Loss: 0.6876422166824341; Accuracy: 0.75
		Training Batch 100: Loss: 0.6964164972305298; Accuracy: 0.75
		Training Batch 110: Loss: 0.6777034401893616; Accuracy: 0.5
		Training Batch 120: Loss: 0.6960470676422119; Accuracy: 0.5
		Training Batch 130: Loss: 0.6608750820159912; Accuracy: 0.5
		Training Batch 140: Loss: 0.6653028726577759; Accuracy: 0.75
		Training Batch 150: Loss: 0.6744694113731384; Accuracy: 1.0
		Training Batch 160: Loss: 0.6807105541229248; Accuracy: 0.75
		Training

		Training Batch 1320: Loss: 0.6387513279914856; Accuracy: 0.5
		Training Batch 1330: Loss: 0.7209321856498718; Accuracy: 0.5
		Training Batch 1340: Loss: 0.593829333782196; Accuracy: 0.75
		Training Batch 1350: Loss: 0.5783306360244751; Accuracy: 0.75
Training:
	Loss: 0.6466828497921445; Accuracy: 0.6285924834193073
		Validation Batch 10: Loss: 0.7363885641098022; Accuracy: 0.5
		Validation Batch 20: Loss: 0.5707270503044128; Accuracy: 0.75
		Validation Batch 30: Loss: 0.581841766834259; Accuracy: 0.75
		Validation Batch 40: Loss: 0.6844767332077026; Accuracy: 0.5
		Validation Batch 50: Loss: 0.5066680312156677; Accuracy: 0.75
		Validation Batch 60: Loss: 0.7180450558662415; Accuracy: 0.5
		Validation Batch 70: Loss: 0.4405924081802368; Accuracy: 1.0
		Validation Batch 80: Loss: 0.7519993782043457; Accuracy: 0.5
		Validation Batch 90: Loss: 0.6914889812469482; Accuracy: 0.5
		Validation Batch 100: Loss: 0.48350176215171814; Accuracy: 0.75
		Validation Batch 110: Loss: 0.55197590589523

		Training Batch 1100: Loss: 0.6262776851654053; Accuracy: 0.75
		Training Batch 1110: Loss: 0.3270082175731659; Accuracy: 1.0
		Training Batch 1120: Loss: 0.36454126238822937; Accuracy: 1.0
		Training Batch 1130: Loss: 0.3289235532283783; Accuracy: 1.0
		Training Batch 1140: Loss: 0.32174021005630493; Accuracy: 1.0
		Training Batch 1150: Loss: 0.317534476518631; Accuracy: 1.0
		Training Batch 1160: Loss: 0.33529213070869446; Accuracy: 1.0
		Training Batch 1170: Loss: 0.5210258364677429; Accuracy: 1.0
		Training Batch 1180: Loss: 0.31546705961227417; Accuracy: 1.0
		Training Batch 1190: Loss: 0.3348250985145569; Accuracy: 1.0
		Training Batch 1200: Loss: 0.358184814453125; Accuracy: 1.0
		Training Batch 1210: Loss: 0.5650465488433838; Accuracy: 0.75
		Training Batch 1220: Loss: 0.7161327004432678; Accuracy: 0.5
		Training Batch 1230: Loss: 0.4000394642353058; Accuracy: 1.0
		Training Batch 1240: Loss: 1.0525469779968262; Accuracy: 0.25
		Training Batch 1250: Loss: 0.3844912052154541; A

		Training Batch 880: Loss: 0.31482023000717163; Accuracy: 1.0
		Training Batch 890: Loss: 0.7099626660346985; Accuracy: 0.5
		Training Batch 900: Loss: 0.31866365671157837; Accuracy: 1.0
		Training Batch 910: Loss: 0.4655955731868744; Accuracy: 0.75
		Training Batch 920: Loss: 0.4028598666191101; Accuracy: 1.0
		Training Batch 930: Loss: 0.4758474826812744; Accuracy: 1.0
		Training Batch 940: Loss: 0.5676104426383972; Accuracy: 0.75
		Training Batch 950: Loss: 0.31748053431510925; Accuracy: 1.0
		Training Batch 960: Loss: 0.32019251585006714; Accuracy: 1.0
		Training Batch 970: Loss: 0.42996299266815186; Accuracy: 1.0
		Training Batch 980: Loss: 0.5887423157691956; Accuracy: 0.75
		Training Batch 990: Loss: 0.43447205424308777; Accuracy: 1.0
		Training Batch 1000: Loss: 0.5006168484687805; Accuracy: 0.75
		Training Batch 1010: Loss: 0.41142570972442627; Accuracy: 1.0
		Training Batch 1020: Loss: 0.3224804997444153; Accuracy: 1.0
		Training Batch 1030: Loss: 0.4566463530063629; Accurac

<All keys matched successfully>

In [17]:
def flat_predictions(preds):
    pred_flat = np.argmax(preds, axis=1).flatten()
    return pred_flat == 1

In [18]:
validation_predictions_correctness = {}
# Validation
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Put model in evaluation mode
class_model.eval()
# Evaluate data for one epoch
for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.cuda() for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_ids, b_input_mask, b_labels, b_word1, b_word2, b_index = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        #loss, logits = class_model(b_input_ids, token_type_ids=b_token_ids, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = class_model(b_input_ids, attention_mask=b_input_mask, 
                                    labels=b_labels, word1_locs = b_word1, word2_locs = b_word2)
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    # Calculate the accuracy
    b_accuracy, b_pred_correctness = flat_accuracy(logits, label_ids, return_predict_correctness = True) # For RobertaForClassification
    indexes = b_index.detach().cpu().numpy() # Get the indexes
    # Add to predictions
    for index, pred in zip(indexes, b_pred_correctness):
        validation_predictions_correctness[index] = pred
    # Update tracking variables
    eval_loss += loss.item()
    eval_accuracy += b_accuracy
    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1
    if nb_eval_steps%10 == 0:
        print("\t\tValidation Batch {}: Loss: {}; Accuracy: {}".format(nb_eval_steps, loss.item(), b_accuracy))
print("Validation:\n\tLoss={}; Accuracy: {}".format(eval_loss/nb_eval_steps, eval_accuracy/nb_eval_steps))
validation_predictions_correctness = collections.OrderedDict(sorted(validation_predictions_correctness.items()))
print(validation_predictions_correctness)

  logits = self.softmax(self.linear_seperator(layer1_results))


		Validation Batch 10: Loss: 0.5798935294151306; Accuracy: 0.75
		Validation Batch 20: Loss: 0.6736460328102112; Accuracy: 0.5
		Validation Batch 30: Loss: 0.5676725506782532; Accuracy: 0.75
		Validation Batch 40: Loss: 0.7799647450447083; Accuracy: 0.5
		Validation Batch 50: Loss: 0.48344919085502625; Accuracy: 1.0
		Validation Batch 60: Loss: 0.5075159072875977; Accuracy: 1.0
		Validation Batch 70: Loss: 0.42874273657798767; Accuracy: 0.75
		Validation Batch 80: Loss: 0.8265653848648071; Accuracy: 0.25
		Validation Batch 90: Loss: 0.5445793867111206; Accuracy: 0.75
		Validation Batch 100: Loss: 0.5663015246391296; Accuracy: 0.75
		Validation Batch 110: Loss: 0.5275027751922607; Accuracy: 0.75
		Validation Batch 120: Loss: 0.5846569538116455; Accuracy: 0.75
		Validation Batch 130: Loss: 0.8868913054466248; Accuracy: 0.25
		Validation Batch 140: Loss: 0.8078813552856445; Accuracy: 0.5
		Validation Batch 150: Loss: 0.5321898460388184; Accuracy: 0.75
		Validation Batch 160: Loss: 0.81270

In [19]:
test_predictions = {}
test_loss, test_accuracy = 0, 0
nb_test_examples, nb_test_steps = 0, 0
# Testing
# Put model in evaluation mode to evaluate loss on the validation set
class_model.eval()
# Evaluate data for one epoch
for batch in test_dataloader:
    # Add batch to GPU
    batch = tuple(t.cuda() for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_token_ids, b_input_mask, b_word1, b_word2, b_index = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = class_model(b_input_ids, attention_mask=b_input_mask, word1_locs = b_word1, word2_locs = b_word2)
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    # Get the predictions
    b_preds = flat_predictions(logits)
    indexes = b_index.detach().cpu().numpy() # Get the indexes
    for index, pred in zip(indexes, b_preds):
        test_predictions[index] = pred
    # Update tracking variables
    test_loss += loss.item()
    test_accuracy += b_accuracy
    nb_test_examples += b_input_ids.size(0)
    nb_test_steps += 1
    if nb_test_steps%10 == 0:
        print("\t\tTest Batch {}".format(nb_test_steps))
# Print final results
print("Testing results")
test_predictions = collections.OrderedDict(sorted(test_predictions.items()))
print(test_predictions)

  logits = self.softmax(self.linear_seperator(layer1_results))


		Test Batch 10
		Test Batch 20
		Test Batch 30
		Test Batch 40
		Test Batch 50
		Test Batch 60
		Test Batch 70
		Test Batch 80
		Test Batch 90
		Test Batch 100
		Test Batch 110
		Test Batch 120
		Test Batch 130
		Test Batch 140
		Test Batch 150
		Test Batch 160
		Test Batch 170
		Test Batch 180
		Test Batch 190
		Test Batch 200
		Test Batch 210
		Test Batch 220
		Test Batch 230
		Test Batch 240
		Test Batch 250
		Test Batch 260
		Test Batch 270
		Test Batch 280
		Test Batch 290
		Test Batch 300
		Test Batch 310
		Test Batch 320
		Test Batch 330
		Test Batch 340
		Test Batch 350
Testing results
OrderedDict([(0, True), (1, False), (2, True), (3, True), (4, False), (5, False), (6, False), (7, True), (8, False), (9, True), (10, True), (11, True), (12, False), (13, False), (14, True), (15, True), (16, True), (17, False), (18, False), (19, True), (20, False), (21, False), (22, True), (23, True), (24, False), (25, False), (26, True), (27, False), (28, True), (29, True), (30, True), (31, Fals