In [None]:
import torch
import torchvision
import torchaudio

In [None]:
torch.__version__

In [None]:
#check for GPU
torch.cuda.is_available()

In [None]:
import numpy as np
import json
import random
import time
import pickle
from transformers import BertTokenizer

In [None]:
map_relations = {'Comment':0, 'Contrast':1, 'Correction':2, 'Question-answer_pair':3, 'Acknowledgement':4,'Elaboration':5,
                 'Clarification_question':6, 'Conditional':7, 'Continuation':8, 'Result':9, 'Explanation':10, 'Q-Elab':11,
                 'Alternation':12, 'Narration':13, 'Confirmation_question':14, 'Sequence':15, 'Break':16}

NB: Same pre-processing as in previous finetuning notebook

In [None]:
home=%pwd
filename = home + '/data/TRAIN+VAL_407_bert.json'

In [None]:
from utils import load_data, input_format, position_ids_compute, tokenize
from bert_format import undersample, format_time, flat_accuracy

In [None]:
data = load_data(filename, map_relations)

In [None]:
#split out a certain portion of validation data 
train_data = data[40:]
valid_data = data[:40]

In [None]:
input_text_train, labels_train, raw_train = input_format(train_data, 10)

In [None]:
input_text_valid, labels_valid, raw_valid = input_format(valid_data, 10)

In [None]:
#load tokenizer and token ids
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', use_fast=True)

In [None]:
put = ['1','0']
colors = ['r', 'b', 'g', 'o', 'y', 'p']
listx = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n']
listy = ['0', '1', '2', '3', '4', '5', '6', '7', '8']
listz = ['a', 'e', 'i', 'o', 'u', 'p', 'q', 'r', 'x', 'y', 'z']

In [None]:
coord_tokens = [''.join([s, t, i, j, k]) for s in put
                for t in colors
                for i in listx
                for j in listy
                for k in listz]

In [None]:
tokenizer.add_tokens(coord_tokens)

In [None]:
len(tokenizer)

In [None]:
device = torch.device('cuda')

In [None]:
input_ids_train, attention_masks_train, token_type_ids_train = tokenize(input_text_train, tokenizer, device)

In [None]:
input_ids_valid, attention_masks_valid, token_type_ids_valid = tokenize(input_text_valid, tokenizer, device)

compute position ids

In [None]:
position_ids_train = position_ids_compute(tokenizer, input_ids_train, raw_train, labels_train)

In [None]:
position_ids_valid = position_ids_compute(tokenizer, input_ids_valid, raw_valid, labels_valid)

In [None]:
position_ids_train = torch.tensor(position_ids_train)

In [None]:
position_ids_valid = torch.tensor(position_ids_valid)

Undersample <br>
For Bertlinear we use the undersample function because...

In [None]:
from bert_format import undersample

In [None]:
#all cands
print(len(labels_train))
#unattached cands
print(sum([1 for i in labels_train if i[3] == 0]))
#attached cands
print(sum([1 for i in labels_train if i[3] == 1]))

In [None]:
labels_attach_train = [l[3] for l in labels_train]

In [None]:
labels_attach_valid = [l[3] for l in labels_valid]

In [None]:
labels_train = torch.tensor(labels_train)
labels_valid = torch.tensor(labels_valid)
labels_attach_train = torch.tensor(labels_attach_train)
labels_attach_valid = torch.tensor(labels_attach_valid)

In [None]:
#NB need to choose a number to keep
#usually about 60% of total candidates
labels_train, labels_attach_train, input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train = undersample(103400, labels_train, labels_attach_train, input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train)

gather metadata from labels

In [None]:
#make meta data
meta_data_train = []
for i in range(len(labels_train)):
  lbs = labels_train[i].tolist()
  meta_data_train.append([lbs[2], lbs[2]-lbs[1]])

meta_data_valid = []
for i in range(len(labels_valid)):
  lbs = labels_valid[i].tolist()
  meta_data_valid.append([lbs[2], lbs[2]-lbs[1]])

create batches

In [None]:
#create metadata batches
def get_batches(len_data, batch_size):
    indices = [i for i in range(len_data)]
    batches = []
    for i in range(len_data // batch_size + bool(len_data) % batch_size):
        batches.append(indices[i * batch_size:(i + 1) * batch_size])
    return batches

In [None]:
train_batches = get_batches(len(meta_data_train), 32)

In [None]:
valid_batches = get_batches(len(meta_data_valid), 32)

In [None]:
meta_train_batches = []
for ba in train_batches:
  meta_train_batches.append([meta_data_train[b] for b in ba])

In [None]:
meta_valid_batches = []
for ba in valid_batches:
  meta_valid_batches.append([meta_data_valid[b] for b in ba])

In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from transformers import AdamW, BertForSequenceClassification

In [None]:
train_dataset = TensorDataset(input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train, labels_attach_train)

In [None]:
input_ids_train.size(), attention_masks_train.size(), position_ids_train.size(), labels_attach_train.size()

In [None]:
input_ids_valid.size(), attention_masks_valid.size(), position_ids_valid.size(), labels_attach_valid.size()

In [None]:
val_dataset = TensorDataset(input_ids_valid, attention_masks_valid, token_type_ids_valid, position_ids_valid, labels_attach_valid)

In [None]:
train_dataloader = DataLoader(
            train_dataset,
            sampler = SequentialSampler(train_dataset),
            batch_size = 32
        )

In [None]:
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = 32
        )

Load finetuned from first step

In [None]:
model_path = home + '<name of your model folder>/<name of your finetune .pth file output>'

In [None]:
embedder = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    output_attentions = False,
    output_hidden_states = True, attention_probs_dropout_prob=0, hidden_dropout_prob=0
)

In [None]:
#!!resize embedder to account for new embeddings!
embedder.resize_token_embeddings(len(tokenizer))

In [None]:
checkpoint = torch.load(model_path, map_location=device)
embedder.load_state_dict(checkpoint['model_state_dict'])
embedder.to(device)

load NN

In [None]:
import random
import pickle
from torch import nn

In [None]:
#fix neural net here
#hidden_size = 774
#hidden_size = 772 for just incoming rels

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()

        self.linear = nn.Sequential(
            nn.Dropout(p=0.3),
            # nn.Linear(params.hidden_size, params.hidden_size_1),
            nn.Linear(770, 2000),
            nn.Dropout(p=0.3),
            nn.Tanh(),
            nn.Linear(2000, 1))


    def forward(self, x):
        logits = self.linear(x)
        return logits

In [None]:
linear = NeuralNetwork().to(device)

In [None]:
linear.train()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(params=linear.parameters(), lr=0.0002)

In [None]:
linear_model_path = home + '<name of your model folder>'
save_linear_name =  '<name of your bertlinear .pth file output>'

train the model

In [None]:
for epoch in range(15):
    loss_sum_train = 0
    linear.train()
    for e, batch in enumerate(train_dataloader):
      if e in [0, len(train_dataloader)-1]:
        print("epoch ", epoch)
        print("batch no ", e)
      output = embedder(batch[0].to(device),
                        token_type_ids = batch[2].to(device),
                        attention_mask = batch[1].to(device),
                        position_ids = batch[3].to(device),
                        labels = batch[4].to(device),
                        return_dict=True)
      #concat each candidate embedding with metadata tensor
      #stack these
      H_embed = torch.stack([torch.cat((r[0], torch.tensor(meta_train_batches[e][i]).to(device)),0) for i, r in enumerate(output.hidden_states[-1])])
      H_embed = H_embed.to(device)
      logits = linear(H_embed).unsqueeze(0)
      logits = logits.squeeze(-1)

   
      target = torch.tensor([[float(b) for b in batch[4]]]).to(device)
     

      loss = criterion(input=logits, target=target)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      loss_sum_train += loss.item()

    # Calculate the average train loss over all of the batches.
    avg_train_loss = loss_sum_train / len(train_dataloader)
    print("avg train loss: ", avg_train_loss)


    print("going to eval")
    linear.eval()
    loss_sum_valid = 0

    for e, batch in enumerate(validation_dataloader):
      with torch.no_grad():
        output = embedder(batch[0].to(device),
                        token_type_ids = batch[2].to(device),
                        attention_mask = batch[1].to(device),
                        position_ids = batch[3].to(device),
                        labels = batch[4].to(device),
                        return_dict=True)

      H_embed = torch.stack([torch.cat((r[0], torch.tensor(meta_valid_batches[e][i]).to(device)),0) for i, r in enumerate(output.hidden_states[-1])])
      H_embed = H_embed.to(device)
      with torch.no_grad():
            logits = linear(H_embed).unsqueeze(0)

      logits = logits.squeeze(-1)

      target = torch.tensor([[float(b) for b in batch[4]]]).to(device)
      # target = batch[4].to(device)

      loss = criterion(input=logits, target=target)

      loss_sum_valid += loss.item()

    # Calculate the average loss over all of the batches.
    avg_val_loss = loss_sum_valid / len(validation_dataloader)
    print("avg val loss: ", avg_val_loss)

    print('--------------------------------------')

output_model = linear_model_path + save_linear_name

print('finished_training, saving to : ', output_model)

torch.save({
    'model_state_dict': linear.state_dict(),
}, output_model)

Get scores on test

In [None]:
home=%pwd
filename = home + '/data/TEST_101_bert.json'
test_data = load_data(filename, map_relations)

In [None]:
input_text_test, labels_test, raw_test = input_format(test_data, 10)

In [None]:
input_ids_test, attention_masks_test, token_type_ids_test = tokenize(input_text_test, tokenizer, device)

In [None]:
position_ids_test = position_ids_compute(tokenizer, input_ids_test, raw_test, labels_test)

In [None]:
position_ids_test = torch.tensor(position_ids_test)

In [None]:
labels_attach_test = [l[3] for l in labels_test]

In [None]:
meta_data_test = []
for i in range(len(labels_test)):
  lbs = labels_test[i]
  meta_data_test.append([lbs[2], lbs[2]-lbs[1]])

In [None]:
test_batches = get_batches(len(meta_data_test), 32)

In [None]:
meta_test_batches = []
for ba in test_batches:
  meta_test_batches.append([meta_data_test[b] for b in ba])

In [None]:
labels_test_batches = []
for ba in test_batches:
  labels_test_batches.append([labels_test[b] for b in ba])

In [None]:
test_dataset = TensorDataset(input_ids_test, attention_masks_test, token_type_ids_test, position_ids_test)

In [None]:
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = 32
        )

In [None]:
#problem is somewhere aroudn i = 2010
for i, e in enumerate(test_dataloader):
    if i == 2010:
        print(e)
        break

start :: if need to reload the linear model to run test

In [None]:
model_path = home + '<name of your model folder>/<name of your linear .pth file output>'
linear = NeuralNetwork().to(device)
checkpoint = torch.load(model_path, map_location='cuda')
linear.load_state_dict(checkpoint['model_state_dict'])
linear.to(device)

End :: if you needed to reload the linear model

In [None]:
predictions = []

linear.eval()

for e, batch in enumerate(test_dataloader):
  print(e)
  with torch.no_grad():
    output = embedder(batch[0].to(device),
                    token_type_ids = batch[2].to(device),
                    attention_mask = batch[1].to(device),
                    position_ids = batch[3].to(device),
                    # labels = batch[4].to(device),
                    return_dict=True)

  H_embed = torch.stack([torch.cat((r[0], torch.tensor(meta_test_batches[e][i]).to(device)),0) for i, r in enumerate(output.hidden_states[-1])])
  H_embed = H_embed.to(device)
  with torch.no_grad():
        logits = linear(H_embed).unsqueeze(0)

  m = nn.Sigmoid()
  mod =(m(logits)).squeeze(-1).cpu().tolist()[0]
  xs = [i for i in range(len(mod)) if mod[i] > 0.81]  
  

  labels = labels_test_batches[e]
  for lab in range(len(labels)):
    if lab in xs:
      labels[lab].append(1)
    else:
      labels[lab].append(0)

  predictions.extend(labels)



In [None]:
len(predictions)

In [None]:
attach_predictions = [i[5] for i in predictions]
true_labels = [i[3] for i in predictions]

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
precision_recall_fscore_support(true_labels, attach_predictions, average='binary')

change output to a list of lists so it can be fed to multitask <br>
needs to be a list of lists, each list a game

In [None]:
multitask_inputs = []
for i in range(101): #32 for the minecraft data
    inputs = [[e[1], e[2]] for e in predictions if e[0] == i and e[5]==1]
    multitask_inputs.append(inputs)

In [None]:
len(multitask_inputs)

In [None]:
with open(home + '<name of your pickle folder>/<name of your linear preds pickle file>', 'wb') as f:
    pickle.dump(multitask_inputs, f)