In [1]:
import torch
import torchvision
import torchaudio

In [43]:
torch.cuda.empty_cache()

In [2]:
torch.__version__

'2.1.1+cu121'

In [3]:
#check for GPU
torch.cuda.is_available()
# device = torch.device('cuda')
# device

True

In [4]:
import numpy as np
import json
import random
import time
from transformers import BertTokenizer

In [5]:
map_relations = {'Comment':0, 'Contrast':1, 'Correction':2, 'Question-answer_pair':3, 'Acknowledgement':4,'Elaboration':5,
                 'Clarification_question':6, 'Conditional':7, 'Continuation':8, 'Result':9, 'Explanation':10, 'Q-Elab':11,
                 'Alternation':12, 'Narration':13, 'Confirmation_question':14, 'Sequence':15, 'Break':16}

In [6]:
home=%pwd
filename = home + '/data/TRAIN_407.json'

load and preprocess the training data

In [7]:
from utils import load_data, input_format, position_ids_compute, tokenize
from bert_format import undersample, format_time, flat_accuracy

In [8]:
data = load_data(filename, map_relations)

Loading data: /home/kate/LREC/data/TRAIN_407.json
407 dialogs, 21822 edus, 26299 relations, 194 backward relations
4787 edus have multiple parents


In [9]:
#split out a certain portion of validation data (a function of length?)
train_data = data[40:]
valid_data = data[:40]

In [10]:
input_text_train, labels_complete_train, raw_train = input_format(train_data, 7)


22781 relations
126994 candidates
104213 non attached


In [11]:
input_text_val, labels_complete_val, raw_val = input_format(valid_data, 7)

2554 relations
14364 candidates
11810 non attached


In [8]:
#load tokenizer and token ids
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', use_fast=True)

In [9]:
put = ['1','0']
colors = ['r', 'b', 'g', 'o', 'y', 'p']
listx = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n']
listy = ['0', '1', '2', '3', '4', '5', '6', '7', '8']
listz = ['a', 'e', 'i', 'o', 'u', 'p', 'q', 'r', 'x', 'y', 'z']

In [10]:
coord_tokens = [''.join([s, t, i, j, k]) for s in put
                for t in colors
                for i in listx
                for j in listy
                for k in listz]

In [11]:
tokenizer.add_tokens(coord_tokens)

13068

In [12]:
len(tokenizer)

42064

In [13]:
device = torch.device('cuda')

In [18]:
input_ids_train, attention_masks_train, token_type_ids_train = tokenize(input_text_train, tokenizer, device)

In [19]:
len(input_ids_train)

126994

In [20]:
input_ids_val, attention_masks_val, token_type_ids_val = tokenize(input_text_val, tokenizer, device)

Compute position ids

In [21]:
position_ids_train = position_ids_compute(tokenizer, input_ids_train, raw_train, labels_complete_train)

In [22]:
position_ids_val = position_ids_compute(tokenizer, input_ids_val, raw_val, labels_complete_val)

In [23]:
position_ids_train = torch.tensor(position_ids_train)

In [24]:
position_ids_val = torch.tensor(position_ids_val)

Undersample

In [25]:
labels_train = [l[3] for l in labels_complete_train]

In [26]:
labels_val = [l[3] for l in labels_complete_val]

In [27]:
labels_train = torch.tensor(labels_train)
labels_val = torch.tensor(labels_val)
labels_complete_train = torch.tensor(labels_complete_train)
labels_complete_val = torch.tensor(labels_complete_val)

In [28]:
#NB need to choose a drop number
labels_complete_train, labels_train, input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train = undersample(68576, labels_complete_train, labels_train, input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train)

Load data loader and model

In [14]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, BertForSequenceClassification

In [30]:
train_dataset = TensorDataset(input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train, labels_train)

In [31]:
val_dataset = TensorDataset(input_ids_val, attention_masks_val, token_type_ids_val, position_ids_val, labels_val)

In [32]:
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = 32
        )

In [33]:
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = 32
        )

In [34]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    output_attentions = False,
    output_hidden_states = True, attention_probs_dropout_prob=0, hidden_dropout_prob=0)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
#!!important -- must add new token embeddings to BERT
model.resize_token_embeddings(len(tokenizer))

Embedding(42064, 768)

In [36]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42064, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [37]:
optimizer = AdamW(model.parameters(),
                  lr = 1.5e-5,
                  eps = 1e-8
                )



In [38]:
total_steps = len(train_dataloader) * 2
seed_val = 18
total_t0 = time.time()

Run model

In [39]:
model_path = home + '/models/'
bert_name = 'finetune_d7'

In [40]:
for epoch_i in range(2):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, 2))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        model.zero_grad()

        result = model(batch[0].to(device),
                       token_type_ids=batch[2].to(device),
                       attention_mask=batch[1].to(device),
                       position_ids = batch[3].to(device),
                       labels=batch[4].to(device),
                       return_dict=True)

        loss = result.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print("  Training Loss: ",avg_train_loss)
    print("  Training took: ", training_time)
    print("Running Validation")
    t0 = time.time()

    # Evaluation step
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:

        with torch.no_grad():
            result = model(batch[0].to(device),
                           token_type_ids=batch[2].to(device),
                           attention_mask=batch[1].to(device),
                           position_ids = batch[3].to(device),
                           labels=batch[4].to(device),
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = batch[4].to(device).cpu().numpy()
        # if device == 'cuda' :
        #     # logits = logits.detach().cpu().numpy()
        #     logits = logits.detach().cpu().numpy()
        #     label_ids = batch[4].to(device).cpu().numpy()
        #     # label_ids = batch[4].to(device).to('cpu').numpy()
        #     # Move logits and labels to CPU
        # else : label_ids = batch[4].to(device)

        # Compute the accuracy
        #print(label_ids)
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: ", avg_val_accuracy)

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: ", avg_val_loss)
    print("  Validation took: ",validation_time)
print("Training complete!")

torch.save({
    'model_state_dict': model.state_dict(),
}, model_path + bert_name + '.pth')


  Batch   500  of  2,855.    Elapsed: 0:01:24.
  Batch 1,000  of  2,855.    Elapsed: 0:02:50.
  Batch 1,500  of  2,855.    Elapsed: 0:04:13.
  Batch 2,000  of  2,855.    Elapsed: 0:05:36.
  Batch 2,500  of  2,855.    Elapsed: 0:06:58.
  Training Loss:  0.23582668830877115
  Training took:  0:08:00
Running Validation
  Accuracy:  0.9350342029907732
  Validation Loss:  0.18419285180391473
  Validation took:  0:00:20

  Batch   500  of  2,855.    Elapsed: 0:01:26.
  Batch 1,000  of  2,855.    Elapsed: 0:02:53.
  Batch 1,500  of  2,855.    Elapsed: 0:04:19.
  Batch 2,000  of  2,855.    Elapsed: 0:05:46.
  Batch 2,500  of  2,855.    Elapsed: 0:07:11.
  Training Loss:  0.18037610872927565
  Training took:  0:08:12
Running Validation
  Accuracy:  0.9323098950047725
  Validation Loss:  0.18006433696301452
  Validation took:  0:00:19
Training complete!


In [2]:
torch.cuda.empty_cache()

Get scores on Test

In [15]:
#load test data
# filename = home + '/data/DEV_32_bert.json'
filename = home + '/data/TEST_102_bert.json'

In [16]:
test_data = load_data(filename, map_relations)

Loading data: /home/kate/LREC/data/TEST_102_bert.json
102 dialogs, 5032 edus, 6041 relations, 56 backward relations
1081 edus have multiple parents


In [17]:
input_text_test, labels_complete_test, raw_test = input_format(test_data, 10)

5886 relations
44710 candidates
38824 non attached


In [18]:
input_ids_test, attention_masks_test, token_type_ids_test = tokenize(input_text_test, tokenizer, device)

In [20]:
position_ids_test = position_ids_compute(tokenizer, input_ids_test, raw_test, labels_complete_test)

In [21]:
position_ids_test = torch.tensor(position_ids_test)

In [22]:
labels_test = [l[3] for l in labels_complete_test]

In [23]:
labels_test = torch.tensor(labels_test)
labels_complete_test = torch.tensor(labels_complete_test)

In [24]:
test_dataset = TensorDataset(input_ids_test, attention_masks_test, token_type_ids_test, position_ids_test, labels_test)

In [25]:
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = 32
        )

In [26]:
model_path = home + '/models/finetune_d10.pth'

In [27]:
embedder = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    output_attentions = False,
    output_hidden_states = True, attention_probs_dropout_prob=0, hidden_dropout_prob=0
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
embedder.resize_token_embeddings(len(tokenizer))

Embedding(42064, 768)

In [29]:
checkpoint = torch.load(model_path, map_location=device)
embedder.load_state_dict(checkpoint['model_state_dict'])
embedder.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42064, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [30]:
model=embedder

In [31]:
# Prediction on test set

print('Predicting labels for {:,} test cands...'.format(len(input_ids_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions , true_labels = [], []

# Predict
for batch in test_dataloader:
  # # Add batch to GPU
  # batch = tuple(t.to(device) for t in batch)

  # # Unpack the inputs from our dataloader
  # b_input_ids, b_input_mask, b_labels = batch

  # # Telling the model not to compute or store gradients, saving memory and
  # # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(batch[0].to(device),
                      token_type_ids=batch[2].to(device),
                      attention_mask=batch[1].to(device),
                      position_ids = batch[3].to(device),
                      labels=batch[4].to(device),
                      return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  # logits = logits.detach().cpu().numpy()
  # label_ids = labels.to('cpu').numpy()

  logits = logits.detach().cpu().numpy()
  label_ids = batch[4].to(device).cpu().numpy()

  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

Predicting labels for 44,710 test cands...
    DONE.


In [32]:

len(true_labels)

1398

In [33]:
len(predictions)

1398

In [34]:
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [35]:
from sklearn.metrics import precision_recall_fscore_support

In [36]:
precision_recall_fscore_support(flat_true_labels, flat_predictions, average='binary')

(0.858414079493771, 0.737512742099898, 0.7933838983825277, None)

save output <br>
list of lists with [dialogue index, x index, y index, true attach, true label, predicted attach]

In [37]:
full_labels = labels_complete_test.tolist()

In [38]:
full_labels[0:10]

[[0, 0, 1, 1, 0],
 [0, 0, 2, 1, 8],
 [0, 1, 2, 0, -1],
 [0, 0, 3, 0, -1],
 [0, 1, 3, 0, -1],
 [0, 2, 3, 1, 5],
 [0, 0, 4, 0, -1],
 [0, 1, 4, 0, -1],
 [0, 2, 4, 0, -1],
 [0, 3, 4, 1, 9]]

In [39]:
save_stats = []
for i, lab in enumerate(full_labels):
    li = []
    li += lab
    li.append(flat_predictions[i])
    save_stats.append(li)

In [41]:
#%pip install pickle-mixin
import pickle

In [42]:
with open(home + '/pickles/' + 'scores_finetune_d10_test.pkl', 'wb') as f:
    pickle.dump(save_stats, f)