In [None]:
import torch
import torchvision
import torchaudio

In [None]:
torch.cuda.empty_cache()

In [None]:
torch.__version__

In [None]:
#check for GPU
torch.cuda.is_available()
# device = torch.device('cuda')
# device

In [None]:
+
import random
import time
from transformers import BertTokenizer

In [None]:
map_relations = {'Comment':0, 'Contrast':1, 'Correction':2, 'Question-answer_pair':3, 'Acknowledgement':4,'Elaboration':5,
                 'Clarification_question':6, 'Conditional':7, 'Continuation':8, 'Result':9, 'Explanation':10, 'Q-Elab':11,
                 'Alternation':12, 'Narration':13, 'Confirmation_question':14, 'Sequence':15, 'Break':16}

In [None]:
home=%pwd
filename = home + '/data/TRAIN+VAL_407_bert.json'

load and preprocess the training data

In [None]:
from utils import load_data, input_format, position_ids_compute, tokenize
from bert_format import undersample, format_time, flat_accuracy

In [None]:
data = load_data(filename, map_relations)

In [None]:
#split out a certain portion of validation data (a function of length?)
train_data = data[40:]
valid_data = data[:40]

In [None]:
input_text_train, labels_complete_train, raw_train = input_format(data, 10)


In [None]:
input_text_val, labels_complete_val, raw_val = input_format(valid_data, 10)

In [None]:
#load tokenizer and token ids
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', use_fast=True)

Add special tokens for moves 

In [None]:
put = ['1','0']
colors = ['r', 'b', 'g', 'o', 'y', 'p']
listx = ['b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n']
listy = ['0', '1', '2', '3', '4', '5', '6', '7', '8']
listz = ['a', 'e', 'i', 'o', 'u', 'p', 'q', 'r', 'x', 'y', 'z']

In [None]:
coord_tokens = [''.join([s, t, i, j, k]) for s in put
                for t in colors
                for i in listx
                for j in listy
                for k in listz]

In [None]:
tokenizer.add_tokens(coord_tokens)

In [None]:
len(tokenizer)

In [None]:
device = torch.device('cuda')

In [None]:
input_ids_train, attention_masks_train, token_type_ids_train = tokenize(input_text_train, tokenizer, device)

In [None]:
input_ids_val, attention_masks_val, token_type_ids_val = tokenize(input_text_val, tokenizer, device)

Compute position ids

In [None]:
position_ids_train = position_ids_compute(tokenizer, input_ids_train, raw_train, labels_complete_train)

In [None]:
position_ids_val = position_ids_compute(tokenizer, input_ids_val, raw_val, labels_complete_val)

In [None]:
position_ids_train = torch.tensor(position_ids_train)

In [None]:
position_ids_val = torch.tensor(position_ids_val)

Undersample

In [None]:
labels_train = [l[3] for l in labels_complete_train]

In [None]:
labels_val = [l[3] for l in labels_complete_val]

In [None]:
labels_train = torch.tensor(labels_train)
labels_val = torch.tensor(labels_val)
labels_complete_train = torch.tensor(labels_complete_train)
labels_complete_val = torch.tensor(labels_complete_val)

In [None]:
#NB need to choose a number to keep
#usually about 60% of total candidates
labels_complete_train, labels_train, input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train = undersample(103400, labels_complete_train, labels_train, input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train)

Load data loader and model

In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, BertForSequenceClassification

In [None]:
train_dataset = TensorDataset(input_ids_train, attention_masks_train, token_type_ids_train, position_ids_train, labels_train)

In [None]:
val_dataset = TensorDataset(input_ids_val, attention_masks_val, token_type_ids_val, position_ids_val, labels_val)

In [None]:
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = 32
        )

In [None]:
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = 32
        )

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    output_attentions = False,
    output_hidden_states = True, attention_probs_dropout_prob=0, hidden_dropout_prob=0)

In [None]:
#!!important -- must add new token embeddings to BERT
model.resize_token_embeddings(len(tokenizer))

In [None]:
model.to(device) 

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 1.5e-5,
                  eps = 1e-8
                )

In [None]:
total_steps = len(train_dataloader) * 2
seed_val = 18
total_t0 = time.time()

Run model

In [None]:
model_path = home + '<name of your model folder>'
bert_name = '<name of your .pth file output>'

In [None]:
for epoch_i in range(2):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, 2))

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        model.zero_grad()

        result = model(batch[0].to(device),
                       token_type_ids=batch[2].to(device),
                       attention_mask=batch[1].to(device),
                       position_ids = batch[3].to(device),
                       labels=batch[4].to(device),
                       return_dict=True)

        loss = result.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)

    print("  Training Loss: ",avg_train_loss)
    print("  Training took: ", training_time)
    print("Running Validation")
    t0 = time.time()

    # Evaluation step
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:

        with torch.no_grad():
            result = model(batch[0].to(device),
                           token_type_ids=batch[2].to(device),
                           attention_mask=batch[1].to(device),
                           position_ids = batch[3].to(device),
                           labels=batch[4].to(device),
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = batch[4].to(device).cpu().numpy()

        # Compute the accuracy
        total_eval_accuracy += flat_accuracy(logits, label_ids)


    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: ", avg_val_accuracy)

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: ", avg_val_loss)
    print("  Validation took: ",validation_time)
print("Training complete!")

torch.save({
    'model_state_dict': model.state_dict(),
}, model_path + bert_name + '.pth')

In [None]:
torch.cuda.empty_cache()

Get scores on Test

In [None]:
#load test data
filename = home + '/data/TEST_101_bert.json'

In [None]:
test_data = load_data(filename, map_relations)

In [None]:
input_text_test, labels_complete_test, raw_test = input_format(test_data, 10)

In [None]:
input_ids_test, attention_masks_test, token_type_ids_test = tokenize(input_text_test, tokenizer, device)

In [None]:
position_ids_test = position_ids_compute(tokenizer, input_ids_test, raw_test, labels_complete_test)

In [None]:
position_ids_test = torch.tensor(position_ids_test)

In [None]:
labels_test = [l[3] for l in labels_complete_test]

In [None]:
labels_test = torch.tensor(labels_test)
labels_complete_test = torch.tensor(labels_complete_test)

In [None]:
test_dataset = TensorDataset(input_ids_test, attention_masks_test, token_type_ids_test, position_ids_test, labels_test)

In [None]:
test_dataloader = DataLoader(
            test_dataset,
            sampler = SequentialSampler(test_dataset),
            batch_size = 32
        )

start :: if need to reload the model to run test

In [None]:
model_path = home + '<name of your model folder>/<name of your .pth file output>'

In [None]:
embedder = BertForSequenceClassification.from_pretrained(
    'bert-base-cased',
    output_attentions = False,
    output_hidden_states = True, attention_probs_dropout_prob=0, hidden_dropout_prob=0
)

In [None]:
embedder.resize_token_embeddings(len(tokenizer))

In [None]:
checkpoint = torch.load(model_path, map_location=device)
embedder.load_state_dict(checkpoint['model_state_dict'])
embedder.to(device)

In [None]:
model=embedder

End :: if you needed to reload the model

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test cands...'.format(len(input_ids_test)))

# Put model in evaluation mode
model.eval()

# Tracking variables
predictions , true_labels = [], []

# Predict
for batch in test_dataloader:
  # # Add batch to GPU
  # batch = tuple(t.to(device) for t in batch)

  # # Unpack the inputs from our dataloader
  # b_input_ids, b_input_mask, b_labels = batch

  # # Telling the model not to compute or store gradients, saving memory and
  # # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(batch[0].to(device),
                      token_type_ids=batch[2].to(device),
                      attention_mask=batch[1].to(device),
                      position_ids = batch[3].to(device),
                      labels=batch[4].to(device),
                      return_dict=True)

  logits = result.logits

  logits = logits.detach().cpu().numpy()
  label_ids = batch[4].to(device).cpu().numpy()

  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

In [None]:

len(true_labels)

In [None]:
len(predictions)

In [None]:
# Combine the results across all batches.
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
precision_recall_fscore_support(flat_true_labels, flat_predictions, average='binary')

save output <br>
list of lists with [dialogue index, x index, y index, true attach, true label, predicted attach]