# Predicting Judicial Decisions of the European Court of Human Rights

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In this notebook, we aim to train a classification model to classify cases as 'violation' or 'non-violation' using a Bert Sequence Classification model from the Transformer library. 
The cases were originally downloaded from HUDOC and structured based on the articles they fall under.

In [0]:
!pip install transformers



In [0]:
import tensorflow as tf
from transformers import *
import torch

In [0]:
import numpy as np
import re
import os
import copy

### Activate logging

In [0]:
import logging
logging.basicConfig(level=logging.INFO)

In [0]:
device = torch.device("cuda")

In [0]:
n_gpu = torch.cuda.device_count()
n_gpu

1

In [0]:
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

## Data

In [0]:
def read_dataset(PATH):
    X_dataset = {}
    Y_dataset = {}
    for path, dirs, files in os.walk(PATH):
        for filename in files:
            fullpath = os.path.join(path, filename)
            if "both" not in fullpath:
                with open(fullpath, 'r', encoding="utf8") as file:
                    X_dataset, Y_dataset = add_file_to_dataset(fullpath, X_dataset, Y_dataset, file.read())

    return X_dataset, Y_dataset       

In [0]:
def add_file_to_dataset(fullpath, x_dataset, y_dataset, file):
    article = extract_article(fullpath)
    file = preprocess(file)
    if article not in x_dataset.keys() :
        x_dataset[article] = []
        y_dataset[article] = []
    x_dataset[article] = x_dataset[article] + [file]
    label = 0 if "non-violation" in fullpath else 1
    y_dataset[article] = y_dataset[article] + [label]
    return x_dataset, y_dataset  

In [0]:
def extract_article(path): 
    pattern = r"(Article\d+)"
    result = re.search(pattern, path)
    article = result.group(1)
    return article

### Preprocessing 

Similar to the research paper this work is based on, we will only use the PROCEDURE and THE FACTS paragraphs of the cases as our training set. Otherwise, the model may be biased.

In [0]:
def preprocess(file): 
    file = extract_paragraphs(file)
    return file

In [0]:
def extract_paragraphs(file): 
  # Remove any non-ASCII characters
  file = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', file)

  # # Remove any number at the beginning of a new line
  # pat = r'\n[0-9].'
  # result = re.findall(pat, file, re.S | re.IGNORECASE)
  # for group in result:
  #     file = file.replace(group, "\n")

  # Extract THE FACTS paragraphs
  pat = r'((THE CIRCUMSTANCES OF THE CASE\s*\n.+?RELEVANT DOMESTIC LAW.+?)|(\n(AS TO THE FACTS|THE FACTS)\s*\n.+?))(\nIII\.|THE LAW\s*\n|PROCEEDINGS BEFORE THE COMMISSION\s*\n|ALLEGED VIOLATION OF ARTICLE [0-9]+ OF THE CONVENTION \s*\n)'
  result = re.search(pat, file, re.S |  re.IGNORECASE)

  content = ""
  content += result.group(1)
  return content

### Loading the data

In [0]:
base_path = "/content/drive/My Drive/Colab Notebooks/Datasets/Human rights dataset"

In [0]:
X_train_docs, Y_train_docs = read_dataset(base_path + "/train")
X_test_docs, Y_test_docs = read_dataset(base_path + "/test20")

Also, similarly to Medvedeva, M., Vols, M. & Wieling, M. Artif Intell Law (2019), we want to remove the articles which contain too few cases. We include Article 11 "as an estimate of how well the model performs when only very few cases are available".

In [0]:
def select_articles(train_set):
    selected_training_set = copy.deepcopy(train_set)
    
    for key in train_set.keys():
        if len(train_set[key]) <= 50:
            selected_training_set.pop(key)
            continue
    return selected_training_set

In [0]:
X_train_docs = select_articles(X_train_docs)

In [0]:
print(len(X_train_docs))

9


In [0]:
print(X_train_docs.keys())

dict_keys(['Article11', 'Article10', 'Article13', 'Article5', 'Article3', 'Article6', 'Article14', 'Article2', 'Article8'])


### Combining all the articles according to class

In [0]:
X_train = X_train_docs["Article2"] + X_train_docs["Article3"] + X_train_docs["Article5"] + X_train_docs["Article6"] + X_train_docs["Article8"] + X_train_docs["Article10"] + X_train_docs["Article11"] + X_train_docs["Article13"] + X_train_docs["Article14"]

In [0]:
Y_train = Y_train_docs["Article2"] + Y_train_docs["Article3"] + Y_train_docs["Article5"] + Y_train_docs["Article6"] + Y_train_docs["Article8"] + Y_train_docs["Article10"] + Y_train_docs["Article11"] + Y_train_docs["Article13"] + Y_train_docs["Article14"]

In [0]:
X_test = X_test_docs["Article2"] + X_test_docs["Article3"] + X_test_docs["Article5"] + X_test_docs["Article6"] + X_test_docs["Article8"] + X_test_docs["Article10"] + X_test_docs["Article11"] + X_test_docs["Article13"] + X_test_docs["Article14"]

In [0]:
Y_test = Y_test_docs["Article2"] + Y_test_docs["Article3"] + Y_test_docs["Article5"] + Y_test_docs["Article6"] + Y_test_docs["Article8"] + Y_test_docs["Article10"] + Y_test_docs["Article11"] + Y_test_docs["Article13"] + Y_test_docs["Article14"]

In [0]:
len(Y_train)

3131

## Formatting Data

Credit to https://mccormickml.com/2019/07/22/BERT-fine-tuning/ for explaining and demonstrating how to train Bert

#### Tokenization training set

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

INFO:filelock:Lock 139863659028888 acquired on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpb2pgdp65


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt in cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
INFO:filelock:Lock 139863659028888 released on /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f7




In [0]:
# Tokenize all of the sentences, map the tokens to their word IDs and retrieve attention masks.
attention_masks = []
input_ids = []

# `encode` will:
#   (1) Tokenize the sentence.
#   (2) Prepend the `[CLS]` token to the start.
#   (3) Append the `[SEP]` token to the end.
#   (4) Pad shorter sentences until they all have the maximum length.
#   (5) Map tokens to their IDs.
#   (6) Map which tokens are actual words versus which are padding.
encoded_sent = tokenizer.batch_encode_plus(
                    X_train,                   # list of sentences to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    pad_to_max_length = True,  # Add padding
                    max_length = 512
               )
    
# Retrieve attention mask and token IDs.
attention_masks = encoded_sent['attention_mask']
input_ids = encoded_sent['input_ids']

In [0]:
train_masks = np.array(attention_masks)
train_inputs = np.array(input_ids)
train_labels = np.array(Y_train)

#### Tokenization test set

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [0]:
# Tokenize all of the sentences, map the tokens to their word IDs and retrieve attention masks.
attention_masks = []
input_ids = []

# `encode` will:
#   (1) Tokenize the sentence.
#   (2) Prepend the `[CLS]` token to the start.
#   (3) Append the `[SEP]` token to the end.
#   (4) Pad shorter sentences until they all have the maximum length.
#   (5) Map tokens to their IDs.
#   (6) Map which tokens are actual words versus which are padding.
encoded_sent = tokenizer.batch_encode_plus(
                    X_test,                   # list of sentences to encode.
                    add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                    pad_to_max_length = True,  # Add padding
                    max_length = 512
               )
    
# Retrieve attention mask and token IDs.
validation_masks = encoded_sent['attention_mask']
validation_inputs = encoded_sent['input_ids']

In [0]:
validation_masks = np.array(validation_masks)
validation_inputs = np.array(validation_inputs)
validation_labels = np.array(Y_test)

#### Save tensors

In [0]:
base_path = "/content/drive/My Drive/Colab Notebooks/BertModel/"

# Converting to PyTorch Data Types
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
#print("Shape: " + str(validation_inputs.shape))

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
#print("Shape: " + str(validation_labels.shape))


train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks) 
#print("Shape: " + str(validation_masks.shape))


# Saving our training data
torch.save(train_inputs, base_path + 'train_inputs_test20.pt')
torch.save(validation_inputs, base_path + 'validation_inputs_test20.pt')

torch.save(train_labels, base_path + 'train_labels_test20.pt')
torch.save(validation_labels, base_path + 'validation_labels_test20.pt')

torch.save(train_masks, base_path + 'train_masks_test20.pt')
torch.save(validation_masks, base_path + 'validation_masks_test20.pt')

## Training with Bert Model

### Metrics

In [0]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
# accuracy: (tp + tn) / (p + n)
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [0]:
# Function to calculate the precision of our predictions vs labels
# precision tp / (tp + fp)
from sklearn.metrics import precision_score

def flat_precision(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return precision_score(labels_flat, pred_flat)   

In [0]:
# Function to calculate the recall of our predictions vs labels
# recall: tp / (tp + fn)
from sklearn.metrics import recall_score

def flat_recall(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    recall = recall_score(labels_flat, pred_flat)   
    return recall

In [0]:
# Function to calculate the f1 score of our predictions vs labels
# f1: 2 tp / (2 tp + fp + fn)
from sklearn.metrics import f1_score

def flat_f1(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    f1 = f1_score(labels_flat, pred_flat)   
    return f1

Credit to https://mccormickml.com/2019/07/22/BERT-fine-tuning/ for explaining and demonstrating how to train Bert

### Loading PyTorch Tensors

In [0]:
base_path = "/content/drive/My Drive/Colab Notebooks/BertModel/"

In [0]:
train_inputs = torch.load(base_path + 'train_inputs_test20.pt')
validation_inputs = torch.load(base_path + 'validation_inputs_test20.pt')

train_labels = torch.load(base_path + 'train_labels_test20.pt')
validation_labels = torch.load(base_path + 'validation_labels_test20.pt')

train_masks = torch.load(base_path + 'train_masks_test20.pt')
validation_masks = torch.load(base_path + 'validation_masks_test20.pt')

In [0]:
train_inputs.shape, train_labels.shape, train_masks.shape

(torch.Size([3131, 512]), torch.Size([3131]), torch.Size([3131, 512]))

In [0]:
validation_inputs.shape, validation_labels.shape, validation_masks.shape

(torch.Size([784, 512]), torch.Size([784]), torch.Size([784, 512]))

In [0]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


### Fine-tuning

In [0]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",          # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2,               # The number of output labels--2 for binary classification.
    output_attentions = False,    # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
model.cuda()

INFO:filelock:Lock 140497941182616 acquired on /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpvosj519i


HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json in cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
INFO:filelock:Lock 140497941182616 released on /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685.lock
INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5




INFO:filelock:Lock 140501548774456 acquired on /root/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157.lock
INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmpuprii0ze


HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin in cache at /root/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
INFO:filelock:Lock 140501548774456 released on /root/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157.lock
INFO:transformers.modeling_utils:loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin from cache at /root/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5




INFO:transformers.modeling_utils:Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
INFO:transformers.modeling_utils:Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# The 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 3e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [0]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

### Training

In [0]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_precision = 0
    total_eval_recall = 0
    total_eval_f1 = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # Get the "logits" output by the model. The "logits" are the output
            # values prior to applying an activation function like the softmax.
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        total_eval_precision += flat_precision(logits, label_ids)
        total_eval_recall += flat_recall(logits, label_ids)
        total_eval_f1 += flat_f1(logits, label_ids)

        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Report the final precision  for this validation run.
    avg_val_precision = total_eval_precision / len(validation_dataloader)
    print("  Precision: {0:.2f}".format(avg_val_precision))

    # Report the final recall for this validation run.
    avg_val_recall = total_eval_recall / len(validation_dataloader)
    print("  Recall: {0:.2f}".format(avg_val_recall))

    # Report the final f1 for this validation run.
    avg_val_f1 = total_eval_f1 / len(validation_dataloader)
    print("  F1: {0:.2f}".format(avg_val_f1))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Valid. Prec.': avg_val_precision,
            'Valid. Recall': avg_val_recall,
            'Valid. F1': avg_val_f1,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    196.    Elapsed: 0:00:35.
  Batch    80  of    196.    Elapsed: 0:01:10.
  Batch   120  of    196.    Elapsed: 0:01:44.
  Batch   160  of    196.    Elapsed: 0:02:19.

  Average training loss: 0.63
  Training epcoh took: 0:02:50

Running Validation...


  _warn_prf(average, modifier, msg_start, len(result))


  Accuracy: 0.69
  Precision: 0.53
  Recall: 0.55
  F1: 0.52
  Validation Loss: 0.58
  Validation took: 0:00:13

Training...
  Batch    40  of    196.    Elapsed: 0:00:35.
  Batch    80  of    196.    Elapsed: 0:01:10.
  Batch   120  of    196.    Elapsed: 0:01:44.
  Batch   160  of    196.    Elapsed: 0:02:19.

  Average training loss: 0.54
  Training epcoh took: 0:02:50

Running Validation...
  Accuracy: 0.74
  Precision: 0.54
  Recall: 0.47
  F1: 0.48
  Validation Loss: 0.53
  Validation took: 0:00:13

Training complete!
Total training took 0:06:07 (h:mm:ss)


### Results

In [0]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Valid. Prec.,Valid. Recall,Valid. F1,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.63,0.58,0.69,0.53,0.55,0.52,0:02:50,0:00:13
2,0.54,0.53,0.74,0.54,0.47,0.48,0:02:50,0:00:13


In [0]:
model.save_pretrained('/content/drive/My Drive/Colab Notebooks/BertModel/Model')
print("Saved")  # save

INFO:transformers.configuration_utils:Configuration saved in /content/drive/My Drive/Colab Notebooks/BertModel/Model/config.json
INFO:transformers.modeling_utils:Model weights saved in /content/drive/My Drive/Colab Notebooks/BertModel/Model/pytorch_model.bin


Saved
