In [1]:
!pip install torch==1.2.0 torchvision==0.4.0 -f https://download.pytorch.org/whl/torch_stable.html
!pip install seaborn

Looking in links: https://download.pytorch.org/whl/torch_stable.html
You should consider upgrading via the '/export/data/swisspost/enviroments/word_embeddings/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/export/data/swisspost/enviroments/word_embeddings/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import tensorflow as tf
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification, CamembertModel

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda", 0)

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(1))
    print(device)

There are 2 GPU(s) available.
We will use the GPU: GeForce GTX TITAN X
cuda:0


In [6]:
DOMAIN = 'abdominal'
PATH = '20200410'

In [7]:
import pandas as pd
# df = pd.read_csv('training_data/simplify_label/ellipsis_training.csv')
df = pd.read_csv(PATH+'/{0}_compactExpansion_200410.txt'.format(DOMAIN), sep=";", names=['canonical', 'variation'])

In [8]:
df = df[~df.variation.isna()]

In [9]:
df = df.drop_duplicates()

In [10]:
import re
preprocess = lambda sent: re.sub(r'\s+', ' ', re.sub('-', ' ', re.sub("'", "' ",  sent.strip())))
preprocess_canonical = lambda sent: re.sub(r'\s+', ' ', sent.strip())

In [11]:
df.variations = df.variation.apply(preprocess)
df.canonical = df.canonical.apply(preprocess_canonical)

  """Entry point for launching an IPython kernel.


In [12]:
df

Unnamed: 0,canonical,variation
0,avez-vous mal au ventre ?,vous êtes venu parce que vous aviez cette doul...
1,avez-vous mal au ventre ?,on ma dit que vous aviez mal à l'abdomen
2,avez-vous mal au ventre ?,est-ce que vous avez noté ce mal abdominale
3,avez-vous mal au ventre ?,on m'a dit que vous aviez un mal à l'abdomen
4,avez-vous mal au ventre ?,cela vous fait-il maux de ventre
...,...,...
218761,avez-vous senti une masse dans les testicules ?,vous vous êtes déjà palpé une boule au niveau ...
218762,avez-vous senti une masse dans les testicules ?,avez-vous noté une masse au niveau du testicule
218763,avez-vous senti une masse dans les testicules ?,avez-vous palpé une masse dans les testicules
218764,avez-vous senti une masse dans les testicules ?,vous arrive-t-il d'éprouver une masse au nivea...


In [13]:
import pickle
def save_model(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pickle(path):
    with open(path, 'rb') as f:
        model = pickle.load(f)
    return model

In [14]:
index2canonical = df.canonical.unique()
canonical2index = dict([(canonical, i) for (i, canonical) in enumerate(index2canonical)])

In [15]:
import os
if not os.path.isdir(os.path.join(PATH, DOMAIN)):
    os.mkdir(os.path.join(PATH, DOMAIN))

In [16]:
save_model(index2canonical, PATH+'/{0}/index2canonical'.format(DOMAIN))
save_model(canonical2index, PATH+'/{0}/canonical2index'.format(DOMAIN))

In [17]:
df['label'] = df.canonical.apply(lambda sent: canonical2index[sent])

In [18]:
X = df.variation.values
y = df.label.values

In [19]:
len(df)

218607

In [20]:
cambertTokenizer = CamembertTokenizer.from_pretrained('camembert-base')

In [21]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in X:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = cambertTokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# print last sentence, now as a list of IDs.
print('Original: ', X[-1])
print('Token IDs:', input_ids[-1])

Original:  vous arrive-t-il d'avoir une masse au niveau du testicule
Token IDs: [5, 39, 1242, 26, 110, 26, 62, 18, 11, 443, 28, 2269, 36, 359, 25, 2006, 14194, 6]


In [22]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  39


In [23]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [24]:
# max length sentence + 5. For test data.

MAX_LEN = 50

print('\nPadding/truncating all sentences to {} values...'.format(MAX_LEN))

print('\nPadding token: "{:}", ID: {:}'.format(cambertTokenizer.pad_token, cambertTokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')


Padding/truncating all sentences to 50 values...

Padding token: "<pad>", ID: 1

Done.


In [25]:
# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [26]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 90% for training and 10% for validation.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, y, 
                                                            random_state=2018, test_size=0.1)

# Do the same for the masks.
train_masks, validation_masks, _, _ = train_test_split(attention_masks, y,
                                             random_state=2018, test_size=0.1)

In [27]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [28]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 25

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [29]:
model = CamembertForSequenceClassification.from_pretrained('camembert-base', # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = len(index2canonical), # The number of output labels for multi-class classification.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

In [30]:
# train the model on GPU
model.cuda()

CamembertForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [31]:
from transformers import AdamW
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [32]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs 
epochs = 15

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [33]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [34]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# seed value to make it reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
  Batch    40  of  7,870.    Elapsed: 0:00:05.
  Batch    80  of  7,870.    Elapsed: 0:00:10.
  Batch   120  of  7,870.    Elapsed: 0:00:15.
  Batch   160  of  7,870.    Elapsed: 0:00:19.
  Batch   200  of  7,870.    Elapsed: 0:00:24.
  Batch   240  of  7,870.    Elapsed: 0:00:29.
  Batch   280  of  7,870.    Elapsed: 0:00:34.
  Batch   320  of  7,870.    Elapsed: 0:00:39.
  Batch   360  of  7,870.    Elapsed: 0:00:44.
  Batch   400  of  7,870.    Elapsed: 0:00:49.
  Batch   440  of  7,870.    Elapsed: 0:00:54.
  Batch   480  of  7,870.    Elapsed: 0:00:59.
  Batch   520  of  7,870.    Elapsed: 0:01:03.
  Batch   560  of  7,870.    Elapsed: 0:01:08.
  Batch   600  of  7,870.    Elapsed: 0:01:13.
  Batch   640  of  7,870.    Elapsed: 0:01:18.
  Batch   680  of  7,870.    Elapsed: 0:01:23.
  Batch   720  of  7,870.    Elapsed: 0:01:28.
  Batch   760  of  7,870.    Elapsed: 0:01:33.
  Batch   800  of  7,870.    Elapsed: 0:01:38.
  Batch   840  of  7,870.    Elapsed: 0:01:43.


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(loss_values, 'b-o')

# Label the plot.
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.show()

In [None]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = PATH+'/{0}/CambemBERT/'.format(DOMAIN)

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
cambertTokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


In [None]:
with open('test_data/elipsis_test/data_preprocessed/source_concatenate_without_special_char', 'r') as f:
    sentences = [sent.strip() for sent in f.readlines()]

In [None]:
with open('test_data/elipsis_test/data_preprocessed/core_sentences', 'r') as f:
    labels = [canonical2index.get(sent.strip(), -1) for sent in f.readlines()]

In [None]:
import pandas as pd

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(len(sentences)))

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = cambertTokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_sent)

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

    logits = outputs[0]

  # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

print('    DONE.')

In [None]:
from sklearn.metrics import matthews_corrcoef, classification_report

matthews_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):
  
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # Calculate and store the coef for this batch.  
    matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
    matthews_set.append(matthews)

In [73]:
a = list(filter(lambda prediction: prediction[0] > 0, zip(flat_true_labels, flat_predictions)))

NameError: name 'flat_true_labels' is not defined

In [None]:
true_labels, predictions = list(zip(*a))

In [None]:
print(classification_report(true_labels, predictions))

In [None]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)