## 1.1. Using Colab GPU for Training


In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install transformers



In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [6]:
from transformers import BertForSequenceClassification, BertTokenizer
# Load a trained model and vocabulary that you have fine-tuned
main_dir = "./drive/My Drive/Lancet/"
model_dir = "model_save_health/"
saved_model_number = 0
output_dir = main_dir+model_dir+str(saved_model_number)
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
import pandas as pd
import numpy as np
import zipfile
downloaded = drive.CreateFile({"id": "13jbSPsFXAj2DWaUdoSxPD6BFm6EFfg0G"})
downloaded.GetContentFile('TXT.zip')
with zipfile.ZipFile('TXT.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [0]:
import os
my_directory = 'Session 74 - 2019/'
files = os.listdir(my_directory)

In [14]:
import nltk
nltk.download('punkt')
#Create dataset of all sentences of that Session
li = []
for filename in files:
  curr_file = my_directory+filename
  df = pd.read_csv(open(curr_file), delimiter='\t', header=None, names=['paragraph'])
  li.append(df)
df = pd.concat(li, axis=0, ignore_index=True)
df=df.dropna()
df['sentences'] = df['paragraph'].map(lambda x: nltk.tokenize.sent_tokenize(x))
df = df.explode('sentences')
df=df.sample(20).reset_index(drop=True)

# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df.shape[0]))

# Create sentence and label lists
sentences = df.sentences.values

max_len = 0
id=[]
k=0
# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    if len(input_ids) > 100:
      id.append(k)
    k=k+1


print('Max sentence length: ', max_len)
sentences = np.delete(sentences,id)
len(sentences)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Number of test sentences: 20

Max sentence length:  126


19

In [0]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]' tokens
                        max_length = 110,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

#For evalualtion on an unlabelled dataset we do not have a labels tensor

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [16]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions = []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  
  # Store predictions and true labels
  predictions.append(logits)

print('    DONE.')

Predicting labels for 19 test sentences...
    DONE.


In [24]:
from sklearn.metrics import precision_recall_fscore_support, classification_report, get_scorer, matthews_corrcoef, accuracy_score, cohen_kappa_score, f1_score

# Combine the results across all batches. 
flat_predictions = torch.from_numpy(np.concatenate(predictions, axis=0))

#convert to probabilities with sigmoid activation function
sigmoid = torch.nn.Sigmoid()
prob_predictions = sigmoid(flat_predictions)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

print("Probabilities of label 0 (anything) or 1 (health):")
print(prob_predictions)
print("Predicted label:")
print(flat_predictions)
print("Sentences:")
print(sentences)

Probabilities of label 0 (anything) or 1 (health):
tensor([[0.9844, 0.0063],
        [0.9834, 0.0070],
        [0.8146, 0.1156],
        [0.9395, 0.0262],
        [0.9782, 0.0087],
        [0.9802, 0.0092],
        [0.9699, 0.0142],
        [0.9879, 0.0054],
        [0.9829, 0.0069],
        [0.9816, 0.0071],
        [0.9731, 0.0110],
        [0.9862, 0.0057],
        [0.9840, 0.0065],
        [0.9731, 0.0105],
        [0.9849, 0.0078],
        [0.9863, 0.0062],
        [0.9548, 0.0182],
        [0.9693, 0.0151],
        [0.5827, 0.3263]])
Predicted label:
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Sentences:
['New opportunities have emerged for the region following the first informal consultations at the highest level held in our capital last year.'
 'This went on forever, a bit like the experience of Brexit in the United Kingdom, if some of our parliamentarians had their way.'
 '“I take this opportunity to reaffirm from this rostrum my country’s commitment to c