<a href="https://colab.research.google.com/github/isaacmg/task-vt/blob/re_model_revised/drug_treatment_extraction/notebooks/ROBERTA_RE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training Roberta

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 17.2MB/s eta 0:00:01[K     |█▏                              | 20kB 3.1MB/s eta 0:00:01[K     |█▊                              | 30kB 4.1MB/s eta 0:00:01[K     |██▎                             | 40kB 2.9MB/s eta 0:00:01[K     |███                             | 51kB 3.4MB/s eta 0:00:01[K     |███▌                            | 61kB 4.0MB/s eta 0:00:01[K     |████                            | 71kB 4.3MB/s eta 0:00:01[K     |████▋                           | 81kB 4.5MB/s eta 0:00:01[K     |█████▎                          | 92kB 5.0MB/s eta 0:00:01[K     |█████▉                          | 102kB 4.9MB/s eta 0:00:01[K     |██████▍                         | 112kB 4.9MB/s eta 0:00:01[K     |███████                         | 122kB 4.9M

In [0]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import EarlyStopping

In [3]:
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_train_data.csv drug_train_data.csv
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_dev_data.csv drug_dev_data.csv 
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_test_data.csv drug_test_data.csv
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_train_data.csv synth_train_data.csv
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_dev_data.csv synth_dev_data.csv 
!gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_test_data.csv synth_test_data.csv

Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_train_data.csv...
/ [0 files][    0.0 B/ 50.6 KiB]                                                / [1 files][ 50.6 KiB/ 50.6 KiB]                                                
Operation completed over 1 objects/50.6 KiB.                                     
Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_dev_data.csv...
/ [1 files][  1.5 KiB/  1.5 KiB]                                                
Operation completed over 1 objects/1.5 KiB.                                      
Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_test_data.csv...
/ [1 files][ 35.2 KiB/ 35.2 KiB]                                                
Operation completed over 1 objects/35.2 KiB.                                     
Copying gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_train_data.csv...
/ [1 files][386.1 KiB/386.1 KiB]                                                
Operation completed over 1 obj

In [0]:
def create_train_dev_test(prefix):
  train = pd.read_csv(prefix + "_train_data.csv")
  dev = pd.read_csv(prefix + "_dev_data.csv")
  test = pd.read_csv(prefix + "_test_data.csv")
  return train, test, dev

transfer_train, transfer_test, transfer_dev = create_train_dev_test("synth")
drug_train, drug_test, drug_dev = create_train_dev_test("drug")
drug_full = pd.concat([drug_train, drug_test])


In [0]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    return np.array(enc_di['input_ids'])

In [0]:
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), 'accuracy'])
    return model

In [0]:
max_len = int(int(drug_train.sentence.str.len().max()))
avg_len = int(drug_train.sentence.str.len().median())

# Configuration
EPOCHS = 4
MAX_LEN = max_len

In [8]:
MODEL = "allenai/biomed_roberta_base" #'xlnet-base-cased'

# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(IntProgress(value=0, description='Downloading', max=495, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=898822, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=2, style=ProgressStyle(description_width='i…




HBox(children=(IntProgress(value=0, description='Downloading', max=150, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=185, style=ProgressStyle(description_width=…




In [9]:
%%time 
x_train = regular_encode(transfer_train.sentence.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(transfer_dev.sentence.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(transfer_test.sentence.values, tokenizer, maxlen=MAX_LEN)
y_train = transfer_train.label.values
y_valid = transfer_dev.label.values
y_test  = transfer_test.label.values

CPU times: user 2.58 s, sys: 76.2 ms, total: 2.66 s
Wall time: 2.66 s


In [0]:
es = EarlyStopping(monitor='val_accuracy', 
                    min_delta=0.001, 
                    patience=2,
                    verbose=1, 
                    mode='max', 
                    restore_best_weights=True)

In [11]:
!pip install wandb
!wandb login
import wandb
from wandb.keras import WandbCallback
wandb.init(project="vt-relation-extract", sync_tensorboard=True)

Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/ca/18/ef5215832f523c29f6e0c19a5b87e0dd90fe40fb48ba38362f961be14e4f/wandb-0.8.31-py2.py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.0MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading https://files.pythonhosted.org/packages/f5/e8/f6bd1eee09314e7e6dee49cbe2c5e22314ccdb38db16c9fc72d2fa80d054/docker_pycreds-0.4.0-py2.py3-none-any.whl
Collecting configparser>=3.8.1
  Downloading https://files.pythonhosted.org/packages/4b/6b/01baa293090240cf0562cc5eccb69c6f5006282127f2b846fad011305c79/configparser-5.0.0-py3-none-any.whl
Collecting subprocess32>=3.5.3
[?25l  Downloading https://files.pythonhosted.org/packages/32/c8/564be4d12629b912ea431f1a50eb8b3b9d00f1a0b1ceff17f266be190007/subprocess32-3.5.4.tar.gz (97kB)
[K     |████████████████████████████████| 102kB 10.1MB/s 
Collecting GitPython>=1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/19/1a/0df85d2bddbca33665d2148173d3

W&B Run: https://app.wandb.ai/igodfried/vt-relation-extract/runs/2s31r5r7

In [12]:
from transformers import AutoModel
strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
model = AutoModel.from_pretrained("allenai/biomed_roberta_base")
!mkdir biomed_roberta_base
model.save_pretrained("biomed_roberta_base")
with strategy.scope():
  model = TFAutoModel.from_pretrained("biomed_roberta_base", from_pt=True)
  model = build_model(model)
BATCH_SIZE = 2 * strategy.num_replicas_in_sync


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


HBox(children=(IntProgress(value=0, description='Downloading', max=655615582, style=ProgressStyle(description_…




In [13]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 512, 768), (None, 124645632 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 124,646,401
Trainable params: 124,646,401
Non-trainable params: 0
_________________________________________________________________


In [14]:
x_test = regular_encode(drug_test.sentence.values, tokenizer, maxlen=MAX_LEN)
y_test  = drug_test.label.values

train_history = model.fit(
                        x_train, y_train,
                        batch_size = BATCH_SIZE,
                        validation_data=(x_test, y_test),
                        callbacks=[es, WandbCallback()],
                        epochs=EPOCHS
                        )

Epoch 1/4
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tenso

ResourceExhaustedError: ignored

In [0]:
x_train = regular_encode(drug_train.sentence.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(drug_dev.sentence.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(drug_test.sentence.values, tokenizer, maxlen=MAX_LEN)
y_train = drug_train.label.values
y_valid = drug_dev.label.values
y_test  = drug_test.label.values

In [16]:
train_history = model.fit(
                        x_train, y_train,
                        batch_size=BATCH_SIZE,
                        validation_data=(x_test, y_test),
                        callbacks=[es],
                        epochs=8
                        )

Epoch 1/8


KeyboardInterrupt: ignored

In [0]:
model.save("best_epoch_roberta3.h5")


NotImplementedError: ignored

In [0]:
tf.keras.models.save_model(model, "the_model.h5")

NotImplementedError: ignored

In [0]:
from transformers import TFBertForSequenceClassification
#!cp biomed_roberta_base/config.json 
TFBertForSequenceClassification.from_pretrained('best_epoch_roberta')

OSError: ignored

In [0]:
#!cp biomed_roberta_base/config.json  best_epoch_roberta/config.json  

AttributeError: ignored

In [0]:
from google.colab import auth
from datetime import datetime
auth.authenticate_user()
!gsutil cp -r best_epoch_roberta gs://coronaviruspublicdata/temp_data/snapshots

Copying file://best_epoch_roberta/config.json [Content-Type=application/json]...
Copying file://best_epoch_roberta/saved_model.pb [Content-Type=application/octet-stream]...
Copying file://best_epoch_roberta/variables/variables.index [Content-Type=application/octet-stream]...
Copying file://best_epoch_roberta/variables/variables.data-00001-of-00002 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite obje

In [0]:
tf.keras.models.load_model("best_epoch_roberta")

ValueError: ignored

In [110]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [112]:
import pickle 
pickle.dump(model, open( "model.pickle", "wb" ) )
!gsutil cp model.pickle gs://coronaviruspublicdata/model.pickle

Copying file://model.pickle [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/475.6 MiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

\
Operation completed over 1 objects/475.6 MiB.                                    


In [0]:
def save_model(model, transformer_dir='transformer'):
    """
    Special function to load a keras model that uses a transformer layer
    """
    transformer = model.layers[1]
    touch_dir(transformer_dir)
    transformer.save_pretrained(transformer_dir)
    sigmoid = model.get_layer('sigmoid').get_weights()
    pickle.dump(sigmoid, open('sigmoid.pickle', 'wb'))

def load_model(transformer_dir='transformer', max_len=256):
    """
    Special function to load a keras model that uses a transformer layer
    """
    transformer = TFAutoModel.from_pretrained(transformer_dir)
    model = build_model(transformer, max_len=max_len)
    sigmoid = pickle.load(open('sigmoid.pickle', 'rb'))
    model.get_layer('sigmoid').set_weights(sigmoid)

    return model

In [17]:
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7413 sha256=05834e7a8995c349ac00c967859edc9d2114aab304b17cf3e1659582c1db14da
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
Gen RAM Free: 22.4 GB  | Proc size: 6.3 GB
GPU RAM Free: 913MB | Used: 15367MB | Util  94% | Total 16280MB
