# Training Roberta for relation classification
[Code based on this](https://www.kaggle.com/xhlulu/jigsaw-tpu-xlm-roberta).
Our goal is to train a binary classification model to determine if drug-treatment sentences actually contain a relation.

In [None]:
run_type='load'

In [None]:
#!pip install transformers

In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import AutoModel
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
import tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import pickle

In [None]:
# data was made with the make-train notebook also found in the repo.

# !gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_train_data.csv drug_train_data.csv
# !gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_dev_data.csv drug_dev_data.csv 
# !gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/drug_test_data.csv drug_test_data.csv
# !gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_train_data.csv synth_train_data.csv
# !gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_dev_data.csv synth_dev_data.csv 
# !gsutil cp  gs://coronaviruspublicdata/snapshot_re_4_12_2020/synth_test_data.csv synth_test_data.csv

In [None]:
# def create_train_dev_test(prefix):
#   train = pd.read_csv(prefix + "_train_data.csv")
#   dev = pd.read_csv(prefix + "_dev_data.csv")
#   test = pd.read_csv(prefix + "_test_data.csv")
#   return train, test, dev

# transfer_train, transfer_test, transfer_dev = create_train_dev_test("synth")
# drug_train, drug_test, drug_dev = create_train_dev_test("drug")
# drug_full = pd.concat([drug_train, drug_test])

if run_type=='save':
    transfer_data = pd.read_excel('/kaggle/input/drugvisdata-syn/DrugVisData - All Annotations_Aradhana.xlsx',sheet_name='Syn_Data')
    transfer_data = transfer_data.rename(columns={'annotation_expert_1': 'label'})
    #transfer_data = transfer_data.fillna(2)
    transfer_data = transfer_data.dropna(0)
    #x_train,x_test,y_train,y_test=train_test_split(transfer_data.drug+'[SEP]'+transfer_data.sentence,transfer_data['label'],test_size=0.2)
    x_train,x_test,y_train,y_test=train_test_split(transfer_data.sentence,transfer_data['label'],test_size=0.2)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    """ Function to encode many sentences"""
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen,
        sep_token='[SEP]'
    )
    return np.array(enc_di['input_ids'])

In [None]:
def build_model(transformer, max_len=512):
    """
    Require a transformer of type TFAutoBert
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid', name='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    #model.compile(Adam(lr=1e-6), loss='categorical_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), 'accuracy'])
    model.compile(Adam(lr=1e-6), loss='binary_crossentropy', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.Precision(), 'accuracy'])
    return model

In [None]:
#max_len = int(int(drug_train.sentence.str.len().max()))
#avg_len = int(drug_train.sentence.str.len().median())
if run_type=='save':
    max_len = int(int(x_train.str.len().max()))
    avg_len = int(x_train.str.len().median())

In [None]:
# Configuration params
EPOCHS = 4
MAX_LEN = 512

In [None]:
MODEL = "allenai/biomed_roberta_base" 

# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
%%time 
# x_train = regular_encode(transfer_train.sentence.values, tokenizer, maxlen=MAX_LEN)
# x_valid = regular_encode(transfer_dev.sentence.values, tokenizer, maxlen=MAX_LEN)
# x_test = regular_encode(transfer_test.sentence.values, tokenizer, maxlen=MAX_LEN)
# y_train = transfer_train.label.values
# y_valid = transfer_dev.label.values
# y_test  = transfer_test.label.values

if run_type=='save':
    x_train = regular_encode(x_train.values, tokenizer, maxlen=MAX_LEN)
    x_test = regular_encode(x_test.values, tokenizer, maxlen=MAX_LEN)

In [None]:
es = EarlyStopping(monitor='val_accuracy', 
                    min_delta=0.001, 
                    patience=3,
                    verbose=1, 
                    mode='max', 
                    restore_best_weights=True)

In [None]:
# !pip install wandb
# !wandb login
# import wandb
# from wandb.keras import WandbCallback
# wandb.init(project="vt-relation-extract", sync_tensorboard=True)

In [None]:
if run_type=='save':
    strategy = tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
    model = AutoModel.from_pretrained("allenai/biomed_roberta_base")
    !mkdir biomed_roberta_base
    model.save_pretrained("biomed_roberta_base")
    with strategy.scope():
      model = TFAutoModel.from_pretrained("biomed_roberta_base", from_pt=True)
      model = build_model(model)
    BATCH_SIZE = 2 * strategy.num_replicas_in_sync

In [None]:
if run_type=='save':
    model.summary()

In [None]:
# This may look like a bug but in reality we only care about the performance on
# the annotated drug data and not what we are training on.

#x_test = regular_encode(drug_full.sentence.values, tokenizer, maxlen=MAX_LEN)
#y_test  = drug_full.label.values

if run_type=='save':
    train_history = model.fit(
                        x_train, y_train,
                        batch_size = BATCH_SIZE,
                        validation_data=(x_test, y_test),
                        #callbacks=[es, WandbCallback()],
                        callbacks=[es],
                        #epochs=EPOCHS
                        epochs=5
                        )

In [None]:
if run_type=='save':
    model.summary()

### Saving/Exporting
A model isn't useful if it cannot be used in a production pipeline.

In [None]:
# from google.colab import auth
# from datetime import datetime
# auth.authenticate_user()
# !gsutil cp -r best_epoch_roberta gs://coronaviruspublicdata/temp_data/snapshots

In [None]:
#import pickle 
#save model, input: sentence, output: binary
#pickle.dump(model, open( "bioERT_model1.pickle", "wb" ) )
# !gsutil cp model.pickle gs://coronaviruspublicdata/model.pickle

In [None]:
def save_model(model, transformer_dir='transformer'):
    """
    Special function to save a keras model that uses a transformer layer
    """
    transformer = model.layers[1]
    !mkdir transformer
    transformer.save_pretrained(transformer_dir)
    sigmoid = model.get_layer(index=3).get_weights()
    pickle.dump(sigmoid, open('sigmoid.pickle', 'wb'))

def load_model(pickle_path, transformer_dir='transformer', max_len=512):
    """
    Special function to load a keras model that uses a transformer layer
    """
    transformer = TFAutoModel.from_pretrained(transformer_dir)
    model = build_model(transformer, max_len=max_len)
    sigmoid = pickle.load(open(pickle_path, 'rb'))
    #model.get_layer('sigmoid').set_weights(sigmoid)
    model.get_layer('sigmoid').set_weights(sigmoid)

    return model

In [None]:
if run_type=='save':
    save_model(model)
    shutil.make_archive('biobert_output', 'zip', '/kaggle/working/')

In [None]:
if run_type=='load':
    model = load_model("/kaggle/input/biobert-model1/sigmoid.pickle", "/kaggle/input/biobert-model1/transformer")

In [None]:
# model.summary()
# model.get_layer(index=3)

In [None]:
# !gsutil cp -r transformer3 gs://coronaviruspublicdata/re_final_best2/s
# !gsutil cp sigmoid3.pickle gs://coronaviruspublicdata/re_final_best2/s

### Qualitative Evaluation
We will now qualitatively look at a few examples.

In [None]:
# test_examples = regular_encode(["As with Acacia and PAN, the LAC prospectus affirms that digital technologies and ICTbased solutions provide a powerful tool to change the ways in which health services are managed and delivered to the population at large, and to low-income and marginalized communities in particular"], tokenizer, maxlen=MAX_LEN)
# model.predict(test_examples)

In [None]:
# test_examples = regular_encode(["Glatiramer acetate (Copaxone) therapy induces an oligoclonal CD8+ T cell response with cytotoxic ability R"], tokenizer, maxlen=MAX_LEN)
# model.predict(test_examples)

In [None]:
new_data = pd.read_excel('/kaggle/input/drugvisdata-syn/DrugVisData - All Annotations_Aradhana.xlsx',sheet_name='DrugVisData - Copy')
new_data = new_data.rename(columns={'annotation_expert_1': 'label'})
new_data = new_data.loc[new_data.label != 0,:]
new_data = new_data.loc[new_data.label != 1,:]
new_data = new_data.loc[0:250,:]

In [None]:
output_data = new_data.drop_duplicates(subset='sentence').reset_index(drop=True)

for i in range(len(output_data)):
    ls = []
    ls.append(str(output_data.loc[i,'sentence']))
    #print(i,ls)
    test_example = regular_encode(ls, tokenizer, maxlen=MAX_LEN)
    output_data.loc[i,'BioBERT_Prediction'] = model.predict(test_example)[0][0]

output_data.to_csv('bioBERT_Output_Negated.csv',header=True)

In [None]:
# print(predictions[0])
# print("sentence is " + drug_dev.sentence.values[0])
# print("real label is " + str(drug_dev.label.values[0]))

In [None]:
# print(predictions[2])
# print("sentence is: " + drug_dev.sentence.values[2])
# print("real label is " + str(drug_dev.label.values[2]))

In [None]:
# print(predictions[4])
# print("sentence is: " + drug_dev.sentence.values[4])
# print("real label is " + str(drug_dev.label.values[4]))

### Tests for RAM usage
Basic check to determine how much RAM is available.

In [None]:
# !ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
# !pip install gputil
# !pip install psutil
# !pip install humanize
# import psutil
# import humanize
# import os
# import GPUtil as GPU
# GPUs = GPU.getGPUs()
# # XXX: only one GPU on Colab and isn’t guaranteed
# gpu = GPUs[0]
# def printm():
#  process = psutil.Process(os.getpid())
#  print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
#  print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
# printm()

In [None]:
# from google.colab import auth
# from datetime import datetime
# auth.authenticate_user()


In [None]:
# !gsutil cp -r transformer gs://coronaviruspublicdata/re_snapshot/4_13_2020
# !gsutil cp sigmoid.pickle gs://coronaviruspublicdata/re_snapshot/4_13_2020