In [1]:
import os

os.environ['NO_PROXY'] = '169.254.169.254'

os.environ['HTTP_PROXY'] = '10.239.228.20:8000'

os.environ['HTTPS_PROXY'] = '10.239.228.20:8000'

!cat /etc/resolv.conf

!cat ~/.wgetrcb

!echo "use_proxy=yes\nhttp_proxy=http.proxy.fmr.com:8000\nhttps_proxy=http.proxy.fmr.com:8000" > ~/.wgetrc

 

#cat ~/.wgetrc

!echo $HTTP_PROXY

!echo $HTTPS_PROXY

nameserver 172.16.0.10
search fmr-a642163.svc.gpu-cluster.local svc.gpu-cluster.local gpu-cluster.local fmr.com
options ndots:5
cat: /home/jovyan/.wgetrcb: No such file or directory
10.239.228.20:8000
10.239.228.20:8000


In [4]:
import sys
sys.path.append("/home/jovyan/TF_NEW/tf-transformers/src/")



In [5]:
import os
import tempfile
import json
import glob
import datasets
import shutil

from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf

from tf_transformers.data import TFReader, TFWriter
from model import get_model, get_tokenizer, get_optimizer, get_trainer

In [6]:
with initialize(config_path="conf/"):
    cfg = compose(config_name="config", overrides=["data.take_sample=true", "+glue=mrpc"])
    print(cfg)

{'data': {'train_batch_size': 32, 'eval_batch_size': 64, 'take_sample': True, 'max_seq_length': 128}, 'trainer': {'type': 'gpu', 'dtype': 'fp32', 'num_gpus': 2, 'tpu_address': None, 'epochs': 3, 'strategy': 'mirrored'}, 'optimizer': {'learning_rate': 2e-05, 'loss_type': None}, 'glue': {'task': {'name': 'mrpc'}, 'data': {'name': 'mrpc'}}}


In [5]:
# Steps

# 1. Download the data
# 2. Prepare TFRecords
# 3. Read TFrecords to tf.data
# 4. Train the model

In [7]:
# Convert data to features using specific length
# into a temp dir (and log it as well for monitoring)

def get_dataset(data, batch_size, tokenizer, max_seq_length, mode, tfrecord_dir, take_sample=False):
    
    if mode not in ["train", "eval"]:
        raise ValueError("Inavlid mode `{}` specified. Available mode is ['train', 'eval']".format(mode))
    
    def get_tfrecord_example(data):
        result = {}
        for f in data:
            input_ids_s1 = [tokenizer.cls_token] + tokenizer.tokenize(f['sentence1'])[: max_seq_length-2] + [tokenizer.sep_token] # -2 to add CLS and SEP
            input_ids_s1 = tokenizer.convert_tokens_to_ids(input_ids_s1)
            input_type_ids_s1 = [0] * len(input_ids_s1) # 0 for s1

            input_ids_s2 = tokenizer.tokenize(f['sentence2'])[: max_seq_length-1] + [tokenizer.sep_token] # -1 to add SEP
            input_ids_s2 = tokenizer.convert_tokens_to_ids(input_ids_s2)
            input_type_ids_s2 = [1] * len(input_ids_s2)
            
            # concatanate two sentences
            input_ids =  input_ids_s1 + input_ids_s2
            input_type_ids = input_type_ids_s1 + input_type_ids_s2
            input_mask = [1] * len(input_ids) # 1 for s2
            
            result = {}
            result['input_ids'] = input_ids
            result['input_mask'] = input_mask
            result['input_type_ids'] = input_type_ids

            result['labels'] = f['label']
            yield result
            
    schema = {
        "input_ids": ("var_len", "int"),
        "input_mask": ("var_len", "int"),
        "input_type_ids": ("var_len", "int"),
        "labels": ("var_len", "int"),
    }
    
    # Create a temp dir
    if mode == "train":
        # Write tf records
        train_data_dir = os.path.join(tfrecord_dir,"train")        
        tfrecord_filename = 'mrpc'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=train_data_dir,
                            tag='train',
                            overwrite=False
                     )
        data_train = data['train']
        # Take sample
        if take_sample:
            data_train = data_train.select(range(500))
            
        tfwriter.process(parse_fn=get_tfrecord_example(data_train))
        
        # Read tfrecord to dataset
        schema = json.load(open("{}/schema.json".format(train_data_dir)))
        stats  = json.load(open('{}/stats.json'.format(train_data_dir)))
        all_files = glob.glob("{}/*.tfrecord".format(train_data_dir))
        tf_reader = TFReader(schema=schema, 
                            tfrecord_files=all_files)

        x_keys = ['input_ids', 'input_type_ids', 'input_mask']
        y_keys = ['labels']
        train_dataset = tf_reader.read_record(auto_batch=True, 
                                           keys=x_keys,
                                           batch_size=batch_size, 
                                           x_keys = x_keys, 
                                           y_keys = y_keys,
                                           shuffle=True, 
                                           drop_remainder=True
                                          )
        return train_dataset, stats['total_records']
    if mode == "eval":
        # Write tfrecords
        eval_data_dir = os.path.join(tfrecord_dir,"eval")
        tfrecord_filename = 'mrpc'
        tfwriter = TFWriter(schema=schema, 
                            file_name=tfrecord_filename, 
                            model_dir=eval_data_dir,
                            tag='dev',
                            overwrite=False
                            )
        data_eval = data['validation']
        # Take sample
        if take_sample:
            data_eval = data_eval.select(range(500))
        tfwriter.process(parse_fn=get_tfrecord_example(data_eval))
        
        
        # Read tfrecord to dataset
        schema = json.load(open("{}/schema.json".format(eval_data_dir)))
        stats  = json.load(open('{}/stats.json'.format(eval_data_dir)))
        all_files = glob.glob("{}/*.tfrecord".format(eval_data_dir))
        tf_reader = TFReader(schema=schema, 
                            tfrecord_files=all_files)

        x_keys = ['input_ids', 'input_type_ids', 'input_mask']
        y_keys = ['labels']
        eval_dataset = tf_reader.read_record(auto_batch=True, 
                                           keys=x_keys,
                                           batch_size=batch_size, 
                                           x_keys = x_keys, 
                                           y_keys = y_keys,
                                           shuffle=False, 
                                           drop_remainder=False
                                          )
        return eval_dataset, stats['total_records']

In [8]:
cfg

{'data': {'train_batch_size': 32, 'eval_batch_size': 64, 'take_sample': True, 'max_seq_length': 128}, 'trainer': {'type': 'gpu', 'dtype': 'fp32', 'num_gpus': 2, 'tpu_address': None, 'epochs': 3, 'strategy': 'mirrored'}, 'optimizer': {'learning_rate': 2e-05, 'loss_type': None}, 'glue': {'task': {'name': 'mrpc'}, 'data': {'name': 'mrpc'}}}

In [9]:

# Data specific configuration
max_seq_len = cfg.data.max_seq_length
take_sample = cfg.data.take_sample
max_seq_length = cfg.data.max_seq_length
train_batch_size = cfg.data.train_batch_size
eval_batch_size  = cfg.data.eval_batch_size

# Trainer specifics
device = cfg.trainer.type
num_gpus = cfg.trainer.num_gpus
tpu_address = cfg.trainer.tpu_address
dtype = cfg.trainer.dtype
epochs = cfg.trainer.epochs
strategy = cfg.trainer.strategy

# Optimizer
learning_rate = cfg.optimizer.learning_rate
loass_type = cfg.optimizer.loss_type

# Data name
data_name = cfg.glue.data.name



In [10]:
# Load tokenizer
tokenizer = get_tokenizer()

# Load data
data = datasets.load_dataset("glue", data_name)
tfrecord_dir = tempfile.mkdtemp()

train_dataset, total_train_examples = get_dataset(data, train_batch_size,tokenizer, max_seq_length, "train", tfrecord_dir, take_sample)
eval_dataset, total_eval_examples  = get_dataset(data, eval_batch_size,tokenizer, max_seq_len, "eval", tfrecord_dir, take_sample)

Reusing dataset glue (/home/jovyan/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
INFO:absl:Total individual observations/examples written is 500 in 0.36572909355163574 seconds
INFO:absl:All writer objects closed
INFO:absl:Total individual observations/examples written is 500 in 0.3345170021057129 seconds
INFO:absl:All writer objects closed


In [11]:
# Load optimizer
optimizer_fn = get_optimizer(learning_rate, total_train_examples, train_batch_size, epochs)

# Load trainer
trainer = get_trainer(device, dtype, strategy, num_gpus, tpu_address)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


In [21]:
# shutil.rmtree(tfrecord_dir)

In [30]:
total_train_examples

500

In [14]:
model = get_model()

You are using a model of type albert to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
INFO:absl:Successful: Model checkpoints matched and loaded from /tmp/tf_transformers_cache/albert-base-v2/ckpt-1


In [12]:
for (batch_inputs, batch_labels) in train_dataset.take(1):
    print(batch_inputs['input_ids'].shape, batch_labels['labels'].shape)

(32, 93) (32, 1)


In [15]:
outputs = model(batch_inputs)

In [16]:
outputs

{'cls_output': <tf.Tensor: shape=(32, 768), dtype=float32, numpy=
 array([[ 0.7143398 , -0.65813   , -0.9785618 , ...,  0.99996495,
          0.99999976,  0.5848551 ],
        [ 0.0306453 ,  0.00973307, -0.99677414, ...,  0.9994981 ,
         -0.38448253, -0.11974888],
        [-0.40589306,  0.49201494, -0.99863875, ...,  0.99995124,
          0.41152513, -0.48804575],
        ...,
        [-0.45018688,  0.6705278 , -0.98600817, ...,  0.9998185 ,
         -0.31839788, -0.6285347 ],
        [-0.42206424,  0.6171094 , -0.992722  , ...,  0.9996224 ,
          0.9642419 , -0.5217529 ],
        [ 0.2351484 , -0.28753084, -0.99572015, ...,  0.99625677,
         -0.9023692 ,  0.3213496 ]], dtype=float32)>,
 'token_embeddings': <tf.Tensor: shape=(32, 93, 768), dtype=float32, numpy=
 array([[[ 1.5806692 , -1.2263874 ,  1.4549923 , ..., -1.0687431 ,
          -0.5606896 ,  1.2192034 ],
         [-0.7002921 , -0.40231714, -1.216911  , ..., -0.23197146,
           0.25958478, -0.28148273],
       

In [11]:
class Callback():
    
    def __init__(self):
        pass
    
    def call(trainer_kwargs):
        
        for k, v in trainer_kwargs.items():
            print(k, '-->', v)
callback = Callback()

In [None]:
history = trainer.run(
    model_fn = get_model,
    optimizer_fn = optimizer_fn,
    train_dataset = train_dataset,
    train_loss_fn,
    epochs,
    steps_per_epoch,
    model_checkpoint_dir,
    batch_size,
    training_loss_names=None,
    validation_loss_names=None,
    validation_dataset=None,
    validation_loss_fn=None,
    validation_interval_steps=None,
    steps_per_call=100,
    enable_xla=True,
    callbacks=None,
    callbacks_interval_steps=None,
    overwrite_checkpoint_dir=False,
    max_number_of_models=10,
    model_save_interval_steps=None,
    repeat_dataset=True,
    latest_checkpoint=None,
)