# Setup

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime







from pybert.bert_tf_lib import run_classifier
from pybert.bert_tf_lib import optimization
from pybert.bert_tf_lib import tokenization

W0430 23:09:36.247522 140358862772032 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
from pybert.config.basic_config import configs as config

In [3]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = config['train']['batch_size'] 
LEARNING_RATE = config['train']['learning_rate'] 
NUM_TRAIN_EPOCHS = config['train']['epochs'] 
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = config['train']['warmup_proportion'] 


MAX_SEQ_LEN=config['train']['max_seq_len']

# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

## Define Output Directory

In [4]:
OUTPUT_DIR = config['output']['checkpoint_dir']       #@param {type:"string"}



#If true, deletes previous checkpoints
DO_DELETE = False #@param {type:"boolean"}



if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass

print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: pybert/output/checkpoints *****


# Data Preprocessing
We'll need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. 
- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.
- `label` is the label for our example, i.e. True, False

Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):

    Lowercase our text (if we're using a BERT lowercase model)
    Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
    Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
    Map our words to indexes using a vocab file that BERT provides
    Add special "CLS" and "SEP" tokens (see the readme)
    Append "index" and "segment" tokens to each input (see the BERT paper)

Happily, we don't have to worry about most of these details.

To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:


## Load Data

In [5]:
import warnings

from torch.utils.data import DataLoader
from pybert.io.dataset import CreateDataset
from pybert.io.data_transformer import DataTransformer
from pybert.utils.logginger import init_logger
from pybert.utils.utils import seed_everything
from pybert.preprocessing.preprocessor import EnglishPreProcessor
from pytorch_pretrained_bert.tokenization import BertTokenizer
warnings.filterwarnings("ignore")


In [6]:
logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir'])
logger.info(f"seed is {config['train']['seed']}")
device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}"
seed_everything(seed=config['train']['seed'],device=device)
logger.info('starting load data from disk')
id2label = {value: key for key, value in config['label2id'].items()}

[2019-04-30 23:09:45]: bert <ipython-input-6-92bf49f76f77>[line:2] INFO  seed is 2018
I0430 23:09:45.865490 140358862772032 <ipython-input-6-92bf49f76f77>:2] seed is 2018
[2019-04-30 23:09:45]: bert <ipython-input-6-92bf49f76f77>[line:5] INFO  starting load data from disk
I0430 23:09:45.868750 140358862772032 <ipython-input-6-92bf49f76f77>:5] starting load data from disk


In [7]:
DT = DataTransformer(logger = logger,seed = config['train']['seed'])

In [8]:
targets,sentences = DT.read_data(raw_data_path = config['data']['raw_data_path'],
                                    preprocessor = EnglishPreProcessor(),
                                    is_train = True)

100%|██████████| 159571/159571 [00:06<00:00, 22853.01it/s]


In [9]:
# TRAIN VALIDATION SPLIT
train, valid = DT.train_val_split(X = sentences,y = targets,save=True,shuffle=True,stratify=False,
                                      valid_size  = config['train']['valid_size'],
                                      train_path  = config['data']['train_file_path'],
                                      valid_path  = config['data']['valid_file_path'])

[2019-04-30 23:09:55]: bert data_transformer.py[line:82] INFO  train val split
I0430 23:09:55.317772 140358862772032 data_transformer.py:82] train val split
Merge: 159571it [00:00, 2154265.25it/s]
write data to disk: 100%|██████████| 127657/127657 [00:00<00:00, 273361.46it/s]
write data to disk: 100%|██████████| 31914/31914 [00:00<00:00, 267859.75it/s]


In [10]:
 # TOKENIZE WITH BERT TOKENIZER
tokenizer = BertTokenizer(vocab_file=config['pretrained']['bert']['vocab_path'],
                              do_lower_case=config['train']['do_lower_case'])


In [11]:
# train
train_dataset   = CreateDataset(data = train,
                                    tokenizer = tokenizer,
                                    max_seq_len = config['train']['max_seq_len'],
                                    seed = config['train']['seed'],
                                    example_type = 'train')

In [12]:
#valid
valid_dataset   = CreateDataset(data= valid,
                                    tokenizer = tokenizer,
                                    max_seq_len  = config['train']['max_seq_len'],
                                    seed = config['train']['seed'],
                                    example_type = 'valid')

In [13]:
train_features = []


for example in train_dataset.examples:
    feature = train_dataset.build_features(example)
    feature.label_id = [int(label) for label in feature.label_id]
    train_features.append(feature)

In [14]:
test_features = []


for example in valid_dataset.examples:
    feature = valid_dataset.build_features(example)
    feature.label_id = [int(label) for label in feature.label_id]
    test_features.append(feature)

In [15]:
train_features[0]

<pybert.io.dataset.InputFeatures at 0x7fa6f02aabe0>

In [16]:
train_features[55].label_id

[1, 0, 1, 0, 1, 1]

# Creating a model

Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning).

In [25]:
def multi_label_hot(prediction, threshold=0.5):
    prediction = tf.cast(prediction, tf.float32)
    threshold = float(threshold)
    return tf.cast(tf.greater(prediction, threshold), tf.int64)


In [26]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,num_labels):
  #Creates a classification model.

    bert_module = hub.Module(BERT_MODEL_HUB,trainable=True)
    
    bert_inputs = dict(input_ids=input_ids,
                       input_mask=input_mask,
                       segment_ids=segment_ids)
  
    bert_outputs = bert_module(inputs=bert_inputs,
                               signature="tokens",
                               as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        
        


    # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=one_hot_labels)
        
        log_probs = tf.sigmoid(logits)
        one_hot_prediction = multi_label_hot(log_probs)
        
        
    # If we're predicting, we want predicted labels and the probabiltiies.
        if is_predicting:
            return (one_hot_prediction,log_probs)

    # If we're train/eval, compute loss between predicted and actual label
        #per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(tf.reduce_sum(loss, axis=1))
        return (loss,one_hot_prediction, log_probs)


 Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction.

In [27]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.

def model_fn_builder(num_labels, learning_rate, num_train_steps,num_warmup_steps):
    
    
#Returns `model_fn` closure for TPUEstimator."""
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    #The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_id"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
        # TRAIN and EVAL
        if not is_predicting:

            (loss, predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

            train_op = bert.optimization.create_optimizer(loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
            def metric_fn(label_id, predicted_labels):
                
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
            
                #f1_score = tf.contrib.metrics.f1_score(label_ids,predicted_labels)
            
                auc = tf.metrics.auc(label_ids,predicted_labels)
        
                #recall = tf.metrics.recall(label_ids,predicted_labels)
        
                #precision = tf.metrics.precision(label_ids,predicted_labels) 
        
                #true_pos = tf.metrics.true_positives(label_ids,predicted_labels)
                #true_neg = tf.metrics.true_negatives(label_ids,predicted_labels)   
        
                #false_pos = tf.metrics.false_positives(label_ids,predicted_labels)
            
            
                #false_neg = tf.metrics.false_negatives(label_ids,predicted_labels)
        
                return {
                    "eval_accuracy": accuracy,
                    #"f1_score": f1_score,
                    "auc": auc,
                    #"precision": precision,
                    #"recall": recall,
                    #"true_positives": true_pos,
                    #"true_negatives": true_neg,
                    #"false_positives": false_pos,
                    #"false_negatives": false_neg
                }

            eval_metrics = metric_fn(label_ids, predicted_labels)

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)
        
            else:
                return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              eval_metric_ops=eval_metrics)
        else:
            (predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_id, num_labels)

            predictions = {
                'probabilities': log_probs,
                'labels': predicted_labels
                          }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
    return model_fn

In [28]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [29]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [30]:
model_fn = model_fn_builder(num_labels=6,
                            learning_rate=LEARNING_RATE,
                            num_train_steps=num_train_steps,
                            num_warmup_steps=num_warmup_steps)




estimator = tf.estimator.Estimator(model_fn=model_fn,
                                   config=run_config,
                                   params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': 'pybert/output/checkpoints', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa6eb448940>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


I0430 23:20:15.105865 140358862772032 estimator.py:201] Using config: {'_model_dir': 'pybert/output/checkpoints', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa6eb448940>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators).

In [31]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LEN,
    is_training=True,
    drop_remainder=False)

In [32]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Beginning Training!
INFO:tensorflow:Calling model_fn.


I0430 23:21:07.807149 140358862772032 estimator.py:1111] Calling model_fn.


KeyError: 'label_id'

6.0